library(tidyverse) library(dplyr) library(broom) library(ggnewscale) library(pcadapt) library(pcadapt) library(vcfR) library(factoextra) library(ggfortify) library(viridis) library(scales) library(adegenet) library(pegas) library(poppr) nuclear_snps <- read.csv("silky_30x_SNPs_popoolation_FINAL-NOMULTIALLELIC.csv") #set up new names old_names <- c("AF1", "GOM1", "BRA1", "WNA1", "EPAC1", "NCP1", "SCP1", "PNG1", "TAI1", "RSC1", "IO1") new_names <- c("AFR", "GOM", "BRA", "NWA", "EPAC", "NCP", "SCP", "PNG", "TAI", "RDS", "IDO") nuclear_snps_test <- nuclear_snps %>% select(snpid, pair) %>% mutate(pair = str_remove(pair, "silky9_")) %>% mutate(pair = str_split(pair, "_")) %>% group_by(snpid) %>% summarize(allpops = list(pair)) %>% mutate(allpops = lapply(allpops, unlist)) %>% mutate(allpops = lapply(allpops, unique)) %>% mutate(numpops = lapply(allpops, length)) %>% mutate(numpops = as.numeric(numpops)) %>% ungroup() %>% left_join(unique(nuclear_snps[,1:2])) #write chrom + pos to file #vcftools --vcf filtered_TotalRawSNPs_ddocent_manual_snp_calling.vcf --positions silky_nuc_chrompos.tsv --recode --recode-INFO-all --out silky_nuclear_final_snps.vcf #silky_nuclear_final_snps.vcf.recode.vcf write_tsv(unique(nuclear_snps[,2:3]), file = "silky_nuc_chrompos.tsv", col_names = FALSE) #mean read depth per pool #vcftools --vcf silky_nuclear_final_snps.vcf.recode.vcf --depth --out depth #depth.idepth all_depths <- read.table("site-depth.ldepth", header = T) mean(all_depths$SUM_DEPTH) #count SNPs and loci per pool all_counts <- nuclear_snps_test %>% mutate(AF1 = as.numeric(lapply(allpops, function(x) ifelse("AF1" %in% x, 1, 0)))) %>% mutate(GOM1 = as.numeric(lapply(allpops, function(x) ifelse("GOM1" %in% x, 1, 0)))) %>% mutate(BRA1 = as.numeric(lapply(allpops, function(x) ifelse("BRA1" %in% x, 1, 0)))) %>% mutate(WNA1 = as.numeric(lapply(allpops, function(x) ifelse("WNA1" %in% x, 1, 0)))) %>% mutate(EPAC1 = as.numeric(lapply(allpops, function(x) ifelse("EPAC1" %in% x, 1, 0)))) %>% mutate(NCP1 = as.numeric(lapply(allpops, function(x) ifelse("NCP1" %in% x, 1, 0)))) %>% mutate(SCP1 = as.numeric(lapply(allpops, function(x) ifelse("SCP1" %in% x, 1, 0)))) %>% mutate(PNG1 = as.numeric(lapply(allpops, function(x) ifelse("PNG1" %in% x, 1, 0)))) %>% mutate(TAI1 = as.numeric(lapply(allpops, function(x) ifelse("TAI1" %in% x, 1, 0)))) %>% mutate(RSC1 = as.numeric(lapply(allpops, function(x) ifelse("RSC1" %in% x, 1, 0)))) %>% mutate(IO1 = as.numeric(lapply(allpops, function(x) ifelse("IO1" %in% x, 1, 0)))) snp_counts <- all_counts %>% select(5:15) %>% colSums() locus_counts <- all_counts %>% select(4:15) %>% unique() %>% select(-CHROM) %>% colSums() nuclear_snps_table <- nuclear_snps %>% select(snpid, CHROM, POS, value, pair) %>% unique() %>% group_by(pair) %>% summarize(mean(value)) empty_same_pop <- data.frame("pop1" = old_names, "pop2" = old_names, "fst" = NA) nuclear_snps_table_for_plot <- nuclear_snps_table %>% mutate(pair = str_remove(pair, "silky9_")) %>% mutate(pair = str_split(pair, "_")) names(nuclear_snps_table_for_plot)[2] = "fst" nuclear_snps_table_for_plot <- nuclear_snps_table_for_plot %>% mutate(pair = lapply(pair, function(x) x[order(match(x, old_names))])) %>% mutate(pair = lapply(pair, function(x) paste(x, collapse = "_"))) %>% separate(pair, into = c("pop1", "pop2")) %>% bind_rows(., empty_same_pop) %>% mutate(pop1_f = factor(pop1, levels = old_names, labels = new_names)) %>% mutate(pop2_f = factor(pop2, levels = old_names, labels = new_names)) #plot ggplot(nuclear_snps_table_for_plot, aes(y = pop1_f, x = pop2_f)) + geom_tile(aes(fill = fst)) + scale_fill_gradient(low = "yellow", high = "red") + geom_text(aes(label = round(fst, 3))) + guides(fill = FALSE) + xlab("") + ylab("") #####TEMPORARY until get mitochondrial table from server mt_snps_table_for_plot <- read.csv("silky-mt-copy.csv") names(mt_snps_table_for_plot)[1] <- "pop1" mt_snps_table_for_plot <- mt_snps_table_for_plot %>% mutate(pop1_f = factor(pop1, levels = old_names, labels = new_names)) %>% mutate(pop2_f = factor(pop2, levels = old_names, labels = new_names)) #plot ggplot(mt_snps_table_for_plot, aes(y = pop2_f, x = pop1_f)) + geom_tile(aes(fill = fst)) + scale_fill_gradient(low = "yellow", high = "red") + geom_text(aes(label = round(fst, 3))) + guides(fill = FALSE) + xlab("") + ylab("") #plot both ggplot() + geom_tile(data = nuclear_snps_table_for_plot, aes(fill = fst, y = pop1_f, x = pop2_f)) + scale_fill_gradient(low = "yellow", high = "red", na.value="transparent") + geom_text(data = nuclear_snps_table_for_plot, aes(y = pop1_f, x = pop2_f, label = round(fst, 3))) + guides(fill = FALSE) + ggnewscale::new_scale_fill() + geom_tile(data = mt_snps_table_for_plot, aes(fill = fst, x = pop1_f, y = pop2_f)) + scale_fill_distiller(palette = "Blues", direction = 1) + geom_text(data = mt_snps_table_for_plot, aes(x = pop1_f, y = pop2_f, label = round(fst, 3))) + guides(fill = FALSE) + xlab("") + ylab("") + theme_bw() + theme(panel.border = element_rect(colour = "black", fill=NA, size=5), axis.text.x = element_text(angle=90, vjust=.5, hjust=1, size=20), axis.text.y = element_text(size=20)) #wilcoxon rank sum test - ????? not right p_out <- pairwise.wilcox.test(nuclear_snps$value, nuclear_snps$pair, p.adjust.method = "BH", alternative = "two.sided") p_out_tidy <- tidy(p_out) #PCA - VCF (012 file) #make bed #./plink --vcf silky_nuclear_final_snps.vcf.recode.vcf --make-bed --out silky_nuclear_final_snps --allow-extra-chr --missing-phenotype 9 pc <- read.pcadapt(input = "silky_nuclear_final_snps.bed", type = "bed") test <- pcadapt::bed2matrix("silky_nuclear_final_snps.bed") pops <- c("AF_1", "BRA_1", "EPAC_1", "GOM_1", "IO_1", "NCP_1", "PNG_1", "RSC_1", "SCP_1", "TAI_1", "WNA_1") pops_2 <- c("Atlantic", "Atlantic", "Indo-Pacific", "Atlantic", "Indo-Pacific", "Indo-Pacific", "Indo-Pacific", "Indo-Pacific", "Indo-Pacific", "Indo-Pacific", "Atlantic") #PCA - pool (MAF matrix) vcf <- read.vcfR("silky_nuclear_final_snps.vcf.recode.vcf") vcf_tidy <- vcfR2tidy(vcf) summary_out <- vcf_tidy$fix genos_out <- vcf_tidy$gt #get min/maj alleles summary_out <- summary_out %>% select(ChromKey, POS, RO, AO) %>% mutate(RO = as.numeric(RO)) %>% mutate(AO = as.numeric(AO)) %>% mutate(which_major = ifelse(RO > AO, "RO", "AO")) %>% select(-RO, -AO) #add to genos genos_out <- genos_out %>% left_join(., summary_out, by = c("ChromKey", "POS")) %>% mutate(gt_RO = as.numeric(gt_RO)) %>% mutate(gt_AO = as.numeric(gt_AO)) %>% mutate(major_count = ifelse(which_major == "RO", gt_RO, gt_AO)) %>% mutate(major_freq = major_count / gt_DP) major_freqs <- genos_out %>% select(ChromKey, POS, Indiv, major_freq) %>% unite(c("ChromKey", "POS"), col = "snpid", sep = "_") %>% pivot_wider(., names_from = Indiv, values_from = major_freq) %>% select(-snpid) %>% as.matrix() major_freqs_df <- major_freqs %>% t() %>% as_tibble() %>% bind_cols(., data.frame("Pops" = pops)) %>% bind_cols(., data.frame("Ocean" = pops_2)) pc_out_2 <- prcomp(t(major_freqs), scale = F, center = F, tol = 1e-04) autoplot(pc_out_2, data = major_freqs_df, colour = "Ocean", size = 6) + scale_color_viridis_d(end = 0.6) + scale_y_continuous(labels = label_number(accuracy = 0.01)) + scale_x_continuous(labels = label_number(accuracy = 0.01)) + theme_bw() + theme(panel.border = element_rect(colour = "black", fill=NA, size=5), axis.text = element_text(size=18), axis.title = element_text(size=20), legend.text = element_text(size=20), legend.title = element_text(size=20)) # AMOVA genind <- vcfR2genlight(x = vcf) pop(genind) <- pops my_strata <- data.frame(regions = pops_2, populations = pops) strata(genind) <- my_strata setPop(genind) <- ~regions genind_dist <- pairDist(genind, within = T) dat <- genind_dist$data %>% separate(., col = groups, into = c("pop1", "pop2"), sep = "-") %>% pivot_wider(names_from = pop2, values_from = distance) dat_mat <- as.matrix(dat) rownames(dat_mat) <- dat_mat[,1] colnames(dat_mat)[1] <- "AF_1" dat_mat[,1] <- NA dat_mat <- t(dat_mat) dat_dist <- as.dist(dat_mat, diag = T) amova_out <- poppr.amova(genind, hier = ~regions, method = "ade4", within = F, freq = T) # $call # ade4::amova(samples = xtab, distances = xdist, structures = xstruct) # # $results # Df Sum Sq Mean Sq # Between samples 1 299.8182 299.8182 # Within samples 9 792.0000 88.0000 # Total 10 1091.8182 109.1818 # # $componentsofcovariance # Sigma % # Variations Between samples 41.60714 32.10251 # Variations Within samples 88.00000 67.89749 # Total variations 129.60714 100.00000 # # $statphi # Phi # Phi-samples-total 0.3210251 rt_out <- randtest(amova_out) # Monte-Carlo test # Call: as.randtest(sim = res, obs = sigma[1]) # # Observation: 41.60714 # # Based on 99 replicates # Simulated p-value: 0.01 # Alternative hypothesis: greater # # Std.Obs Expectation Variance # 7.347787 -1.573365 34.535229 #DAPC dapc_out <- dapc(genind, scale = FALSE, var.contrib = TRUE, var.loadings = TRUE, pca.info = TRUE) par(cex = 1.2, cex.axis = 1.5, cex.lab = 1.5, lwd = 4) scatter(dapc_out, leg=TRUE, col = c("#460a5a", "#49a987")) predict.dapc(dapc_out) loadingplot(dapc_out$var.contr)