--- title: "Storage temperature" output: pdf_document: default html_notebook: default word_document: default html_document: df_print: paged --- ### Set up Clear the R environment, set seed, import packages and read in result file from DADA2. Tax table and ASV count table are in the same table and must be split into separate tables. ```{r set_up} rm(list=ls()) set.seed(711) library(ggplot2) library(phyloseq) library(DECIPHER) library(phangorn) library(decontam) library(dplyr) ``` # General sequence statistics ```{r seq_stats} track <- read.csv("trackSequences.csv", row.names=1) # Remove control samples track <- track[-c(1:3), ] min(track$input) ; max(track$input) ; mean(track$input) ; sum(track$input) min(track$nonchim) ; max(track$nonchim) ; mean(track$nonchim) ; sum(track$nonchim) ``` ```{r phyloseq_object} asvs <- read.csv("dada2ASVs.csv", quote="", row.names = 1) # Sequence counts before removing singletons prior_singletons <- colSums(asvs[1:19]) # Remove singletons asvs_to_remove <- which(rowSums(asvs[1:19]) == 1) asvs <- asvs[-asvs_to_remove,] # Post singletons post_singletons <- colSums(asvs[1:19]) mean(post_singletons) ; max(post_singletons) ; min(post_singletons) # Apply 80% confidence threshold to taxonomic assignments asvs$Phylum <- as.factor(ifelse(asvs$phylum.conf < 80, NA, asvs$Phylum)) asvs$Class <- as.factor(ifelse(asvs$class.conf < 80, NA, asvs$Class)) asvs$Order <- as.factor(ifelse(asvs$order.conf < 80, NA, asvs$Order)) asvs$Family <- as.factor(ifelse(asvs$family.conf < 80, NA, asvs$Family)) asvs$Genus <- as.factor(ifelse(asvs$genus.conf < 80, NA,asvs$Genus)) # ASV abundances count_table <- asvs[c(1:19)] # Taxonomy table tax_table <- as.matrix(asvs["phylum.conf" >= 80,c(20:31)]) # Sample data metadata <- read.csv("all_metadata.csv", header = TRUE, row.names = 1) sample_colors <- rep(RColorBrewer::brewer.pal(8,"Dark2"), each = 2) #read in phylogenetic tree (neighbour-joining tree) # see lines 333 - 343 to create tree tree <- readRDS("fitGTR_unoptimized.rds") ps <- phyloseq(otu_table(count_table, taxa_are_rows=T), tax_table(tax_table), sample_data(metadata), phy_tree(tree$tree)) # Remove contaminants using decontam sample_data(ps)$is.neg <- sample_data(ps)$Sample_or_Control == "Control" contamdf.prev <- isContaminant(ps, method = "prevalence", neg = "is.neg") head(which(contamdf.prev$contaminant)) ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0)) ps.pa.neg <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "Control", ps.pa) ps.pa.pos <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "True Sample", ps.pa) df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg), contaminant=contamdf.prev$contaminant) # Visualize presence ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_jitter() + xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)") # Remove identified contaminant ASVs and control samples py.no.rar <- prune_taxa(!contamdf.prev$contaminant, ps) py.no.rar <- subset_samples(py.no.rar, sample_names(py.no.rar) != c("NTC_water", "SI_2014_16S_Blank1", "SI_2014_16S_Blank2")) # Make rarified phyloseq object py.rar<- rarefy_even_depth(py.no.rar, rngseed=711) # Proportion of asvs assigned to 80% confidence genus, family and phylum count_table <- otu_table(py.no.rar, taxa_are_rows = T) tax_table <- tax_table(py.no.rar) merged <- merge(count_table, tax_table) merged$phylum.conf <- as.numeric(merged$phylum.conf) merged$family.conf <- as.numeric(merged$family.conf) merged$genus.conf <- as.numeric(merged$genus.conf) length(which(merged$phylum.conf >= 80)) / length(merged$phylum.conf) length(which(merged$family.conf >= 80)) / length(merged$family.conf) length(which(merged$genus.conf >= 80)) / length(merged$genus.conf) ``` # Alpha Diversity ### ASVS Create a phyloseq object and compute alpha diversity indices. Plot observed, Chao1, Shannon and Simpson indices for -20 and -80 samples (colored by individual). ```{r alpha_div_asv} sample_data(py.rar)$Observed <- estimate_richness(py.rar)[,1] sample_data(py.rar)$Chao1 <- estimate_richness(py.rar)[,2] sample_data(py.rar)$Shannon <- estimate_richness(py.rar)[,6] sample_data(py.rar)$Simpson <- estimate_richness(py.rar)[,7] # Modify the phyloseq function for minor aesthetic changes plot_RichnessCustom <- function (physeq, x = "samples", color = NULL, shape = NULL, title = NULL, scales = "free_y", nrow = 1, shsi = NULL, measures = NULL, sortby = NULL) { erDF = estimate_richness(physeq, split = TRUE, measures = measures) measures = colnames(erDF) ses = colnames(erDF)[grep("^se\\.", colnames(erDF))] measures = measures[!measures %in% ses] if (!is.null(sample_data(physeq, errorIfNULL = FALSE))) { DF <- data.frame(erDF, sample_data(physeq)) } else { DF <- data.frame(erDF) } if (!"samples" %in% colnames(DF)) { DF$samples <- sample_names(physeq) } if (!is.null(x)) { if (x %in% c("sample", "samples", "sample_names", "sample.names")) { x <- "samples" } } else { x <- "samples" } mdf = reshape2::melt(DF, measure.vars = measures) mdf$se <- NA_integer_ if (length(ses) > 0) { selabs = ses names(selabs) <- substr(selabs, 4, 100) substr(names(selabs), 1, 1) <- toupper(substr(names(selabs), 1, 1)) mdf$wse <- sapply(as.character(mdf$variable), function(i, selabs) { selabs[i] }, selabs) for (i in 1:nrow(mdf)) { if (!is.na(mdf[i, "wse"])) { mdf[i, "se"] <- mdf[i, (mdf[i, "wse"])] } } mdf <- mdf[, -which(colnames(mdf) %in% c(selabs, "wse"))] } if (!is.null(measures)) { if (any(measures %in% as.character(mdf$variable))) { mdf <- mdf[as.character(mdf$variable) %in% measures, ] } else { warning("Argument to `measures` not supported. All alpha-diversity measures (should be) included in plot.") } } if (!is.null(shsi)) { warning("shsi no longer supported option in plot_richness. Please use `measures` instead") } if (!is.null(sortby)) { if (!all(sortby %in% levels(mdf$variable))) { warning("`sortby` argument not among `measures`. Ignored.") } if (!is.discrete(mdf[, x])) { warning("`sortby` argument provided, but `x` not a discrete variable. `sortby` is ignored.") } if (all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, x])) { wh.sortby = which(mdf$variable %in% sortby) mdf[, x] <- factor(mdf[, x], levels = names(sort(tapply(X = mdf[wh.sortby, "value"], INDEX = mdf[wh.sortby, x], mean, na.rm = TRUE, simplify = TRUE)))) } } richness_map = aes_string(x = x, y = "value", colour = color, shape = shape) p = ggplot(mdf, richness_map) + geom_point(na.rm = TRUE) if (any(!is.na(mdf[, "se"]))) { p = p + geom_errorbar(aes(ymax = value + se, ymin = value - se), width = 0.1) } #p = p + theme(axis.text.x = element_text(hjust = 0)) p = p + ylab("Alpha Diversity Measure") p = p + facet_wrap(~variable, nrow = nrow, scales = scales) if (!is.null(title)) { p <- p + ggtitle(title) } p = p + geom_line(aes(group = Individual)) + scale_color_manual(values = sample_colors) return(p) } sample_colors <- rep(RColorBrewer::brewer.pal(8,"Dark2"), each = 2) sample_colors <- RColorBrewer::brewer.pal(8, "Dark2") plot_RichnessCustom(py.rar, x="Temperature", color="Individual", measures=c("Observed","Chao1", "Shannon")) # Summarize alpha diversity below max(sample_data(py.rar)$Observed) ; min(sample_data(py.rar)$Observed) ; mean(sample_data(py.rar)$Observed) ; sd(sample_data(py.rar)$Observed) max(sample_data(py.rar)$Chao1) ; min(sample_data(py.rar)$Chao1) ; mean(sample_data(py.rar)$Chao1) ; sd(sample_data(py.rar)$Chao1) max(sample_data(py.rar)$Shannon) ; min(sample_data(py.rar)$Shannon) ; mean(sample_data(py.rar)$Shannon) ; sd(sample_data(py.rar)$Shannon) ``` ```{r normalcy} #inspect the distribution of variables hist(sample_data(py.rar)$Observed) hist(sample_data(py.rar)$Shannon) hist(sample_data(py.rar)$Chao1) #test for normalcy shapiro.test(sample_data(py.rar)$Observed) # normal shapiro.test(sample_data(py.rar)$Chao1) # normal shapiro.test(sample_data(py.rar)$Shannon) # not normal, just transform (log, sqrt)? shapiro.test(sqrt(sample_data(py.rar)$Shannon)) # not normal shapiro.test(log(sample_data(py.rar)$Shannon)) # not normal # try a boxcox transformation but need to identify suitable lambda forecast::BoxCox.lambda(sample_data(py.rar)$Shannon, lower = -14 , upper = 2) # Best lambda was -14, use Wilcoxon sample_data(py.rar)$Shannon.boxcox <- (((sample_data(py.rar)$Shannon^-14)-1)/-14) qqnorm(sample_data(py.rar)$Shannon.boxcox) shapiro.test(sample_data(py.rar)$Shannon.boxcox) ``` ### Significance testing We will use the non-parametric Wilcoxon test to compare alpha diversity measures of samples stored at -20 and -80. Shapiro tests returned Observed and Chao1 results as normal, so we used a paired t-test for those values. ```{r} twenty <- sample_data(py.rar)[sample_data(py.rar)$Temp == 20, c("sample","Observed", "Shannon", "Chao1", "Simpson")] eighty <- sample_data(py.rar)[sample_data(py.rar)$Temp == 80, c("sample","Observed", "Shannon", "Chao1", "Simpson")] t.test(twenty$Observed, eighty$Observed, alternative = "two.sided", paired=T) wilcox.test(twenty$Shannon, eighty$Shannon, alternative = "two.sided", paired=T) t.test(twenty$Chao1, eighty$Chao1, alternative = "two.sided", paired=T) ``` None of the above tests are significant. P-value for Shannon diversity was the lowest at 0.1094. # Beta diversity ```{r ordinate, results="hide"} set.seed(711) clr <- microbiome::transform(py.no.rar, "clr") euc <- ordinate(clr, method="NMDS", distance="euclidean") jaccard <- ordinate(py.no.rar, method="NMDS", distance="jaccard") unweighted <- ordinate(py.no.rar, method="NMDS", distance="unifrac") weighted <- ordinate(py.no.rar, method="NMDS", distance="wunifrac") ``` ```{r plot_ordination_asv} sample_colors <- RColorBrewer::brewer.pal(8,"Dark2") euc_ord <- plot_ordination(clr, euc, type="samples", color="Individual", shape="Temperature") + scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("A") + ggnewscale::new_scale_color() + theme_classic() + stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t")) jac_ord <- plot_ordination(py.no.rar, jaccard, type="samples", color="Individual", shape="Temperature") + scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("B") + ggnewscale::new_scale_color() + theme_classic() + stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t")) uni_ord <- plot_ordination(py.no.rar, unweighted, type="samples", color="Individual", shape="Temperature") + scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("C") + ggnewscale::new_scale_color() + theme_classic() + stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t")) wuni_ord <- plot_ordination(py.no.rar, weighted, type="samples", color="Individual", shape="Temperature") + scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("D") + ggnewscale::new_scale_color() + theme_classic() + stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t")) ggpubr::ggarrange(euc_ord, jac_ord, uni_ord, wuni_ord, nrow=2, ncol=2, common.legend=TRUE, legend="bottom") ``` ### PERMANOVA - ASV ```{r permanova_asv} set.seed(711) #distances euc_dist <- phyloseq::distance(clr, "euclidean") jac_dist <- phyloseq::distance(py.no.rar, "jaccard") uni_dist <- phyloseq::distance(py.no.rar, "unifrac") wuni_dist <- phyloseq::distance(py.no.rar, "wunifrac") vegan::adonis2(euc_dist ~ Temp + Individual, by = "margin", as(sample_data(clr), "data.frame"), permutations=9999) vegan::adonis2(jac_dist ~ Temp + Individual, by = "margin", as(sample_data(py.no.rar), "data.frame"), permutations=9999) vegan::adonis2(uni_dist ~ Temp + Individual, by = "margin", as(sample_data(py.no.rar), "data.frame"), permutations=9999) vegan::adonis2(wuni_dist ~ Temp + Individual, by = "margin", as(sample_data(py.no.rar), "data.frame"), permutations=9999) ``` ### Beta dispersion Checking that beta dispersion between samples at -20 and -80 are homogenous. ```{r betadisper_asv} set.seed(711) disp.euc <- vegan::betadisper(phyloseq::distance(clr, "euclidean"), sample_data(clr)$Temperature) disp.euc.sample <- vegan::betadisper(phyloseq::distance(clr, "euclidean"), sample_data(clr)$Individual) vegan::permutest(disp.euc, pairwise=T, permutations = 9999) disp.jac <- vegan::betadisper(phyloseq::distance(py.no.rar, "jaccard"), sample_data(py.no.rar)$Temperature) vegan::permutest(disp.jac, pairwise=T, permutations = 9999) disp.uni <- vegan::betadisper(phyloseq::distance(py.no.rar, "unifrac"), sample_data(py.no.rar)$Temperature) vegan::permutest(disp.uni, pairwise=T, permutations = 9999) disp.wuni <- vegan::betadisper(phyloseq::distance(py.no.rar, "wunifrac"), sample_data(py.no.rar)$Temperature) vegan::permutest(disp.wuni, pairwise=T, permutations = 9999) ``` ### generate phylogenetic tree ```{r tree, eval=FALSE} alignment <- AlignSeqs(seqs) saveRDS(alignment, "alignment.rds") alignment <- readRDS("alignment.rds") phang.align <- phyDat(as(alignment, "matrix"), type="DNA") dm <- dist.ml(phang.align) saveRDS(dm, "dm.rds") treeNJ <- NJ(dm) fit = pnl(treeNJ, data=phang.align) fitGTR <- update(fit, k=4, inv=0.2) saveRDS(fitGTR, "fitGTR_unoptimized.rds") ``` ### Proportional abundance bar plots ```{r plotCompositionalAbundance} # Function to display summary statistics and plot compositional abundances at a specified taxonomic rank # Takes unrarefied phyloseq object # Option to aggregate taxa below a specified percentage threshold plotCompositional <- function(py_object, rank, aggregate = FALSE) { py_comp <- transform_sample_counts(py_object, function(x) x / sum(x)) py <- tax_glom(py_comp, rank, NArm = FALSE) # Replace NA with unclassified, identify uniques within rank tax <- tax_table(py) tax <- tidyr::replace_na(tax, "Unidentified") tax_table(py) <- as.matrix(tax) rankList <- unique(tax_table(py)[, rank]) # Melt phyloseq object abund <- psmelt(py) abund$Bar <- paste(abund$Individual, abund$Temperature) if(aggregate == TRUE) { abund[, rank] <- as.character(abund[, rank]) abund[abund$Abundance < 0.01, rank] <- "< 1% abund." length(unique(abund[, rank])) # 24 families total rankList <- unique(abund[ , rank]) } # Specify desired colors colorPalette <- as.vector(c(RColorBrewer::brewer.pal(12, "Paired"), "lightsteelblue2","red4", "darkseagreen3", "rosybrown", "cyan","#FBB4AE", "#FC8D62", "snow3", "olivedrab2", "orange", "gold", "black")) colorPalette <- colorPalette[1:length(rankList)] names(colorPalette) <- rankList ggplot(abund, aes_string(x = "Bar", y = "Abundance", fill = rank)) + geom_bar(stat = "identity", position = "stack") + theme(axis.text.x = element_text(hjust = 0)) + scale_fill_manual(values = colorPalette) + coord_flip() + theme(axis.title.y = element_blank()) + guides(fill = guide_legend(ncol=1)) } plotCompositional(py.no.rar, "Phylum") plotCompositional(py.no.rar, "Family", aggregate = TRUE) plotCompositional(py.no.rar, "Genus", aggregate = TRUE) ``` # Summary statistics for the phylum level ```{r} py_comp <- transform_sample_counts(py.no.rar, function(x) x / sum(x)) py <- tax_glom(py_comp, "Phylum", NArm = FALSE) abund <- psmelt(py) phyla <- abund[, c("Sample","Abundance", "Phylum")] phyla_by_sample <- as.data.frame(phyla %>% group_by(Phylum, Sample) %>% summarise(sum(Abundance))) #Bacteroidota mean(phyla_by_sample[c(17:32), 3]) ; max(phyla_by_sample[c(17:32), 3]) ; min(phyla_by_sample[c(17:32), 3]) #Firmicutes mean(phyla_by_sample[c(161:176), 3]) ; max(phyla_by_sample[c(161:176), 3]) ; min(phyla_by_sample[c(161:176), 3]) ```