---
title: "Storage temperature"
output:
  pdf_document: default
  html_notebook: default
  word_document: default
  html_document:
    df_print: paged
---

### Set up
  Clear the R environment, set seed, import packages and read in result file from DADA2. Tax table and ASV count table are in the same table and must be split into separate tables.
```{r set_up}
rm(list=ls())

set.seed(711)

library(ggplot2)
library(phyloseq)
library(DECIPHER)
library(phangorn)
library(decontam)
library(dplyr)
```
# General sequence statistics
```{r seq_stats}
track <- read.csv("trackSequences.csv", row.names=1)
# Remove control samples
track <- track[-c(1:3), ]

min(track$input) ; max(track$input) ; mean(track$input) ; sum(track$input)
min(track$nonchim) ; max(track$nonchim) ; mean(track$nonchim) ; sum(track$nonchim)
```

```{r phyloseq_object}
asvs <- read.csv("dada2ASVs.csv", quote="", row.names = 1)
# Sequence counts before removing singletons
prior_singletons <- colSums(asvs[1:19])
# Remove singletons
asvs_to_remove <- which(rowSums(asvs[1:19]) == 1)
asvs <- asvs[-asvs_to_remove,]
# Post singletons
post_singletons <- colSums(asvs[1:19])
mean(post_singletons) ; max(post_singletons) ; min(post_singletons)

# Apply 80% confidence threshold to taxonomic assignments
asvs$Phylum <- as.factor(ifelse(asvs$phylum.conf < 80, NA, asvs$Phylum))
asvs$Class <- as.factor(ifelse(asvs$class.conf < 80, NA, asvs$Class))
asvs$Order <- as.factor(ifelse(asvs$order.conf < 80, NA, asvs$Order))
asvs$Family <- as.factor(ifelse(asvs$family.conf < 80, NA, asvs$Family))
asvs$Genus <- as.factor(ifelse(asvs$genus.conf < 80, NA,asvs$Genus))

# ASV abundances
count_table <- asvs[c(1:19)]

# Taxonomy table
tax_table <- as.matrix(asvs["phylum.conf" >= 80,c(20:31)])

# Sample data
metadata <- read.csv("all_metadata.csv", header = TRUE, row.names = 1)
sample_colors <- rep(RColorBrewer::brewer.pal(8,"Dark2"), each = 2)

#read in phylogenetic tree (neighbour-joining tree)
# see lines 333 - 343 to create tree
tree <- readRDS("fitGTR_unoptimized.rds")

ps <- phyloseq(otu_table(count_table, taxa_are_rows=T), 
                      tax_table(tax_table), sample_data(metadata), 
                      phy_tree(tree$tree))

# Remove contaminants using decontam
sample_data(ps)$is.neg <- sample_data(ps)$Sample_or_Control == "Control"
contamdf.prev <- isContaminant(ps, method = "prevalence", neg = "is.neg")
head(which(contamdf.prev$contaminant))
ps.pa <- transform_sample_counts(ps, function(abund) 1*(abund>0))
ps.pa.neg <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "Control", ps.pa)
ps.pa.pos <- prune_samples(sample_data(ps.pa)$Sample_or_Control == "True Sample", ps.pa)
df.pa <- data.frame(pa.pos=taxa_sums(ps.pa.pos), pa.neg=taxa_sums(ps.pa.neg),
                      contaminant=contamdf.prev$contaminant)
# Visualize presence
ggplot(data=df.pa, aes(x=pa.neg, y=pa.pos, color=contaminant)) + geom_jitter() +
  xlab("Prevalence (Negative Controls)") + ylab("Prevalence (True Samples)")

# Remove identified contaminant ASVs and control samples
py.no.rar <- prune_taxa(!contamdf.prev$contaminant, ps)
py.no.rar <- subset_samples(py.no.rar, sample_names(py.no.rar) != 
                              c("NTC_water", "SI_2014_16S_Blank1",
                                "SI_2014_16S_Blank2"))

# Make rarified phyloseq object
py.rar<- rarefy_even_depth(py.no.rar, rngseed=711)

# Proportion of asvs assigned to 80% confidence genus, family and phylum
count_table <- otu_table(py.no.rar, taxa_are_rows = T)
tax_table <- tax_table(py.no.rar)
merged <- merge(count_table, tax_table)

merged$phylum.conf <- as.numeric(merged$phylum.conf)
merged$family.conf <- as.numeric(merged$family.conf)
merged$genus.conf <- as.numeric(merged$genus.conf)

length(which(merged$phylum.conf >= 80)) / length(merged$phylum.conf)
length(which(merged$family.conf >= 80)) / length(merged$family.conf)
length(which(merged$genus.conf >= 80)) / length(merged$genus.conf)
```
# Alpha Diversity   

### ASVS
  Create a phyloseq object and compute alpha diversity indices. Plot observed, Chao1, Shannon and Simpson indices for -20 and -80 samples (colored by individual).
```{r alpha_div_asv}
sample_data(py.rar)$Observed <- estimate_richness(py.rar)[,1]
sample_data(py.rar)$Chao1 <- estimate_richness(py.rar)[,2]
sample_data(py.rar)$Shannon <- estimate_richness(py.rar)[,6]
sample_data(py.rar)$Simpson <- estimate_richness(py.rar)[,7]

# Modify the phyloseq function for minor aesthetic changes
plot_RichnessCustom <- function (physeq, x = "samples", color = NULL, shape = NULL, 
    title = NULL, scales = "free_y", nrow = 1, shsi = NULL, measures = NULL, 
    sortby = NULL) 
{
    erDF = estimate_richness(physeq, split = TRUE, measures = measures)
    measures = colnames(erDF)
    ses = colnames(erDF)[grep("^se\\.", colnames(erDF))]
    measures = measures[!measures %in% ses]
    if (!is.null(sample_data(physeq, errorIfNULL = FALSE))) {
        DF <- data.frame(erDF, sample_data(physeq))
    }
    else {
        DF <- data.frame(erDF)
    }
    if (!"samples" %in% colnames(DF)) {
        DF$samples <- sample_names(physeq)
    }
    if (!is.null(x)) {
        if (x %in% c("sample", "samples", "sample_names", "sample.names")) {
            x <- "samples"
        }
    }
    else {
        x <- "samples"
    }
    mdf = reshape2::melt(DF, measure.vars = measures)
    mdf$se <- NA_integer_
    if (length(ses) > 0) {
        selabs = ses
        names(selabs) <- substr(selabs, 4, 100)
        substr(names(selabs), 1, 1) <- toupper(substr(names(selabs), 
            1, 1))
        mdf$wse <- sapply(as.character(mdf$variable), function(i, 
            selabs) {
            selabs[i]
        }, selabs)
        for (i in 1:nrow(mdf)) {
            if (!is.na(mdf[i, "wse"])) {
                mdf[i, "se"] <- mdf[i, (mdf[i, "wse"])]
            }
        }
        mdf <- mdf[, -which(colnames(mdf) %in% c(selabs, "wse"))]
    }
    if (!is.null(measures)) {
        if (any(measures %in% as.character(mdf$variable))) {
            mdf <- mdf[as.character(mdf$variable) %in% measures, 
                ]
        }
        else {
            warning("Argument to `measures` not supported. All alpha-diversity measures (should be) included in plot.")
        }
    }
    if (!is.null(shsi)) {
        warning("shsi no longer supported option in plot_richness. Please use `measures` instead")
    }
    if (!is.null(sortby)) {
        if (!all(sortby %in% levels(mdf$variable))) {
            warning("`sortby` argument not among `measures`. Ignored.")
        }
        if (!is.discrete(mdf[, x])) {
            warning("`sortby` argument provided, but `x` not a discrete variable. `sortby` is ignored.")
        }
        if (all(sortby %in% levels(mdf$variable)) & is.discrete(mdf[, 
            x])) {
            wh.sortby = which(mdf$variable %in% sortby)
            mdf[, x] <- factor(mdf[, x], levels = names(sort(tapply(X = mdf[wh.sortby, 
                "value"], INDEX = mdf[wh.sortby, x], mean, na.rm = TRUE, 
                simplify = TRUE))))
        }
    }
    richness_map = aes_string(x = x, y = "value", colour = color, 
        shape = shape)
    p = ggplot(mdf, richness_map) + geom_point(na.rm = TRUE)
    if (any(!is.na(mdf[, "se"]))) {
        p = p + geom_errorbar(aes(ymax = value + se, ymin = value - 
            se), width = 0.1)
    }
    #p = p + theme(axis.text.x = element_text(hjust = 0))
    p = p + ylab("Alpha Diversity Measure")
    p = p + facet_wrap(~variable, nrow = nrow, scales = scales)
    if (!is.null(title)) {
        p <- p + ggtitle(title)
    }
    p = p + geom_line(aes(group = Individual)) +
      scale_color_manual(values = sample_colors)
    return(p)
}
sample_colors <- rep(RColorBrewer::brewer.pal(8,"Dark2"), each = 2)
sample_colors <- RColorBrewer::brewer.pal(8, "Dark2")
plot_RichnessCustom(py.rar, x="Temperature", color="Individual", measures=c("Observed","Chao1", "Shannon"))

# Summarize alpha diversity below
max(sample_data(py.rar)$Observed) ; min(sample_data(py.rar)$Observed) ; mean(sample_data(py.rar)$Observed) ; sd(sample_data(py.rar)$Observed)
max(sample_data(py.rar)$Chao1) ; min(sample_data(py.rar)$Chao1) ; mean(sample_data(py.rar)$Chao1) ; sd(sample_data(py.rar)$Chao1)
max(sample_data(py.rar)$Shannon) ; min(sample_data(py.rar)$Shannon) ; mean(sample_data(py.rar)$Shannon) ; sd(sample_data(py.rar)$Shannon)
```

```{r normalcy}
#inspect the distribution of variables
hist(sample_data(py.rar)$Observed)
hist(sample_data(py.rar)$Shannon)
hist(sample_data(py.rar)$Chao1)

#test for normalcy
shapiro.test(sample_data(py.rar)$Observed) # normal
shapiro.test(sample_data(py.rar)$Chao1) # normal

shapiro.test(sample_data(py.rar)$Shannon) # not normal, just transform (log, sqrt)?
shapiro.test(sqrt(sample_data(py.rar)$Shannon)) # not normal
shapiro.test(log(sample_data(py.rar)$Shannon)) # not normal
# try a boxcox transformation but need to identify suitable lambda
forecast::BoxCox.lambda(sample_data(py.rar)$Shannon, lower = -14 , upper = 2)
# Best lambda was -14, use Wilcoxon 
sample_data(py.rar)$Shannon.boxcox <- (((sample_data(py.rar)$Shannon^-14)-1)/-14)
qqnorm(sample_data(py.rar)$Shannon.boxcox)
shapiro.test(sample_data(py.rar)$Shannon.boxcox)
```


### Significance testing
  We will use the non-parametric Wilcoxon test to compare alpha diversity measures of samples stored at -20 and -80. Shapiro tests returned Observed and Chao1 results as normal, so we used a paired t-test for those values.
```{r}
twenty <- sample_data(py.rar)[sample_data(py.rar)$Temp == 20, c("sample","Observed", "Shannon", "Chao1", "Simpson")]
eighty <- sample_data(py.rar)[sample_data(py.rar)$Temp == 80, c("sample","Observed", "Shannon", "Chao1", "Simpson")]

t.test(twenty$Observed, eighty$Observed, alternative = "two.sided", paired=T)
wilcox.test(twenty$Shannon, eighty$Shannon, alternative = "two.sided", paired=T)
t.test(twenty$Chao1, eighty$Chao1, alternative = "two.sided", paired=T)
```
  None of the above tests are significant. P-value for Shannon diversity was the lowest at 0.1094. 

# Beta diversity
```{r ordinate, results="hide"}
set.seed(711)
clr <- microbiome::transform(py.no.rar, "clr")
euc <- ordinate(clr, method="NMDS", distance="euclidean")
jaccard <- ordinate(py.no.rar, method="NMDS", distance="jaccard")
unweighted <- ordinate(py.no.rar, method="NMDS", distance="unifrac")
weighted <- ordinate(py.no.rar, method="NMDS", distance="wunifrac")
```


```{r plot_ordination_asv}
sample_colors <- RColorBrewer::brewer.pal(8,"Dark2")

euc_ord <- plot_ordination(clr, euc, type="samples", color="Individual", shape="Temperature") +
  scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("A") + 
  ggnewscale::new_scale_color() + theme_classic() +
  stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t"))

jac_ord <- plot_ordination(py.no.rar, jaccard, type="samples", color="Individual", shape="Temperature") +
  scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("B") + 
  ggnewscale::new_scale_color() + theme_classic() +
  stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t"))

uni_ord <- plot_ordination(py.no.rar, unweighted, type="samples", color="Individual", shape="Temperature") +
  scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("C") + 
  ggnewscale::new_scale_color() + theme_classic() +
  stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t"))

wuni_ord <- plot_ordination(py.no.rar, weighted, type="samples", color="Individual", shape="Temperature") +
  scale_colour_manual(values=sample_colors) + geom_point(size=4) + ggtitle("D") + 
  ggnewscale::new_scale_color() + theme_classic() +
  stat_ellipse(aes(group = Temperature, color = Temperature, level=0.95, type="t"))

ggpubr::ggarrange(euc_ord, jac_ord, uni_ord, wuni_ord, 
          nrow=2, ncol=2, common.legend=TRUE, legend="bottom")
```

### PERMANOVA - ASV
```{r permanova_asv}
set.seed(711)
#distances
euc_dist <- phyloseq::distance(clr, "euclidean")
jac_dist <- phyloseq::distance(py.no.rar, "jaccard")
uni_dist <- phyloseq::distance(py.no.rar, "unifrac")
wuni_dist <- phyloseq::distance(py.no.rar, "wunifrac")

vegan::adonis2(euc_dist ~ Temp + Individual, by = "margin",
               as(sample_data(clr), "data.frame"), permutations=9999)

vegan::adonis2(jac_dist ~  Temp + Individual, by = "margin",
        as(sample_data(py.no.rar), "data.frame"), permutations=9999)

vegan::adonis2(uni_dist ~  Temp + Individual, by = "margin",
        as(sample_data(py.no.rar), "data.frame"), permutations=9999)

vegan::adonis2(wuni_dist ~  Temp + Individual, by = "margin",
        as(sample_data(py.no.rar), "data.frame"), permutations=9999)
```

### Beta dispersion
  Checking that beta dispersion between samples at -20 and -80 are homogenous.
```{r betadisper_asv}
set.seed(711)
disp.euc <- vegan::betadisper(phyloseq::distance(clr, "euclidean"), sample_data(clr)$Temperature)
disp.euc.sample <- vegan::betadisper(phyloseq::distance(clr, "euclidean"), 
                                     sample_data(clr)$Individual)
vegan::permutest(disp.euc, pairwise=T, permutations = 9999)

disp.jac <- vegan::betadisper(phyloseq::distance(py.no.rar, "jaccard"), sample_data(py.no.rar)$Temperature)
vegan::permutest(disp.jac, pairwise=T, permutations = 9999)

disp.uni <- vegan::betadisper(phyloseq::distance(py.no.rar, "unifrac"), sample_data(py.no.rar)$Temperature)
vegan::permutest(disp.uni, pairwise=T, permutations = 9999)

disp.wuni <- vegan::betadisper(phyloseq::distance(py.no.rar, "wunifrac"), sample_data(py.no.rar)$Temperature)
vegan::permutest(disp.wuni, pairwise=T, permutations = 9999)

```


### generate phylogenetic tree
```{r tree, eval=FALSE}
alignment <- AlignSeqs(seqs)
saveRDS(alignment, "alignment.rds")
alignment <- readRDS("alignment.rds")
phang.align <- phyDat(as(alignment, "matrix"), type="DNA")
dm <- dist.ml(phang.align)
saveRDS(dm, "dm.rds")
treeNJ <- NJ(dm)
fit = pnl(treeNJ, data=phang.align)

fitGTR <- update(fit, k=4, inv=0.2)
saveRDS(fitGTR, "fitGTR_unoptimized.rds")
```

### Proportional abundance bar plots
```{r plotCompositionalAbundance}
# Function to display summary statistics and plot compositional abundances at a specified taxonomic rank
# Takes unrarefied phyloseq object
# Option to aggregate taxa below a specified percentage threshold
plotCompositional <- function(py_object, rank, aggregate = FALSE) {
  py_comp <- transform_sample_counts(py_object, function(x) x / sum(x))
  py <- tax_glom(py_comp, rank, NArm = FALSE)
  
  # Replace NA with unclassified, identify uniques within rank
  tax <- tax_table(py)
  tax <- tidyr::replace_na(tax, "Unidentified")
  tax_table(py) <- as.matrix(tax)
  rankList <- unique(tax_table(py)[, rank])
  
  # Melt phyloseq object
  abund <- psmelt(py)
  abund$Bar <- paste(abund$Individual, abund$Temperature)
  
  if(aggregate == TRUE) {
    abund[, rank] <- as.character(abund[, rank])
    abund[abund$Abundance < 0.01, rank] <-  "< 1% abund."
    length(unique(abund[, rank])) # 24 families total
    rankList <- unique(abund[ , rank])
  }
  
  # Specify desired colors
  colorPalette <- as.vector(c(RColorBrewer::brewer.pal(12, "Paired"), 
                              "lightsteelblue2","red4", "darkseagreen3", 
                              "rosybrown", "cyan","#FBB4AE", "#FC8D62", 
                              "snow3", "olivedrab2", "orange", "gold", "black"))
  colorPalette <- colorPalette[1:length(rankList)]
  names(colorPalette) <- rankList
  
  ggplot(abund, aes_string(x = "Bar", y = "Abundance", fill = rank)) +
    geom_bar(stat = "identity", position = "stack") +
    theme(axis.text.x = element_text(hjust = 0)) +
    scale_fill_manual(values = colorPalette) + coord_flip() +
    theme(axis.title.y = element_blank()) + 
    guides(fill = guide_legend(ncol=1))
}

plotCompositional(py.no.rar, "Phylum")
plotCompositional(py.no.rar, "Family", aggregate = TRUE)
plotCompositional(py.no.rar, "Genus", aggregate = TRUE)
```

# Summary statistics for the phylum level
```{r}
py_comp <- transform_sample_counts(py.no.rar, function(x) x / sum(x))
py <- tax_glom(py_comp, "Phylum", NArm = FALSE)
abund <- psmelt(py)
phyla <- abund[, c("Sample","Abundance", "Phylum")]
phyla_by_sample <- as.data.frame(phyla %>% group_by(Phylum, Sample) %>% summarise(sum(Abundance)))
#Bacteroidota
mean(phyla_by_sample[c(17:32), 3]) ; max(phyla_by_sample[c(17:32), 3]) ; min(phyla_by_sample[c(17:32), 3])
#Firmicutes
mean(phyla_by_sample[c(161:176), 3]) ; max(phyla_by_sample[c(161:176), 3]) ; min(phyla_by_sample[c(161:176), 3])
```