---
title: "Gut.Microbiome.Analysis.2020.06.29"
author: "jirayu.tanp@nomnomnow.com"
date: "2020/06/29"
output: html_document
editor_options: 
  chunk_output_type: console
---

##	R version and citation
```{r, echo=TRUE}
version
citation()
```

##	Set paths
```{r, echo=TRUE}
### fastq files downloded from gdrive --> CoreBiome
### demultiplexing and OTU construction completed by CoreBiome
### Phyloseq objected created using in-house NomNomNow pipeline ver 0.2
  my_dir = c("...")
  setwd(my_dir)
  print(paste("Working directory is:",getwd()))
  seed=20200629
  set.seed(seed)
  group_cols=c(rep("cyan",12),rep("lightsteelblue",12),
               rep("dodgerblue",12),rep("navy",12),rep("purple",12),
               rep("pink",12),rep("darkgoldenrod",12),rep("chocolate1",12),
               rep("indianred1",12),rep("brown",12))
  pop.kolor=c("navy","gold", "red")
  dark.colors = c("#454849","#7c8286") 
  nn.colors = c("#A7A8A9","#EE2737","#69B3E7")
```

## Load libraries 
```{r, echo=TRUE}
#if (!requireNamespace("BiocManager"))
#    install.packages("BiocManager")
#   BiocManager::install("DESeq2")
#BiocManager::install("dada2","msa","phyloseq","plyr","dplyr","reshape2","ade4","ggrepel","DECIPHER","vegan","DESeq2","phanghorn")
#BiocManager::install("ShotgunFunctionalize.R")


library("ggplot2");packageVersion("ggplot2");citation("ggplot2")
library("gridExtra");packageVersion("gridExtra");citation("gridExtra")
library("phyloseq");packageVersion("phyloseq");citation("phyloseq")
library("plyr");packageVersion("plyr");citation("plyr")
library("dplyr");packageVersion("dplyr");citation("dplyr")
library("reshape2");packageVersion("reshape2");citation("reshape2")
library("ggrepel");packageVersion("ggrepel");citation("ggrepel")
library("vegan");packageVersion("vegan");citation("vegan")
library("tidyr");packageVersion("tidyr");citation("tidyr")
library("DESeq2");packageVersion("DESeq2");citation("DESeq2")
library("Rmisc");packageVersion("Rmisc");citation("Rmisc")
library("PMCMR");packageVersion("PMCMR");citation("PMCMR")
library("nlme");packageVersion("nlme");citation("nlme")
library("lme4");packageVersion("lme4");citation("lme4")
library("dunn.test");packageVersion("dunn.test");citation("dunn.test")
library("randomForest");packageVersion("randomForest");citation("randomForest")
library("ROCR");packageVersion("ROCR");citation("ROCR")
library("verification");packageVersion("verification");citation("verification")
library("qvalue");packageVersion("qvalue");citation("qvalue")
library("caret");packageVersion("caret");citation("caret")
library("e1071");packageVersion("e1071");citation("e1071")
library("caTools");packageVersion("caTools");citation("caTools")
library("rpart");packageVersion("rpart");citation("rpart")
library("ggridges");packageVersion("ggridges");citation("ggridges")
library("RVAideMemoire")
#library("");packageVersion("")
#library("");packageVersion("")

```

## Necessary functions
```{r, echo=TRUE}
# sequencing depths
  sample_coverage<-function(phyloseqobj){
    xx = data.frame(nreads = sample_sums(phyloseqobj), 
                    sorted = 1:nsamples(phyloseqobj),
                    type = "Samples")
    xx$SampleID=rownames(xx)
    xx = xx[order(xx$nreads),]
    print(paste("coverage summary: "));print(summary(xx$nreads))
    print(paste("total reads = ",sum(xx$nreads),sep=""))
    print(paste("stdev = ",sd(xx$nreads),sep=""))
    #return(xx)
  }
  
# clean phyloseq objects     
  clean_phyloseq<-function(phyloseqobj){
    readsumsdf = data.frame(nreads = sort(taxa_sums(phyloseqobj), TRUE), 
                            sorted = 1:ntaxa(phyloseqobj), type = "Taxa")
    readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(phyloseqobj), TRUE), 
                                            sorted = 1:nsamples(phyloseqobj),type = "Samples"))
    pdf("Taxa_and_sample_coverage.plot.pdf",width=4,height=3)
      title = "Total number of reads in dataset"
      ggplot(readsumsdf, aes(x = sorted, y = nreads)) + 
        geom_bar(stat = "identity", color = dark.colors[1], fill = dark.colors[1]) + 
        ggtitle(title)  + facet_wrap(~type, 1, scales = "free") +
        xlab("Sequencing depth (number of reads)")  +
        theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) +
        theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
    dev.off()
  
    print("assessing phyla abundance...")
    prev0 = apply(X = otu_table(phyloseqobj),
                  MARGIN = ifelse(taxa_are_rows(phyloseqobj), yes = 1, no = 2),
                  FUN = function(x){sum(x > 0)})
    prevdf = data.frame(Prevalence = prev0,
                      TotalAbundance = taxa_sums(phyloseqobj),
                      tax_table(phyloseqobj))
    prevdf1 = subset(prevdf, Rank2 %in% get_taxa_unique(phyloseqobj, "Rank2"))
  
    pdf("Taxa_prevalence.precleanup.pdf", width=10, height=6)
      ggplot(prevdf1, aes(TotalAbundance, Prevalence/nsamples(phyloseqobj), color = Rank2)) +
        geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") +
        ylab("Prevalence [Frac. Samples]") + facet_wrap(~Rank2) + theme(legend.position="none")  +
        theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) +
        theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) +
        theme(legend.position = "none")
    dev.off()

  print("removing unknown taxa and low abundance phyla (N<5)...")
    temp=plyr::ddply(prevdf, "Rank2",
              function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
    colnames(temp)=c("Phylum","Var1","Var2")
    filtered_phyla=subset(temp,Var2<dim(sample_data(pets.raw))[1]*.05)
    filtered_phyla=unlist(as.character(filtered_phyla$Phylum));class(filtered_phyla)
    ps1 = subset_taxa(pets.raw, !is.na(Rank2) & !Rank2 %in% filtered_phyla)
    print(ps1); print(pets.raw)
  
  # save cleaned ps object   
  return(ps1)
  print(ps1)
  }  
  
# Abundance plot
  plot_abundance = function(physeq, mapvar, Facet="Phylum", Color="Phylum"){
    mphyseq = psmelt(physeq)
    mphyseq <- subset(mphyseq, Abundance > 0)
    ggplot(data = mphyseq, mapping = aes_string(x = mapvar, y = "Abundance", color = Color, fill = Color)) +
      geom_violin(fill = NA) + geom_point(size = 1, alpha = 0.3, position = position_jitter(width = 0.3)) +
      geom_point(colour = "#454849", shape=21, alpha=0.5) +
      facet_wrap(facets = Facet) + scale_y_log10() +
      theme(legend.position="none") + theme(axis.text.x = element_text(angle = 90, hjust = 1))
  }

# Plot PCoA  
  plotPCoA<-function(phyloseqobj,pcoa,evals,kolor){
    plot_ordination(phyloseqobj, pcoa) + 
      geom_point(aes(color = Timepoint), size = 1) + scale_color_manual(values = kolor) +
      geom_point(colour = "#454849", shape=21, alpha=0.5) +
      geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + 
      coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() +  
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  }

# Calculating alpha diversity
  compute_alphadiv<-function(phyloseqobj,trials,rarefaction_depth){
    min_lib= rarefaction_depth
    nsamp = nsamples(phyloseqobj)
    trials = trials
   
    richness <- matrix(nrow = nsamp, ncol = trials)
    row.names(richness) <- sample_names(phyloseqobj)
    
    evennessF <- matrix(nrow = nsamp, ncol = trials)
    row.names(evennessF) <- sample_names(phyloseqobj)
    
    evennessS <- matrix(nrow = nsamp, ncol = trials)
    row.names(evennessS) <- sample_names(phyloseqobj)
    
    evennessI <- matrix(nrow = nsamp, ncol = trials)
    row.names(evennessI) <- sample_names(phyloseqobj)
  
    set.seed(007)
    for (i in 1:trials) {
      # Subsample
      r <- rarefy_even_depth(phyloseqobj, sample.size = min_lib, verbose = FALSE, replace = TRUE)
      
      # Calculate richness
      rich <- as.numeric(as.matrix(estimate_richness(r, measures = "Observed")))
      richness[ ,i] <- rich
      
      # Calculate evenness Fisher
      evenF <- as.numeric(as.matrix(estimate_richness(r, measures = "Fisher")))
      evennessF[ ,i] <- evenF
    
      # Calculate evenness Shannon
      evenS <- as.numeric(as.matrix(estimate_richness(r, measures = "Shannon")))
      evennessS[ ,i] <- evenS
      
      # Calculate evenness Simpson
      evenI <- as.numeric(as.matrix(estimate_richness(r, measures = "Simpson")))
      evennessI[ ,i] <- evenI
    }
  
    SampleID <- row.names(richness)
    mean <- apply(richness, 1, mean)
    sd <- apply(richness, 1, sd)
    measure <- rep("Richness", nsamp)
    rich_stats <- data.frame(SampleID, mean, sd, measure)
  
    SampleID <- row.names(evennessF)
    mean <- apply(evennessF, 1, mean)
    sd <- apply(evennessF, 1, sd)
    measure <- rep("Fisher", nsamp)
    even_statsF <- data.frame(SampleID, mean, sd, measure)
  
    SampleID <- row.names(evennessS)
    mean <- apply(evennessS, 1, mean)
    sd <- apply(evennessS, 1, sd)
    measure <- rep("Shannon", nsamp)
    even_statsS <- data.frame(SampleID, mean, sd, measure)
  
    SampleID <- row.names(evennessI)
    median <- apply(evennessI, 1, mean)
    sd <- apply(evennessI, 1, sd)
    measure <- rep("Simpson", nsamp)
    even_statsI <- data.frame(SampleID, mean, sd, measure)
  
    alpha <- rbind(rich_stats, even_statsF, even_statsS, even_statsI)
  
    s <- data.frame(sample_data(phyloseqobj))
    alphadiv <- merge(alpha, s, by = "SampleID")
    alphadiv$rarefaction=rep(min_lib,dim(alphadiv)[1])
    return(alphadiv)
 }

# Standard error calculation
  st.err <- function(x) {
    sd(x)/sqrt(length(x))
     }

# PAM clustering
  pamPCoA = function(x, k) {
    list(cluster = pam(x[,1:2], k, cluster.only = TRUE))
}
 
# Plot heatmap of taxa
  make_hcb <- function(data, var, name = NULL, fillScale = NULL, ...) {
  hcb <- ggplot(data=data, aes_string(x="index", y=1, fill=var)) + 
          geom_raster() +
          scale_y_continuous(expand=c(0,0), breaks=1, labels=name) + 
          scale_x_continuous(expand=c(0,0)) +
          xlab(NULL) + ylab(NULL) +
          theme(axis.title=element_blank(), axis.ticks=element_blank()) +
          theme(axis.text.x=element_blank()) +
          theme(axis.text.y=element_text(size=8, face="bold")) +
          #theme(plot.margin=unit(c(0,0,0,0),"lines"), axis.ticks.margin = unit(0,"null"), ...) +
          guides(fill=F)
  if(!is.null(fillScale)) hcb <- hcb + fillScale
  return(hcb)
}
  
  plot_heatmap.2 <- function(ps, sample.label=NULL, taxa.label=NULL, ...) {
    hm <- plot_heatmap(ps, taxa.label="Rank8", sample.order=sample.order, taxa.order = taxa.order)
    hm <- hm + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
    low = "#000033"; high = "#66CCFF"; trans = scales::log_trans(4); na.value = "#454849"
    #low = "#10cfc9"; high = "#f5a800"; trans = scales::log_trans(4); na.value = "black"
    new_gradient <- scale_fill_gradient(low = low, high = high, trans = trans, na.value = na.value,
                                      breaks = c(0.001, 0.01, 0.1, 1), name="Relative\nabundance")
    hm <- hm + theme(plot.margin=unit(c(0,0.5,0.5,0.5),"lines"))
    hm <- hm + new_gradient
    hm <- hm + geom_raster()
    hm <- hm + ylab("Taxa")
    hm$layers <- hm$layers[2]
    return(hm)
  }
  
  makeProfilePlot <- function(mylist,names) {
     require(RColorBrewer)
     # find out how many variables we want to include
     numvariables <- length(mylist)
     # choose 'numvariables' random colours
     qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
     col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors,
                                rownames(qual_col_pals)))
     colours <- sample(col_vector,numvariables)
     # find out the minimum and maximum values of the variables:
     mymin <- 1e+20
     mymax <- 1e-20
     for (i in 1:numvariables)
     {
        vectori <- mylist[[i]]
        mini <- min(vectori)
        maxi <- max(vectori)
        if (mini < mymin) { mymin <- mini }
        if (maxi > mymax) { mymax <- maxi }
     }
     # plot the variables
     for (i in 1:numvariables)
     {
        vectori <- mylist[[i]]
        namei <- names[i]
        colouri <- colours[i]
        if (i == 1) { plot(vectori,col=colouri,type="l",ylim=c(mymin,mymax)) }
        else         { points(vectori, col=colouri,type="l")                                     }
        lastxval <- length(vectori)
        lastyval <- vectori[length(vectori)]
        text((lastxval-10),(lastyval),namei,col="black",cex=0.6)
     }
  }  
  
```


## ##########################################
## Microbiome analysis: Week 4 vs Baseline ##
## ##########################################

## Read phyloseq object
```{r, echo=TRUE}
# read phyloseq object
  ps.raw=readRDS("Study.metadata.integrated.rds")  
  print(ps.raw)
  head(sample_data(ps.raw))
```

## Filtering and QC
```{r, echo=TRUE}
# sequencing depths
  sample_coverage(ps.raw)

# plot sample and taxa coverage pre-cleanup
  readsumsdf = data.frame(nreads = sort(taxa_sums(ps.raw), TRUE), 
                          sorted = 1:ntaxa(ps.raw), type = "Taxa")
  readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(ps.raw), TRUE), 
                                            sorted = 1:nsamples(ps.raw),type = "Samples"))
  pdf("Taxa_and_sample_coverage_pre-cleanup.pdf",width=4,height=3)
    title = "Total number of reads pre-cleanup"
    ggplot(readsumsdf, aes(x = sorted, y = nreads)) + 
      geom_bar(stat = "identity", color = dark.colors[1], fill = dark.colors[1]) + 
      ggtitle(title)  + facet_wrap(~type, 1, scales = "free") +
      xlab("Sequencing depth (number of reads)")  +
      theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  
# assess abundance of phyla pre-cleanup
  prev0 = apply(X = otu_table(ps.raw),
                MARGIN = ifelse(taxa_are_rows(ps.raw), yes = 1, no = 2),
                FUN = function(x){sum(x > 0)})
  
  prevdf = data.frame(Prevalence = prev0,
                      TotalAbundance = taxa_sums(ps.raw),
                      tax_table(ps.raw))
  prevdf1 = subset(prevdf, Rank2 %in% get_taxa_unique(ps.raw, "Rank2"))
  
  pdf("Taxa_prevalence_pre-cleanup.pdf", width=10, height=6)
    ggplot(prevdf1, aes(TotalAbundance, Prevalence/nsamples(ps.raw), color = Rank2)) +
      geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") +
      ylab("Prevalence [Frac. Samples]") + facet_wrap(~Rank2) +
      theme(legend.position="none")  +
      theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) +
      theme(legend.position = "none")
  dev.off()

# remove rare and low abundant taxa
  print(ps.raw)
  tmp = filter_taxa(ps.raw, function(x) sum(x > 2) > (0.05*length(x)), TRUE)
  print(tmp)
  
# remove missing phyla, if any
  ps1 = subset_taxa(tmp, !is.na(Rank2))
  print(ps1)
  print(ps.raw)
  
# Filter entries with prevalence = 0, if any .
  readsumsdf = data.frame(nreads = sort(taxa_sums(ps1), TRUE), 
                          sorted = 1:ntaxa(ps1), 
                          type = "Taxa")
  readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(ps1), TRUE), 
                                            sorted = 1:nsamples(ps1), type = "Samples"))
# Plot post-filtering coverage statistics  
  pdf("Taxa_and_sample_coverage_post-filtering.pdf")
    title = "Total number of reads post-cleanup"
    p = ggplot(readsumsdf, aes(x = sorted, y = nreads)) + 
      geom_bar(stat = "identity", color = dark.colors[2], fill = dark.colors[2])
    p + ggtitle(title) +  facet_wrap(~type, 1, scales = "free") +
      xlab("Sequencing depth (number of reads)")    +
      theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()

# calculate coverage post-filtering    
  sample_coverage(ps1)

# assess abundance of phyla post filtering
  prev0 = apply(X = otu_table(ps1),
                MARGIN = ifelse(taxa_are_rows(ps1), yes = 1, no = 2),
                FUN = function(x){sum(x > 0)})
  prevdf = data.frame(Prevalence = prev0,
                      TotalAbundance = taxa_sums(ps1),
                      tax_table(ps1))
  prevdf1 = subset(prevdf, Rank2 %in% get_taxa_unique(ps1, "Rank2"))
  
  pdf("Taxa_prevalence_postfiltering.pdf")
    ggplot(prevdf1, aes(TotalAbundance, Prevalence/nsamples(ps1), color = Rank2)) +
      geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") +
      ylab("Prevalence [Frac. Samples]") + facet_wrap(~Rank2) +
      theme(legend.position="none")  +
      theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) +
      theme(legend.position = "none")
  dev.off()

# save cleaned ps object   
  print(ps1)
  saveRDS(ps1,"Study.metadata.integrated.cln.rds")
```

## Read cleaned phyloseq object
```{r, echo=TRUE}
# read phyloseq object
  ps.cln=readRDS("Study.metadata.integrated.cln.rds")  
  print(ps.cln) 
  head(sample_data(ps.cln))
  sample_data(ps.cln)$Label=paste(sample_data(ps.cln)$Time)
  table(sample_data(ps.cln)$Label)
  
#include only baseline
  ps.cln.baseline <- ps.cln %>%
    subset_samples(  Label == "baseline" 
                     ) %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
#include only week4
  ps.cln.week4 <- ps.cln %>%
    subset_samples(  Label == "week4"  
                     ) %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
```

## Calculate alpha diversity 
```{r, echo=TRUE}
  alldat=ps.cln
  
# Rarefaction statistics
  xx = data.frame(sample_sums(alldat))
  xx$SampleID=rownames(xx)
  colnames(xx)=c("Coverage","SampleID")
  xx = xx[order(xx$Coverage),]
  min(xx$Coverage)
  depth=10000
  maximm=round(min(xx$Coverage)/depth,0)
  
for ( i in 1:maximm){
  # calculate alpha diversity by subsampling at the rarefaction depth 
    rfxn=compute_alphadiv(alldat,100,depth)
    # head(rfxn)
 
  # calculate evenness  
    evenness = subset(rfxn, measure=="Shannon")
    evn=evenness$mean/log(dim(otu_table(alldat))[2])
    evenness$mean=evn
    evenness$sd=rep("NA",dim(evenness)[1])
    evenness$measure=rep("Evenness",dim(evenness)[1])
  
    rfxn2=rbind(rfxn,evenness)
    rfxn2=rfxn2[order(rfxn2$SampleID),]
    filename=paste("AlphaDiversity",depth,"txt",sep=".")
    write.table(rfxn2, file = filename,
                append = FALSE, quote = FALSE, sep = "\t", 
                eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = TRUE)
  # increase depth by 500  
    depth=depth+10000
}

  ad2=read.table("AlphaDiversity.10000.txt", header=TRUE)
  ad3=read.table("AlphaDiversity.20000.txt", header=TRUE)
  ad4=read.table("AlphaDiversity.30000.txt", header=TRUE)
  ad5=read.table("AlphaDiversity.40000.txt", header=TRUE)
  ad6=read.table("AlphaDiversity.50000.txt", header=TRUE)
  ad7=read.table("AlphaDiversity.60000.txt", header=TRUE)
  ad8=read.table("AlphaDiversity.70000.txt", header=TRUE)
  ad9=read.table("AlphaDiversity.80000.txt", header=TRUE)
  ad10=read.table("AlphaDiversity.90000.txt", header=TRUE)
  ad11=read.table("AlphaDiversity.1e+05.txt", header=TRUE)
  ad12=read.table("AlphaDiversity.110000.txt", header=TRUE)
  ad13=read.table("AlphaDiversity.120000.txt", header=TRUE)
  ad14=read.table("AlphaDiversity.130000.txt", header=TRUE)
  ad15=read.table("AlphaDiversity.140000.txt", header=TRUE)
  ad16=read.table("AlphaDiversity.150000.txt", header=TRUE)
  ad17=read.table("AlphaDiversity.160000.txt", header=TRUE)
  ad18=read.table("AlphaDiversity.170000.txt", header=TRUE)
  ad19=read.table("AlphaDiversity.180000.txt", header=TRUE)
  ad20=read.table("AlphaDiversity.190000.txt", header=TRUE)
  ad21=read.table("AlphaDiversity.2e+05.txt", header=TRUE)
  ad22=read.table("AlphaDiversity.210000.txt", header=TRUE)
  ad23=read.table("AlphaDiversity.220000.txt", header=TRUE)
  ad24=read.table("AlphaDiversity.230000.txt", header=TRUE)
  ad25=read.table("AlphaDiversity.240000.txt", header=TRUE)
  ad26=read.table("AlphaDiversity.250000.txt", header=TRUE)
  ad27=read.table("AlphaDiversity.260000.txt", header=TRUE)
  ad28=read.table("AlphaDiversity.270000.txt", header=TRUE)
  ad29=read.table("AlphaDiversity.280000.txt", header=TRUE)
  ad30=read.table("AlphaDiversity.290000.txt", header=TRUE)
  ad31=read.table("AlphaDiversity.3e+05.txt", header=TRUE)
  ad32=read.table("AlphaDiversity.310000.txt", header=TRUE)
  ad33=read.table("AlphaDiversity.320000.txt", header=TRUE)
  ad34=read.table("AlphaDiversity.330000.txt", header=TRUE)
  ad35=read.table("AlphaDiversity.340000.txt", header=TRUE)
  ad36=read.table("AlphaDiversity.350000.txt", header=TRUE)
  ad37=read.table("AlphaDiversity.360000.txt", header=TRUE)
  ad38=read.table("AlphaDiversity.370000.txt", header=TRUE)
  ad39=read.table("AlphaDiversity.380000.txt", header=TRUE)
  ad40=read.table("AlphaDiversity.390000.txt", header=TRUE)
  ad41=read.table("AlphaDiversity.4e+05.txt", header=TRUE)
  ad42=read.table("AlphaDiversity.410000.txt", header=TRUE)
  ad43=read.table("AlphaDiversity.420000.txt", header=TRUE)
  ad44=read.table("AlphaDiversity.430000.txt", header=TRUE)
  ad45=read.table("AlphaDiversity.440000.txt", header=TRUE)
  ad46=read.table("AlphaDiversity.450000.txt", header=TRUE)
  ad47=read.table("AlphaDiversity.460000.txt", header=TRUE)
  ad48=read.table("AlphaDiversity.470000.txt", header=TRUE)
  
  ad=rbind(ad2,ad3,ad4,ad5,ad6,ad7,ad8,ad9,ad10,
         ad11,ad12,ad13,ad14,ad15,ad16,ad17,ad18,ad19,ad20,
         ad21,ad22,ad23,ad24,ad25,ad26,ad27,ad28,ad29,ad30,
         ad31,ad32,ad33,ad34,ad35,ad36,ad37,ad38,ad39,ad40,
         ad41,ad42,ad43,ad44,ad45,ad46,ad47,ad48)
  head(ad)
  head(mer)
  
  ad2=merge(mer, ad, by=c("SampleID"))
  ad2$Dog.y=c()
  head(ad2)
  colnames(ad2)=c("SampleID", "Dog", "Time.x", "mean", "sd", "measure", "Time.y", "Label", "PCoA1", "PCoA2", "PCoA3", "PCoA4", "PCoA5", "PCoA6", "PCoA7", "rarefaction")
  ad2=ad2[order(ad2$rarefaction,ad2$Dog,ad2$Label,ad2$measure),]
  
  write.table(ad2, file = "AlphaDiversity.combined.txt", append = FALSE, quote = FALSE, 
            sep = "\t",eol = "\n", na = "NA", dec = ".", row.names = FALSE,
            col.names = TRUE, qmethod = c("escape", "double"),
            fileEncoding = "")
```

### Plot alpha diversity (rarefaction curves)
```{r, echo=TRUE}
# Read the concatenated alphadiv file
  df = read.table("AlphaDiversity.combined.txt", header=TRUE)
  head(df);dim(df)
  
# Plot Richness  
  rich=data.frame(subset(df,measure=="Richness"))
  rich$mean=as.numeric(paste(rich$mean))
  rich$sd=as.numeric(paste(rich$sd))
  rich$rarefaction=as.numeric(paste(rich$rarefaction))
  head(rich);dim(rich)
 
# Rarefaction curves by Diet
  rich_agg.mean = aggregate(rich$mean, by=list(rich$Label,rich$rarefaction),
                            FUN=mean, na.rm=TRUE)
  rich_agg.sd = aggregate(rich$mean, by=list(rich$Label,rich$rarefaction),
                          FUN=sd, na.rm=TRUE)
  rich_agg.sem = aggregate(rich$mean, by=list(rich$Label,rich$rarefaction),
                                         FUN=st.err)
  
  rich_agg = cbind(rich_agg.mean,rich_agg.sd, rich_agg.sem)
  head(rich_agg)
  rich_agg[,c(4:5,7:8)]=c()
  colnames(rich_agg)=c("Label","Rarefaction_depth","mean","sd","se")
  head(rich_agg)
  
  
  pdf("RarefactionCurves_richness_time.pdf", 6,4)
    ggplot(rich_agg, aes(Rarefaction_depth, mean, ymin = mean - sd, ymax = mean + sd,
                   colour = Label, group=Label)) + scale_color_manual(values=nn.colors[-1]) + 
      geom_point() + geom_pointrange(aes(ymin=mean-sd, ymax=mean+sd), alpha=0.3) +
      geom_line() + geom_pointrange(alpha=0.3) + scale_x_continuous(breaks=seq(0,470000,100000)) +
      ylab("Species Richness") + xlab("Rarefaction depth") + theme_bw() + 
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()    
  
# Richness: assess significance (using highest rarefaction depth)
  depth = max(rich$rarefaction);depth  
  mxs = subset(rich, rarefaction == depth)
  set.seed(seed)
  shapiro.test(mxs$mean) # normally distributed
  set.seed(seed) 
  rich.aes=aov(mean~Label,mxs)
  summary(rich.aes) # p=0.777
  set.seed(seed)
  TukeyHSD(rich.aes)
  
 head(mxs)
 pdf("Richness_time_boxplot.pdf", 4,4)
  ggplot(mxs, aes(Label, mean)) +
    geom_boxplot(aes(fill=Label), outlier.shape=NA) +
    geom_jitter(width=0.1,shape=21,col="black") +
    scale_fill_manual(values=nn.colors[-1], name = "Label") + 
    ylab("Species Richness") + xlab("Time") + theme_bw()  
 dev.off()    
  
# Plot shannon  
  head(df)
  shannon=data.frame(subset(df,measure=="Shannon"))
  shannon$mean=as.numeric(paste(shannon$mean))
  shannon$sd=as.numeric(paste(shannon$sd))
  shannon$rarefaction=as.numeric(paste(shannon$rarefaction))
  head(shannon);dim(shannon)
 
# rarefaction curves by Diet
  shannon_agg.mean = aggregate(shannon$mean, by=list(shannon$Label,shannon$rarefaction),
                            FUN=mean, na.rm=TRUE)
  shannon_agg.sd = aggregate(shannon$mean, by=list(shannon$Label,shannon$rarefaction),
                          FUN=sd, na.rm=TRUE)
  shannon_agg.sem = aggregate(shannon$mean, by=list(shannon$Label,shannon$rarefaction),
                                         FUN=st.err)
  
  shannon_agg = cbind(shannon_agg.mean,shannon_agg.sd, shannon_agg.sem)
  head(shannon_agg)
  shannon_agg[,c(4:5,7:8)]=c()
  colnames(shannon_agg)=c("Label","Rarefaction_depth","mean","sd","se")
  head(shannon_agg)
  
  pdf("RarefactionCurves_shannon_time.pdf", 6,4)
    ggplot(shannon_agg, aes(Rarefaction_depth, mean, ymin = mean - sd, ymax = mean + sd,
          colour=Label, group=Label)) + scale_color_manual(values=nn.colors[-1]) + geom_point(alpha=0.3) +
      geom_line() + geom_pointrange(aes(ymin=mean-sd, ymax=mean+sd),alpha=0.3) +
      geom_pointrange(alpha=0.3) + scale_x_continuous(breaks=seq(0,470000,100000)) +
      ylab("Shannon's H") + xlab("Rarefaction depth") + theme_bw() + 
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()    
  
# shannon: assess significance (using highest rarefaction depth)
  depth = max(shannon$rarefaction);depth  
  mxs = subset(shannon, rarefaction == depth);head(mxs)
  set.seed(seed)
  shapiro.test(mxs$mean) # normally distributed
  set.seed(seed) 
  shannon.aes=aov(mean~Label,mxs)
  summary(shannon.aes)
  kruskal.test(mean~Label,mxs)
  
  pdf("Shannon_time_boxplot.pdf", 4,4)
    ggplot(mxs, aes(Label, mean)) +
      geom_boxplot(aes(fill=Label),outlier.shape = NA, color="black") +
      geom_jitter(width=0.1,shape=21, col="black") +
      scale_fill_manual(values=nn.colors[-1]) + 
      ylab("Shannon's H") + xlab("Time") + theme_bw() 
  dev.off()    
```

### Bray-Curtis distances and statistical assocations
```{r, echo=TRUE}
# Load the cleaned ps object
  print(ps.cln)  
  sample_coverage(ps.cln)  

# Agglomerate at different taxonomic level
  ps2.phylum  =  tax_glom(ps.cln, "Rank2", NArm = FALSE) #phylum
  ps2.class   =  tax_glom(ps.cln, "Rank3", NArm = FALSE) #class
  ps2.order   =  tax_glom(ps.cln, "Rank4", NArm = FALSE) #order
  ps2.family  =  tax_glom(ps.cln, "Rank5", NArm = FALSE) #family
  ps2.genus   =  tax_glom(ps.cln, "Rank6", NArm = FALSE) #genus
  ps2.species =  tax_glom(ps.cln, "Rank7", NArm = FALSE) #species
  
  saveRDS(ps2.phylum,"Study.metadata.integrated.cln.phylum.rds")
  saveRDS(ps2.class,"Study.metadata.integrated.cln.class.rds")
  saveRDS(ps2.order,"Study.metadata.integrated.cln.order.rds")
  saveRDS(ps2.family,"Study.metadata.integrated.cln.family.rds")
  saveRDS(ps2.genus,"Study.metadata.integrated.cln.genus.rds")
  saveRDS(ps2.species,"Study.metadata.integrated.cln.species.rds")
  
# Change ps2 to different taxonomic levels
  ps2.species = readRDS("Study.metadata.integrated.cln.species.rds")
  ps2 = ps2.species
  print(ps2)
  sample_coverage(ps2)  
  
# Optional - transform count to relative abundance
  ps.ra  = transform_sample_counts(ps2, function(x) x / sum(x) )
  write.csv(otu_table(ps.ra),'relative.abundance.species.csv',row.names = TRUE) 
  
# Log transform the raw count data
  pslog <- transform_sample_counts(ps2, function(x) log(1 + x))
  print(pslog)
                                   
# Calculate Bray-Curtis distance
  set.seed(seed)
  bcdist = phyloseq::distance(pslog, method="bray",normalized=TRUE,
                              parallel=FALSE,fast=TRUE) 
  pcoa <- ordinate(pslog, method = "MDS", distance = bcdist)
  pcoa$vectors[,1] = pcoa$vectors[,1]*-1
  evals <- pcoa$values$Eigenvalues

# Screeplot
  pdf("PCoA_screeplot.species.pdf",6,4)
    plot_scree(pcoa) + xlim(as.character(seq(1,25))) + 
      ggtitle("Bray-Curtis ordination eigenvalues") +
      theme_bw() + 
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()

  names(pcoa)
  pcoa$values [1:7,]
       
# Prepare the metadata  
  sample_data(pslog)$PCoA1=pcoa$vectors[,1]
  sample_data(pslog)$PCoA2=pcoa$vectors[,2]
  sample_data(pslog)$PCoA3=pcoa$vectors[,3]
  sample_data(pslog)$PCoA4=pcoa$vectors[,4]
  sample_data(pslog)$PCoA5=pcoa$vectors[,5]
  sample_data(pslog)$PCoA6=pcoa$vectors[,6]
  sample_data(pslog)$PCoA7=pcoa$vectors[,7]
  sample_data(ps2)=sample_data(pslog)
  sampledf=data.frame(sample_data(pslog))
  head(sampledf)  

  write.csv(sample_data(pslog), "MB.PCoA.scores.csv")
  
# Add HR, MR, LR to the metadata
  sample_data(pslog)$PCo.Group <-
    c("LR", "MR", "MR", "LR", "MR",
      "MR", "LR", "MR", "MR", "LR",
      "MR", "MR", "HR", "HR", "LR",
      "LR", "HR", "HR", "HR", "HR",
      "HR", "LR", "MR", "MR", "LR",
      "HR", "MR", "HR", "HR", "HR",
      "HR", "MR", "LR", "LR", "MR",
      "LR", "MR", "LR", "LR", "LR",
      "HR", "HR", "MR", "HR", "LR",
      "MR", "HR", "MR", "HR", "LR",
      "LR", "MR", "LR", "HR", "LR",
      "LR")
  sample_data(ps2)=sample_data(pslog)
  sampledf=data.frame(sample_data(pslog))
  head(sampledf)
  
# Assess significance (using Permanova and GLM; validation of KW test)
  names(sampledf)
  set.seed(seed)
  prm_ab=adonis2(bcdist ~ Label, 
                data = sampledf, permutations = 9999)
  prm_ab
  
# Check dispersion
  groups <- sampledf[["Label"]]
  dispgrps <-betadisper(bcdist, groups, type=c("median"))
  anova(dispgrps)
  plot(dispgrps, col=pop.kolor)
  dispgrps
  boxplot(dispgrps) 
      
# Plot PCoA 
  pdf("PCoA_abundance_diet.pdf", 6,3)
    plot_ordination(pslog, pcoa, color="Label") + 
      scale_color_manual(values = nn.colors[-1]) +
      geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + 
      coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() +  
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()

  pdf("PCoA1_ridges.pdf",6,6)
    ggplot(sampledf, aes(x = PCoA1, y = Label, fill=Label)) +
    geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) +
    scale_fill_manual(values=nn.colors[-1]) +
    scale_y_discrete(expand = c(0.01, 0), name="Diet") +
    scale_x_continuous(expand = c(0.01, 0)) +
    theme_ridges() #+ facet_wrap(~Diet)
  dev.off()

  pdf("PCoA2_ridges.pdf",6,6)
    ggplot(sampledf, aes(x = PCoA2, y = Label, fill=Label)) +
    geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) +
    scale_fill_manual(values=nn.colors[-1]) +
    scale_y_discrete(expand = c(0.01, 0)) +
    scale_x_continuous(expand = c(0.01, 0)) +
    theme_ridges() #+ facet_wrap(~Diet)
  dev.off()
  
# Statistical association of PCoAs
# Pcoa1
  shapiro.test(sampledf$PCoA1) # normally distributed
  set.seed(seed)
  kruskal.test(PCoA1~Label,data=sampledf) # p-value = 0.00286
  sampledf <- sampledf %>% arrange(Time, Dog)
  wilcox.test(sampledf$PCoA1[1:28], sampledf$PCoA1[29:56], paired=T)
 
# Visualize the difference using boxplots 
pdf("PCoA1_boxplot.pdf",6,4)
  ggplot(sampledf,aes(Label,PCoA1, fill=Label)) + 
    geom_violin() + scale_fill_manual(values=nn.colors[-1]) +
    geom_jitter(width=0.05,col="black",size=1,shape=21) + xlab("") +
    theme_bw() + theme(legend.position="none") #+ facet_wrap(~Diet)
dev.off()
  
# Pcoa2
  shapiro.test(sampledf$PCoA2) # normally distributed
  set.seed(seed)
  kruskal.test(PCoA2~Label,data=sampledf) # p-value = 0.2582

# visualize the difference using boxplots   
pdf("PCoA2_boxplot.pdf",6,4)
  ggplot(sampledf,aes(Label,PCoA2, fill=Label)) + 
    geom_violin() + scale_fill_manual(values=nn.colors[-1]) +
    geom_jitter(width=0.05,col="black",size=1,shape=21) + xlab("") +
    theme_bw() + theme(legend.position="none") 
  dev.off() 

# Pcoa3
  shapiro.test(sampledf$PCoA3) # normally distributed
  set.seed(seed)
  kruskal.test(PCoA3~Label,data=sampledf) # p-value = 0.4126
  
# Pcoa4
  shapiro.test(sampledf$PCoA4) # normally distributed
  set.seed(seed)
  kruskal.test(PCoA4~Label,data=sampledf) # p-value = 0.3419
  
# Pcoa5
  shapiro.test(sampledf$PCoA5) # normally distributed
  set.seed(seed)
  kruskal.test(PCoA5~Label,data=sampledf) # p-value = 0.6346

  m2 <- glm(PCoA1 ~ Label, data = sampledf)
  qqnorm(residuals(m2))
  qqline(residuals(m2),col="red")
  summary(m2) # associated
  # the glm corroborates the findings from the kw analyses for PCoA1 

  m3 <- glm(PCoA2 ~ Label, data = sampledf)
  qqnorm(residuals(m3))
  qqline(residuals(m3),col="red")
  summary(m3)
  
  m4 <- glm(PCoA3 ~ Label, data = sampledf)
  qqnorm(residuals(m4))
  qqline(residuals(m4),col="red")
  summary(m4)
  
  m5 <- glm(PCoA4 ~ Label, data = sampledf)
  qqnorm(residuals(m5))
  qqline(residuals(m5),col="red")
  summary(m5) 
  
  m6 <- glm(PCoA5 ~ Label, data = sampledf)
  qqnorm(residuals(m6))
  qqline(residuals(m6),col="red")
  summary(m6)  # associated
  
  m7 <- glm(PCoA6 ~ Label, data = sampledf)
  qqnorm(residuals(m7))
  qqline(residuals(m7),col="red")
  summary(m7)
  
  m8 <- glm(PCoA7 ~ Label, data = sampledf)
  qqnorm(residuals(m8))
  qqline(residuals(m8),col="red")
  summary(m8)

```

### PAM Cluster by Ordination analysis - baseline vs week 4
```{r, echo=TRUE}
# Cluster
  library(cluster)
  NDIM <- 2
  x <- pcoa$vectors[,1:NDIM]  # rows=sample, cols=MDS axes, entries = value
  gs = clusGap(x, FUN = pamPCoA, K.max = 5, B = 500)
  pdf("GapStatistics.pdf", width=4, height=4)
    plot_clusgap(gs) + scale_x_continuous(breaks=c(seq(0, 5, 1))) +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()  

# The gap statistic suggests 2 distinct clusters
  K <- 2
  x <- pcoa$vectors[,1:NDIM]
  clust <- as.factor(pam(x, k=K, cluster.only=T))
  sample_data(pslog)$CST <- paste("Cluster",clust,sep="")
  CSTs <- as.character(seq(K))
  
  write.csv(sample_data(pslog), "PAM.baseline.week4.csv")

# Cluster by Ordination (MDS)
  CSTColors <- c("#EE2737", "#69B3E7")
  names(CSTColors) <- paste("Cluster",CSTs,sep="")#CSTs
  CSTColors
  CSTColorScale <- scale_colour_manual(name = "CST", values = CSTColors[1:7])
  CSTFillScale <- scale_fill_manual(name = "CST", values = CSTColors[1:7])

  pdf("PCoA_CST_species.pdf", 6, 4)
    plot_ordination(pslog, pcoa, color = "CST", shape="Time") +
      geom_point(colour = "black", aes(shape=Time), size = 2, alpha=0.75) +
      geom_point(aes(color = CST, shape=Time), size = 1, alpha=0.75) + 
      
        CSTColorScale +
        geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) +
        coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + 
        #xlab("PCoA1 [9.5%]") + ylab("PCoA2 [8.7%]") + 
      theme(legend.position = "none") +
        theme(panel.border = element_blank(), panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()
  table(sample_data(pslog)$Time,sample_data(pslog)$CST)
  
  mcnemar <- matrix(c(4,1,11,12), nrow=2)
  mcnemar.test(mcnemar)
```  

### DESeq2 to calculate differences in taxa abundance (week4 vs baseline)
```{r, echo=TRUE}
  ps2
  base <- ps2 %>%
    subset_samples(Label=="baseline" | Label=="week4") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  base
  table(sample_data(base)$Label)
  diagdds = phyloseq_to_deseq2(base, ~ Label)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  write.csv(otu_table(base),'DESeq.input.phylum.csv',row.names = TRUE)
  write.csv(otu_table(base),'DESeq.input.class.csv',row.names = TRUE)
  write.csv(otu_table(base),'DESeq.input.order.csv',row.names = TRUE)
  write.csv(otu_table(base),'DESeq.input.family.csv',row.names = TRUE)
  write.csv(otu_table(base),'DESeq.input.genus.csv',row.names = TRUE)
  write.csv(otu_table(base),'DESeq.input.species.csv',row.names = TRUE)
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps2)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq_week4_vs_baseline.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) +
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      ylim(0,11) +
      theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) + 
      theme(axis.text.x = element_text(angle = 0, hjust = 0, vjust=0.5))
  dev.off()
  
  write.csv(res, file = "DESeq.output.phylum.csv", row.names = TRUE)
  write.csv(res, file = "DESeq.output.class.csv", row.names = TRUE)
  write.csv(res, file = "DESeq.output.order.csv", row.names = TRUE)
  write.csv(res, file = "DESeq.output.family.csv", row.names = TRUE)
  write.csv(res, file = "DESeq.output.genus.csv", row.names = TRUE)
  write.csv(res, file = "DESeq.output.species.csv", row.names = TRUE)
  
```

  
## ######################################
## Microbiome Analysis with LR, MR, HR ##
## ######################################

#Lineplot
```{r,echo=T}

pcoa1 <- sample_data(ps2)

pdf("PCoA1.lineplots.HR.MR.LR.pdf",6,6)
  ggplot(pcoa1, aes(x=Time, y=PCoA1, color=PCo.Group, group=Dog)) + 
    geom_line() +
    geom_point(shape=20, alpha=1, color='black') + scale_color_manual(values = c(nn.colors[-1], nn.colors[1],"#E6A65D")) + 
    theme_bw() +  
    theme(panel.border = element_blank(), panel.grid.major = element_blank(),
          panel.grid.minor = element_blank(), axis.line = element_line(colour =
                                                                         "black")) +
    scale_x_discrete(name ="Time point") 
  dev.off()
```

# Calculate Bray-Curtis distance only for baseline samples for HR, MR, LR
```{r, echo=TRUE}

  ps2;
  pslog; 
  
  ps2.baseline <- ps2 %>%
    subset_samples(Label == "baseline") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
  pslog.baseline <- pslog %>%
    subset_samples(Label == "baseline") %>%
    prune_taxa(taxa_sums(.) > 0, .) 

  set.seed(seed)
  bcdist = phyloseq::distance(pslog.baseline, method="bray",normalized=TRUE,
                              parallel=FALSE,fast=TRUE) 
  pcoa.baseline <- ordinate(pslog.baseline, method = "MDS", distance = bcdist)
  evals <- pcoa.baseline$values$Eigenvalues
  
  pcoa.baseline$vectors[,1] <- pcoa.baseline$vectors[,1]*-1
  
  sampledf=data.frame(sample_data(pslog.baseline))
  head(sampledf)
  
  #Meta data
  sample_data(pslog.baseline)$PCoA1=pcoa.baseline$vectors[,1]
  sample_data(pslog.baseline)$PCoA2=pcoa.baseline$vectors[,2]
  sample_data(pslog.baseline)$PCoA3=pcoa.baseline$vectors[,3]
  sample_data(pslog.baseline)$PCoA4=pcoa.baseline$vectors[,4]
  sample_data(pslog.baseline)$PCoA5=pcoa.baseline$vectors[,5]
  sample_data(pslog.baseline)$PCoA6=pcoa.baseline$vectors[,6]
  sample_data(pslog.baseline)$PCoA7=pcoa.baseline$vectors[,7]
  sample_data(ps2.baseline)=sample_data(pslog.baseline)
  sampledf=data.frame(sample_data(pslog.baseline))
  head(sampledf)
  
  # assess significance (using Permanova and GLM; validation of KW test)
  names(sampledf)
  set.seed(seed)
  prm_ab=adonis2(bcdist ~ PCo.Group, 
                data = sampledf, permutations = 9999)
  prm_ab
  
  # Plot PCoA  
  pdf("PCoA_abundance_HR_MR_LR.baseline.pdf", 6,3)
    plot_ordination(pslog.baseline, pcoa.baseline, color="PCo.Group") + 
      scale_color_manual(values = nn.colors) +
      geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + 
      coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() +  
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()
  
```

# PAM Cluster by Ordination analysis - HR, MR, LR
```{r, echo=TRUE}
# Cluster
  library(cluster)
  NDIM <- 2
  x <- pcoa.baseline$vectors[,1:NDIM]  # rows=sample, cols=MDS axes, entries = value
  gs = clusGap(x, FUN = pamPCoA, K.max = 5, B = 500)
  pdf("GapStatistics.HR.MR.LR.pdf", width=4, height=4)
    plot_clusgap(gs) + scale_x_continuous(breaks=c(seq(0, 5, 1))) +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()  

# The gap statistic suggests 2 distinct clusters
  K <- 2
  x <- pcoa.baseline$vectors[,1:NDIM]
  clust <- as.factor(pam(x, k=K, cluster.only=T))
  sample_data(pslog.baseline)$CST <- paste("Cluster",clust,sep="")
  CSTs <- as.character(seq(K))

# Cluster by Ordination (MDS)
  CSTColors <- c(nn.colors, "#FF9E1B", "#EF82A9")
  names(CSTColors) <- paste("Cluster",CSTs,sep="")#CSTs
  CSTColors
  CSTColorScale <- scale_colour_manual(name = "CST", values = CSTColors[1:7])
  CSTFillScale <- scale_fill_manual(name = "CST", values = CSTColors[1:7])

  pdf("PCoA_CST_species.HR.MR.LR.pdf", 6, 4)
    plot_ordination(pslog.baseline, pcoa.baseline, color = "PCo.Group", shape="CST") +
      geom_point(colour = "black", aes(shape=CST), size = 2, alpha=0.75) +
      geom_point(aes(color = PCo.Group, shape=CST), size = 2, alpha=0.75) + 
      
        CSTColorScale +
        geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) +
        coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + 
        #xlab("PCoA1 [9.5%]") + ylab("PCoA2 [8.7%]") + 
      theme(legend.position = "right") +
        theme(panel.border = element_blank(), panel.grid.major = element_blank(),
              panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()
  table(sample_data(pslog.baseline)$PCo.Group,sample_data(pslog.baseline)$CST)
  
  fisher <- matrix(c(8,2,2,1,8,7), nrow = 3)
  fisher.test(fisher)
```  

# Supervised learning - Random Forest Test (Abundance) for HR, MR, LR
```{r, echo=TRUE}
  
  ps2.RF <- ps2.baseline;
  pslog.RF <- pslog.baseline;
  
  sample_data(ps2.RF)$PCo.Group
  sample_data(pslog.RF)$PCo.Group
  
  sample_data(ps2.RF)$PCo.Group <-
    c("LM", "LM", "LM", "LM", "LM",
      "LM", "HR", "LM", "HR", "HR",
      "LM", "LM", "HR", "LM", "HR",
      "LM", "LM", "LM", "LM", "LM",
      "HR", "HR", "LM", "LM", "HR",
      "LM", "HR", "LM")
  
  sample_data(pslog.RF)$PCo.Group <-
    c("LM", "LM", "LM", "LM", "LM",
      "LM", "HR", "LM", "HR", "HR",
      "LM", "LM", "HR", "LM", "HR",
      "LM", "LM", "LM", "LM", "LM",
      "HR", "HR", "LM", "LM", "HR",
      "LM", "HR", "LM")

# prepare the data
  dataMatrix <- data.frame(var = sample_data(pslog.RF)$PCo.Group, 
                           t(otu_table(pslog.RF)))
  set.seed(seed)
  spl <- sample.split(dataMatrix$var, SplitRatio = 0.7)
  Train <- subset(dataMatrix, spl == TRUE);dim(Train)
  Test <- subset(dataMatrix, spl == FALSE);dim(Test)
  
# run randomForest with repeated cross validation (10X CV times 3) 
  control <- trainControl(method="repeatedcv", number=5, repeats=3,
                          search="random")
  set.seed(seed)
  mtry <- sqrt(ncol(dataMatrix)-1)
  rf_train <- train(var~., data=Train, method="rf", metric="Accuracy",
                     tuneLength=15, trControl=control)
  print(rf_train)
  plot(rf_train)
  rf_train$finalModel
  head(names(Test[,-1]))
  rf_pred <- predict(rf_train, Test[,-1])
  confusionMatrix(rf_pred, as.factor(Test$var))
    
  set.seed(seed)
  prediction_for_roc_curve <- predict(rf_train,Test[-1],type="prob")
  classes=levels(Test$var)
  prediction_for_roc_curve <- predict(rf_train,Test,type="prob")

  kolor=pop.kolor
  pdf("AUC_abundance_Labels.pdf",3,3)
    for (i in 1:3){
    # Define which observations belong to class[i]
    true_values <- ifelse(Test$var==classes[i],1,0)
    
    # Assess the performance of classifier for class[i]
    pred <- prediction(prediction_for_roc_curve[,i],true_values)
    perf <- performance(pred, "tpr", "fpr")
    if (i==1){
       plot(perf,col=kolor[i], lwd=2) 
   }
   else{
       plot(perf,col=kolor[i],add=TRUE,lwd=2) 
   }
   # Calculate the AUC and print it to screen
    auc.perf <- performance(pred, measure = "auc")
    print(auc.perf@y.values)
    }
  dev.off()
  
```

##Abundance by DESeq - baseline HR vs LR
```{r,echo=T}

  # pull up only week 0 data
  ps2.baseline <- ps2 %>%
    subset_samples(Label == "baseline") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  ps2.baseline
  sample_data(ps2.baseline)
  sample_data(ps2.baseline)$Dog
  
  #remove MR
  baseline <- ps2.baseline %>%
    subset_samples(PCo.Group != "MR") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  baseline
  sample_data(baseline)
  
  table(sample_data(baseline)$PCo.Group)
  diagdds = phyloseq_to_deseq2(baseline, ~ PCo.Group)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(baseline)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  sigtab$log2FoldChange <- (-1)*sigtab$log2FoldChange
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq.DASpp.week0.HR.vs.LR.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + 
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -1, lty=3, color="black") + geom_vline(xintercept = 1, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + 
      ylim(0,11.5) + xlim(-30,30) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week0.HR.vs.LR.csv", row.names = FALSE)

```

##Alpha-diversity in HR, LR, MR
```{r,echo=T}

  ##read data
  adiv <- read.csv("Diversity and evenness at 470000 reads.csv",header = T)

  ##Evenness at baseline, week 4
  evenness <- adiv[adiv$measure=="Evenness",]
  evenness.0 <- evenness[evenness$Time=="baseline",]
  evenness.4 <- evenness[evenness$Time=="week4",]
  kruskal.test(mean ~ Label, data = evenness.0) #p = 0.3534
  kruskal.test(mean ~ Label, data = evenness.4) #p = 0.9472
  
  ##Fisher at baseline, week 4
  fisher <- adiv[adiv$measure=="Fisher",]
  fisher.0 <- fisher[fisher$Time=="baseline",]
  fisher.4 <- fisher[fisher$Time=="week4",]
  kruskal.test(mean ~ Label, data = fisher.0) #p = 0.9574
  kruskal.test(mean ~ Label, data = fisher.4) #p = 0.7568
  
  ##Fisher at baseline, week 4
  richness <- adiv[adiv$measure=="Richness",]
  richness.0 <- richness[richness$Time=="baseline",]
  richness.4 <- richness[richness$Time=="week4",]
  kruskal.test(mean ~ Label, data = richness.0) #p = 0.9574
  kruskal.test(mean ~ Label, data = richness.4) #p = 0.7568
  
  ##Shannon at baseline, week 4
  shannon <- adiv[adiv$measure=="Shannon",]
  shannon.0 <- shannon[shannon$Time=="baseline",]
  shannon.4 <- shannon[shannon$Time=="week4",]
  kruskal.test(mean ~ Label, data = shannon.0) #p = 0.3534
  kruskal.test(mean ~ Label, data = shannon.4) #p = 0.9472
  
  ##Simpson at baseline, week 4
  simpson <- adiv[adiv$measure=="Simpson",]
  simpson.0 <- simpson[simpson$Time=="baseline",]
  simpson.4 <- simpson[simpson$Time=="week4",]
  kruskal.test(mean ~ Label, data = simpson.0) #p = 0.3534
  kruskal.test(mean ~ Label, data = simpson.4) #p = 0.9472

```

##PCoA score at baseline and week 4 in HR, MR, LR
```{r,echo=T}
#read data
  pcoa.data <- sample_data(ps2)
  pcoa.data$PCoA1.group <- pcoa.data$PCo.Group
  head(pcoa.data)
  names(pcoa.data)
  
  #baseline PCoA scores
  summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="baseline"])  
  sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="baseline"])
  
  summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="baseline"])  
  sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="baseline"])
  
  summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="baseline"])  
  sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="baseline"])

  #kruskal test
  kruskal.test(pcoa.data$PCoA1[pcoa.data$Time=="baseline"] ~ pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"])
  pairwise.wilcox.test(pcoa.data$PCoA1[pcoa.data$Time=="baseline"], pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"], p.adjust.method = "BH")
  posthoc.kruskal.dunn.test(pcoa.data$PCoA1[pcoa.data$Time=="baseline"], as.factor(pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"]), p.adjust.method = "BH")
  
  #plot
  pcoa.data$PCoA1.group <- factor(pcoa.data$PCoA1.group, levels=c("HR", "MR", "LR"))
  plot(pcoa.data$PCoA1[pcoa.data$Time=="baseline"] ~ pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"], ylab="PCoA1 at baseline", xlab="PCoA1 change")
  
  #week 4 PCoA scores
  summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="week4"])  
  sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="week4"])
  
  summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="week4"])  
  sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="week4"])
  
  summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="week4"])  
  sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="week4"])

  #kruskal test
  kruskal.test(pcoa.data$PCoA1[pcoa.data$Time=="week4"] ~ pcoa.data$PCoA1.group[pcoa.data$Time=="week4"])
  
```

##Abundance by DESeq - week 4 HR vs LR
```{r,echo=T}

  # pull up only week 4 data
  ps2.week4 <- ps2 %>%
    subset_samples(Label == "week4") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  week4
  sample_data(ps2.week4)
  sample_data(ps2.week4)$Dog
  
# remove MR
  
  week4 <- ps2.week4 %>%
    subset_samples(PCo.Group != "MR") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  week4
  sample_data(week4)
  
  table(sample_data(week4)$PCo.Group)
  diagdds = phyloseq_to_deseq2(week4, ~ PCo.Group)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(week4)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  sigtab$log2FoldChange <- (-1)*sigtab$log2FoldChange
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq.DASpp.week4.HR.vs.LR.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + 
      #geom_point(size=2,color="black") + geom_point(size=1,color="white") + 
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + 
      ylim(0,11.5) + xlim(-30,30) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week4.HR.vs.LR.csv", row.names = FALSE)

```

##Abundance by DESeq - HR week 4 vs baseline
```{r,echo=T}

  HR <- ps2 %>%
    subset_samples(PCo.Group == "HR") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  HR
  sample_data(HR)
  
  table(sample_data(HR)$Label)
  diagdds = phyloseq_to_deseq2(HR, ~ Label)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps2)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq.DASpp.HR.week4.vs.week0.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + 
      #geom_point(size=2,color="black") + geom_point(size=1,color="white") + 
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + 
      ylim(0,35) + xlim(-30,30) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.HR.week4.vs.week0.csv", row.names = FALSE)

```

##Abundance by DESeq - LR week 4 vs baseline
```{r,echo=T}

  LR <- ps2 %>%
    subset_samples(PCo.Group == "LR") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  LR
  sample_data(LR)
  
  table(sample_data(LR)$Label)
  diagdds = phyloseq_to_deseq2(LR, ~ Label)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps2)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq.DASpp.LR.week4.vs.week0.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + 
      #geom_point(size=2,color="black") + geom_point(size=1,color="white") + 
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + 
      ylim(0,35) + xlim(-30,30) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.LR.week4.vs.week0.csv", row.names = FALSE)

```

##Abundance by DESeq - Baseline Better vs About the same health
```{r,echo=T}
  
# add overall.health data
  sample_data(ps2)$overall.health <-  
    c("Better", "Better", "Better", "Better", "Same",
      "Same", "Same", "Better", "Better", "Same",
      "Better", "Better", "Better", "Better", "Better",
      "Better", "Same", "Same", "Better", "Same",
      "Same", "Same", "Same", "Same", "Same",
      "Better", "No data", "Better", "Better", "Better",
      "Better", "Better", "Better", "Same", "Same",
      "Better", "Better", "Better", "Same", "Same",
      "Same", "Better", "Better", "Same", "No data",
      "Better", "Better", "Same", "Better", "Better",
      "No data", "No data", "Better", "Better", "Better",
      "Same")
  
  health <- ps2 %>%
    subset_samples(overall.health != "No data") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
  baseline.health <- health %>%
    subset_samples(Label == "baseline") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
  baseline.health
  sample_data(baseline.health)
  
  table(sample_data(baseline.health)$overall.health)
  diagdds = phyloseq_to_deseq2(baseline.health, ~ overall.health)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(baseline.health)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq.DASpp.LR.week4.vs.week0.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + 
      #geom_point(size=2,color="black") + geom_point(size=1,color="white") + 
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + 
      ylim(0,11.5) + xlim(-30,30) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week0.better.health.vs.null.health.csv", row.names = FALSE)

```

##Abundance by DESeq - Week 4 Better vs About the same health
```{r,echo=T}
  
  health <- ps2 %>%
    subset_samples(overall.health != "No data") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
  week4.health <- health %>%
    subset_samples(Label == "week4") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  
  week4.health
  sample_data(week4.health)
  
  table(sample_data(week4.health)$overall.health)
  diagdds = phyloseq_to_deseq2(week4.health, ~ overall.health)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  alpha = 1
  sigtab = res[which(res$padj < alpha), ]
  sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(week4.health)[rownames(sigtab), ], "matrix"))
  sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),]
  head(sigtab)
  dim(sigtab)
  sigtab$Taxaname=row.names(sigtab)
  names(sigtab)
  sigtab.all=data.frame(sigtab)
  
  kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue",
            "magenta","darkgreen","linen",
            "mistyrose","coral3","peachpuff3","lightcyan2","gold4")
  
  pdf("DESeq.DASpp.week4.better.health.vs.null.health.pdf",6,4)
    ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + 
      #geom_point(size=2,color="black") + geom_point(size=1,color="white") + 
      geom_point(size=1,alpha=1,shape=21) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + 
      ylim(0,4) + xlim(-30,30) +
      theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5))
  dev.off()
  write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week4.better.health.vs.null.health.csv", row.names = FALSE)

```

##Alpha-diversity - Better vs About the same health
```{r,echo=T}

  ##read data
  adiv <- read.csv("Diversity and evenness at 470000 reads.csv",header = T)

  ##Evenness at baseline, week 4
  evenness <- adiv[adiv$measure=="Evenness",]
  table(evenness$Health)
  evenness <- evenness[evenness$Health!="",]
  table(evenness$Health)
  evenness.0 <- evenness[evenness$Time=="baseline",]
  evenness.4 <- evenness[evenness$Time=="week4",]
  
  kruskal.test(mean ~ Health, data = evenness.0)
  kruskal.test(mean ~ Health, data = evenness.4)
  
  summary(evenness.4$mean[evenness.4$Health == "About the same"])
  sd(evenness.4$mean[evenness.4$Health == "About the same"])
  summary(evenness.4$mean[evenness.4$Health == "Better"])
  sd(evenness.4$mean[evenness.4$Health == "Better"])
  
  ##Fisher at baseline, week 4
  fisher <- adiv[adiv$measure=="Fisher",]
  table(fisher$Health)
  fisher <- fisher[fisher$Health!="",]
  table(fisher$Health)
  fisher.0 <- fisher[fisher$Time=="baseline",]
  fisher.4 <- fisher[fisher$Time=="week4",]
  
  kruskal.test(mean ~ Health, data = fisher.0)
  kruskal.test(mean ~ Health, data = fisher.4)
  
  ##Richness at baseline, week 4
  richness <- adiv[adiv$measure=="Richness",]
  table(richness$Health)
  richness <- richness[richness$Health!="",]
  table(richness$Health)
  richness.0 <- richness[richness$Time=="baseline",]
  richness.4 <- richness[richness$Time=="week4",]
  
  kruskal.test(mean ~ Health, data = richness.0)
  kruskal.test(mean ~ Health, data = richness.4)
  
  ##Shannon at baseline, week 4
  shannon <- adiv[adiv$measure=="Shannon",]
  table(shannon$Health)
  shannon <- shannon[shannon$Health!="",]
  table(shannon$Health)
  shannon.0 <- shannon[shannon$Time=="baseline",]
  shannon.4 <- shannon[shannon$Time=="week4",]
  
  kruskal.test(mean ~ Health, data = shannon.0)
  kruskal.test(mean ~ Health, data = shannon.4)
  
  summary(shannon.4$mean[shannon.4$Health == "About the same"])
  sd(shannon.4$mean[shannon.4$Health == "About the same"])
  summary(shannon.4$mean[shannon.4$Health == "Better"])
  sd(shannon.4$mean[shannon.4$Health == "Better"])
  
  ##Simpson at baseline, week 4
  simpson <- adiv[adiv$measure=="Simpson",]
  table(simpson$Health)
  simpson <- simpson[simpson$Health!="",]
  table(simpson$Health)
  simpson.0 <- simpson[simpson$Time=="baseline",]
  simpson.4 <- simpson[simpson$Time=="week4",]
  
  kruskal.test(mean ~ Health, data = simpson.0) #p = 0.4606
  kruskal.test(mean ~ Health, data = simpson.4) #p = 0.02343
  
  summary(simpson.4$mean[simpson.4$Health == "About the same"])
  sd(simpson.4$mean[simpson.4$Health == "About the same"])
  summary(simpson.4$mean[simpson.4$Health == "Better"])
  sd(simpson.4$mean[simpson.4$Health == "Better"])
  
```

## ####################################
## Microbiome analysis: KO data      ##
## ####################################

## read phyloseq object (KO terms)
```{r, echo=TRUE}
# read phyloseq object
  ko1=readRDS("KO.CBfiltered.metadata.integrated.rds")
  print(ko1)
  head(sample_data(ko1))
  head(tax_table(ko1))
  head(otu_table(ko1))
  table(sample_data(ko1)$Dog,sample_data(ko1)$Time) 
  ko2=ko1
```

##	CLEAN DATA
```{r, echo=TRUE}
# check for KO terms with 0 reads
  print(ko2)
  ko2= prune_taxa(taxa_sums(ko2)>0,ko2)
  ko0 <- subset_taxa(ko2, !is.na(KO) & !KO %in% c("", "XXX"))
  print(ko0)
  
# assess abundance of KO terms
  prev0 = apply(X = otu_table(ko0),
                MARGIN = ifelse(taxa_are_rows(ko0), yes = 1, no = 2),
                FUN = function(x){sum(x > 0)})
  
  prevdf = data.frame(Prevalence = prev0,
                      TotalAbundance = taxa_sums(ko0),
                      tax_table(ko0))
  
  prevdf1 = subset(prevdf,KO %in% get_taxa_unique(ko0, "KO"))

# remove low abundance terms (single read or <=5% of samples)
  xx=subset(prevdf1,TotalAbundance<=1 | Prevalence/nsamples(ko0)<=0.05) #if using count data
  head(xx);dim(xx)
  xx$Type=c("Ignore")
  yy=subset(prevdf1,TotalAbundance>1 & Prevalence/nsamples(ko0)>0.05) #if using count data
  head(yy);dim(yy)
  yy$Type=c("Include")
  prevdf1=rbind(xx,yy)
  head(prevdf1);dim(prevdf1)
  print(ko0)
  
  pdf("KO_prevalence.relative.prefiltering.pdf")
    ggplot(prevdf1, aes(y=TotalAbundance, x=Prevalence, color=Type)) +
      geom_point(shape=21, size = 1, alpha = 0.5) + scale_color_manual(values=nn.colors) +
      ylab("TotalAbundance") + xlab("Prevalence") + 
      theme(legend.position="none") + theme_minimal() + theme_bw()
  dev.off()
  
  filterKOs = xx$KO
  ko3 = subset_taxa(ko0, !KO %in% filterKOs)
  print(ko3)
  
# save cleaned ps object   
  ko4 = prune_samples(sample_sums(ko3)>=0.9, ko3)
  print(ko4)
  saveRDS(ko4,"KO.CBfiltered.metadata.merged.cln.rds")
```

## read phyloseq object (KO terms)
```{r, echo=TRUE}
# read phyloseq object
  ko.cln=readRDS("KO.CBfiltered.metadata.merged.cln.rds")
  
  write.table(tax_table(ko.cln), file = "module.clean.txt", append = FALSE, quote = FALSE, sep = "\t",
              eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE)
  
  print(ko.cln)
  head(sample_data(ko.cln))
  sample_data(ko.cln)$Label=sample_data(ko.cln)$Time
  table(sample_data(ko.cln)$Label)
  head(tax_table(ko.cln))
  table(sample_data(ko.cln)$Label) 
  ko.cln
```

### Bray-Curtis distances and statistical assocations
```{r, echo=TRUE}
# load the cleaned ps object
  print(ko.cln)  

# log transform the raw count data
  ko.cln.log <- transform_sample_counts(ko.cln, function(x) log(1 + x))
  print(ko.cln.log)

# calculate Bray-Curtis distance
  set.seed(seed)
  bcdist = phyloseq::distance(ko.cln.log, method="bray",normalized=TRUE,
                              parallel=FALSE,fast=TRUE) 
  pcoa <- ordinate(ko.cln.log, method = "MDS", distance = bcdist)
  evals <- pcoa$values$Eigenvalues

# screeplot
  pdf("PCoA_screeplot.KO.CBfiltered.log.pdf",6,4)
    plot_scree(pcoa) + xlim(as.character(seq(1,25))) + 
      ggtitle("Bray-Curtis ordination eigenvalues") +
      theme_bw() + 
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()

  names(pcoa)
  pcoa$values [1:5,]
       
# prepare the metadata  
  sample_data(ko.cln.log)$KO.PCoA1=pcoa$vectors[,1]
  sample_data(ko.cln.log)$KO.PCoA2=pcoa$vectors[,2]
  sample_data(ko.cln.log)$KO.PCoA3=pcoa$vectors[,3]
  sample_data(ko.cln.log)$KO.PCoA4=pcoa$vectors[,4]
  sample_data(ko.cln.log)$KO.PCoA5=pcoa$vectors[,5]
  sampledf=data.frame(sample_data(ko.cln.log))
  names(sampledf)  
  sampledf$Time=as.factor(sampledf$Label)
  sample_data(ko.cln.log)$Time=as.factor(sample_data(ko.cln.log)$Label)
  
  
# Plot PCoA  
  pdf("PCoA_KO_time.CBfiltered.log.pdf", 6,3)
    plot_ordination(ko.cln.log, pcoa, color="Label") + 
      scale_color_manual(values = nn.colors[-1]) +
      geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + 
      coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() +  
      theme(panel.border = element_blank(), panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))
  dev.off()

  head(sampledf)
  pdf("PCoA1_KO_ridges.count.pdf",6,6)
    ggplot(sampledf, aes(x = KO.PCoA1, y = Label, fill=Label)) +
    geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) +
    scale_fill_manual(values=nn.colors[-1]) +
    scale_y_discrete(expand = c(0.01, 0)) +
    scale_x_continuous(expand = c(0.01, 0)) +
    theme_ridges() #+ facet_wrap(~Diet)
  dev.off()
  
  pdf("PCoA1_KO_violin.pdf",6,4)
  ggplot(sampledf,aes(Label,KO.PCoA1, fill=Label)) + 
    geom_violin() + scale_fill_manual(values=nn.colors[-1]) +
    geom_jitter(width=0.05,col="black",size=1,shape=21) + xlab("") +
    theme_bw() + theme(legend.position="none") + labs(y= "KO PCoA1") #+ facet_wrap(~Diet)
  dev.off()

  pdf("PCoA2_KO_ridges.count.pdf",6,6)
    ggplot(sampledf, aes(x = KO.PCoA2, y = Label, fill=Label)) +
    geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) +
    scale_fill_manual(values=nn.colors[-1]) +
    scale_y_discrete(expand = c(0.01, 0)) +
    scale_x_continuous(expand = c(0.01, 0)) +
    theme_ridges() #+ facet_wrap(~Diet)
  dev.off()
  
# statistical association of PCoAs
# pcoa1
  shapiro.test(sampledf$KO.PCoA1) # not normally distributed
  set.seed(seed)
  kruskal.test(KO.PCoA1~Label,data=sampledf)
  sampledf <- sampledf %>% arrange(Time, Dog)
  wilcox.test(sampledf$KO.PCoA1[1:28], sampledf$KO.PCoA1[29:56], paired=T)
 
# pcoa2
  shapiro.test(sampledf$KO.PCoA2) # not normally distributed
  set.seed(seed)
  kruskal.test(KO.PCoA2~Label,data=sampledf) # p-value = 0.3588
    
# visualize the difference using boxplots   
  pdf("PCoA1_KO_boxplot.count.pdf",6,4)
  ggplot(sampledf,aes(Label,KO.PCoA1, fill=Label)) + 
    geom_boxplot(outlier.shape = NA, notch = TRUE) + scale_fill_manual(values=nn.colors[-1]) +
    geom_jitter(width=0.1,fill="gold",size=1,shape=21,alpha=0.75) + xlab("") + 
    theme_bw() + theme(legend.position="none") #+ facet_wrap(~Diet)
  dev.off()
  
# visualize the difference using boxplots   
  pdf("PCoA2_KO_boxplot.count.pdf",6,4)
  ggplot(sampledf,aes(Label,KO.PCoA2, fill=Label)) + 
    geom_boxplot(outlier.shape = NA, notch = TRUE) + scale_fill_manual(values=nn.colors[-1]) +
    geom_jitter(width=0.1,fill="gold",size=1,shape=21,alpha=0.75) + xlab("") + 
    theme_bw() + theme(legend.position="none") #+ facet_wrap(~Diet)
  dev.off()

# assess significance (simple model using PERMANOVA)
  sampledf <- data.frame(sample_data(ko.cln.log))
  names(sampledf)
  set.seed(seed)
  prm_mod=adonis2(bcdist ~ Label, 
                data = sampledf, permutations = 9999)
  prm_mod
```

### DESeq2 to calculate differences in KO terms (week4 vs baseline)
```{r, echo=TRUE}
  #use non-log data
  ko.cln=readRDS("KO.CBfiltered.metadata.merged.cln.rds")
  ko.cln
  base <- ko.cln %>%
    subset_samples(Time=="baseline" | Time=="week4") %>%
    prune_taxa(taxa_sums(.) > 0, .)
  base
  table(sample_data(base)$Time)
  diagdds = phyloseq_to_deseq2(base, ~ Time)
  diagdds = DESeq(diagdds, fitType = c("parametric"))
  
  res = results(diagdds, cooksCutoff = FALSE)
  hist(res$pvalue)
  res = cbind(as(res, "data.frame"), as(tax_table(ko.cln)[rownames(res), ], "matrix"))
  res = res[order(-abs(res$log2FoldChange),res$padj),]
  
  pdf("DESeq_week4_vs_baseline.KO.pdf",6,4)
    ggplot(res, aes(log2FoldChange, -log10(padj))) +
      geom_point(size=2,color="black") + geom_point(size=1,color="white") +
      geom_point(size=1,alpha=0.75) +
      scale_colour_manual(values=kolor) +
      geom_vline(xintercept = -2, lty=3, color="black") + 
      geom_vline(xintercept = 2, lty=3, color="black") + 
      geom_hline(yintercept = -log10(0.05), lty=3, color="black") +
      ylim(0,18) +
      theme_bw() + theme(panel.border = element_blank(), 
                         panel.grid.major = element_blank(),
                         panel.grid.minor = element_blank(), 
                         axis.line = element_line(colour = "black")) + 
      theme(axis.text.x = element_text(angle = 0, hjust = 0, vjust=0.5))
  dev.off()
  res$padj
  write.csv(res, file = "DESeq.KO.count.csv", row.names = TRUE)

```

## ################
## Diet analysis ##
## ################

## PROTEIN ##
```{r,echo=T}
#protein baseline diets
#percent dry-matter
a.pr.dm <- c(25.56, 34.09, 33.33, 26.67, 32.95, 30, 43.18, 35.56, 28.11, 35.56, 26.67, 26.67, 21.33, 27.28, 37.78)
#caloric basis
a.pr.cal <- c(65.92, 71.8, 78.43, 66.43, 71.89, 73.31, 97.44, 81.10, 75.30, 86.04, 66.32, 65.66, 49.64, 66.77, 91.94)

#protein fresh diets
#percent dry-matter 
a.pr.nn <- c(37.04, 36.96, 32, 36.67)
#caloric basis
a.pr.nncal <- c(80.71, 67.73, 64.21, 74.37)
```

## FAT ##
```{r,echo=T}
#fat baseline diets
#percent dry-matter
a.fa.dm <- c(12.22, 22.73, 20, 15.56, 19.32, 17.78, 20.45, 18.34, 8.56, 20, 13.33, 16.67, 18.33, 15.56, 17.78)
#caloric basis
a.fa.cal <- c(31.53, 47.87, 47.06, 38.75, 42.14, 43.44, 46.15, 41.72, 22.92, 48.4, 33.16, 41.04, 42.66, 37.39, 43.27)

#fat fresh diets
#percent dry-matter
a.fa.nn <- c(18.52, 26.09, 20, 16.67)
#caloric basis
a.fa.nncal <- c(40.36, 47.81, 40.13, 33.81)
```

## FIBER ##
```{r,echo=T}
#fiber baseline diets
#percent dry-matter
a.fi.dm <- c(6.33, 3.41, 4.44, 5.56, 3.41, 3.33, 4.55, 4.72, 1.78, 4.44, 4.44, 5.56, 9.22, 3.78, 4.44)
#caloric-basis
a.fi.cal <- c(16.34, 7.18, 10.46, 13.84, 7.44, 8.15, 10.26, 10.76, 4.76, 10.76, 11.05, 13.68, 21.46, 9.08, 10.82)

#fiber fresh diets
#percent dry-matter
a.fi.nn <- c(3.7, 4.35, 8, 3.33)
#caloric basis
a.fi.nncal <- c(8.07, 7.97, 16.05, 6.76)
```

## MOISTURE ##
```{r,echo=T}
#percent moisture baseline diets
a.mo <- c(10, 12, 10, 10, 12, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10)
#percent moisture fresh diets
a.mo.nn <- c(73, 77, 75, 70)
```

## CALORIES ## 
```{r,echo=T}
#calories baseline diets
#calorie density kcal/kg dry-matter
a.ca.dm <- c(3876.67, 4747.73, 4250, 4014.44, 4584.09, 4092.22, 4431.82, 4386.67, 3733.33, 4132.22, 4021.11, 4061.11, 4297.78, 4160, 4108.89)

#calories fresh diets
##calorie density kcal/kg dry-matter
a.ca.nn <- c(4588.889, 5456.52, 4984, 4930)
```

## IMPORT METDATA ##
```{r,echo=T}
#group
b <- c("LR", "MR", "MR", "HR", "MR", "HR", "HR", "HR", "MR", "HR", "MR", "LR", "LR", "LR", "MR")

#PCoA values
PCoA1.baseline <- c(-0.2912434092, -0.05353604557, 0.2881100368, -0.2444149455, 0.03010995198, 0.20169154, -0.3020782323, -0.3866607864, -0.2521250402, -0.3069681731, 0.06988740119, 0.2963869123, 0.1423442863, -0.009072526405, -0.3499042852)

PCoA1.change <- c(0.05854814808, 0.1301768087, 0.2053459868, 0.3444328039,  0.07591902822, 0.2533414719, 0.5462731419, 0.2833942851, 0.2208376496, 0.3976327893, 0.07112995234, -0.06805278555, -0.1079867444, -0.01973125854, 0.1485097625)
```

## Kruskal.wallis tests ##
# protein DM
```{r,echo=T}
kruskal.test(a.pr.dm, b)

LR.pr.dm <-a.pr.dm[c(1, 12, 13, 14)]
mean(LR.pr.dm) #25.21
sd(LR.pr.dm) #2.682872
MR.pr.dm <- a.pr.dm[c(2, 3, 5, 9, 11, 15)]
mean(MR.pr.dm) #32.155
sd(MR.pr.dm) #4.094991
HR.pr.dm <- a.pr.dm[c(4, 6, 7, 8, 10)]
mean(HR.pr.dm) #4.194
sd(HR.pr.dm) #6.298419
```

# protein g/1000 kcal
```{r,echo=T}
kruskal.test(a.pr.cal, b) #0.02652

LR.pr.cal <-a.pr.cal[c(1, 12, 13, 14)] 
mean(LR.pr.cal) #61.9975
sd(LR.pr.cal) #8.251959
MR.pr.cal <- a.pr.cal[c(2, 3, 5, 9, 11, 15)]
mean(MR.pr.cal) # 75.94667
sd(MR.pr.cal) #8.816983
HR.pr.cal <- a.pr.cal[c(4, 6, 7, 8, 10)]
mean(HR.pr.cal) #80.864
sd(HR.pr.cal) #11.90602

```

# fat DM
```{r,echo=T}
kruskal.test(a.fa.dm, b) #0.3421

LR.fa.dm <-a.fa.dm[c(1, 12, 13, 14)] 
mean(LR.fa.dm) #15.695
sd(LR.fa.dm) #2.581195
MR.fa.dm <- a.fa.dm[c(2, 3, 5, 9, 11, 15)]
mean(MR.fa.dm) #16.95333
sd(MR.fa.dm) #5.14807
HR.fa.dm <- a.fa.dm[c(4, 6, 7, 8, 10)]
mean(HR.fa.dm) #18.426
sd(HR.fa.dm) #1.950174
```

# fat g/1000 kcal
```{r,echo=T}
kruskal.test(a.fa.cal, b) #0.2835

LR.fa.cal <-a.fa.cal[c(1, 12, 13, 14)] 
mean(LR.fa.cal) #38.155
sd(LR.fa.cal) #4.936061
MR.fa.cal <- a.fa.cal[c(2, 3, 5, 9, 11, 15)]
mean(MR.fa.cal) #39.40333
sd(MR.fa.cal) #9.627346
HR.fa.cal <- a.fa.cal[c(4, 6, 7, 8, 10)]
mean(HR.fa.cal) #43.692
sd(HR.fa.cal) #3.761073

```

# fiber DM
```{r,echo=T}
kruskal.test(a.fi.dm, b) #0.06769

LR.fi.dm <-a.fi.dm[c(1, 12, 13, 14)] 
mean(LR.fi.dm) #6.2225
sd(LR.fi.dm) #2.26578
MR.fi.dm <- a.fi.dm[c(2, 3, 5, 9, 11, 15)]
mean(MR.fi.dm) #3.653333
sd(MR.fi.dm) #1.047314
HR.fi.dm <- a.fi.dm[c(4, 6, 7, 8, 10)]
mean(HR.fi.dm) #4.52
sd(HR.fi.dm) # 0.7976528
```

# fiber g/1000kcal
```{r,echo=T}
kruskal.test(a.fi.cal, b) #0.1293

LR.fi.cal <-a.fi.cal[c(1, 12, 13, 14)]
mean(LR.fi.cal) #15.14
sd(LR.fi.cal) #5.171641
MR.fi.cal <- a.fi.cal[c(2, 3, 5, 9, 11, 15)]
mean(MR.fi.cal) #8.618333
sd(MR.fi.cal) #2.549356
HR.fi.cal <- a.fi.cal[c(4, 6, 7, 8, 10)]
mean(HR.fi.cal) #10.754
sd(HR.fi.cal) #2.033981
```

#moisture
```{r,echo=T}
kruskal.test(a.mo, b) #0.4594

LR.mo <-a.mo[c(1, 12, 13, 14)] 
mean(LR.mo) #10
sd(LR.mo)
MR.mo <- a.mo[c(2, 3, 5, 9, 11, 15)]
mean(MR.mo) #10.66667
sd(MR.mo)
HR.mo <- a.mo[c(4, 6, 7, 8, 10)]
mean(HR.mo) #10.4
sd(HR.mo)
```

## calorie denisty DM 
```{r,echo=T}
kruskal.test(a.ca.dm, b) #0.8076
LR.ca.dm <-a.ca.dm[c(1, 12, 13, 14)]
mean(LR.ca.dm) # 4098.89
sd(LR.ca.dm)#177.107
MR.ca.dm <- a.ca.dm[c(2, 3, 5, 9, 11, 15)]
mean(MR.ca.dm) # 4240.858
sd(MR.ca.dm) # 373.6577
HR.ca.dm <- a.ca.dm[c(4, 6, 7, 8, 10)]
mean(HR.ca.dm) #4211.474
sd(HR.ca.dm) 
```

## Correlation tests ##
#protein
```{r,echo=T}
cor.test(a.pr.dm, PCoA1.baseline, method = "spearman")
cor.test(a.pr.cal, PCoA1.baseline, method = "spearman")
cor.test(a.pr.dm, PCoA1.change, method = "spearman")
cor.test(a.pr.cal, PCoA1.change, method = "spearman")
```

#fat
```{r,echo=T}
cor.test(a.fa.dm, PCoA1.baseline, method = "spearman")
cor.test(a.fa.cal, PCoA1.baseline, method = "spearman")
cor.test(a.fa.dm, PCoA1.change, method = "spearman")
cor.test(a.fa.cal, PCoA1.change, method = "spearman")
```

#fiber
```{r,echo=T}
cor.test(a.fi.dm, PCoA1.baseline, method = "spearman")
cor.test(a.fi.cal, PCoA1.baseline, method = "spearman")
cor.test(a.fi.dm, PCoA1.change, method = "spearman")
cor.test(a.fi.cal, PCoA1.change, method = "spearman")

```

#moisture - baseline
```{r,echo=T}
cor.test(a.mo, PCoA1.baseline, method = "spearman")
cor.test(a.mo, PCoA1.change, method = "spearman")
cor.test(a.ca.dm, PCoA1.baseline, method = "spearman")
cor.test(a.ca.dm, PCoA1.change, method = "spearman")
```

## Pairwise wilcox test for protein and fiber ##
```{r,echo=T}
#protein
pairwise.wilcox.test(a.pr.dm, b, p.adjust.method = "BH")
pairwise.wilcox.test(a.pr.cal, b, p.adjust.method = "BH")

#fiber
pairwise.wilcox.test(a.fi.dm, b, p.adjust.method = "BH")
```

## #######################
## Baseline health data ##
## #######################

## Compare HR MR LR baseline ##
#Read data
```{r,echo=T}
library(plyr)
insights <- read.csv(file = 'survey_prestudy_has_MB.csv')
table(insights$PCo1)
```

#age
```{r,echo=T}
mean<-ddply(insights, .(PCo1), summarize, mean=mean(age))
mean
sd<-ddply(insights, .(PCo1), summarize, sd=sd(age))
sd
kruskal.test(age~PCo1, data=insights)
```

#sex
```{r,echo=T}
table(insights$sex, insights$PCo1)
fisher <- matrix(c(5,4,6,3,2,6), nrow = 2)
fisher.test(fisher)
```

#spayed
```{r,echo=T}
table  (insights$spayed, insights$PCo1)
fisher <- matrix(c(1,8,2,8,1,8), nrow = 2)
fisher.test(fisher)
```

#prestudy.weight
```{r,echo=T}
mean<-ddply(insights, .(PCo1), summarize, mean=mean(prestudy.weight))
mean
sd<-ddply(insights, .(PCo1), summarize, sd=sd(prestudy.weight))
sd
kruskal.test(prestudy.weight~PCo1, data=insights)
```

#ideal.weight
```{r,echo=T}
mean<-ddply(insights, .(PCo1), summarize, mean=mean(ideal.weight))
mean
sd<-ddply(insights, .(PCo1), summarize, sd=sd(ideal.weight))
sd
kruskal.test(ideal.weight~PCo1, data=insights)
```

#bcs
```{r,echo=T}
table  (insights$bcs, insights$PCo1)
fisher <- matrix(c(8,1,6,2,6,4), nrow = 2)
fisher.test(fisher)
```

#activity_level
```{r,echo=T}
table  (insights$activity_level, insights$PCo1)
fisher <- matrix(c(3,3,3,4,0,4,3,3,4), nrow = 3)
fisher.test(fisher)
```

#overall.health
```{r,echo=T}
table  (insights$overall.health, insights$PCo1)
fisher <- matrix(c(6,2,1,6,2,0,5,4,1), nrow = 3)
fisher.test(fisher)
```

#food.motivation
```{r,echo=T}
table  (insights$food.motivation, insights$PCo1)
fisher <- matrix(c(8,0,1,7,1,0,8,0,2), nrow = 3)
fisher.test(fisher)
```

#bristol.chart
```{r,echo=T}
insights1 <- insights[-15,]
mean<-ddply(insights1, .(PCo1), summarize, mean=mean(bristol.chart))
mean
sd<-ddply(insights1, .(PCo1), summarize, sd=sd(bristol.chart))
sd
kruskal.test(bristol.chart~PCo1, data=insights1)
```

#defecation.frequency
```{r,echo=T}
table  (insights$defecation.frequency, insights$PCo1)
fisher <- matrix(c(3,5,1,3,5,0,2,7,1), nrow = 3)
fisher.test(fisher)
```

#flatulence.frequency
```{r,echo=T}
table  (insights$flatulence.frequency, insights$PCo1)
fisher <- matrix(c(0,9,1,7,5,5), nrow = 2)
fisher.test(fisher)
```

## ##################################
## Health survey analysis - week 4 ##
## ##################################
#read dataset - exit survey
```{r,echo=T}
data <- read.csv(file = 'exit.survey.csv')
data.include <- data[data$exclude.missing.meals != TRUE & data$exclude.antibiotics != TRUE,]
data.exclude <- data[data$exclude.missing.meals == TRUE | data$exclude.antibiotics == TRUE,]
```

#overall.health
```{r,echo=T}
summary(data.include$overall.health)
fisher <- matrix(c(19,12,0,10,10,10), nrow = 3)
fisher.test(fisher)
fisher.multcomp(fisher, p.method = "fdr")
```

#physical.activity
```{r,echo=T}
summary(data.include$physical.activity)
fisher <- matrix(c(15,16,0,10,10,10), nrow = 3)
fisher.test(fisher)
fisher.multcomp(fisher, p.method = "fdr")
```

#weight.change.lbs
```{r,echo=T}
mean(data.include$weight.change.lbs)
sd(data.include$weight.change.lbs)
wilcox.test(data.include$prestudy.weight.lbs, data.include$poststudy.weight.lbs, pair=T)
t.test(data.include$prestudy.weight.lbs, data.include$poststudy.weight.lbs, pair=T)
```

#fecal.score
```{r,echo=T}
mean(data.include$prestudy.fecal.score[-c(6,14,17,27)], na.rm=T)
sd(data.include$prestudy.fecal.score[-c(6,14,17,27)], na.rm=T)
mean(data.exclude$prestudy.fecal.score, na.rm=T)
sd(data.exclude$prestudy.fecal.score, na.rm=T)
wilcox.test(data.include$prestudy.fecal.score[-c(6,14,17,27)], data.exclude$prestudy.fecal.score, pair=F)

mean(data.include$fecal.score.change, na.rm=T)
sd(data.include$fecal.score.change, na.rm=T)
mean(data.include$poststudy.fecal.score[-c(6,14,17,27)], na.rm=T)
sd(data.include$poststudy.fecal.score[-c(6,14,17,27)], na.rm=T)
wilcox.test(data.include$prestudy.fecal.score[-c(6,14,17,27)], data.include$poststudy.fecal.score[-c(6,14,17,27)], pair=T)
t.test(data.include$prestudy.weight.lbs, data.include$poststudy.weight.lbs, pair=T)
```

#defecation.frequency
```{r,echo=T}
summary(data.include$defecation.frequency)
fisher <- matrix(c(2,11,18,10,10,10), nrow = 3)
fisher.test(fisher)
fisher.multcomp(fisher, p.method = "fdr")
```

#flatulence.frequency
```{r,echo=T}
summary(data.include$flatulence.frequency)
fisher <- matrix(c(2,19,10,10,10,10), nrow = 3)
fisher.test(fisher)
fisher.multcomp(fisher, p.method = "fdr")
```

#coat.condition
```{r,echo=T}
summary(data.include$coat.condition)
fisher <- matrix(c(15,16,0,10,10,10), nrow = 3)
fisher.test(fisher)
fisher.multcomp(fisher, p.method = "fdr")
```

#appetite
```{r,echo=T}
summary(data.include$appetite)
fisher <- matrix(c(19,11,1,10,10,10), nrow = 3)
fisher.test(fisher)
fisher.multcomp(fisher, p.method = "fdr")
```

## Compare HR MR LR week 4 ##
#read data
```{r,echo=T}
exit <- read.csv(file = 'exit.survey.csv')
exit$fecal.score.change <- as.numeric(exit$fecal.score.change)
table(exit$PCo1)
```

#overall.health
```{r,echo=T}
table  (exit$overall.health, exit$PCo1)
fisher <- matrix(c(6,3,0,5,3,0,5,4,0), nrow = 3)
fisher.test(fisher)
fisher <- matrix(c(6,3,5,3,5,4), nrow = 2)
fisher.test(fisher)
```

#physical.activity
```{r,echo=T}
table  (exit$physical.activity, exit$PCo1)
fisher <- matrix(c(4,5,0,3,5,0,5,4,0), nrow = 3)
fisher.test(fisher)
fisher <- matrix(c(4,5,3,5,5,4), nrow = 2)
fisher.test(fisher)
```

#weight.change.percent
```{r,echo=T}
exit1 <- exit[-c(1,2,8,15,17,19,22,29,33,34,38),]
table(exit1$PCo1)
mean<-ddply(exit1, .(PCo1), summarize, mean=mean(weight.change.percent))
mean
sd<-ddply(exit1, .(PCo1), summarize, sd=sd(weight.change.percent))
sd
kruskal.test(weight.change.percent~PCo1, data=exit1)
```

#poststudy.fecal.score
```{r,echo=T}
exit1 <- exit[-c(1,2,8,15,17,19,22,29,33,34,38),]
table(exit1$PCo1)
mean<-ddply(exit1, .(PCo1), summarize, mean=mean(poststudy.fecal.score))
mean
sd<-ddply(exit1, .(PCo1), summarize, sd=sd(poststudy.fecal.score))
sd
kruskal.test(poststudy.fecal.score~PCo1, data=exit1)
```

#fecal.score.change
```{r,echo=T}
exit1 <- exit[-c(1,2,8,15,17,19,21,22,29,33,34,38),]
table(exit1$PCo1)
mean<-ddply(exit1, .(PCo1), summarize, mean=mean(fecal.score.change))
mean
sd<-ddply(exit1, .(PCo1), summarize, sd=sd(fecal.score.change))
sd
kruskal.test(fecal.score.change~PCo1, data=exit1)
```

#defecation.frequency
```{r,echo=T}
table  (exit$defecation.frequency, exit$PCo1)
fisher <- matrix(c(0,3,6,0,2,6,1,4,4), nrow = 3)
fisher.test(fisher)
```

#flatulence.frequency
```{r,echo=T}
table  (exit$flatulence.frequency, exit$PCo1)
fisher <- matrix(c(0,8,1,0,4,4,1,5,3), nrow = 3)
fisher.test(fisher)
```

#coat.condition
```{r,echo=T}
table  (exit$coat.condition, exit$PCo1)
fisher <- matrix(c(4,5,0,3,5,0,5,4,0), nrow = 3)
fisher.test(fisher)
fisher <- matrix(c(4,5,3,5,5,4), nrow = 2)
fisher.test(fisher)
```

#appetite
```{r,echo=T}
table  (exit$appetite, exit$PCo1)
fisher <- matrix(c(5,4,0,5,3,0,6,3,0), nrow = 3)
fisher.test(fisher)
```