--- title: "Gut.Microbiome.Analysis.2020.06.29" author: "jirayu.tanp@nomnomnow.com" date: "2020/06/29" output: html_document editor_options: chunk_output_type: console --- ## R version and citation ```{r, echo=TRUE} version citation() ``` ## Set paths ```{r, echo=TRUE} ### fastq files downloded from gdrive --> CoreBiome ### demultiplexing and OTU construction completed by CoreBiome ### Phyloseq objected created using in-house NomNomNow pipeline ver 0.2 my_dir = c("...") setwd(my_dir) print(paste("Working directory is:",getwd())) seed=20200629 set.seed(seed) group_cols=c(rep("cyan",12),rep("lightsteelblue",12), rep("dodgerblue",12),rep("navy",12),rep("purple",12), rep("pink",12),rep("darkgoldenrod",12),rep("chocolate1",12), rep("indianred1",12),rep("brown",12)) pop.kolor=c("navy","gold", "red") dark.colors = c("#454849","#7c8286") nn.colors = c("#A7A8A9","#EE2737","#69B3E7") ``` ## Load libraries ```{r, echo=TRUE} #if (!requireNamespace("BiocManager")) # install.packages("BiocManager") # BiocManager::install("DESeq2") #BiocManager::install("dada2","msa","phyloseq","plyr","dplyr","reshape2","ade4","ggrepel","DECIPHER","vegan","DESeq2","phanghorn") #BiocManager::install("ShotgunFunctionalize.R") library("ggplot2");packageVersion("ggplot2");citation("ggplot2") library("gridExtra");packageVersion("gridExtra");citation("gridExtra") library("phyloseq");packageVersion("phyloseq");citation("phyloseq") library("plyr");packageVersion("plyr");citation("plyr") library("dplyr");packageVersion("dplyr");citation("dplyr") library("reshape2");packageVersion("reshape2");citation("reshape2") library("ggrepel");packageVersion("ggrepel");citation("ggrepel") library("vegan");packageVersion("vegan");citation("vegan") library("tidyr");packageVersion("tidyr");citation("tidyr") library("DESeq2");packageVersion("DESeq2");citation("DESeq2") library("Rmisc");packageVersion("Rmisc");citation("Rmisc") library("PMCMR");packageVersion("PMCMR");citation("PMCMR") library("nlme");packageVersion("nlme");citation("nlme") library("lme4");packageVersion("lme4");citation("lme4") library("dunn.test");packageVersion("dunn.test");citation("dunn.test") library("randomForest");packageVersion("randomForest");citation("randomForest") library("ROCR");packageVersion("ROCR");citation("ROCR") library("verification");packageVersion("verification");citation("verification") library("qvalue");packageVersion("qvalue");citation("qvalue") library("caret");packageVersion("caret");citation("caret") library("e1071");packageVersion("e1071");citation("e1071") library("caTools");packageVersion("caTools");citation("caTools") library("rpart");packageVersion("rpart");citation("rpart") library("ggridges");packageVersion("ggridges");citation("ggridges") library("RVAideMemoire") #library("");packageVersion("") #library("");packageVersion("") ``` ## Necessary functions ```{r, echo=TRUE} # sequencing depths sample_coverage<-function(phyloseqobj){ xx = data.frame(nreads = sample_sums(phyloseqobj), sorted = 1:nsamples(phyloseqobj), type = "Samples") xx$SampleID=rownames(xx) xx = xx[order(xx$nreads),] print(paste("coverage summary: "));print(summary(xx$nreads)) print(paste("total reads = ",sum(xx$nreads),sep="")) print(paste("stdev = ",sd(xx$nreads),sep="")) #return(xx) } # clean phyloseq objects clean_phyloseq<-function(phyloseqobj){ readsumsdf = data.frame(nreads = sort(taxa_sums(phyloseqobj), TRUE), sorted = 1:ntaxa(phyloseqobj), type = "Taxa") readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(phyloseqobj), TRUE), sorted = 1:nsamples(phyloseqobj),type = "Samples")) pdf("Taxa_and_sample_coverage.plot.pdf",width=4,height=3) title = "Total number of reads in dataset" ggplot(readsumsdf, aes(x = sorted, y = nreads)) + geom_bar(stat = "identity", color = dark.colors[1], fill = dark.colors[1]) + ggtitle(title) + facet_wrap(~type, 1, scales = "free") + xlab("Sequencing depth (number of reads)") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() print("assessing phyla abundance...") prev0 = apply(X = otu_table(phyloseqobj), MARGIN = ifelse(taxa_are_rows(phyloseqobj), yes = 1, no = 2), FUN = function(x){sum(x > 0)}) prevdf = data.frame(Prevalence = prev0, TotalAbundance = taxa_sums(phyloseqobj), tax_table(phyloseqobj)) prevdf1 = subset(prevdf, Rank2 %in% get_taxa_unique(phyloseqobj, "Rank2")) pdf("Taxa_prevalence.precleanup.pdf", width=10, height=6) ggplot(prevdf1, aes(TotalAbundance, Prevalence/nsamples(phyloseqobj), color = Rank2)) + geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") + facet_wrap(~Rank2) + theme(legend.position="none") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) + theme(legend.position = "none") dev.off() print("removing unknown taxa and low abundance phyla (N<5)...") temp=plyr::ddply(prevdf, "Rank2", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))}) colnames(temp)=c("Phylum","Var1","Var2") filtered_phyla=subset(temp,Var2 0) ggplot(data = mphyseq, mapping = aes_string(x = mapvar, y = "Abundance", color = Color, fill = Color)) + geom_violin(fill = NA) + geom_point(size = 1, alpha = 0.3, position = position_jitter(width = 0.3)) + geom_point(colour = "#454849", shape=21, alpha=0.5) + facet_wrap(facets = Facet) + scale_y_log10() + theme(legend.position="none") + theme(axis.text.x = element_text(angle = 90, hjust = 1)) } # Plot PCoA plotPCoA<-function(phyloseqobj,pcoa,evals,kolor){ plot_ordination(phyloseqobj, pcoa) + geom_point(aes(color = Timepoint), size = 1) + scale_color_manual(values = kolor) + geom_point(colour = "#454849", shape=21, alpha=0.5) + geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) } # Calculating alpha diversity compute_alphadiv<-function(phyloseqobj,trials,rarefaction_depth){ min_lib= rarefaction_depth nsamp = nsamples(phyloseqobj) trials = trials richness <- matrix(nrow = nsamp, ncol = trials) row.names(richness) <- sample_names(phyloseqobj) evennessF <- matrix(nrow = nsamp, ncol = trials) row.names(evennessF) <- sample_names(phyloseqobj) evennessS <- matrix(nrow = nsamp, ncol = trials) row.names(evennessS) <- sample_names(phyloseqobj) evennessI <- matrix(nrow = nsamp, ncol = trials) row.names(evennessI) <- sample_names(phyloseqobj) set.seed(007) for (i in 1:trials) { # Subsample r <- rarefy_even_depth(phyloseqobj, sample.size = min_lib, verbose = FALSE, replace = TRUE) # Calculate richness rich <- as.numeric(as.matrix(estimate_richness(r, measures = "Observed"))) richness[ ,i] <- rich # Calculate evenness Fisher evenF <- as.numeric(as.matrix(estimate_richness(r, measures = "Fisher"))) evennessF[ ,i] <- evenF # Calculate evenness Shannon evenS <- as.numeric(as.matrix(estimate_richness(r, measures = "Shannon"))) evennessS[ ,i] <- evenS # Calculate evenness Simpson evenI <- as.numeric(as.matrix(estimate_richness(r, measures = "Simpson"))) evennessI[ ,i] <- evenI } SampleID <- row.names(richness) mean <- apply(richness, 1, mean) sd <- apply(richness, 1, sd) measure <- rep("Richness", nsamp) rich_stats <- data.frame(SampleID, mean, sd, measure) SampleID <- row.names(evennessF) mean <- apply(evennessF, 1, mean) sd <- apply(evennessF, 1, sd) measure <- rep("Fisher", nsamp) even_statsF <- data.frame(SampleID, mean, sd, measure) SampleID <- row.names(evennessS) mean <- apply(evennessS, 1, mean) sd <- apply(evennessS, 1, sd) measure <- rep("Shannon", nsamp) even_statsS <- data.frame(SampleID, mean, sd, measure) SampleID <- row.names(evennessI) median <- apply(evennessI, 1, mean) sd <- apply(evennessI, 1, sd) measure <- rep("Simpson", nsamp) even_statsI <- data.frame(SampleID, mean, sd, measure) alpha <- rbind(rich_stats, even_statsF, even_statsS, even_statsI) s <- data.frame(sample_data(phyloseqobj)) alphadiv <- merge(alpha, s, by = "SampleID") alphadiv$rarefaction=rep(min_lib,dim(alphadiv)[1]) return(alphadiv) } # Standard error calculation st.err <- function(x) { sd(x)/sqrt(length(x)) } # PAM clustering pamPCoA = function(x, k) { list(cluster = pam(x[,1:2], k, cluster.only = TRUE)) } # Plot heatmap of taxa make_hcb <- function(data, var, name = NULL, fillScale = NULL, ...) { hcb <- ggplot(data=data, aes_string(x="index", y=1, fill=var)) + geom_raster() + scale_y_continuous(expand=c(0,0), breaks=1, labels=name) + scale_x_continuous(expand=c(0,0)) + xlab(NULL) + ylab(NULL) + theme(axis.title=element_blank(), axis.ticks=element_blank()) + theme(axis.text.x=element_blank()) + theme(axis.text.y=element_text(size=8, face="bold")) + #theme(plot.margin=unit(c(0,0,0,0),"lines"), axis.ticks.margin = unit(0,"null"), ...) + guides(fill=F) if(!is.null(fillScale)) hcb <- hcb + fillScale return(hcb) } plot_heatmap.2 <- function(ps, sample.label=NULL, taxa.label=NULL, ...) { hm <- plot_heatmap(ps, taxa.label="Rank8", sample.order=sample.order, taxa.order = taxa.order) hm <- hm + theme(axis.text.x=element_blank(), axis.ticks.x=element_blank()) low = "#000033"; high = "#66CCFF"; trans = scales::log_trans(4); na.value = "#454849" #low = "#10cfc9"; high = "#f5a800"; trans = scales::log_trans(4); na.value = "black" new_gradient <- scale_fill_gradient(low = low, high = high, trans = trans, na.value = na.value, breaks = c(0.001, 0.01, 0.1, 1), name="Relative\nabundance") hm <- hm + theme(plot.margin=unit(c(0,0.5,0.5,0.5),"lines")) hm <- hm + new_gradient hm <- hm + geom_raster() hm <- hm + ylab("Taxa") hm$layers <- hm$layers[2] return(hm) } makeProfilePlot <- function(mylist,names) { require(RColorBrewer) # find out how many variables we want to include numvariables <- length(mylist) # choose 'numvariables' random colours qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',] col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals))) colours <- sample(col_vector,numvariables) # find out the minimum and maximum values of the variables: mymin <- 1e+20 mymax <- 1e-20 for (i in 1:numvariables) { vectori <- mylist[[i]] mini <- min(vectori) maxi <- max(vectori) if (mini < mymin) { mymin <- mini } if (maxi > mymax) { mymax <- maxi } } # plot the variables for (i in 1:numvariables) { vectori <- mylist[[i]] namei <- names[i] colouri <- colours[i] if (i == 1) { plot(vectori,col=colouri,type="l",ylim=c(mymin,mymax)) } else { points(vectori, col=colouri,type="l") } lastxval <- length(vectori) lastyval <- vectori[length(vectori)] text((lastxval-10),(lastyval),namei,col="black",cex=0.6) } } ``` ## ########################################## ## Microbiome analysis: Week 4 vs Baseline ## ## ########################################## ## Read phyloseq object ```{r, echo=TRUE} # read phyloseq object ps.raw=readRDS("Study.metadata.integrated.rds") print(ps.raw) head(sample_data(ps.raw)) ``` ## Filtering and QC ```{r, echo=TRUE} # sequencing depths sample_coverage(ps.raw) # plot sample and taxa coverage pre-cleanup readsumsdf = data.frame(nreads = sort(taxa_sums(ps.raw), TRUE), sorted = 1:ntaxa(ps.raw), type = "Taxa") readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(ps.raw), TRUE), sorted = 1:nsamples(ps.raw),type = "Samples")) pdf("Taxa_and_sample_coverage_pre-cleanup.pdf",width=4,height=3) title = "Total number of reads pre-cleanup" ggplot(readsumsdf, aes(x = sorted, y = nreads)) + geom_bar(stat = "identity", color = dark.colors[1], fill = dark.colors[1]) + ggtitle(title) + facet_wrap(~type, 1, scales = "free") + xlab("Sequencing depth (number of reads)") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() # assess abundance of phyla pre-cleanup prev0 = apply(X = otu_table(ps.raw), MARGIN = ifelse(taxa_are_rows(ps.raw), yes = 1, no = 2), FUN = function(x){sum(x > 0)}) prevdf = data.frame(Prevalence = prev0, TotalAbundance = taxa_sums(ps.raw), tax_table(ps.raw)) prevdf1 = subset(prevdf, Rank2 %in% get_taxa_unique(ps.raw, "Rank2")) pdf("Taxa_prevalence_pre-cleanup.pdf", width=10, height=6) ggplot(prevdf1, aes(TotalAbundance, Prevalence/nsamples(ps.raw), color = Rank2)) + geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") + facet_wrap(~Rank2) + theme(legend.position="none") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) + theme(legend.position = "none") dev.off() # remove rare and low abundant taxa print(ps.raw) tmp = filter_taxa(ps.raw, function(x) sum(x > 2) > (0.05*length(x)), TRUE) print(tmp) # remove missing phyla, if any ps1 = subset_taxa(tmp, !is.na(Rank2)) print(ps1) print(ps.raw) # Filter entries with prevalence = 0, if any . readsumsdf = data.frame(nreads = sort(taxa_sums(ps1), TRUE), sorted = 1:ntaxa(ps1), type = "Taxa") readsumsdf = rbind(readsumsdf, data.frame(nreads = sort(sample_sums(ps1), TRUE), sorted = 1:nsamples(ps1), type = "Samples")) # Plot post-filtering coverage statistics pdf("Taxa_and_sample_coverage_post-filtering.pdf") title = "Total number of reads post-cleanup" p = ggplot(readsumsdf, aes(x = sorted, y = nreads)) + geom_bar(stat = "identity", color = dark.colors[2], fill = dark.colors[2]) p + ggtitle(title) + facet_wrap(~type, 1, scales = "free") + xlab("Sequencing depth (number of reads)") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() # calculate coverage post-filtering sample_coverage(ps1) # assess abundance of phyla post filtering prev0 = apply(X = otu_table(ps1), MARGIN = ifelse(taxa_are_rows(ps1), yes = 1, no = 2), FUN = function(x){sum(x > 0)}) prevdf = data.frame(Prevalence = prev0, TotalAbundance = taxa_sums(ps1), tax_table(ps1)) prevdf1 = subset(prevdf, Rank2 %in% get_taxa_unique(ps1, "Rank2")) pdf("Taxa_prevalence_postfiltering.pdf") ggplot(prevdf1, aes(TotalAbundance, Prevalence/nsamples(ps1), color = Rank2)) + geom_point(size = 2, alpha = 0.7) + scale_x_log10() + xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") + facet_wrap(~Rank2) + theme(legend.position="none") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) + theme(legend.position = "none") dev.off() # save cleaned ps object print(ps1) saveRDS(ps1,"Study.metadata.integrated.cln.rds") ``` ## Read cleaned phyloseq object ```{r, echo=TRUE} # read phyloseq object ps.cln=readRDS("Study.metadata.integrated.cln.rds") print(ps.cln) head(sample_data(ps.cln)) sample_data(ps.cln)$Label=paste(sample_data(ps.cln)$Time) table(sample_data(ps.cln)$Label) #include only baseline ps.cln.baseline <- ps.cln %>% subset_samples( Label == "baseline" ) %>% prune_taxa(taxa_sums(.) > 0, .) #include only week4 ps.cln.week4 <- ps.cln %>% subset_samples( Label == "week4" ) %>% prune_taxa(taxa_sums(.) > 0, .) ``` ## Calculate alpha diversity ```{r, echo=TRUE} alldat=ps.cln # Rarefaction statistics xx = data.frame(sample_sums(alldat)) xx$SampleID=rownames(xx) colnames(xx)=c("Coverage","SampleID") xx = xx[order(xx$Coverage),] min(xx$Coverage) depth=10000 maximm=round(min(xx$Coverage)/depth,0) for ( i in 1:maximm){ # calculate alpha diversity by subsampling at the rarefaction depth rfxn=compute_alphadiv(alldat,100,depth) # head(rfxn) # calculate evenness evenness = subset(rfxn, measure=="Shannon") evn=evenness$mean/log(dim(otu_table(alldat))[2]) evenness$mean=evn evenness$sd=rep("NA",dim(evenness)[1]) evenness$measure=rep("Evenness",dim(evenness)[1]) rfxn2=rbind(rfxn,evenness) rfxn2=rfxn2[order(rfxn2$SampleID),] filename=paste("AlphaDiversity",depth,"txt",sep=".") write.table(rfxn2, file = filename, append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = TRUE) # increase depth by 500 depth=depth+10000 } ad2=read.table("AlphaDiversity.10000.txt", header=TRUE) ad3=read.table("AlphaDiversity.20000.txt", header=TRUE) ad4=read.table("AlphaDiversity.30000.txt", header=TRUE) ad5=read.table("AlphaDiversity.40000.txt", header=TRUE) ad6=read.table("AlphaDiversity.50000.txt", header=TRUE) ad7=read.table("AlphaDiversity.60000.txt", header=TRUE) ad8=read.table("AlphaDiversity.70000.txt", header=TRUE) ad9=read.table("AlphaDiversity.80000.txt", header=TRUE) ad10=read.table("AlphaDiversity.90000.txt", header=TRUE) ad11=read.table("AlphaDiversity.1e+05.txt", header=TRUE) ad12=read.table("AlphaDiversity.110000.txt", header=TRUE) ad13=read.table("AlphaDiversity.120000.txt", header=TRUE) ad14=read.table("AlphaDiversity.130000.txt", header=TRUE) ad15=read.table("AlphaDiversity.140000.txt", header=TRUE) ad16=read.table("AlphaDiversity.150000.txt", header=TRUE) ad17=read.table("AlphaDiversity.160000.txt", header=TRUE) ad18=read.table("AlphaDiversity.170000.txt", header=TRUE) ad19=read.table("AlphaDiversity.180000.txt", header=TRUE) ad20=read.table("AlphaDiversity.190000.txt", header=TRUE) ad21=read.table("AlphaDiversity.2e+05.txt", header=TRUE) ad22=read.table("AlphaDiversity.210000.txt", header=TRUE) ad23=read.table("AlphaDiversity.220000.txt", header=TRUE) ad24=read.table("AlphaDiversity.230000.txt", header=TRUE) ad25=read.table("AlphaDiversity.240000.txt", header=TRUE) ad26=read.table("AlphaDiversity.250000.txt", header=TRUE) ad27=read.table("AlphaDiversity.260000.txt", header=TRUE) ad28=read.table("AlphaDiversity.270000.txt", header=TRUE) ad29=read.table("AlphaDiversity.280000.txt", header=TRUE) ad30=read.table("AlphaDiversity.290000.txt", header=TRUE) ad31=read.table("AlphaDiversity.3e+05.txt", header=TRUE) ad32=read.table("AlphaDiversity.310000.txt", header=TRUE) ad33=read.table("AlphaDiversity.320000.txt", header=TRUE) ad34=read.table("AlphaDiversity.330000.txt", header=TRUE) ad35=read.table("AlphaDiversity.340000.txt", header=TRUE) ad36=read.table("AlphaDiversity.350000.txt", header=TRUE) ad37=read.table("AlphaDiversity.360000.txt", header=TRUE) ad38=read.table("AlphaDiversity.370000.txt", header=TRUE) ad39=read.table("AlphaDiversity.380000.txt", header=TRUE) ad40=read.table("AlphaDiversity.390000.txt", header=TRUE) ad41=read.table("AlphaDiversity.4e+05.txt", header=TRUE) ad42=read.table("AlphaDiversity.410000.txt", header=TRUE) ad43=read.table("AlphaDiversity.420000.txt", header=TRUE) ad44=read.table("AlphaDiversity.430000.txt", header=TRUE) ad45=read.table("AlphaDiversity.440000.txt", header=TRUE) ad46=read.table("AlphaDiversity.450000.txt", header=TRUE) ad47=read.table("AlphaDiversity.460000.txt", header=TRUE) ad48=read.table("AlphaDiversity.470000.txt", header=TRUE) ad=rbind(ad2,ad3,ad4,ad5,ad6,ad7,ad8,ad9,ad10, ad11,ad12,ad13,ad14,ad15,ad16,ad17,ad18,ad19,ad20, ad21,ad22,ad23,ad24,ad25,ad26,ad27,ad28,ad29,ad30, ad31,ad32,ad33,ad34,ad35,ad36,ad37,ad38,ad39,ad40, ad41,ad42,ad43,ad44,ad45,ad46,ad47,ad48) head(ad) head(mer) ad2=merge(mer, ad, by=c("SampleID")) ad2$Dog.y=c() head(ad2) colnames(ad2)=c("SampleID", "Dog", "Time.x", "mean", "sd", "measure", "Time.y", "Label", "PCoA1", "PCoA2", "PCoA3", "PCoA4", "PCoA5", "PCoA6", "PCoA7", "rarefaction") ad2=ad2[order(ad2$rarefaction,ad2$Dog,ad2$Label,ad2$measure),] write.table(ad2, file = "AlphaDiversity.combined.txt", append = FALSE, quote = FALSE, sep = "\t",eol = "\n", na = "NA", dec = ".", row.names = FALSE, col.names = TRUE, qmethod = c("escape", "double"), fileEncoding = "") ``` ### Plot alpha diversity (rarefaction curves) ```{r, echo=TRUE} # Read the concatenated alphadiv file df = read.table("AlphaDiversity.combined.txt", header=TRUE) head(df);dim(df) # Plot Richness rich=data.frame(subset(df,measure=="Richness")) rich$mean=as.numeric(paste(rich$mean)) rich$sd=as.numeric(paste(rich$sd)) rich$rarefaction=as.numeric(paste(rich$rarefaction)) head(rich);dim(rich) # Rarefaction curves by Diet rich_agg.mean = aggregate(rich$mean, by=list(rich$Label,rich$rarefaction), FUN=mean, na.rm=TRUE) rich_agg.sd = aggregate(rich$mean, by=list(rich$Label,rich$rarefaction), FUN=sd, na.rm=TRUE) rich_agg.sem = aggregate(rich$mean, by=list(rich$Label,rich$rarefaction), FUN=st.err) rich_agg = cbind(rich_agg.mean,rich_agg.sd, rich_agg.sem) head(rich_agg) rich_agg[,c(4:5,7:8)]=c() colnames(rich_agg)=c("Label","Rarefaction_depth","mean","sd","se") head(rich_agg) pdf("RarefactionCurves_richness_time.pdf", 6,4) ggplot(rich_agg, aes(Rarefaction_depth, mean, ymin = mean - sd, ymax = mean + sd, colour = Label, group=Label)) + scale_color_manual(values=nn.colors[-1]) + geom_point() + geom_pointrange(aes(ymin=mean-sd, ymax=mean+sd), alpha=0.3) + geom_line() + geom_pointrange(alpha=0.3) + scale_x_continuous(breaks=seq(0,470000,100000)) + ylab("Species Richness") + xlab("Rarefaction depth") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() # Richness: assess significance (using highest rarefaction depth) depth = max(rich$rarefaction);depth mxs = subset(rich, rarefaction == depth) set.seed(seed) shapiro.test(mxs$mean) # normally distributed set.seed(seed) rich.aes=aov(mean~Label,mxs) summary(rich.aes) # p=0.777 set.seed(seed) TukeyHSD(rich.aes) head(mxs) pdf("Richness_time_boxplot.pdf", 4,4) ggplot(mxs, aes(Label, mean)) + geom_boxplot(aes(fill=Label), outlier.shape=NA) + geom_jitter(width=0.1,shape=21,col="black") + scale_fill_manual(values=nn.colors[-1], name = "Label") + ylab("Species Richness") + xlab("Time") + theme_bw() dev.off() # Plot shannon head(df) shannon=data.frame(subset(df,measure=="Shannon")) shannon$mean=as.numeric(paste(shannon$mean)) shannon$sd=as.numeric(paste(shannon$sd)) shannon$rarefaction=as.numeric(paste(shannon$rarefaction)) head(shannon);dim(shannon) # rarefaction curves by Diet shannon_agg.mean = aggregate(shannon$mean, by=list(shannon$Label,shannon$rarefaction), FUN=mean, na.rm=TRUE) shannon_agg.sd = aggregate(shannon$mean, by=list(shannon$Label,shannon$rarefaction), FUN=sd, na.rm=TRUE) shannon_agg.sem = aggregate(shannon$mean, by=list(shannon$Label,shannon$rarefaction), FUN=st.err) shannon_agg = cbind(shannon_agg.mean,shannon_agg.sd, shannon_agg.sem) head(shannon_agg) shannon_agg[,c(4:5,7:8)]=c() colnames(shannon_agg)=c("Label","Rarefaction_depth","mean","sd","se") head(shannon_agg) pdf("RarefactionCurves_shannon_time.pdf", 6,4) ggplot(shannon_agg, aes(Rarefaction_depth, mean, ymin = mean - sd, ymax = mean + sd, colour=Label, group=Label)) + scale_color_manual(values=nn.colors[-1]) + geom_point(alpha=0.3) + geom_line() + geom_pointrange(aes(ymin=mean-sd, ymax=mean+sd),alpha=0.3) + geom_pointrange(alpha=0.3) + scale_x_continuous(breaks=seq(0,470000,100000)) + ylab("Shannon's H") + xlab("Rarefaction depth") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() # shannon: assess significance (using highest rarefaction depth) depth = max(shannon$rarefaction);depth mxs = subset(shannon, rarefaction == depth);head(mxs) set.seed(seed) shapiro.test(mxs$mean) # normally distributed set.seed(seed) shannon.aes=aov(mean~Label,mxs) summary(shannon.aes) kruskal.test(mean~Label,mxs) pdf("Shannon_time_boxplot.pdf", 4,4) ggplot(mxs, aes(Label, mean)) + geom_boxplot(aes(fill=Label),outlier.shape = NA, color="black") + geom_jitter(width=0.1,shape=21, col="black") + scale_fill_manual(values=nn.colors[-1]) + ylab("Shannon's H") + xlab("Time") + theme_bw() dev.off() ``` ### Bray-Curtis distances and statistical assocations ```{r, echo=TRUE} # Load the cleaned ps object print(ps.cln) sample_coverage(ps.cln) # Agglomerate at different taxonomic level ps2.phylum = tax_glom(ps.cln, "Rank2", NArm = FALSE) #phylum ps2.class = tax_glom(ps.cln, "Rank3", NArm = FALSE) #class ps2.order = tax_glom(ps.cln, "Rank4", NArm = FALSE) #order ps2.family = tax_glom(ps.cln, "Rank5", NArm = FALSE) #family ps2.genus = tax_glom(ps.cln, "Rank6", NArm = FALSE) #genus ps2.species = tax_glom(ps.cln, "Rank7", NArm = FALSE) #species saveRDS(ps2.phylum,"Study.metadata.integrated.cln.phylum.rds") saveRDS(ps2.class,"Study.metadata.integrated.cln.class.rds") saveRDS(ps2.order,"Study.metadata.integrated.cln.order.rds") saveRDS(ps2.family,"Study.metadata.integrated.cln.family.rds") saveRDS(ps2.genus,"Study.metadata.integrated.cln.genus.rds") saveRDS(ps2.species,"Study.metadata.integrated.cln.species.rds") # Change ps2 to different taxonomic levels ps2.species = readRDS("Study.metadata.integrated.cln.species.rds") ps2 = ps2.species print(ps2) sample_coverage(ps2) # Optional - transform count to relative abundance ps.ra = transform_sample_counts(ps2, function(x) x / sum(x) ) write.csv(otu_table(ps.ra),'relative.abundance.species.csv',row.names = TRUE) # Log transform the raw count data pslog <- transform_sample_counts(ps2, function(x) log(1 + x)) print(pslog) # Calculate Bray-Curtis distance set.seed(seed) bcdist = phyloseq::distance(pslog, method="bray",normalized=TRUE, parallel=FALSE,fast=TRUE) pcoa <- ordinate(pslog, method = "MDS", distance = bcdist) pcoa$vectors[,1] = pcoa$vectors[,1]*-1 evals <- pcoa$values$Eigenvalues # Screeplot pdf("PCoA_screeplot.species.pdf",6,4) plot_scree(pcoa) + xlim(as.character(seq(1,25))) + ggtitle("Bray-Curtis ordination eigenvalues") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() names(pcoa) pcoa$values [1:7,] # Prepare the metadata sample_data(pslog)$PCoA1=pcoa$vectors[,1] sample_data(pslog)$PCoA2=pcoa$vectors[,2] sample_data(pslog)$PCoA3=pcoa$vectors[,3] sample_data(pslog)$PCoA4=pcoa$vectors[,4] sample_data(pslog)$PCoA5=pcoa$vectors[,5] sample_data(pslog)$PCoA6=pcoa$vectors[,6] sample_data(pslog)$PCoA7=pcoa$vectors[,7] sample_data(ps2)=sample_data(pslog) sampledf=data.frame(sample_data(pslog)) head(sampledf) write.csv(sample_data(pslog), "MB.PCoA.scores.csv") # Add HR, MR, LR to the metadata sample_data(pslog)$PCo.Group <- c("LR", "MR", "MR", "LR", "MR", "MR", "LR", "MR", "MR", "LR", "MR", "MR", "HR", "HR", "LR", "LR", "HR", "HR", "HR", "HR", "HR", "LR", "MR", "MR", "LR", "HR", "MR", "HR", "HR", "HR", "HR", "MR", "LR", "LR", "MR", "LR", "MR", "LR", "LR", "LR", "HR", "HR", "MR", "HR", "LR", "MR", "HR", "MR", "HR", "LR", "LR", "MR", "LR", "HR", "LR", "LR") sample_data(ps2)=sample_data(pslog) sampledf=data.frame(sample_data(pslog)) head(sampledf) # Assess significance (using Permanova and GLM; validation of KW test) names(sampledf) set.seed(seed) prm_ab=adonis2(bcdist ~ Label, data = sampledf, permutations = 9999) prm_ab # Check dispersion groups <- sampledf[["Label"]] dispgrps <-betadisper(bcdist, groups, type=c("median")) anova(dispgrps) plot(dispgrps, col=pop.kolor) dispgrps boxplot(dispgrps) # Plot PCoA pdf("PCoA_abundance_diet.pdf", 6,3) plot_ordination(pslog, pcoa, color="Label") + scale_color_manual(values = nn.colors[-1]) + geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() pdf("PCoA1_ridges.pdf",6,6) ggplot(sampledf, aes(x = PCoA1, y = Label, fill=Label)) + geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) + scale_fill_manual(values=nn.colors[-1]) + scale_y_discrete(expand = c(0.01, 0), name="Diet") + scale_x_continuous(expand = c(0.01, 0)) + theme_ridges() #+ facet_wrap(~Diet) dev.off() pdf("PCoA2_ridges.pdf",6,6) ggplot(sampledf, aes(x = PCoA2, y = Label, fill=Label)) + geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) + scale_fill_manual(values=nn.colors[-1]) + scale_y_discrete(expand = c(0.01, 0)) + scale_x_continuous(expand = c(0.01, 0)) + theme_ridges() #+ facet_wrap(~Diet) dev.off() # Statistical association of PCoAs # Pcoa1 shapiro.test(sampledf$PCoA1) # normally distributed set.seed(seed) kruskal.test(PCoA1~Label,data=sampledf) # p-value = 0.00286 sampledf <- sampledf %>% arrange(Time, Dog) wilcox.test(sampledf$PCoA1[1:28], sampledf$PCoA1[29:56], paired=T) # Visualize the difference using boxplots pdf("PCoA1_boxplot.pdf",6,4) ggplot(sampledf,aes(Label,PCoA1, fill=Label)) + geom_violin() + scale_fill_manual(values=nn.colors[-1]) + geom_jitter(width=0.05,col="black",size=1,shape=21) + xlab("") + theme_bw() + theme(legend.position="none") #+ facet_wrap(~Diet) dev.off() # Pcoa2 shapiro.test(sampledf$PCoA2) # normally distributed set.seed(seed) kruskal.test(PCoA2~Label,data=sampledf) # p-value = 0.2582 # visualize the difference using boxplots pdf("PCoA2_boxplot.pdf",6,4) ggplot(sampledf,aes(Label,PCoA2, fill=Label)) + geom_violin() + scale_fill_manual(values=nn.colors[-1]) + geom_jitter(width=0.05,col="black",size=1,shape=21) + xlab("") + theme_bw() + theme(legend.position="none") dev.off() # Pcoa3 shapiro.test(sampledf$PCoA3) # normally distributed set.seed(seed) kruskal.test(PCoA3~Label,data=sampledf) # p-value = 0.4126 # Pcoa4 shapiro.test(sampledf$PCoA4) # normally distributed set.seed(seed) kruskal.test(PCoA4~Label,data=sampledf) # p-value = 0.3419 # Pcoa5 shapiro.test(sampledf$PCoA5) # normally distributed set.seed(seed) kruskal.test(PCoA5~Label,data=sampledf) # p-value = 0.6346 m2 <- glm(PCoA1 ~ Label, data = sampledf) qqnorm(residuals(m2)) qqline(residuals(m2),col="red") summary(m2) # associated # the glm corroborates the findings from the kw analyses for PCoA1 m3 <- glm(PCoA2 ~ Label, data = sampledf) qqnorm(residuals(m3)) qqline(residuals(m3),col="red") summary(m3) m4 <- glm(PCoA3 ~ Label, data = sampledf) qqnorm(residuals(m4)) qqline(residuals(m4),col="red") summary(m4) m5 <- glm(PCoA4 ~ Label, data = sampledf) qqnorm(residuals(m5)) qqline(residuals(m5),col="red") summary(m5) m6 <- glm(PCoA5 ~ Label, data = sampledf) qqnorm(residuals(m6)) qqline(residuals(m6),col="red") summary(m6) # associated m7 <- glm(PCoA6 ~ Label, data = sampledf) qqnorm(residuals(m7)) qqline(residuals(m7),col="red") summary(m7) m8 <- glm(PCoA7 ~ Label, data = sampledf) qqnorm(residuals(m8)) qqline(residuals(m8),col="red") summary(m8) ``` ### PAM Cluster by Ordination analysis - baseline vs week 4 ```{r, echo=TRUE} # Cluster library(cluster) NDIM <- 2 x <- pcoa$vectors[,1:NDIM] # rows=sample, cols=MDS axes, entries = value gs = clusGap(x, FUN = pamPCoA, K.max = 5, B = 500) pdf("GapStatistics.pdf", width=4, height=4) plot_clusgap(gs) + scale_x_continuous(breaks=c(seq(0, 5, 1))) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() # The gap statistic suggests 2 distinct clusters K <- 2 x <- pcoa$vectors[,1:NDIM] clust <- as.factor(pam(x, k=K, cluster.only=T)) sample_data(pslog)$CST <- paste("Cluster",clust,sep="") CSTs <- as.character(seq(K)) write.csv(sample_data(pslog), "PAM.baseline.week4.csv") # Cluster by Ordination (MDS) CSTColors <- c("#EE2737", "#69B3E7") names(CSTColors) <- paste("Cluster",CSTs,sep="")#CSTs CSTColors CSTColorScale <- scale_colour_manual(name = "CST", values = CSTColors[1:7]) CSTFillScale <- scale_fill_manual(name = "CST", values = CSTColors[1:7]) pdf("PCoA_CST_species.pdf", 6, 4) plot_ordination(pslog, pcoa, color = "CST", shape="Time") + geom_point(colour = "black", aes(shape=Time), size = 2, alpha=0.75) + geom_point(aes(color = CST, shape=Time), size = 1, alpha=0.75) + CSTColorScale + geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + #xlab("PCoA1 [9.5%]") + ylab("PCoA2 [8.7%]") + theme(legend.position = "none") + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() table(sample_data(pslog)$Time,sample_data(pslog)$CST) mcnemar <- matrix(c(4,1,11,12), nrow=2) mcnemar.test(mcnemar) ``` ### DESeq2 to calculate differences in taxa abundance (week4 vs baseline) ```{r, echo=TRUE} ps2 base <- ps2 %>% subset_samples(Label=="baseline" | Label=="week4") %>% prune_taxa(taxa_sums(.) > 0, .) base table(sample_data(base)$Label) diagdds = phyloseq_to_deseq2(base, ~ Label) diagdds = DESeq(diagdds, fitType = c("parametric")) write.csv(otu_table(base),'DESeq.input.phylum.csv',row.names = TRUE) write.csv(otu_table(base),'DESeq.input.class.csv',row.names = TRUE) write.csv(otu_table(base),'DESeq.input.order.csv',row.names = TRUE) write.csv(otu_table(base),'DESeq.input.family.csv',row.names = TRUE) write.csv(otu_table(base),'DESeq.input.genus.csv',row.names = TRUE) write.csv(otu_table(base),'DESeq.input.species.csv',row.names = TRUE) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps2)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq_week4_vs_baseline.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + ylim(0,11) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = 0, hjust = 0, vjust=0.5)) dev.off() write.csv(res, file = "DESeq.output.phylum.csv", row.names = TRUE) write.csv(res, file = "DESeq.output.class.csv", row.names = TRUE) write.csv(res, file = "DESeq.output.order.csv", row.names = TRUE) write.csv(res, file = "DESeq.output.family.csv", row.names = TRUE) write.csv(res, file = "DESeq.output.genus.csv", row.names = TRUE) write.csv(res, file = "DESeq.output.species.csv", row.names = TRUE) ``` ## ###################################### ## Microbiome Analysis with LR, MR, HR ## ## ###################################### #Lineplot ```{r,echo=T} pcoa1 <- sample_data(ps2) pdf("PCoA1.lineplots.HR.MR.LR.pdf",6,6) ggplot(pcoa1, aes(x=Time, y=PCoA1, color=PCo.Group, group=Dog)) + geom_line() + geom_point(shape=20, alpha=1, color='black') + scale_color_manual(values = c(nn.colors[-1], nn.colors[1],"#E6A65D")) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + scale_x_discrete(name ="Time point") dev.off() ``` # Calculate Bray-Curtis distance only for baseline samples for HR, MR, LR ```{r, echo=TRUE} ps2; pslog; ps2.baseline <- ps2 %>% subset_samples(Label == "baseline") %>% prune_taxa(taxa_sums(.) > 0, .) pslog.baseline <- pslog %>% subset_samples(Label == "baseline") %>% prune_taxa(taxa_sums(.) > 0, .) set.seed(seed) bcdist = phyloseq::distance(pslog.baseline, method="bray",normalized=TRUE, parallel=FALSE,fast=TRUE) pcoa.baseline <- ordinate(pslog.baseline, method = "MDS", distance = bcdist) evals <- pcoa.baseline$values$Eigenvalues pcoa.baseline$vectors[,1] <- pcoa.baseline$vectors[,1]*-1 sampledf=data.frame(sample_data(pslog.baseline)) head(sampledf) #Meta data sample_data(pslog.baseline)$PCoA1=pcoa.baseline$vectors[,1] sample_data(pslog.baseline)$PCoA2=pcoa.baseline$vectors[,2] sample_data(pslog.baseline)$PCoA3=pcoa.baseline$vectors[,3] sample_data(pslog.baseline)$PCoA4=pcoa.baseline$vectors[,4] sample_data(pslog.baseline)$PCoA5=pcoa.baseline$vectors[,5] sample_data(pslog.baseline)$PCoA6=pcoa.baseline$vectors[,6] sample_data(pslog.baseline)$PCoA7=pcoa.baseline$vectors[,7] sample_data(ps2.baseline)=sample_data(pslog.baseline) sampledf=data.frame(sample_data(pslog.baseline)) head(sampledf) # assess significance (using Permanova and GLM; validation of KW test) names(sampledf) set.seed(seed) prm_ab=adonis2(bcdist ~ PCo.Group, data = sampledf, permutations = 9999) prm_ab # Plot PCoA pdf("PCoA_abundance_HR_MR_LR.baseline.pdf", 6,3) plot_ordination(pslog.baseline, pcoa.baseline, color="PCo.Group") + scale_color_manual(values = nn.colors) + geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() ``` # PAM Cluster by Ordination analysis - HR, MR, LR ```{r, echo=TRUE} # Cluster library(cluster) NDIM <- 2 x <- pcoa.baseline$vectors[,1:NDIM] # rows=sample, cols=MDS axes, entries = value gs = clusGap(x, FUN = pamPCoA, K.max = 5, B = 500) pdf("GapStatistics.HR.MR.LR.pdf", width=4, height=4) plot_clusgap(gs) + scale_x_continuous(breaks=c(seq(0, 5, 1))) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() # The gap statistic suggests 2 distinct clusters K <- 2 x <- pcoa.baseline$vectors[,1:NDIM] clust <- as.factor(pam(x, k=K, cluster.only=T)) sample_data(pslog.baseline)$CST <- paste("Cluster",clust,sep="") CSTs <- as.character(seq(K)) # Cluster by Ordination (MDS) CSTColors <- c(nn.colors, "#FF9E1B", "#EF82A9") names(CSTColors) <- paste("Cluster",CSTs,sep="")#CSTs CSTColors CSTColorScale <- scale_colour_manual(name = "CST", values = CSTColors[1:7]) CSTFillScale <- scale_fill_manual(name = "CST", values = CSTColors[1:7]) pdf("PCoA_CST_species.HR.MR.LR.pdf", 6, 4) plot_ordination(pslog.baseline, pcoa.baseline, color = "PCo.Group", shape="CST") + geom_point(colour = "black", aes(shape=CST), size = 2, alpha=0.75) + geom_point(aes(color = PCo.Group, shape=CST), size = 2, alpha=0.75) + CSTColorScale + geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + #xlab("PCoA1 [9.5%]") + ylab("PCoA2 [8.7%]") + theme(legend.position = "right") + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() table(sample_data(pslog.baseline)$PCo.Group,sample_data(pslog.baseline)$CST) fisher <- matrix(c(8,2,2,1,8,7), nrow = 3) fisher.test(fisher) ``` # Supervised learning - Random Forest Test (Abundance) for HR, MR, LR ```{r, echo=TRUE} ps2.RF <- ps2.baseline; pslog.RF <- pslog.baseline; sample_data(ps2.RF)$PCo.Group sample_data(pslog.RF)$PCo.Group sample_data(ps2.RF)$PCo.Group <- c("LM", "LM", "LM", "LM", "LM", "LM", "HR", "LM", "HR", "HR", "LM", "LM", "HR", "LM", "HR", "LM", "LM", "LM", "LM", "LM", "HR", "HR", "LM", "LM", "HR", "LM", "HR", "LM") sample_data(pslog.RF)$PCo.Group <- c("LM", "LM", "LM", "LM", "LM", "LM", "HR", "LM", "HR", "HR", "LM", "LM", "HR", "LM", "HR", "LM", "LM", "LM", "LM", "LM", "HR", "HR", "LM", "LM", "HR", "LM", "HR", "LM") # prepare the data dataMatrix <- data.frame(var = sample_data(pslog.RF)$PCo.Group, t(otu_table(pslog.RF))) set.seed(seed) spl <- sample.split(dataMatrix$var, SplitRatio = 0.7) Train <- subset(dataMatrix, spl == TRUE);dim(Train) Test <- subset(dataMatrix, spl == FALSE);dim(Test) # run randomForest with repeated cross validation (10X CV times 3) control <- trainControl(method="repeatedcv", number=5, repeats=3, search="random") set.seed(seed) mtry <- sqrt(ncol(dataMatrix)-1) rf_train <- train(var~., data=Train, method="rf", metric="Accuracy", tuneLength=15, trControl=control) print(rf_train) plot(rf_train) rf_train$finalModel head(names(Test[,-1])) rf_pred <- predict(rf_train, Test[,-1]) confusionMatrix(rf_pred, as.factor(Test$var)) set.seed(seed) prediction_for_roc_curve <- predict(rf_train,Test[-1],type="prob") classes=levels(Test$var) prediction_for_roc_curve <- predict(rf_train,Test,type="prob") kolor=pop.kolor pdf("AUC_abundance_Labels.pdf",3,3) for (i in 1:3){ # Define which observations belong to class[i] true_values <- ifelse(Test$var==classes[i],1,0) # Assess the performance of classifier for class[i] pred <- prediction(prediction_for_roc_curve[,i],true_values) perf <- performance(pred, "tpr", "fpr") if (i==1){ plot(perf,col=kolor[i], lwd=2) } else{ plot(perf,col=kolor[i],add=TRUE,lwd=2) } # Calculate the AUC and print it to screen auc.perf <- performance(pred, measure = "auc") print(auc.perf@y.values) } dev.off() ``` ##Abundance by DESeq - baseline HR vs LR ```{r,echo=T} # pull up only week 0 data ps2.baseline <- ps2 %>% subset_samples(Label == "baseline") %>% prune_taxa(taxa_sums(.) > 0, .) ps2.baseline sample_data(ps2.baseline) sample_data(ps2.baseline)$Dog #remove MR baseline <- ps2.baseline %>% subset_samples(PCo.Group != "MR") %>% prune_taxa(taxa_sums(.) > 0, .) baseline sample_data(baseline) table(sample_data(baseline)$PCo.Group) diagdds = phyloseq_to_deseq2(baseline, ~ PCo.Group) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(baseline)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] sigtab$log2FoldChange <- (-1)*sigtab$log2FoldChange head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq.DASpp.week0.HR.vs.LR.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -1, lty=3, color="black") + geom_vline(xintercept = 1, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ylim(0,11.5) + xlim(-30,30) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week0.HR.vs.LR.csv", row.names = FALSE) ``` ##Alpha-diversity in HR, LR, MR ```{r,echo=T} ##read data adiv <- read.csv("Diversity and evenness at 470000 reads.csv",header = T) ##Evenness at baseline, week 4 evenness <- adiv[adiv$measure=="Evenness",] evenness.0 <- evenness[evenness$Time=="baseline",] evenness.4 <- evenness[evenness$Time=="week4",] kruskal.test(mean ~ Label, data = evenness.0) #p = 0.3534 kruskal.test(mean ~ Label, data = evenness.4) #p = 0.9472 ##Fisher at baseline, week 4 fisher <- adiv[adiv$measure=="Fisher",] fisher.0 <- fisher[fisher$Time=="baseline",] fisher.4 <- fisher[fisher$Time=="week4",] kruskal.test(mean ~ Label, data = fisher.0) #p = 0.9574 kruskal.test(mean ~ Label, data = fisher.4) #p = 0.7568 ##Fisher at baseline, week 4 richness <- adiv[adiv$measure=="Richness",] richness.0 <- richness[richness$Time=="baseline",] richness.4 <- richness[richness$Time=="week4",] kruskal.test(mean ~ Label, data = richness.0) #p = 0.9574 kruskal.test(mean ~ Label, data = richness.4) #p = 0.7568 ##Shannon at baseline, week 4 shannon <- adiv[adiv$measure=="Shannon",] shannon.0 <- shannon[shannon$Time=="baseline",] shannon.4 <- shannon[shannon$Time=="week4",] kruskal.test(mean ~ Label, data = shannon.0) #p = 0.3534 kruskal.test(mean ~ Label, data = shannon.4) #p = 0.9472 ##Simpson at baseline, week 4 simpson <- adiv[adiv$measure=="Simpson",] simpson.0 <- simpson[simpson$Time=="baseline",] simpson.4 <- simpson[simpson$Time=="week4",] kruskal.test(mean ~ Label, data = simpson.0) #p = 0.3534 kruskal.test(mean ~ Label, data = simpson.4) #p = 0.9472 ``` ##PCoA score at baseline and week 4 in HR, MR, LR ```{r,echo=T} #read data pcoa.data <- sample_data(ps2) pcoa.data$PCoA1.group <- pcoa.data$PCo.Group head(pcoa.data) names(pcoa.data) #baseline PCoA scores summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="baseline"]) sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="baseline"]) summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="baseline"]) sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="baseline"]) summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="baseline"]) sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="baseline"]) #kruskal test kruskal.test(pcoa.data$PCoA1[pcoa.data$Time=="baseline"] ~ pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"]) pairwise.wilcox.test(pcoa.data$PCoA1[pcoa.data$Time=="baseline"], pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"], p.adjust.method = "BH") posthoc.kruskal.dunn.test(pcoa.data$PCoA1[pcoa.data$Time=="baseline"], as.factor(pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"]), p.adjust.method = "BH") #plot pcoa.data$PCoA1.group <- factor(pcoa.data$PCoA1.group, levels=c("HR", "MR", "LR")) plot(pcoa.data$PCoA1[pcoa.data$Time=="baseline"] ~ pcoa.data$PCoA1.group[pcoa.data$Time=="baseline"], ylab="PCoA1 at baseline", xlab="PCoA1 change") #week 4 PCoA scores summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="week4"]) sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="LR" & pcoa.data$Time=="week4"]) summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="week4"]) sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="MR" & pcoa.data$Time=="week4"]) summary(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="week4"]) sd(pcoa.data$PCoA1[pcoa.data$PCoA1.group=="HR" & pcoa.data$Time=="week4"]) #kruskal test kruskal.test(pcoa.data$PCoA1[pcoa.data$Time=="week4"] ~ pcoa.data$PCoA1.group[pcoa.data$Time=="week4"]) ``` ##Abundance by DESeq - week 4 HR vs LR ```{r,echo=T} # pull up only week 4 data ps2.week4 <- ps2 %>% subset_samples(Label == "week4") %>% prune_taxa(taxa_sums(.) > 0, .) week4 sample_data(ps2.week4) sample_data(ps2.week4)$Dog # remove MR week4 <- ps2.week4 %>% subset_samples(PCo.Group != "MR") %>% prune_taxa(taxa_sums(.) > 0, .) week4 sample_data(week4) table(sample_data(week4)$PCo.Group) diagdds = phyloseq_to_deseq2(week4, ~ PCo.Group) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(week4)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] sigtab$log2FoldChange <- (-1)*sigtab$log2FoldChange head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq.DASpp.week4.HR.vs.LR.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + #geom_point(size=2,color="black") + geom_point(size=1,color="white") + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ylim(0,11.5) + xlim(-30,30) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week4.HR.vs.LR.csv", row.names = FALSE) ``` ##Abundance by DESeq - HR week 4 vs baseline ```{r,echo=T} HR <- ps2 %>% subset_samples(PCo.Group == "HR") %>% prune_taxa(taxa_sums(.) > 0, .) HR sample_data(HR) table(sample_data(HR)$Label) diagdds = phyloseq_to_deseq2(HR, ~ Label) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps2)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq.DASpp.HR.week4.vs.week0.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + #geom_point(size=2,color="black") + geom_point(size=1,color="white") + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ylim(0,35) + xlim(-30,30) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.HR.week4.vs.week0.csv", row.names = FALSE) ``` ##Abundance by DESeq - LR week 4 vs baseline ```{r,echo=T} LR <- ps2 %>% subset_samples(PCo.Group == "LR") %>% prune_taxa(taxa_sums(.) > 0, .) LR sample_data(LR) table(sample_data(LR)$Label) diagdds = phyloseq_to_deseq2(LR, ~ Label) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(ps2)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq.DASpp.LR.week4.vs.week0.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + #geom_point(size=2,color="black") + geom_point(size=1,color="white") + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ylim(0,35) + xlim(-30,30) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.LR.week4.vs.week0.csv", row.names = FALSE) ``` ##Abundance by DESeq - Baseline Better vs About the same health ```{r,echo=T} # add overall.health data sample_data(ps2)$overall.health <- c("Better", "Better", "Better", "Better", "Same", "Same", "Same", "Better", "Better", "Same", "Better", "Better", "Better", "Better", "Better", "Better", "Same", "Same", "Better", "Same", "Same", "Same", "Same", "Same", "Same", "Better", "No data", "Better", "Better", "Better", "Better", "Better", "Better", "Same", "Same", "Better", "Better", "Better", "Same", "Same", "Same", "Better", "Better", "Same", "No data", "Better", "Better", "Same", "Better", "Better", "No data", "No data", "Better", "Better", "Better", "Same") health <- ps2 %>% subset_samples(overall.health != "No data") %>% prune_taxa(taxa_sums(.) > 0, .) baseline.health <- health %>% subset_samples(Label == "baseline") %>% prune_taxa(taxa_sums(.) > 0, .) baseline.health sample_data(baseline.health) table(sample_data(baseline.health)$overall.health) diagdds = phyloseq_to_deseq2(baseline.health, ~ overall.health) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(baseline.health)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq.DASpp.LR.week4.vs.week0.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + #geom_point(size=2,color="black") + geom_point(size=1,color="white") + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ylim(0,11.5) + xlim(-30,30) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week0.better.health.vs.null.health.csv", row.names = FALSE) ``` ##Abundance by DESeq - Week 4 Better vs About the same health ```{r,echo=T} health <- ps2 %>% subset_samples(overall.health != "No data") %>% prune_taxa(taxa_sums(.) > 0, .) week4.health <- health %>% subset_samples(Label == "week4") %>% prune_taxa(taxa_sums(.) > 0, .) week4.health sample_data(week4.health) table(sample_data(week4.health)$overall.health) diagdds = phyloseq_to_deseq2(week4.health, ~ overall.health) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) alpha = 1 sigtab = res[which(res$padj < alpha), ] sigtab = cbind(as(sigtab, "data.frame"), as(tax_table(week4.health)[rownames(sigtab), ], "matrix")) sigtab = sigtab[order(-abs(sigtab$log2FoldChange),sigtab$padj),] head(sigtab) dim(sigtab) sigtab$Taxaname=row.names(sigtab) names(sigtab) sigtab.all=data.frame(sigtab) kolor = c("darkgrey","olivedrab","navy","orange","red","dodgerblue", "magenta","darkgreen","linen", "mistyrose","coral3","peachpuff3","lightcyan2","gold4") pdf("DESeq.DASpp.week4.better.health.vs.null.health.pdf",6,4) ggplot(sigtab.all, aes(log2FoldChange, -log10(padj), color=Rank2)) + #geom_point(size=2,color="black") + geom_point(size=1,color="white") + geom_point(size=1,alpha=1,shape=21) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ylim(0,4) + xlim(-30,30) + theme(axis.text.x = element_text(angle = -90, hjust = 0, vjust=0.5)) dev.off() write.csv(sigtab.all, file = "DifferentiallyAbundantSpecies.week4.better.health.vs.null.health.csv", row.names = FALSE) ``` ##Alpha-diversity - Better vs About the same health ```{r,echo=T} ##read data adiv <- read.csv("Diversity and evenness at 470000 reads.csv",header = T) ##Evenness at baseline, week 4 evenness <- adiv[adiv$measure=="Evenness",] table(evenness$Health) evenness <- evenness[evenness$Health!="",] table(evenness$Health) evenness.0 <- evenness[evenness$Time=="baseline",] evenness.4 <- evenness[evenness$Time=="week4",] kruskal.test(mean ~ Health, data = evenness.0) kruskal.test(mean ~ Health, data = evenness.4) summary(evenness.4$mean[evenness.4$Health == "About the same"]) sd(evenness.4$mean[evenness.4$Health == "About the same"]) summary(evenness.4$mean[evenness.4$Health == "Better"]) sd(evenness.4$mean[evenness.4$Health == "Better"]) ##Fisher at baseline, week 4 fisher <- adiv[adiv$measure=="Fisher",] table(fisher$Health) fisher <- fisher[fisher$Health!="",] table(fisher$Health) fisher.0 <- fisher[fisher$Time=="baseline",] fisher.4 <- fisher[fisher$Time=="week4",] kruskal.test(mean ~ Health, data = fisher.0) kruskal.test(mean ~ Health, data = fisher.4) ##Richness at baseline, week 4 richness <- adiv[adiv$measure=="Richness",] table(richness$Health) richness <- richness[richness$Health!="",] table(richness$Health) richness.0 <- richness[richness$Time=="baseline",] richness.4 <- richness[richness$Time=="week4",] kruskal.test(mean ~ Health, data = richness.0) kruskal.test(mean ~ Health, data = richness.4) ##Shannon at baseline, week 4 shannon <- adiv[adiv$measure=="Shannon",] table(shannon$Health) shannon <- shannon[shannon$Health!="",] table(shannon$Health) shannon.0 <- shannon[shannon$Time=="baseline",] shannon.4 <- shannon[shannon$Time=="week4",] kruskal.test(mean ~ Health, data = shannon.0) kruskal.test(mean ~ Health, data = shannon.4) summary(shannon.4$mean[shannon.4$Health == "About the same"]) sd(shannon.4$mean[shannon.4$Health == "About the same"]) summary(shannon.4$mean[shannon.4$Health == "Better"]) sd(shannon.4$mean[shannon.4$Health == "Better"]) ##Simpson at baseline, week 4 simpson <- adiv[adiv$measure=="Simpson",] table(simpson$Health) simpson <- simpson[simpson$Health!="",] table(simpson$Health) simpson.0 <- simpson[simpson$Time=="baseline",] simpson.4 <- simpson[simpson$Time=="week4",] kruskal.test(mean ~ Health, data = simpson.0) #p = 0.4606 kruskal.test(mean ~ Health, data = simpson.4) #p = 0.02343 summary(simpson.4$mean[simpson.4$Health == "About the same"]) sd(simpson.4$mean[simpson.4$Health == "About the same"]) summary(simpson.4$mean[simpson.4$Health == "Better"]) sd(simpson.4$mean[simpson.4$Health == "Better"]) ``` ## #################################### ## Microbiome analysis: KO data ## ## #################################### ## read phyloseq object (KO terms) ```{r, echo=TRUE} # read phyloseq object ko1=readRDS("KO.CBfiltered.metadata.integrated.rds") print(ko1) head(sample_data(ko1)) head(tax_table(ko1)) head(otu_table(ko1)) table(sample_data(ko1)$Dog,sample_data(ko1)$Time) ko2=ko1 ``` ## CLEAN DATA ```{r, echo=TRUE} # check for KO terms with 0 reads print(ko2) ko2= prune_taxa(taxa_sums(ko2)>0,ko2) ko0 <- subset_taxa(ko2, !is.na(KO) & !KO %in% c("", "XXX")) print(ko0) # assess abundance of KO terms prev0 = apply(X = otu_table(ko0), MARGIN = ifelse(taxa_are_rows(ko0), yes = 1, no = 2), FUN = function(x){sum(x > 0)}) prevdf = data.frame(Prevalence = prev0, TotalAbundance = taxa_sums(ko0), tax_table(ko0)) prevdf1 = subset(prevdf,KO %in% get_taxa_unique(ko0, "KO")) # remove low abundance terms (single read or <=5% of samples) xx=subset(prevdf1,TotalAbundance<=1 | Prevalence/nsamples(ko0)<=0.05) #if using count data head(xx);dim(xx) xx$Type=c("Ignore") yy=subset(prevdf1,TotalAbundance>1 & Prevalence/nsamples(ko0)>0.05) #if using count data head(yy);dim(yy) yy$Type=c("Include") prevdf1=rbind(xx,yy) head(prevdf1);dim(prevdf1) print(ko0) pdf("KO_prevalence.relative.prefiltering.pdf") ggplot(prevdf1, aes(y=TotalAbundance, x=Prevalence, color=Type)) + geom_point(shape=21, size = 1, alpha = 0.5) + scale_color_manual(values=nn.colors) + ylab("TotalAbundance") + xlab("Prevalence") + theme(legend.position="none") + theme_minimal() + theme_bw() dev.off() filterKOs = xx$KO ko3 = subset_taxa(ko0, !KO %in% filterKOs) print(ko3) # save cleaned ps object ko4 = prune_samples(sample_sums(ko3)>=0.9, ko3) print(ko4) saveRDS(ko4,"KO.CBfiltered.metadata.merged.cln.rds") ``` ## read phyloseq object (KO terms) ```{r, echo=TRUE} # read phyloseq object ko.cln=readRDS("KO.CBfiltered.metadata.merged.cln.rds") write.table(tax_table(ko.cln), file = "module.clean.txt", append = FALSE, quote = FALSE, sep = "\t", eol = "\n", na = "NA", dec = ".", row.names = TRUE, col.names = TRUE) print(ko.cln) head(sample_data(ko.cln)) sample_data(ko.cln)$Label=sample_data(ko.cln)$Time table(sample_data(ko.cln)$Label) head(tax_table(ko.cln)) table(sample_data(ko.cln)$Label) ko.cln ``` ### Bray-Curtis distances and statistical assocations ```{r, echo=TRUE} # load the cleaned ps object print(ko.cln) # log transform the raw count data ko.cln.log <- transform_sample_counts(ko.cln, function(x) log(1 + x)) print(ko.cln.log) # calculate Bray-Curtis distance set.seed(seed) bcdist = phyloseq::distance(ko.cln.log, method="bray",normalized=TRUE, parallel=FALSE,fast=TRUE) pcoa <- ordinate(ko.cln.log, method = "MDS", distance = bcdist) evals <- pcoa$values$Eigenvalues # screeplot pdf("PCoA_screeplot.KO.CBfiltered.log.pdf",6,4) plot_scree(pcoa) + xlim(as.character(seq(1,25))) + ggtitle("Bray-Curtis ordination eigenvalues") + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() names(pcoa) pcoa$values [1:5,] # prepare the metadata sample_data(ko.cln.log)$KO.PCoA1=pcoa$vectors[,1] sample_data(ko.cln.log)$KO.PCoA2=pcoa$vectors[,2] sample_data(ko.cln.log)$KO.PCoA3=pcoa$vectors[,3] sample_data(ko.cln.log)$KO.PCoA4=pcoa$vectors[,4] sample_data(ko.cln.log)$KO.PCoA5=pcoa$vectors[,5] sampledf=data.frame(sample_data(ko.cln.log)) names(sampledf) sampledf$Time=as.factor(sampledf$Label) sample_data(ko.cln.log)$Time=as.factor(sample_data(ko.cln.log)$Label) # Plot PCoA pdf("PCoA_KO_time.CBfiltered.log.pdf", 6,3) plot_ordination(ko.cln.log, pcoa, color="Label") + scale_color_manual(values = nn.colors[-1]) + geom_hline(yintercept =0,lty=3) + geom_vline(xintercept =0,lty=3) + coord_fixed(sqrt(evals[2] / evals[1])) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) dev.off() head(sampledf) pdf("PCoA1_KO_ridges.count.pdf",6,6) ggplot(sampledf, aes(x = KO.PCoA1, y = Label, fill=Label)) + geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) + scale_fill_manual(values=nn.colors[-1]) + scale_y_discrete(expand = c(0.01, 0)) + scale_x_continuous(expand = c(0.01, 0)) + theme_ridges() #+ facet_wrap(~Diet) dev.off() pdf("PCoA1_KO_violin.pdf",6,4) ggplot(sampledf,aes(Label,KO.PCoA1, fill=Label)) + geom_violin() + scale_fill_manual(values=nn.colors[-1]) + geom_jitter(width=0.05,col="black",size=1,shape=21) + xlab("") + theme_bw() + theme(legend.position="none") + labs(y= "KO PCoA1") #+ facet_wrap(~Diet) dev.off() pdf("PCoA2_KO_ridges.count.pdf",6,6) ggplot(sampledf, aes(x = KO.PCoA2, y = Label, fill=Label)) + geom_density_ridges(scale = 1,alpha=1, aes(fill=Label)) + scale_fill_manual(values=nn.colors[-1]) + scale_y_discrete(expand = c(0.01, 0)) + scale_x_continuous(expand = c(0.01, 0)) + theme_ridges() #+ facet_wrap(~Diet) dev.off() # statistical association of PCoAs # pcoa1 shapiro.test(sampledf$KO.PCoA1) # not normally distributed set.seed(seed) kruskal.test(KO.PCoA1~Label,data=sampledf) sampledf <- sampledf %>% arrange(Time, Dog) wilcox.test(sampledf$KO.PCoA1[1:28], sampledf$KO.PCoA1[29:56], paired=T) # pcoa2 shapiro.test(sampledf$KO.PCoA2) # not normally distributed set.seed(seed) kruskal.test(KO.PCoA2~Label,data=sampledf) # p-value = 0.3588 # visualize the difference using boxplots pdf("PCoA1_KO_boxplot.count.pdf",6,4) ggplot(sampledf,aes(Label,KO.PCoA1, fill=Label)) + geom_boxplot(outlier.shape = NA, notch = TRUE) + scale_fill_manual(values=nn.colors[-1]) + geom_jitter(width=0.1,fill="gold",size=1,shape=21,alpha=0.75) + xlab("") + theme_bw() + theme(legend.position="none") #+ facet_wrap(~Diet) dev.off() # visualize the difference using boxplots pdf("PCoA2_KO_boxplot.count.pdf",6,4) ggplot(sampledf,aes(Label,KO.PCoA2, fill=Label)) + geom_boxplot(outlier.shape = NA, notch = TRUE) + scale_fill_manual(values=nn.colors[-1]) + geom_jitter(width=0.1,fill="gold",size=1,shape=21,alpha=0.75) + xlab("") + theme_bw() + theme(legend.position="none") #+ facet_wrap(~Diet) dev.off() # assess significance (simple model using PERMANOVA) sampledf <- data.frame(sample_data(ko.cln.log)) names(sampledf) set.seed(seed) prm_mod=adonis2(bcdist ~ Label, data = sampledf, permutations = 9999) prm_mod ``` ### DESeq2 to calculate differences in KO terms (week4 vs baseline) ```{r, echo=TRUE} #use non-log data ko.cln=readRDS("KO.CBfiltered.metadata.merged.cln.rds") ko.cln base <- ko.cln %>% subset_samples(Time=="baseline" | Time=="week4") %>% prune_taxa(taxa_sums(.) > 0, .) base table(sample_data(base)$Time) diagdds = phyloseq_to_deseq2(base, ~ Time) diagdds = DESeq(diagdds, fitType = c("parametric")) res = results(diagdds, cooksCutoff = FALSE) hist(res$pvalue) res = cbind(as(res, "data.frame"), as(tax_table(ko.cln)[rownames(res), ], "matrix")) res = res[order(-abs(res$log2FoldChange),res$padj),] pdf("DESeq_week4_vs_baseline.KO.pdf",6,4) ggplot(res, aes(log2FoldChange, -log10(padj))) + geom_point(size=2,color="black") + geom_point(size=1,color="white") + geom_point(size=1,alpha=0.75) + scale_colour_manual(values=kolor) + geom_vline(xintercept = -2, lty=3, color="black") + geom_vline(xintercept = 2, lty=3, color="black") + geom_hline(yintercept = -log10(0.05), lty=3, color="black") + ylim(0,18) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(axis.text.x = element_text(angle = 0, hjust = 0, vjust=0.5)) dev.off() res$padj write.csv(res, file = "DESeq.KO.count.csv", row.names = TRUE) ``` ## ################ ## Diet analysis ## ## ################ ## PROTEIN ## ```{r,echo=T} #protein baseline diets #percent dry-matter a.pr.dm <- c(25.56, 34.09, 33.33, 26.67, 32.95, 30, 43.18, 35.56, 28.11, 35.56, 26.67, 26.67, 21.33, 27.28, 37.78) #caloric basis a.pr.cal <- c(65.92, 71.8, 78.43, 66.43, 71.89, 73.31, 97.44, 81.10, 75.30, 86.04, 66.32, 65.66, 49.64, 66.77, 91.94) #protein fresh diets #percent dry-matter a.pr.nn <- c(37.04, 36.96, 32, 36.67) #caloric basis a.pr.nncal <- c(80.71, 67.73, 64.21, 74.37) ``` ## FAT ## ```{r,echo=T} #fat baseline diets #percent dry-matter a.fa.dm <- c(12.22, 22.73, 20, 15.56, 19.32, 17.78, 20.45, 18.34, 8.56, 20, 13.33, 16.67, 18.33, 15.56, 17.78) #caloric basis a.fa.cal <- c(31.53, 47.87, 47.06, 38.75, 42.14, 43.44, 46.15, 41.72, 22.92, 48.4, 33.16, 41.04, 42.66, 37.39, 43.27) #fat fresh diets #percent dry-matter a.fa.nn <- c(18.52, 26.09, 20, 16.67) #caloric basis a.fa.nncal <- c(40.36, 47.81, 40.13, 33.81) ``` ## FIBER ## ```{r,echo=T} #fiber baseline diets #percent dry-matter a.fi.dm <- c(6.33, 3.41, 4.44, 5.56, 3.41, 3.33, 4.55, 4.72, 1.78, 4.44, 4.44, 5.56, 9.22, 3.78, 4.44) #caloric-basis a.fi.cal <- c(16.34, 7.18, 10.46, 13.84, 7.44, 8.15, 10.26, 10.76, 4.76, 10.76, 11.05, 13.68, 21.46, 9.08, 10.82) #fiber fresh diets #percent dry-matter a.fi.nn <- c(3.7, 4.35, 8, 3.33) #caloric basis a.fi.nncal <- c(8.07, 7.97, 16.05, 6.76) ``` ## MOISTURE ## ```{r,echo=T} #percent moisture baseline diets a.mo <- c(10, 12, 10, 10, 12, 10, 12, 10, 10, 10, 10, 10, 10, 10, 10) #percent moisture fresh diets a.mo.nn <- c(73, 77, 75, 70) ``` ## CALORIES ## ```{r,echo=T} #calories baseline diets #calorie density kcal/kg dry-matter a.ca.dm <- c(3876.67, 4747.73, 4250, 4014.44, 4584.09, 4092.22, 4431.82, 4386.67, 3733.33, 4132.22, 4021.11, 4061.11, 4297.78, 4160, 4108.89) #calories fresh diets ##calorie density kcal/kg dry-matter a.ca.nn <- c(4588.889, 5456.52, 4984, 4930) ``` ## IMPORT METDATA ## ```{r,echo=T} #group b <- c("LR", "MR", "MR", "HR", "MR", "HR", "HR", "HR", "MR", "HR", "MR", "LR", "LR", "LR", "MR") #PCoA values PCoA1.baseline <- c(-0.2912434092, -0.05353604557, 0.2881100368, -0.2444149455, 0.03010995198, 0.20169154, -0.3020782323, -0.3866607864, -0.2521250402, -0.3069681731, 0.06988740119, 0.2963869123, 0.1423442863, -0.009072526405, -0.3499042852) PCoA1.change <- c(0.05854814808, 0.1301768087, 0.2053459868, 0.3444328039, 0.07591902822, 0.2533414719, 0.5462731419, 0.2833942851, 0.2208376496, 0.3976327893, 0.07112995234, -0.06805278555, -0.1079867444, -0.01973125854, 0.1485097625) ``` ## Kruskal.wallis tests ## # protein DM ```{r,echo=T} kruskal.test(a.pr.dm, b) LR.pr.dm <-a.pr.dm[c(1, 12, 13, 14)] mean(LR.pr.dm) #25.21 sd(LR.pr.dm) #2.682872 MR.pr.dm <- a.pr.dm[c(2, 3, 5, 9, 11, 15)] mean(MR.pr.dm) #32.155 sd(MR.pr.dm) #4.094991 HR.pr.dm <- a.pr.dm[c(4, 6, 7, 8, 10)] mean(HR.pr.dm) #4.194 sd(HR.pr.dm) #6.298419 ``` # protein g/1000 kcal ```{r,echo=T} kruskal.test(a.pr.cal, b) #0.02652 LR.pr.cal <-a.pr.cal[c(1, 12, 13, 14)] mean(LR.pr.cal) #61.9975 sd(LR.pr.cal) #8.251959 MR.pr.cal <- a.pr.cal[c(2, 3, 5, 9, 11, 15)] mean(MR.pr.cal) # 75.94667 sd(MR.pr.cal) #8.816983 HR.pr.cal <- a.pr.cal[c(4, 6, 7, 8, 10)] mean(HR.pr.cal) #80.864 sd(HR.pr.cal) #11.90602 ``` # fat DM ```{r,echo=T} kruskal.test(a.fa.dm, b) #0.3421 LR.fa.dm <-a.fa.dm[c(1, 12, 13, 14)] mean(LR.fa.dm) #15.695 sd(LR.fa.dm) #2.581195 MR.fa.dm <- a.fa.dm[c(2, 3, 5, 9, 11, 15)] mean(MR.fa.dm) #16.95333 sd(MR.fa.dm) #5.14807 HR.fa.dm <- a.fa.dm[c(4, 6, 7, 8, 10)] mean(HR.fa.dm) #18.426 sd(HR.fa.dm) #1.950174 ``` # fat g/1000 kcal ```{r,echo=T} kruskal.test(a.fa.cal, b) #0.2835 LR.fa.cal <-a.fa.cal[c(1, 12, 13, 14)] mean(LR.fa.cal) #38.155 sd(LR.fa.cal) #4.936061 MR.fa.cal <- a.fa.cal[c(2, 3, 5, 9, 11, 15)] mean(MR.fa.cal) #39.40333 sd(MR.fa.cal) #9.627346 HR.fa.cal <- a.fa.cal[c(4, 6, 7, 8, 10)] mean(HR.fa.cal) #43.692 sd(HR.fa.cal) #3.761073 ``` # fiber DM ```{r,echo=T} kruskal.test(a.fi.dm, b) #0.06769 LR.fi.dm <-a.fi.dm[c(1, 12, 13, 14)] mean(LR.fi.dm) #6.2225 sd(LR.fi.dm) #2.26578 MR.fi.dm <- a.fi.dm[c(2, 3, 5, 9, 11, 15)] mean(MR.fi.dm) #3.653333 sd(MR.fi.dm) #1.047314 HR.fi.dm <- a.fi.dm[c(4, 6, 7, 8, 10)] mean(HR.fi.dm) #4.52 sd(HR.fi.dm) # 0.7976528 ``` # fiber g/1000kcal ```{r,echo=T} kruskal.test(a.fi.cal, b) #0.1293 LR.fi.cal <-a.fi.cal[c(1, 12, 13, 14)] mean(LR.fi.cal) #15.14 sd(LR.fi.cal) #5.171641 MR.fi.cal <- a.fi.cal[c(2, 3, 5, 9, 11, 15)] mean(MR.fi.cal) #8.618333 sd(MR.fi.cal) #2.549356 HR.fi.cal <- a.fi.cal[c(4, 6, 7, 8, 10)] mean(HR.fi.cal) #10.754 sd(HR.fi.cal) #2.033981 ``` #moisture ```{r,echo=T} kruskal.test(a.mo, b) #0.4594 LR.mo <-a.mo[c(1, 12, 13, 14)] mean(LR.mo) #10 sd(LR.mo) MR.mo <- a.mo[c(2, 3, 5, 9, 11, 15)] mean(MR.mo) #10.66667 sd(MR.mo) HR.mo <- a.mo[c(4, 6, 7, 8, 10)] mean(HR.mo) #10.4 sd(HR.mo) ``` ## calorie denisty DM ```{r,echo=T} kruskal.test(a.ca.dm, b) #0.8076 LR.ca.dm <-a.ca.dm[c(1, 12, 13, 14)] mean(LR.ca.dm) # 4098.89 sd(LR.ca.dm)#177.107 MR.ca.dm <- a.ca.dm[c(2, 3, 5, 9, 11, 15)] mean(MR.ca.dm) # 4240.858 sd(MR.ca.dm) # 373.6577 HR.ca.dm <- a.ca.dm[c(4, 6, 7, 8, 10)] mean(HR.ca.dm) #4211.474 sd(HR.ca.dm) ``` ## Correlation tests ## #protein ```{r,echo=T} cor.test(a.pr.dm, PCoA1.baseline, method = "spearman") cor.test(a.pr.cal, PCoA1.baseline, method = "spearman") cor.test(a.pr.dm, PCoA1.change, method = "spearman") cor.test(a.pr.cal, PCoA1.change, method = "spearman") ``` #fat ```{r,echo=T} cor.test(a.fa.dm, PCoA1.baseline, method = "spearman") cor.test(a.fa.cal, PCoA1.baseline, method = "spearman") cor.test(a.fa.dm, PCoA1.change, method = "spearman") cor.test(a.fa.cal, PCoA1.change, method = "spearman") ``` #fiber ```{r,echo=T} cor.test(a.fi.dm, PCoA1.baseline, method = "spearman") cor.test(a.fi.cal, PCoA1.baseline, method = "spearman") cor.test(a.fi.dm, PCoA1.change, method = "spearman") cor.test(a.fi.cal, PCoA1.change, method = "spearman") ``` #moisture - baseline ```{r,echo=T} cor.test(a.mo, PCoA1.baseline, method = "spearman") cor.test(a.mo, PCoA1.change, method = "spearman") cor.test(a.ca.dm, PCoA1.baseline, method = "spearman") cor.test(a.ca.dm, PCoA1.change, method = "spearman") ``` ## Pairwise wilcox test for protein and fiber ## ```{r,echo=T} #protein pairwise.wilcox.test(a.pr.dm, b, p.adjust.method = "BH") pairwise.wilcox.test(a.pr.cal, b, p.adjust.method = "BH") #fiber pairwise.wilcox.test(a.fi.dm, b, p.adjust.method = "BH") ``` ## ####################### ## Baseline health data ## ## ####################### ## Compare HR MR LR baseline ## #Read data ```{r,echo=T} library(plyr) insights <- read.csv(file = 'survey_prestudy_has_MB.csv') table(insights$PCo1) ``` #age ```{r,echo=T} mean<-ddply(insights, .(PCo1), summarize, mean=mean(age)) mean sd<-ddply(insights, .(PCo1), summarize, sd=sd(age)) sd kruskal.test(age~PCo1, data=insights) ``` #sex ```{r,echo=T} table(insights$sex, insights$PCo1) fisher <- matrix(c(5,4,6,3,2,6), nrow = 2) fisher.test(fisher) ``` #spayed ```{r,echo=T} table (insights$spayed, insights$PCo1) fisher <- matrix(c(1,8,2,8,1,8), nrow = 2) fisher.test(fisher) ``` #prestudy.weight ```{r,echo=T} mean<-ddply(insights, .(PCo1), summarize, mean=mean(prestudy.weight)) mean sd<-ddply(insights, .(PCo1), summarize, sd=sd(prestudy.weight)) sd kruskal.test(prestudy.weight~PCo1, data=insights) ``` #ideal.weight ```{r,echo=T} mean<-ddply(insights, .(PCo1), summarize, mean=mean(ideal.weight)) mean sd<-ddply(insights, .(PCo1), summarize, sd=sd(ideal.weight)) sd kruskal.test(ideal.weight~PCo1, data=insights) ``` #bcs ```{r,echo=T} table (insights$bcs, insights$PCo1) fisher <- matrix(c(8,1,6,2,6,4), nrow = 2) fisher.test(fisher) ``` #activity_level ```{r,echo=T} table (insights$activity_level, insights$PCo1) fisher <- matrix(c(3,3,3,4,0,4,3,3,4), nrow = 3) fisher.test(fisher) ``` #overall.health ```{r,echo=T} table (insights$overall.health, insights$PCo1) fisher <- matrix(c(6,2,1,6,2,0,5,4,1), nrow = 3) fisher.test(fisher) ``` #food.motivation ```{r,echo=T} table (insights$food.motivation, insights$PCo1) fisher <- matrix(c(8,0,1,7,1,0,8,0,2), nrow = 3) fisher.test(fisher) ``` #bristol.chart ```{r,echo=T} insights1 <- insights[-15,] mean<-ddply(insights1, .(PCo1), summarize, mean=mean(bristol.chart)) mean sd<-ddply(insights1, .(PCo1), summarize, sd=sd(bristol.chart)) sd kruskal.test(bristol.chart~PCo1, data=insights1) ``` #defecation.frequency ```{r,echo=T} table (insights$defecation.frequency, insights$PCo1) fisher <- matrix(c(3,5,1,3,5,0,2,7,1), nrow = 3) fisher.test(fisher) ``` #flatulence.frequency ```{r,echo=T} table (insights$flatulence.frequency, insights$PCo1) fisher <- matrix(c(0,9,1,7,5,5), nrow = 2) fisher.test(fisher) ``` ## ################################## ## Health survey analysis - week 4 ## ## ################################## #read dataset - exit survey ```{r,echo=T} data <- read.csv(file = 'exit.survey.csv') data.include <- data[data$exclude.missing.meals != TRUE & data$exclude.antibiotics != TRUE,] data.exclude <- data[data$exclude.missing.meals == TRUE | data$exclude.antibiotics == TRUE,] ``` #overall.health ```{r,echo=T} summary(data.include$overall.health) fisher <- matrix(c(19,12,0,10,10,10), nrow = 3) fisher.test(fisher) fisher.multcomp(fisher, p.method = "fdr") ``` #physical.activity ```{r,echo=T} summary(data.include$physical.activity) fisher <- matrix(c(15,16,0,10,10,10), nrow = 3) fisher.test(fisher) fisher.multcomp(fisher, p.method = "fdr") ``` #weight.change.lbs ```{r,echo=T} mean(data.include$weight.change.lbs) sd(data.include$weight.change.lbs) wilcox.test(data.include$prestudy.weight.lbs, data.include$poststudy.weight.lbs, pair=T) t.test(data.include$prestudy.weight.lbs, data.include$poststudy.weight.lbs, pair=T) ``` #fecal.score ```{r,echo=T} mean(data.include$prestudy.fecal.score[-c(6,14,17,27)], na.rm=T) sd(data.include$prestudy.fecal.score[-c(6,14,17,27)], na.rm=T) mean(data.exclude$prestudy.fecal.score, na.rm=T) sd(data.exclude$prestudy.fecal.score, na.rm=T) wilcox.test(data.include$prestudy.fecal.score[-c(6,14,17,27)], data.exclude$prestudy.fecal.score, pair=F) mean(data.include$fecal.score.change, na.rm=T) sd(data.include$fecal.score.change, na.rm=T) mean(data.include$poststudy.fecal.score[-c(6,14,17,27)], na.rm=T) sd(data.include$poststudy.fecal.score[-c(6,14,17,27)], na.rm=T) wilcox.test(data.include$prestudy.fecal.score[-c(6,14,17,27)], data.include$poststudy.fecal.score[-c(6,14,17,27)], pair=T) t.test(data.include$prestudy.weight.lbs, data.include$poststudy.weight.lbs, pair=T) ``` #defecation.frequency ```{r,echo=T} summary(data.include$defecation.frequency) fisher <- matrix(c(2,11,18,10,10,10), nrow = 3) fisher.test(fisher) fisher.multcomp(fisher, p.method = "fdr") ``` #flatulence.frequency ```{r,echo=T} summary(data.include$flatulence.frequency) fisher <- matrix(c(2,19,10,10,10,10), nrow = 3) fisher.test(fisher) fisher.multcomp(fisher, p.method = "fdr") ``` #coat.condition ```{r,echo=T} summary(data.include$coat.condition) fisher <- matrix(c(15,16,0,10,10,10), nrow = 3) fisher.test(fisher) fisher.multcomp(fisher, p.method = "fdr") ``` #appetite ```{r,echo=T} summary(data.include$appetite) fisher <- matrix(c(19,11,1,10,10,10), nrow = 3) fisher.test(fisher) fisher.multcomp(fisher, p.method = "fdr") ``` ## Compare HR MR LR week 4 ## #read data ```{r,echo=T} exit <- read.csv(file = 'exit.survey.csv') exit$fecal.score.change <- as.numeric(exit$fecal.score.change) table(exit$PCo1) ``` #overall.health ```{r,echo=T} table (exit$overall.health, exit$PCo1) fisher <- matrix(c(6,3,0,5,3,0,5,4,0), nrow = 3) fisher.test(fisher) fisher <- matrix(c(6,3,5,3,5,4), nrow = 2) fisher.test(fisher) ``` #physical.activity ```{r,echo=T} table (exit$physical.activity, exit$PCo1) fisher <- matrix(c(4,5,0,3,5,0,5,4,0), nrow = 3) fisher.test(fisher) fisher <- matrix(c(4,5,3,5,5,4), nrow = 2) fisher.test(fisher) ``` #weight.change.percent ```{r,echo=T} exit1 <- exit[-c(1,2,8,15,17,19,22,29,33,34,38),] table(exit1$PCo1) mean<-ddply(exit1, .(PCo1), summarize, mean=mean(weight.change.percent)) mean sd<-ddply(exit1, .(PCo1), summarize, sd=sd(weight.change.percent)) sd kruskal.test(weight.change.percent~PCo1, data=exit1) ``` #poststudy.fecal.score ```{r,echo=T} exit1 <- exit[-c(1,2,8,15,17,19,22,29,33,34,38),] table(exit1$PCo1) mean<-ddply(exit1, .(PCo1), summarize, mean=mean(poststudy.fecal.score)) mean sd<-ddply(exit1, .(PCo1), summarize, sd=sd(poststudy.fecal.score)) sd kruskal.test(poststudy.fecal.score~PCo1, data=exit1) ``` #fecal.score.change ```{r,echo=T} exit1 <- exit[-c(1,2,8,15,17,19,21,22,29,33,34,38),] table(exit1$PCo1) mean<-ddply(exit1, .(PCo1), summarize, mean=mean(fecal.score.change)) mean sd<-ddply(exit1, .(PCo1), summarize, sd=sd(fecal.score.change)) sd kruskal.test(fecal.score.change~PCo1, data=exit1) ``` #defecation.frequency ```{r,echo=T} table (exit$defecation.frequency, exit$PCo1) fisher <- matrix(c(0,3,6,0,2,6,1,4,4), nrow = 3) fisher.test(fisher) ``` #flatulence.frequency ```{r,echo=T} table (exit$flatulence.frequency, exit$PCo1) fisher <- matrix(c(0,8,1,0,4,4,1,5,3), nrow = 3) fisher.test(fisher) ``` #coat.condition ```{r,echo=T} table (exit$coat.condition, exit$PCo1) fisher <- matrix(c(4,5,0,3,5,0,5,4,0), nrow = 3) fisher.test(fisher) fisher <- matrix(c(4,5,3,5,5,4), nrow = 2) fisher.test(fisher) ``` #appetite ```{r,echo=T} table (exit$appetite, exit$PCo1) fisher <- matrix(c(5,4,0,5,3,0,6,3,0), nrow = 3) fisher.test(fisher) ```