setwd("C:/Users/Administrator/Google Drive/CUHK projects/16S methodology validation/redemux") library(vegan) read.table('env.txt',header=T,sep='\t',row.names=1)->env read.table("esv_table_counts_nochlor.tsv",header=T,row.names=1,sep='\t')->tmp ncol(env) ## 10 columns in env env[,11] <- NA colSums(tmp[,-193]) -> env[,11] names(env)[names(env)=="V11"] <- "read_count" # now get the samples with 1000 or more reads, and their corresponding entries in env file subset(t(tmp[,-193]),env$read_count > 999) -> tmp env[env$read_count>999,] -> env ## filter away samples with less than 1000 reads # and make sure rownames are the same in both ESV table and env files rownames(tmp)==rownames(env) # CLR transformation: # from https://github.com/joey711/shiny-phyloseq/blob/master/panels/paneldoc/Transform.md gm_mean = function(x, na.rm=TRUE){ exp(sum(log(x[x > 0 & !is.na(x)]), na.rm=na.rm) / length(x)) } # from https://github.com/joey711/shiny-phyloseq/blob/master/panels/paneldoc/Transform.md clr = function(x, base=2){ x <- log((x / gm_mean(x)), base) x[!is.finite(x) | is.na(x)] <- 0.0 return(x) } #Run CLR on the ESV count matrix (rows ESVs, cols samples) esv.clr <- t(apply(t(tmp), MARGIN = 2, FUN = clr)) ## tmp is transposed because ESVs are in columns and samples are in rows #also create a corresponding esv table based on relative abundances to obtain relative abundance values later decostand(tmp,"total") -> esv ###### subset commands ###### subset(esv.clr, env$norgen=="n") -> esv.clr.n subset(env, env$norgen=="n") -> env.n subset(esv.clr,env$subject=="A") -> esv.clr.A subset(esv.clr,env$subject=="B") -> esv.clr.B subset(esv.clr,env$subject=="C") -> esv.clr.C subset(env,env$subject=="A") -> env.A subset(env,env$subject=="B") -> env.B subset(env,env$subject=="C") -> env.C subset(esv.clr.A, env.A$storage_groups=="a") -> esv.clr.A.a subset(esv.clr.B, env.B$storage_groups=="a") -> esv.clr.B.a subset(esv.clr.C, env.C$storage_groups=="a") -> esv.clr.C.a subset(env.A, env.A$storage_groups=="a") -> env.A.a subset(env.B, env.B$storage_groups=="a") -> env.B.a subset(env.C, env.C$storage_groups=="a") -> env.C.a subset(esv.clr,env$subject_visit=="1") -> esv.clr.1 subset(esv,env$subject_visit=="1") -> esv.1 subset(env,env$subject_visit=="1") -> env.1 subset(esv.clr.1, env.1$subject=="A") -> esv.clr.1.A subset(esv.clr.1, env.1$subject=="B") -> esv.clr.1.B subset(esv.clr.1, env.1$subject=="C") -> esv.clr.1.C subset(env.1, env.1$subject=="A") -> env.1.A subset(env.1, env.1$subject=="B") -> env.1.B subset(env.1, env.1$subject=="C") -> env.1.C subset(esv.clr.1, env.1$norgen=="n") -> esv.clr.1.n subset(env.1, env.1$norgen=="n") -> env.1.n subset(esv.clr.1.n, env.1.n$subject=="A") -> esv.clr.1.n.A subset(env.1.n, env.1.n$subject=="A") -> env.1.n.A subset(esv.clr.1.n, env.1.n$subject=="B") -> esv.clr.1.n.B subset(env.1.n, env.1.n$subject=="B") -> env.1.n.B subset(esv.clr.1.n, env.1.n$subject=="C") -> esv.clr.1.n.C subset(env.1.n, env.1.n$subject=="C") -> env.1.n.C subset(esv.1.n, env.1.n$subject=="A") -> esv.1.n.A subset(esv.1.y, env.1.y$subject=="A") -> esv.1.y.A subset(env.1.y, env.1.y$subject=="A") -> env.1.y.A subset(esv.1.n, env.1.n$subject=="B") -> esv.1.n.B subset(esv.1.y, env.1.y$subject=="B") -> esv.1.y.B subset(env.1.y, env.1.y$subject=="B") -> env.1.y.B subset(esv.1.n, env.1.n$subject=="C") -> esv.1.n.C subset(esv.1.y, env.1.y$subject=="C") -> esv.1.y.C subset(env.1.y, env.1.y$subject=="C") -> env.1.y.C subset(esv.clr.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="b") -> esv.clr.1.n.A.ab subset(esv.clr.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="c") -> esv.clr.1.n.A.ac subset(env.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="b") -> env.1.n.A.ab subset(env.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="c") -> env.1.n.A.ac subset(esv.clr.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="b") -> esv.clr.1.n.B.ab subset(esv.clr.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="c") -> esv.clr.1.n.B.ac subset(env.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="b") -> env.1.n.B.ab subset(env.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="c") -> env.1.n.B.ac subset(esv.clr.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="b") -> esv.clr.1.n.C.ab subset(esv.clr.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="c") -> esv.clr.1.n.C.ac subset(env.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="b") -> env.1.n.C.ab subset(env.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="c") -> env.1.n.C.ac subset(esv.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="b") -> esv.1.n.A.ab subset(esv.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="b") -> esv.1.n.B.ab subset(esv.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="b") -> esv.1.n.C.ab subset(esv.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="c") -> esv.1.n.A.ac subset(esv.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="c") -> esv.1.n.B.ac subset(esv.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="c") -> esv.1.n.C.ac ## can differences be detected in post 2 hour samples? : subset(esv.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> esv.1.n.A.ab.02 subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> env.1.n.A.ab.02 subset(esv.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> esv.1.n.B.ab.02 subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> env.1.n.B.ab.02 subset(esv.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> esv.1.n.C.ab.02 subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> env.1.n.C.ab.02 subset(esv.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> esv.1.n.A.ac.02 subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> env.1.n.A.ac.02 subset(esv.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> esv.1.n.B.ac.02 subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> env.1.n.B.ac.02 subset(esv.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> esv.1.n.C.ac.02 subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> env.1.n.C.ac.02 subset(esv.clr.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> esv.clr.1.n.A.ab.02 subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> env.1.n.A.ab.02 subset(esv.clr.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> esv.clr.1.n.B.ab.02 subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> env.1.n.B.ab.02 subset(esv.clr.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> esv.clr.1.n.C.ab.02 subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> env.1.n.C.ab.02 subset(esv.clr.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> esv.clr.1.n.A.ac.02 subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> env.1.n.A.ac.02 subset(esv.clr.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> esv.clr.1.n.B.ac.02 subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> env.1.n.B.ac.02 subset(esv.clr.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> esv.clr.1.n.C.ac.02 subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> env.1.n.C.ac.02 ## can differences be detected in post 5 hour samples? : subset(esv.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> esv.1.n.A.ab.05 subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> env.1.n.A.ab.05 subset(esv.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> esv.1.n.B.ab.05 subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> env.1.n.B.ab.05 subset(esv.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> esv.1.n.C.ab.05 subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> env.1.n.C.ab.05 subset(esv.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> esv.1.n.A.ac.05 subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> env.1.n.A.ac.05 subset(esv.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> esv.1.n.B.ac.05 subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> env.1.n.B.ac.05 subset(esv.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> esv.1.n.C.ac.05 subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> env.1.n.C.ac.05 subset(esv.clr.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> esv.clr.1.n.A.ab.05 subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> env.1.n.A.ab.05 subset(esv.clr.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> esv.clr.1.n.B.ab.05 subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> env.1.n.B.ab.05 subset(esv.clr.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> esv.clr.1.n.C.ab.05 subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> env.1.n.C.ab.05 subset(esv.clr.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> esv.clr.1.n.A.ac.05 subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> env.1.n.A.ac.05 subset(esv.clr.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> esv.clr.1.n.B.ac.05 subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> env.1.n.B.ac.05 subset(esv.clr.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> esv.clr.1.n.C.ac.05 subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> env.1.n.C.ac.05 ## can differences be detected in post 24 hour samples? : subset(esv.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> esv.1.n.A.ab.024 subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> env.1.n.A.ab.024 subset(esv.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> esv.1.n.B.ab.024 subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> env.1.n.B.ab.024 subset(esv.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> esv.1.n.C.ab.024 subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> env.1.n.C.ab.024 subset(esv.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> esv.1.n.A.ac.024 subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> env.1.n.A.ac.024 subset(esv.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> esv.1.n.B.ac.024 subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> env.1.n.B.ac.024 subset(esv.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> esv.1.n.C.ac.024 subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> env.1.n.C.ac.024 subset(esv.clr.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> esv.clr.1.n.A.ab.024 subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> env.1.n.A.ab.024 subset(esv.clr.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> esv.clr.1.n.B.ab.024 subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> env.1.n.B.ab.024 subset(esv.clr.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> esv.clr.1.n.C.ab.024 subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> env.1.n.C.ab.024 subset(esv.clr.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> esv.clr.1.n.A.ac.024 subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> env.1.n.A.ac.024 subset(esv.clr.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> esv.clr.1.n.B.ac.024 subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> env.1.n.B.ac.024 subset(esv.clr.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> esv.clr.1.n.C.ac.024 subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> env.1.n.C.ac.024 ## for point 5, effect of sites in the one stool on community composition: subset(esv.clr.1.A, env.1.A$storage_groups=="a") -> esv.clr.1.A.a subset(esv.clr.1.B, env.1.B$storage_groups=="a") -> esv.clr.1.B.a subset(esv.clr.1.C, env.1.C$storage_groups=="a") -> esv.clr.1.C.a subset(env.1.A, env.1.A$storage_groups=="a") -> env.1.A.a subset(env.1.B, env.1.B$storage_groups=="a") -> env.1.B.a subset(env.1.C, env.1.C$storage_groups=="a") -> env.1.C.a ############### alpha diversity ################# read.table("alpha_div_collated.txt",header=T, row.names=1, sep='\t') -> alpha_div ## REMOVE THE ENRICHED SAMPLES! alpha_div[-4:-6,]->alpha_div # remove samples with zero value for sobs, shannon and faith pd: alpha_div[alpha_div$sobs_1086>0,] -> alpha_div ## filter away samples with less than 1000 reads rownames(alpha_div) kruskal.test(sobs_1086 ~ subject, data=alpha_div) kruskal.test(shannon_1086 ~ subject, data=alpha_div) kruskal.test(pd_1086 ~ subject, data=alpha_div) library(ggplot2) ## <--- Fig S1 ggplot(alpha_div, aes(x=subject, y=sobs_1086)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width = 0.1, aes(colour = factor(alpha_div$subject_visit))) ggplot(alpha_div, aes(x=subject, y=shannon_1086)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width = 0.1, aes(colour = factor(alpha_div$subject_visit))) ggplot(alpha_div, aes(x=subject, y=pd_1086)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width = 0.1, aes(colour = factor(alpha_div$subject_visit))) # alpha diversity metrics of sampling occasion 1 stools, comparing norgen vs no norgen: subset(alpha_div, alpha_div$subject_visit=="1") -> alpha_div.1 subset(alpha_div.1, alpha_div.1$subject=="A") -> alpha_div.1.A subset(alpha_div.1, alpha_div.1$subject=="B") -> alpha_div.1.B subset(alpha_div.1, alpha_div.1$subject=="C") -> alpha_div.1.C subset(alpha_div.1.A, alpha_div.1.A$storage_groups=="a" | alpha_div.1.A$storage_groups=="d") -> alpha_div.1.A.ad subset(alpha_div.1.A, alpha_div.1.A$storage_groups=="b" | alpha_div.1.A$storage_groups=="d") -> alpha_div.1.A.bd subset(alpha_div.1.A, alpha_div.1.A$storage_groups=="c" | alpha_div.1.A$storage_groups=="d") -> alpha_div.1.A.cd subset(alpha_div.1.B, alpha_div.1.B$storage_groups=="a" | alpha_div.1.B$storage_groups=="d") -> alpha_div.1.B.ad subset(alpha_div.1.B, alpha_div.1.B$storage_groups=="b" | alpha_div.1.B$storage_groups=="d") -> alpha_div.1.B.bd subset(alpha_div.1.B, alpha_div.1.B$storage_groups=="c" | alpha_div.1.B$storage_groups=="d") -> alpha_div.1.B.cd subset(alpha_div.1.C, alpha_div.1.C$storage_groups=="a" | alpha_div.1.C$storage_groups=="d") -> alpha_div.1.C.ad subset(alpha_div.1.C, alpha_div.1.C$storage_groups=="b" | alpha_div.1.C$storage_groups=="d") -> alpha_div.1.C.bd subset(alpha_div.1.C, alpha_div.1.C$storage_groups=="c" | alpha_div.1.C$storage_groups=="d") -> alpha_div.1.C.cd ggplot(alpha_div.1, aes(x=subject, y=sobs_1086)) + geom_boxplot(outlier.shape=NA, aes(fill=alpha_div.1$storage_groups)) + geom_jitter(aes(colour=alpha_div.1$storage_groups)) ggplot(alpha_div.1, aes(x=subject, y=shannon_1086)) + geom_boxplot(outlier.shape=NA, aes(fill=alpha_div.1$storage_groups)) + geom_jitter(aes(colour=alpha_div.1$storage_groups)) ggplot(alpha_div.1, aes(x=subject, y=pd_1086)) + geom_boxplot(outlier.shape=NA, aes(fill=alpha_div.1$storage_groups)) + geom_jitter(aes(colour=alpha_div.1$storage_groups)) ##################### COMMUNITY COMPOSITION ANALYSIS ##################### 1. Effect of subject overides all other factors rda(esv.clr) -> esv.clr.pca plot(esv.clr.pca,type='n') ## <--- fig 1A points(esv.clr.pca, pch=21, cex=1, bg=env$storage_groups) legend("bottomright", legend=unique(factor(env$storage_groups)), pch=19, col=unique(factor(env$storage_groups))) # PCA figure ordispider(esv.clr.pca, env$subject, label=T) ## Overall model here: ## <----- Table 1A and 1B adonis(esv.clr ~ subject + factor(subject_visit) + subject:factor(subject_visit) + factor(storage_groups) + factor(storage_groups)/time , data=env, strata = env$subject, method = "euclidean", permu=5000) adonis(esv.clr.n ~ subject + factor(subject_visit) + subject:factor(subject_visit) + factor(storage_groups) + factor(storage_groups)/time, data=env.n, strata = env.n$subject, method = "euclidean", permu=5000) 2. temporal variation within subjects. ## <---- fig 2b-d Need to look at all samples from immediate -80C storage, exclude other storage conditions And need to look at individuals separately rda(esv.clr.A) -> esv.clr.A.pca plot(esv.clr.A.pca, type='n') ## <--- fig 2b points(esv.clr.A.pca, pch=21, cex=1.5, bg=env.A$storage_groups) text(esv.clr.A.pca, labels=env.A$days_from_start, col="gray") ordispider(esv.clr.A.pca, env.A$days_from_start, label=T) legend("bottomright", legend=unique(factor(env.A$storage_groups)), pch=19, col=unique(factor(env.A$storage_groups))) rda(esv.clr.B) -> esv.clr.B.pca plot(esv.clr.B.pca, type='n') ## <--- fig 2c points(esv.clr.B.pca, pch=21, cex=1.5, bg=env.B$storage_groups) text(esv.clr.B.pca, labels=env.B$days_from_start, col="gray") legend("bottomright", legend=unique(factor(env.B$storage_groups)), pch=19, col=unique(factor(env.B$storage_groups))) rda(esv.clr.C) -> esv.clr.C.pca plot(esv.clr.C.pca, type='n') ## <--- fig 2d points(esv.clr.C.pca, pch=21, cex=1.5, bg=env.C$storage_groups) text(esv.clr.C.pca, labels=env.C$days_from_start, col="gray") legend("bottomright", legend=unique(factor(env.C$storage_groups)), pch=19, col=unique(factor(env.C$storage_groups))) adonis(esv.clr.A.a ~ factor(subject_visit), data=env.A.a, method = "euclidean") adonis(esv.clr.B.a ~ factor(subject_visit), data=env.B.a, method = "euclidean") adonis(esv.clr.C.a ~ factor(subject_visit), data=env.C.a, method = "euclidean") # Hclust to show temporal clustering ##<------ Fig. S2 vegdist(esv.clr.A.a, method="euclidean")->hc vegdist(esv.clr.B.a, method="euclidean")->hc vegdist(esv.clr.C.a, method="euclidean")->hc plot(hclust(hc)) 3. effect of storage conditions: Need to get samples from visit 1 since only these samples have all four treatments ## norgen vs RT, chilled and immediately frozen, alpha diversity: # norgen vs frozen wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.A.ad) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.A.ad) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.A.ad) wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.B.ad) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.B.ad) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.B.ad) wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.C.ad) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.C.ad) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.C.ad) #room temp vs norgen wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.A.bd) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.A.bd) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.A.bd) wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.B.bd) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.B.bd) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.B.bd) wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.C.bd) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.C.bd) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.C.bd) #chilled vs norgen wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.A.cd) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.A.cd) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.A.cd) wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.B.cd) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.B.cd) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.B.cd) wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.C.cd) wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.C.cd) wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.C.cd) #table S2 containing three permanova outputs. adonis(esv.clr.1.A ~ norgen + factor(storage_groups) + time + factor(storage_groups)/time, data=env.1.A, strata = env.1.A$storage_groups, method = "euclidean", permu=5000) adonis(esv.clr.1.B ~ norgen + factor(storage_groups) + time + factor(storage_groups)/time, data=env.1.B, strata = env.1.B$storage_groups, method = "euclidean", permu=5000) adonis(esv.clr.1.C ~ norgen + factor(storage_groups) + time + factor(storage_groups)/time, data=env.1.C, strata = env.1.C$storage_groups, method = "euclidean", permu=5000) adonis(esv.clr.1.A ~ norgen , data=env.1.A, method = "euclidean", permu=5000) ## plot PCA plots that show esv composition from visit 1 only, by subject: rda(esv.clr.1.A) -> esv.clr.1.A.pca plot(esv.clr.1.A.pca, type='n') ## <--- fig 3 points(esv.clr.1.A.pca, pch=21, cex=2, bg=env.1.A$storage_groups) legend("bottomright", legend=unique(factor(env.1.A$storage_groups)), pch=19, col=unique(factor(env.1.A$storage_groups))) text(esv.clr.1.A.pca, labels=env.1.A$time, col="white") ordispider(esv.clr.1.A.pca, env.1.A$time, label=T) rda(esv.clr.1.B) -> esv.clr.1.B.pca plot(esv.clr.1.B.pca, type='n') ## <--- fig 3 points(esv.clr.1.B.pca, pch=21, cex=2, bg=env.1.B$storage_groups) legend("bottomright", legend=unique(factor(env.1.B$storage_groups)), pch=19, col=unique(factor(env.1.B$storage_groups))) text(esv.clr.1.B.pca, labels=env.1.B$time, col="white") ordispider(esv.clr.1.B.pca, env.1.B$time, label=T) rda(esv.clr.1.C) -> esv.clr.1.C.pca plot(esv.clr.1.C.pca, type='n') ## <--- fig 3 points(esv.clr.1.C.pca, pch=21, cex=2, bg=env.1.C$storage_groups) legend("bottomright", legend=unique(factor(env.1.C$storage_groups)), pch=19, col=unique(factor(env.1.C$storage_groups))) text(esv.clr.1.C.pca, labels=env.1.C$time, col="white") ordispider(esv.clr.1.C.pca, env.1.C$time, label=T) svg("esv.clr.1.A.pca.svg") svg("esv.clr.1.B.pca.svg") svg("esv.clr.1.C.pca.svg") dev.off() #Which taxa are associated with Norgen use? Use a GLM: ## <--- Table S2 p_vals <- NULL for (i in 1:ncol(esv.clr.1)){ tmp2 <- (summary(glm(esv.clr.1[,i] ~ env.1$norgen + env.1$subject, family="gaussian"))$coefficient[2,4]) p_vals <- rbind(p_vals, tmp2) } p_vals p.adjust(p_vals,"BH") subset(esv, env$subject_visit=="1") -> esv.1 subset(esv.1, env.1$norgen=="n") -> esv.1.n subset(env.1, env.1$norgen=="n") -> env.1.n subset(esv.1, env.1$norgen=="y") -> esv.1.y subset(env.1, env.1$norgen=="y") -> env.1.y rownames(esv.1)==rownames(esv.clr.1) colMeans(esv.1.n) ## OR colMeans(esv.1[env.1$norgen=="n",]) # to get esv relabundances in NO norgen samples colMeans(esv.1.y) ## OR colMeans(esv.1[env.1$norgen=="y",]) # to get esv relabundances in WITH norgen samples # Perform sPLSDA to corroborate the above GLM: # Which taxa associated with which storage condition? with norgen. library(mixOmics) pca(esv.clr.1, ncomp=10, logratio='none',) -> esv.clr.1.pca plot(esv.clr.1.pca) # barplot of variance explained plsda(esv.clr.1, env.1$storage_groups, ncomp = 5) -> plsda perf.plsda <- perf(plsda, validation = 'Mfold', folds = 5, progressBar = TRUE, nrepeat = 20) plot(perf.plsda, overlay = 'measure', sd=TRUE) plotIndiv(plsda , comp = c(1,2), ind.names = FALSE, ellipse = TRUE, legend = TRUE, title = 'Cooloola Most Diverse, PLSDA comp 1 - 2') ## pick five components based on perf.plsda plot splsda.tune = tune.splsda(esv.clr.1, env.1$storage_groups, ncomp = 5, logratio = 'none', test.keepX = c(seq(5,150, 5)), validation = 'Mfold', folds = 5, dist = 'max.dist', nrepeat = 50) plot(splsda.tune) # lines indicate that components 1 to 5 are sufficient. select.keepX = splsda.tune$choice.keepX[1:5] splsda.res <- splsda(esv.clr.1, env.1$storage_groups, ncomp = 5, keepX = select.keepX) plotIndiv(splsda.res, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, legend = TRUE, title = 'cooloola data, sPLSDA comp 1 - 2') splsda.perf = perf(splsda.res, validation = 'Mfold', folds = 20, progressBar = TRUE, nrepeat = 50) splsda.perf$error.rate plot(splsda.perf) (selectVar(splsda.res, comp = 1)$value) plotLoadings(splsda.res, comp = 1, method = 'mean', contrib = 'max') plotLoadings(splsda.res, comp = 2, method = 'mean', contrib = 'max') selectVar(splsda.res, comp = 1)$value #glm on donor separately to identify norgen associated esvs: ##<-- table s3 ## donor A p_vals <- NULL for (i in 1:ncol(esv.clr.1.A)){ tmp2 <- (summary(glm(esv.clr.1.A[,i] ~ env.1.A$norgen, family="gaussian"))$coefficient[2,4]) p_vals <- rbind(p_vals, tmp2) } p_vals p.adjust(p_vals,"BH") ##<- table s3 rownames(esv.1)==rownames(esv.clr.1) colMeans(esv.1.n.A) ## OR colMeans(esv.1.n[env.1.n$subject=="A",]) # to get esv relabundances in NO norgen samples colMeans(esv.1.y.A) ## OR colMeans(esv.1.y[env.1.y$subject=="A",]) # to get esv relabundances in WITH norgen samples ## donor B p_vals <- NULL for (i in 1:ncol(esv.clr.1.B)){ tmp2 <- (summary(glm(esv.clr.1.B[,i] ~ env.1.B$norgen, family="gaussian"))$coefficient[2,4]) p_vals <- rbind(p_vals, tmp2) } p_vals p.adjust(p_vals,"BH") ##<- table s3 rownames(esv.1)==rownames(esv.clr.1) colMeans(esv.1.n.B) ## OR colMeans(esv.1.n[env.1.n$subject=="B",]) # to get esv relabundances in NO norgen samples colMeans(esv.1.y.B) ## OR colMeans(esv.1.y[env.1.y$subject=="B",]) # to get esv relabundances in WITH norgen samples ## donor C p_vals <- NULL for (i in 1:ncol(esv.clr.1.C)){ tmp2 <- (summary(glm(esv.clr.1.C[,i] ~ env.1.C$norgen, family="gaussian"))$coefficient[2,4]) p_vals <- rbind(p_vals, tmp2) } p_vals p.adjust(p_vals,"BH") ##<- table s3 rownames(esv.1)==rownames(esv.clr.1) colMeans(esv.1.n.C) ## OR colMeans(esv.1.n[env.1.n$subject=="C",]) # to get esv relabundances in NO norgen samples colMeans(esv.1.y.C) ## OR colMeans(esv.1.y[env.1.y$subject=="C",]) # to get esv relabundances in WITH norgen samples 4. in terms of real world application, would 24 hour duration make a difference in both ice box and room temp conditions? adonis(esv.clr.1.n ~ subject * storage_groups * time_h, data=env.1.n, strata=env.1.n$subject, method="euclidean", permu=5000 ) rda(esv.clr.1.n) -> esv.clr.1.n.pca plot(esv.clr.1.n.pca) ## effect of individual is overarching, test storage conditions by donors separately instead: adonis(esv.clr.1.A ~ storage_groups * time_h, data=env.1.A, strata=env.1.A$storage_groups, method="euclidean", permu=5000 ) adonis(esv.clr.1.B ~ storage_groups * time_h, data=env.1.B, strata=env.1.B$storage_groups, method="euclidean", permu=5000 ) adonis(esv.clr.1.C ~ storage_groups * time_h, data=env.1.C, strata=env.1.C$storage_groups, method="euclidean", permu=5000 ) adonis(esv.clr.1.A ~ storage_groups * time, data=env.1.A, strata=env.1.A$storage_groups, method="euclidean", permu=5000 ) adonis(esv.clr.1.B ~ storage_groups * time, data=env.1.B, strata=env.1.B$storage_groups, method="euclidean", permu=5000 ) adonis(esv.clr.1.C ~ storage_groups * time, data=env.1.C, strata=env.1.C$storage_groups, method="euclidean", permu=5000 ) adonis(esv.clr.1.n.A ~ storage_groups *time, data=env.1.n.A, strata=env.1.n.A$storage_groups, method="euclidean", permu=5000 ) ##<--- table S4 adonis(esv.clr.1.n.B ~ storage_groups *time, data=env.1.n.B, strata=env.1.n.B$storage_groups, method="euclidean", permu=5000 ) ##<--- table S4 adonis(esv.clr.1.n.C ~ storage_groups *time, data=env.1.n.C, strata=env.1.n.C$storage_groups, method="euclidean", permu=5000 ) ##<--- table S4 ## can differences be detected in post 2 hour samples? :adonis(esv.1.n.A.ab.02 ~ env.1.n.A.ab.02$time, permu=5000, method="euclidean") ## <-- table S5 adonis(esv.clr.1.n.A.ab.02 ~ env.1.n.A.ab.02$time, permu=5000, method="euclidean") ## room temp vs immediately frozen adonis(esv.clr.1.n.B.ab.02 ~ env.1.n.B.ab.02$time, permu=5000, method="euclidean") adonis(esv.clr.1.n.C.ab.02 ~ env.1.n.C.ab.02$time, permu=5000, method="euclidean") adonis(esv.clr.1.n.A.ac.02 ~ env.1.n.A.ac.02$time, permu=5000, method="euclidean") ## chilled vs immediately frozen adonis(esv.clr.1.n.B.ac.02 ~ env.1.n.B.ac.02$time, permu=5000, method="euclidean") adonis(esv.clr.1.n.C.ac.02 ~ env.1.n.C.ac.02$time, permu=5000, method="euclidean") ## can differences be detected in post 5 hour samples? : adonis(esv.clr.1.n.A.ab.05 ~ env.1.n.A.ab.05$time, permu=5000, method="euclidean") ## room temp vs immediately frozen adonis(esv.clr.1.n.B.ab.05 ~ env.1.n.B.ab.05$time, permu=5000, method="euclidean") # p<0.05 adonis(esv.clr.1.n.C.ab.05 ~ env.1.n.C.ab.05$time, permu=5000, method="euclidean") adonis(esv.clr.1.n.A.ac.05 ~ env.1.n.A.ac.05$time, permu=5000, method="euclidean") ## chilled vs immediately frozen adonis(esv.clr.1.n.B.ac.05 ~ env.1.n.B.ac.05$time, permu=5000, method="euclidean") # p<0.05 adonis(esv.clr.1.n.C.ac.05 ~ env.1.n.C.ac.05$time, permu=5000, method="euclidean") ## can differences be detected in post 24 hour samples? : adonis(esv.clr.1.n.A.ab.024 ~ env.1.n.A.ab.024$time, permu=5000, method="euclidean") ## room temp vs immediately frozen adonis(esv.clr.1.n.B.ab.024 ~ env.1.n.B.ab.024$time, permu=5000, method="euclidean") # p<0.05 adonis(esv.clr.1.n.C.ab.024 ~ env.1.n.C.ab.024$time, permu=5000, method="euclidean") adonis(esv.clr.1.n.A.ac.024 ~ env.1.n.A.ac.024$time, permu=5000, method="euclidean") ## chilled vs immediately frozen adonis(esv.clr.1.n.B.ac.024 ~ env.1.n.B.ac.024$time, permu=5000, method="euclidean") # p<0.05 adonis(esv.clr.1.n.C.ac.024 ~ env.1.n.C.ac.024$time, permu=5000, method="euclidean") #which taxa are enriched in the later points (2, 5 and 24 h) for ice box and RT conditions? #subset to get immediately frozen+ room temp (2, 5, 24 h), immediately frozen + ice box (2, 5, 24 h) samples ## GLM for room temp vs immediately frozen: # subject A p_vals <- NULL for (i in 1:ncol(esv.clr.1.n.A.ab)){ x <- summary(glm(esv.clr.1.n.A.ab[,i] ~ env.1.n.A.ab$time, family="gaussian")) ##because frozen samples are t=0 h, so no need for specifying storage groups here h2 <- x$coefficient[2,4] h5 <- x$coefficient[3,4] h24 <- x$coefficient[4,4] p.adjust(h2,"BH") -> h2.adj p.adjust(h5,"BH") -> h5.adj p.adjust(h24,"BH") -> h24.adj p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj) } p_vals ## <- table s5 colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="a",]) # to get their proportions from frozen t=0 h colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="b",]) # to get their proportions from RT t=2 h colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="c",]) # to get their proportions from RT t=5 h colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="d",]) # to get their proportions from RT t=24 h # subject B p_vals <- NULL for (i in 1:ncol(esv.clr.1.n.B.ab)){ x <- summary(glm(esv.clr.1.n.B.ab[,i] ~ env.1.n.B.ab$time, family="gaussian")) ##because frozen samples are t=0 h, so no need for specifying storage groups here h2 <- x$coefficient[2,4] h5 <- x$coefficient[3,4] h24 <- x$coefficient[4,4] p.adjust(h2,"BH") -> h2.adj p.adjust(h5,"BH") -> h5.adj p.adjust(h24,"BH") -> h24.adj p_vals <- rbind(p_vals, h2.adj,h5.adj, h24.adj) } p_vals ## <- table s5 colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="a",]) # to get their proportions from frozen t=0 h colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="b",]) # to get their proportions from RT t=2 h colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="c",]) # to get their proportions from RT t=5 h colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="d",]) # to get their proportions from RT t=24 h # subject C p_vals <- NULL for (i in 1:ncol(esv.clr.1.n.C.ab)){ x <- summary(glm(esv.clr.1.n.C.ab[,i] ~ env.1.n.C.ab$time, family="gaussian")) ##because frozen samples are t=0 h, so no need for specifying storage groups here h2 <- x$coefficient[2,4] h5 <- x$coefficient[3,4] h24 <- x$coefficient[4,4] p.adjust(h2,"BH") -> h2.adj p.adjust(h5,"BH") -> h5.adj p.adjust(h24,"BH") -> h24.adj p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj) } p_vals ## <- table s5 colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="a",]) # to get their proportions from frozen t=0 h colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="b",]) # to get their proportions from RT t=2 h colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="c",]) # to get their proportions from RT t=5 h colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="d",]) # to get their proportions from RT t=24 h ## GLM for ice box vs immediately frozen: # subject A p_vals <- NULL for (i in 1:ncol(esv.clr.1.n.A.ac)){ x <- summary(glm(esv.clr.1.n.A.ac[,i] ~ env.1.n.A.ac$time, family="gaussian")) ##because frozen samples are t=0 h, so no need for specifying storage groups here h2 <- x$coefficient[2,4] h5 <- x$coefficient[3,4] h24 <- x$coefficient[4,4] p.adjust(h2,"BH") -> h2.adj p.adjust(h5,"BH") -> h5.adj p.adjust(h24,"BH") -> h24.adj p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj) } p_vals ## <- table s5 colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="a",]) # to get their proportions from frozen t=0 h colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="b",]) # to get their proportions from icebox t=2 h colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="c",]) # to get their proportions from icebox t=5 h colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="d",]) # to get their proportions from icebox t=24 h # subject B p_vals <- NULL for (i in 1:ncol(esv.clr.1.n.B.ac)){ x <- summary(glm(esv.clr.1.n.B.ac[,i] ~ env.1.n.B.ac$time, family="gaussian")) ##because frozen samples are t=0 h, so no need for specifying storage groups here h2 <- x$coefficient[2,4] h5 <- x$coefficient[3,4] h24 <- x$coefficient[4,4] p.adjust(h2,"BH") -> h2.adj p.adjust(h5,"BH") -> h5.adj p.adjust(h24,"BH") -> h24.adj p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj) } p_vals ## <- table s5 colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="a",]) # to get their proportions from frozen t=0 h colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="b",]) # to get their proportions from icebox t=2 h colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="c",]) # to get their proportions from icebox t=5 h colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="d",]) # to get their proportions from icebox t=24 h # subject C p_vals <- NULL for (i in 1:ncol(esv.clr.1.n.C.ac)){ x <- summary(glm(esv.clr.1.n.C.ac[,i] ~ env.1.n.C.ac$time, family="gaussian")) ##because frozen samples are t=0 h, so no need for specifying storage groups here h2 <- x$coefficient[2,4] h5 <- x$coefficient[3,4] h24 <- x$coefficient[4,4] p.adjust(h2,"BH") -> h2.adj p.adjust(h5,"BH") -> h5.adj p.adjust(h24,"BH") -> h24.adj p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj) } p_vals ## <- table s5 colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="a",]) # to get their proportions from frozen t=0 h colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="b",]) # to get their proportions from icebox t=2 h colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="c",]) # to get their proportions from icebox t=5 h colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="d",]) # to get their proportions from icebox t=24 h 5. does collection from different sites of the stool make a difference? adonis(esv.clr.1.A.a ~ site, data=env.1.A.a, method="euclidean", permu=5000) ## <--- table S6 adonis(esv.clr.1.B.a ~ site, data=env.1.B.a, method="euclidean", permu=5000) ## <--- table S6 adonis(esv.clr.1.C.a ~ site, data=env.1.C.a, method="euclidean", permu=5000) ## <--- table S6 # yes! but only very slightly. If we combine all three donors, inter-donor variability overwhelms intra-sample variability: rbind(esv.clr.1.A.a, esv.clr.1.B.a, esv.clr.1.C.a) -> esv.clr.1.a rbind(env.1.A.a, env.1.B.a, env.1.C.a) -> env.1.a adonis(esv.clr.1.a ~ subject*site, data=env.1.a, strata=env.1.a$subject, method="euclidean", permu=5000) ##<--- table S4 rda(esv.clr.1.A.a) -> esv.clr.1.A.a.pca plot(esv.clr.1.A.a.pca, type='n') ## <--- Fig S4 points(esv.clr.1.A.a.pca, pch=21, cex=2, bg=env.1.A.a$site) text(esv.clr.1.A.a.pca, labels=env.1.A.a$site, cex=3) rda(esv.clr.1.A.a ~ env.1.A.a$site) -> esv.clr.1.A.a.rda permutest(esv.clr.1.A.a.rda, permu=2000) rda(esv.clr.1.B.a) -> esv.clr.1.B.a.pca plot(esv.clr.1.B.a.pca, type='n') ## <--- Fig S4 points(esv.clr.1.B.a.pca, pch=21, cex=2, bg=env.1.B.a$site) text(esv.clr.1.B.a.pca, labels=env.1.B.a$site, cex=3) rda(esv.clr.1.B.a ~ env.1.B.a$site) -> esv.clr.1.B.a.rda permutest(esv.clr.1.B.a.rda, permu=2000) rda(esv.clr.1.C.a) -> esv.clr.1.C.a.pca plot(esv.clr.1.C.a.pca, type='n') ## <--- Fig S4 points(esv.clr.1.C.a.pca, pch=21, cex=2, bg=env.1.C.a$site) text(esv.clr.1.C.a.pca, labels=env.1.C.a$site, cex=3) rda(esv.clr.1.C.a ~ env.1.C.a$site) -> esv.clr.1.C.a.rda permutest(esv.clr.1.C.a.rda, permu=2000) 6. Power calculations read.table("esv_table_counts_nochlor - power.txt",header=T,row.names=1,sep='\t') -> tmp ## the above counts file has been processed as such: # 1. chloroplast sequences removed. # 2. esvs with less than 20 reads in all samples removed. tmp ncol(env) ## 10 columns in env env[,11] <- NA colSums(tmp[,-193]) -> env[,11] names(env)[names(env)=="V11"] <- "read_count" subset(t(tmp[,-193]),env$read_count > 999) -> tmp env[env$read_count>999,] -> env ## filter away samples with less than 1000 reads # and make sure rownames are the same in both esv table and env files rownames(tmp)==rownames(env) ## REMOVE THE ENRICHED SAMPLES! tmp[-4:-6,]->tmp env[-4:-6,]->env subset(tmp, env$subject_visit=="1") -> counts ##just get the first visit samples subset(counts, env.1$norgen=="y") -> counts.y subset(counts, env.1$norgen=="n") -> counts.n DM.MoM(counts.n) -> fit.counts.n DM.MoM(counts.y) -> fit.counts.y numMC <- 1000 nrow(counts.n) nrow(counts.y) nrsGrp1 <- rep(2000, 81) nrsGrp2 <- rep(2000, 36) group.Nrs <- list(nrsGrp1, nrsGrp2) ### Computing Power of the test statistics (Type II error) alphap <- rbind(fit.counts.n$gamma, fit.counts.y$gamma) pwr <- MC.Xdc.statistics(group.Nrs, numMC, alphap, "ha") pwr