setwd("C:/Users/Administrator/Google Drive/CUHK projects/16S methodology validation/redemux")

library(vegan)

read.table('env.txt',header=T,sep='\t',row.names=1)->env
read.table("esv_table_counts_nochlor.tsv",header=T,row.names=1,sep='\t')->tmp

ncol(env) ## 10 columns in env
env[,11] <- NA
colSums(tmp[,-193]) -> env[,11]
names(env)[names(env)=="V11"] <- "read_count"

# now get the samples with 1000 or more reads, and their corresponding entries in env file
subset(t(tmp[,-193]),env$read_count > 999) -> tmp
env[env$read_count>999,] -> env ## filter away samples with less than 1000 reads

# and make sure rownames are the same in both ESV table and env files
rownames(tmp)==rownames(env)

# CLR transformation:
# from https://github.com/joey711/shiny-phyloseq/blob/master/panels/paneldoc/Transform.md

gm_mean = function(x, na.rm=TRUE){
  exp(sum(log(x[x > 0 & !is.na(x)]), na.rm=na.rm) / length(x))
}
# from https://github.com/joey711/shiny-phyloseq/blob/master/panels/paneldoc/Transform.md
clr = function(x, base=2){
  x <- log((x / gm_mean(x)), base)
  x[!is.finite(x) | is.na(x)] <- 0.0
  return(x)
}

#Run CLR on the ESV count matrix (rows ESVs, cols samples)
esv.clr <- t(apply(t(tmp), MARGIN = 2, FUN = clr))    ## tmp is transposed because ESVs are in columns and samples are in rows 

#also create a corresponding esv table based on relative abundances to obtain relative abundance values later
decostand(tmp,"total") -> esv


###### subset commands ######

subset(esv.clr, env$norgen=="n") -> esv.clr.n
subset(env, env$norgen=="n") -> env.n

subset(esv.clr,env$subject=="A") -> esv.clr.A
subset(esv.clr,env$subject=="B") -> esv.clr.B
subset(esv.clr,env$subject=="C") -> esv.clr.C
subset(env,env$subject=="A") -> env.A
subset(env,env$subject=="B") -> env.B
subset(env,env$subject=="C") -> env.C

subset(esv.clr.A, env.A$storage_groups=="a") -> esv.clr.A.a
subset(esv.clr.B, env.B$storage_groups=="a") -> esv.clr.B.a
subset(esv.clr.C, env.C$storage_groups=="a") -> esv.clr.C.a
subset(env.A, env.A$storage_groups=="a") -> env.A.a
subset(env.B, env.B$storage_groups=="a") -> env.B.a
subset(env.C, env.C$storage_groups=="a") -> env.C.a

subset(esv.clr,env$subject_visit=="1") -> esv.clr.1
subset(esv,env$subject_visit=="1") -> esv.1
subset(env,env$subject_visit=="1") -> env.1


subset(esv.clr.1, env.1$subject=="A") -> esv.clr.1.A
subset(esv.clr.1, env.1$subject=="B") -> esv.clr.1.B
subset(esv.clr.1, env.1$subject=="C") -> esv.clr.1.C
subset(env.1, env.1$subject=="A") -> env.1.A
subset(env.1, env.1$subject=="B") -> env.1.B
subset(env.1, env.1$subject=="C") -> env.1.C

subset(esv.clr.1, env.1$norgen=="n") -> esv.clr.1.n
subset(env.1, env.1$norgen=="n") -> env.1.n

subset(esv.clr.1.n, env.1.n$subject=="A") -> esv.clr.1.n.A
subset(env.1.n, env.1.n$subject=="A") -> env.1.n.A
subset(esv.clr.1.n, env.1.n$subject=="B") -> esv.clr.1.n.B
subset(env.1.n, env.1.n$subject=="B") -> env.1.n.B
subset(esv.clr.1.n, env.1.n$subject=="C") -> esv.clr.1.n.C
subset(env.1.n, env.1.n$subject=="C") -> env.1.n.C

subset(esv.1.n, env.1.n$subject=="A") -> esv.1.n.A
subset(esv.1.y, env.1.y$subject=="A") -> esv.1.y.A
subset(env.1.y, env.1.y$subject=="A") -> env.1.y.A
subset(esv.1.n, env.1.n$subject=="B") -> esv.1.n.B
subset(esv.1.y, env.1.y$subject=="B") -> esv.1.y.B
subset(env.1.y, env.1.y$subject=="B") -> env.1.y.B
subset(esv.1.n, env.1.n$subject=="C") -> esv.1.n.C
subset(esv.1.y, env.1.y$subject=="C") -> esv.1.y.C
subset(env.1.y, env.1.y$subject=="C") -> env.1.y.C

subset(esv.clr.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="b") -> esv.clr.1.n.A.ab
subset(esv.clr.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="c") -> esv.clr.1.n.A.ac
subset(env.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="b") -> env.1.n.A.ab
subset(env.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="c") -> env.1.n.A.ac

subset(esv.clr.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="b") -> esv.clr.1.n.B.ab
subset(esv.clr.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="c") -> esv.clr.1.n.B.ac
subset(env.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="b") -> env.1.n.B.ab
subset(env.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="c") -> env.1.n.B.ac

subset(esv.clr.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="b") -> esv.clr.1.n.C.ab
subset(esv.clr.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="c") -> esv.clr.1.n.C.ac
subset(env.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="b") -> env.1.n.C.ab
subset(env.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="c") -> env.1.n.C.ac

subset(esv.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="b") -> esv.1.n.A.ab
subset(esv.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="b") -> esv.1.n.B.ab
subset(esv.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="b") -> esv.1.n.C.ab
subset(esv.1.n.A,env.1.n.A$storage_groups=="a" | env.1.n.A$storage_groups=="c") -> esv.1.n.A.ac
subset(esv.1.n.B,env.1.n.B$storage_groups=="a" | env.1.n.B$storage_groups=="c") -> esv.1.n.B.ac
subset(esv.1.n.C,env.1.n.C$storage_groups=="a" | env.1.n.C$storage_groups=="c") -> esv.1.n.C.ac

## can differences be detected in post 2 hour samples? :
subset(esv.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> esv.1.n.A.ab.02
subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> env.1.n.A.ab.02
subset(esv.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> esv.1.n.B.ab.02
subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> env.1.n.B.ab.02
subset(esv.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> esv.1.n.C.ab.02
subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> env.1.n.C.ab.02

subset(esv.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> esv.1.n.A.ac.02
subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> env.1.n.A.ac.02
subset(esv.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> esv.1.n.B.ac.02
subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> env.1.n.B.ac.02
subset(esv.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> esv.1.n.C.ac.02
subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> env.1.n.C.ac.02

subset(esv.clr.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> esv.clr.1.n.A.ab.02
subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="b") -> env.1.n.A.ab.02
subset(esv.clr.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> esv.clr.1.n.B.ab.02
subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="b") -> env.1.n.B.ab.02
subset(esv.clr.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> esv.clr.1.n.C.ab.02
subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="b") -> env.1.n.C.ab.02

subset(esv.clr.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> esv.clr.1.n.A.ac.02
subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="b") -> env.1.n.A.ac.02
subset(esv.clr.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> esv.clr.1.n.B.ac.02
subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="b") -> env.1.n.B.ac.02
subset(esv.clr.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> esv.clr.1.n.C.ac.02
subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="b") -> env.1.n.C.ac.02



## can differences be detected in post 5 hour samples? :
subset(esv.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> esv.1.n.A.ab.05
subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> env.1.n.A.ab.05
subset(esv.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> esv.1.n.B.ab.05
subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> env.1.n.B.ab.05
subset(esv.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> esv.1.n.C.ab.05
subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> env.1.n.C.ab.05

subset(esv.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> esv.1.n.A.ac.05
subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> env.1.n.A.ac.05
subset(esv.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> esv.1.n.B.ac.05
subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> env.1.n.B.ac.05
subset(esv.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> esv.1.n.C.ac.05
subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> env.1.n.C.ac.05

subset(esv.clr.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> esv.clr.1.n.A.ab.05
subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="c") -> env.1.n.A.ab.05
subset(esv.clr.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> esv.clr.1.n.B.ab.05
subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="c") -> env.1.n.B.ab.05
subset(esv.clr.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> esv.clr.1.n.C.ab.05
subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="c") -> env.1.n.C.ab.05

subset(esv.clr.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> esv.clr.1.n.A.ac.05
subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="c") -> env.1.n.A.ac.05
subset(esv.clr.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> esv.clr.1.n.B.ac.05
subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="c") -> env.1.n.B.ac.05
subset(esv.clr.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> esv.clr.1.n.C.ac.05
subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="c") -> env.1.n.C.ac.05

## can differences be detected in post 24 hour samples? :
subset(esv.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> esv.1.n.A.ab.024
subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> env.1.n.A.ab.024
subset(esv.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> esv.1.n.B.ab.024
subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> env.1.n.B.ab.024
subset(esv.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> esv.1.n.C.ab.024
subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> env.1.n.C.ab.024

subset(esv.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> esv.1.n.A.ac.024
subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> env.1.n.A.ac.024
subset(esv.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> esv.1.n.B.ac.024
subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> env.1.n.B.ac.024
subset(esv.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> esv.1.n.C.ac.024
subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> env.1.n.C.ac.024

subset(esv.clr.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> esv.clr.1.n.A.ab.024
subset(env.1.n.A.ab, env.1.n.A.ab$time=="a" | env.1.n.A.ab$time=="d") -> env.1.n.A.ab.024
subset(esv.clr.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> esv.clr.1.n.B.ab.024
subset(env.1.n.B.ab, env.1.n.B.ab$time=="a" | env.1.n.B.ab$time=="d") -> env.1.n.B.ab.024
subset(esv.clr.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> esv.clr.1.n.C.ab.024
subset(env.1.n.C.ab, env.1.n.C.ab$time=="a" | env.1.n.C.ab$time=="d") -> env.1.n.C.ab.024

subset(esv.clr.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> esv.clr.1.n.A.ac.024
subset(env.1.n.A.ac, env.1.n.A.ac$time=="a" | env.1.n.A.ac$time=="d") -> env.1.n.A.ac.024
subset(esv.clr.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> esv.clr.1.n.B.ac.024
subset(env.1.n.B.ac, env.1.n.B.ac$time=="a" | env.1.n.B.ac$time=="d") -> env.1.n.B.ac.024
subset(esv.clr.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> esv.clr.1.n.C.ac.024
subset(env.1.n.C.ac, env.1.n.C.ac$time=="a" | env.1.n.C.ac$time=="d") -> env.1.n.C.ac.024

## for point 5, effect of sites in the one stool on community composition:
subset(esv.clr.1.A, env.1.A$storage_groups=="a") -> esv.clr.1.A.a
subset(esv.clr.1.B, env.1.B$storage_groups=="a") -> esv.clr.1.B.a
subset(esv.clr.1.C, env.1.C$storage_groups=="a") -> esv.clr.1.C.a
subset(env.1.A, env.1.A$storage_groups=="a") -> env.1.A.a
subset(env.1.B, env.1.B$storage_groups=="a") -> env.1.B.a
subset(env.1.C, env.1.C$storage_groups=="a") -> env.1.C.a



############### alpha diversity #################

read.table("alpha_div_collated.txt",header=T, row.names=1, sep='\t') -> alpha_div

## REMOVE THE ENRICHED SAMPLES!
alpha_div[-4:-6,]->alpha_div


# remove samples with zero value for sobs, shannon and faith pd:
alpha_div[alpha_div$sobs_1086>0,] -> alpha_div ## filter away samples with less than 1000 reads

rownames(alpha_div)
kruskal.test(sobs_1086 ~ subject, data=alpha_div)
kruskal.test(shannon_1086 ~ subject, data=alpha_div)
kruskal.test(pd_1086 ~ subject, data=alpha_div)

library(ggplot2)  ## <--- Fig S1
ggplot(alpha_div, aes(x=subject, y=sobs_1086)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width = 0.1, aes(colour = factor(alpha_div$subject_visit)))
ggplot(alpha_div, aes(x=subject, y=shannon_1086)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width = 0.1, aes(colour = factor(alpha_div$subject_visit)))
ggplot(alpha_div, aes(x=subject, y=pd_1086)) + geom_boxplot(outlier.shape=NA) + geom_jitter(width = 0.1, aes(colour = factor(alpha_div$subject_visit)))




# alpha diversity metrics of sampling occasion 1 stools, comparing norgen vs no norgen:
subset(alpha_div, alpha_div$subject_visit=="1") -> alpha_div.1
subset(alpha_div.1, alpha_div.1$subject=="A") -> alpha_div.1.A
subset(alpha_div.1, alpha_div.1$subject=="B") -> alpha_div.1.B
subset(alpha_div.1, alpha_div.1$subject=="C") -> alpha_div.1.C

subset(alpha_div.1.A, alpha_div.1.A$storage_groups=="a" | alpha_div.1.A$storage_groups=="d") -> alpha_div.1.A.ad
subset(alpha_div.1.A, alpha_div.1.A$storage_groups=="b" | alpha_div.1.A$storage_groups=="d") -> alpha_div.1.A.bd
subset(alpha_div.1.A, alpha_div.1.A$storage_groups=="c" | alpha_div.1.A$storage_groups=="d") -> alpha_div.1.A.cd

subset(alpha_div.1.B, alpha_div.1.B$storage_groups=="a" | alpha_div.1.B$storage_groups=="d") -> alpha_div.1.B.ad
subset(alpha_div.1.B, alpha_div.1.B$storage_groups=="b" | alpha_div.1.B$storage_groups=="d") -> alpha_div.1.B.bd
subset(alpha_div.1.B, alpha_div.1.B$storage_groups=="c" | alpha_div.1.B$storage_groups=="d") -> alpha_div.1.B.cd

subset(alpha_div.1.C, alpha_div.1.C$storage_groups=="a" | alpha_div.1.C$storage_groups=="d") -> alpha_div.1.C.ad
subset(alpha_div.1.C, alpha_div.1.C$storage_groups=="b" | alpha_div.1.C$storage_groups=="d") -> alpha_div.1.C.bd
subset(alpha_div.1.C, alpha_div.1.C$storage_groups=="c" | alpha_div.1.C$storage_groups=="d") -> alpha_div.1.C.cd

ggplot(alpha_div.1, aes(x=subject, y=sobs_1086)) + geom_boxplot(outlier.shape=NA, aes(fill=alpha_div.1$storage_groups)) + geom_jitter(aes(colour=alpha_div.1$storage_groups))
ggplot(alpha_div.1, aes(x=subject, y=shannon_1086)) + geom_boxplot(outlier.shape=NA, aes(fill=alpha_div.1$storage_groups)) + geom_jitter(aes(colour=alpha_div.1$storage_groups))
ggplot(alpha_div.1, aes(x=subject, y=pd_1086)) + geom_boxplot(outlier.shape=NA, aes(fill=alpha_div.1$storage_groups)) + geom_jitter(aes(colour=alpha_div.1$storage_groups))



##################### COMMUNITY COMPOSITION ANALYSIS ##################### 

1. Effect of subject overides all other factors
rda(esv.clr) -> esv.clr.pca	
plot(esv.clr.pca,type='n')    ## <--- fig 1A
points(esv.clr.pca, pch=21, cex=1, bg=env$storage_groups)
legend("bottomright", legend=unique(factor(env$storage_groups)), pch=19, col=unique(factor(env$storage_groups)))   # PCA figure
ordispider(esv.clr.pca, env$subject, label=T)


## Overall model here:  ## <----- Table 1A and 1B
adonis(esv.clr ~ subject +  factor(subject_visit) + subject:factor(subject_visit) + factor(storage_groups) + factor(storage_groups)/time , data=env, strata = env$subject, method = "euclidean", permu=5000)
adonis(esv.clr.n ~ subject +  factor(subject_visit) + subject:factor(subject_visit) + factor(storage_groups) + factor(storage_groups)/time, data=env.n, strata = env.n$subject, method = "euclidean", permu=5000)


2. temporal variation within subjects. ## <---- fig 2b-d
Need to look at all samples from immediate -80C storage, exclude other storage conditions
And need to look at individuals separately 


rda(esv.clr.A) -> esv.clr.A.pca
plot(esv.clr.A.pca, type='n')  ## <--- fig 2b
points(esv.clr.A.pca, pch=21, cex=1.5, bg=env.A$storage_groups)
text(esv.clr.A.pca, labels=env.A$days_from_start, col="gray")
ordispider(esv.clr.A.pca, env.A$days_from_start, label=T)
legend("bottomright", legend=unique(factor(env.A$storage_groups)), pch=19, col=unique(factor(env.A$storage_groups))) 

rda(esv.clr.B) -> esv.clr.B.pca
plot(esv.clr.B.pca, type='n')  ## <--- fig 2c
points(esv.clr.B.pca, pch=21, cex=1.5, bg=env.B$storage_groups)
text(esv.clr.B.pca, labels=env.B$days_from_start, col="gray")
legend("bottomright", legend=unique(factor(env.B$storage_groups)), pch=19, col=unique(factor(env.B$storage_groups))) 

rda(esv.clr.C) -> esv.clr.C.pca
plot(esv.clr.C.pca, type='n')  ## <--- fig 2d
points(esv.clr.C.pca, pch=21, cex=1.5, bg=env.C$storage_groups)
text(esv.clr.C.pca, labels=env.C$days_from_start, col="gray")
legend("bottomright", legend=unique(factor(env.C$storage_groups)), pch=19, col=unique(factor(env.C$storage_groups))) 


adonis(esv.clr.A.a ~ factor(subject_visit), data=env.A.a, method = "euclidean")
adonis(esv.clr.B.a ~ factor(subject_visit), data=env.B.a, method = "euclidean")
adonis(esv.clr.C.a ~ factor(subject_visit), data=env.C.a, method = "euclidean")


# Hclust to show temporal clustering ##<------ Fig. S2
vegdist(esv.clr.A.a, method="euclidean")->hc
vegdist(esv.clr.B.a, method="euclidean")->hc
vegdist(esv.clr.C.a, method="euclidean")->hc

plot(hclust(hc))


3. effect of storage conditions: 
Need to get samples from visit 1 since only these samples have all four treatments

## norgen vs RT, chilled and immediately frozen, alpha diversity:

# norgen vs frozen
wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.A.ad)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.A.ad)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.A.ad)

wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.B.ad)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.B.ad)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.B.ad)

wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.C.ad)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.C.ad)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.C.ad)

#room temp vs norgen
wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.A.bd)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.A.bd)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.A.bd)

wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.B.bd)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.B.bd)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.B.bd)

wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.C.bd)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.C.bd)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.C.bd)

#chilled vs norgen
wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.A.cd)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.A.cd)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.A.cd)

wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.B.cd)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.B.cd)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.B.cd)

wilcox.test(sobs_1086 ~ norgen, data=alpha_div.1.C.cd)
wilcox.test(shannon_1086 ~ norgen, data=alpha_div.1.C.cd)
wilcox.test(pd_1086 ~ norgen, data=alpha_div.1.C.cd)


#table S2 containing three permanova outputs. 
adonis(esv.clr.1.A ~ norgen + factor(storage_groups) + time + factor(storage_groups)/time, data=env.1.A, strata = env.1.A$storage_groups, method = "euclidean", permu=5000)
adonis(esv.clr.1.B ~ norgen + factor(storage_groups) + time + factor(storage_groups)/time, data=env.1.B, strata = env.1.B$storage_groups, method = "euclidean", permu=5000)
adonis(esv.clr.1.C ~ norgen + factor(storage_groups) + time + factor(storage_groups)/time, data=env.1.C, strata = env.1.C$storage_groups, method = "euclidean", permu=5000)

adonis(esv.clr.1.A ~ norgen , data=env.1.A, method = "euclidean", permu=5000)


## plot PCA plots that show esv composition from visit 1 only, by subject:

rda(esv.clr.1.A) -> esv.clr.1.A.pca
plot(esv.clr.1.A.pca, type='n')  ## <--- fig 3
points(esv.clr.1.A.pca, pch=21, cex=2, bg=env.1.A$storage_groups)
legend("bottomright", legend=unique(factor(env.1.A$storage_groups)), pch=19, col=unique(factor(env.1.A$storage_groups)))
text(esv.clr.1.A.pca, labels=env.1.A$time, col="white")
ordispider(esv.clr.1.A.pca, env.1.A$time, label=T)

rda(esv.clr.1.B) -> esv.clr.1.B.pca
plot(esv.clr.1.B.pca, type='n')  ## <--- fig 3
points(esv.clr.1.B.pca, pch=21, cex=2, bg=env.1.B$storage_groups)
legend("bottomright", legend=unique(factor(env.1.B$storage_groups)), pch=19, col=unique(factor(env.1.B$storage_groups)))
text(esv.clr.1.B.pca, labels=env.1.B$time, col="white")
ordispider(esv.clr.1.B.pca, env.1.B$time, label=T)

rda(esv.clr.1.C) -> esv.clr.1.C.pca
plot(esv.clr.1.C.pca, type='n')  ## <--- fig 3
points(esv.clr.1.C.pca, pch=21, cex=2, bg=env.1.C$storage_groups)
legend("bottomright", legend=unique(factor(env.1.C$storage_groups)), pch=19, col=unique(factor(env.1.C$storage_groups)))
text(esv.clr.1.C.pca, labels=env.1.C$time, col="white")
ordispider(esv.clr.1.C.pca, env.1.C$time, label=T)

svg("esv.clr.1.A.pca.svg")
svg("esv.clr.1.B.pca.svg")
svg("esv.clr.1.C.pca.svg")
dev.off()


#Which taxa are associated with Norgen use? Use a GLM:   ## <--- Table S2
p_vals <- NULL
for (i in 1:ncol(esv.clr.1)){
tmp2 <- (summary(glm(esv.clr.1[,i] ~ env.1$norgen + env.1$subject, family="gaussian"))$coefficient[2,4])
p_vals <- rbind(p_vals, tmp2)
} 
p_vals
p.adjust(p_vals,"BH")

subset(esv, env$subject_visit=="1") -> esv.1
subset(esv.1, env.1$norgen=="n") -> esv.1.n
subset(env.1, env.1$norgen=="n") -> env.1.n
subset(esv.1, env.1$norgen=="y") -> esv.1.y
subset(env.1, env.1$norgen=="y") -> env.1.y

rownames(esv.1)==rownames(esv.clr.1)

colMeans(esv.1.n)  ## OR colMeans(esv.1[env.1$norgen=="n",])  # to get esv relabundances in NO norgen samples
colMeans(esv.1.y)  ## OR colMeans(esv.1[env.1$norgen=="y",])  # to get esv relabundances in WITH norgen samples

# Perform sPLSDA to corroborate the above GLM: 
# Which taxa associated with which storage condition? with norgen.
library(mixOmics)
pca(esv.clr.1, ncomp=10, logratio='none',) -> esv.clr.1.pca
plot(esv.clr.1.pca)  # barplot of variance explained
plsda(esv.clr.1, env.1$storage_groups, ncomp = 5) -> plsda
perf.plsda <- perf(plsda, validation = 'Mfold', folds = 5, progressBar = TRUE, nrepeat = 20)
plot(perf.plsda, overlay = 'measure', sd=TRUE)
plotIndiv(plsda , comp = c(1,2), ind.names = FALSE, ellipse = TRUE, legend = TRUE, title = 'Cooloola Most Diverse, PLSDA comp 1 - 2')

## pick five components based on perf.plsda plot
splsda.tune = tune.splsda(esv.clr.1, env.1$storage_groups, ncomp = 5, logratio = 'none', test.keepX = c(seq(5,150, 5)), validation = 'Mfold', folds = 5, dist = 'max.dist', nrepeat = 50)
plot(splsda.tune)  # lines indicate that components 1 to 5 are sufficient.
select.keepX = splsda.tune$choice.keepX[1:5]
splsda.res <- splsda(esv.clr.1, env.1$storage_groups, ncomp = 5, keepX = select.keepX)

plotIndiv(splsda.res, comp = c(1,2), ind.names = FALSE, ellipse = TRUE, legend = TRUE, title = 'cooloola data, sPLSDA comp 1 - 2')

splsda.perf = perf(splsda.res, validation = 'Mfold', folds = 20, progressBar = TRUE, nrepeat = 50)
splsda.perf$error.rate
plot(splsda.perf)

(selectVar(splsda.res, comp = 1)$value) 
plotLoadings(splsda.res, comp = 1, method = 'mean', contrib = 'max')
plotLoadings(splsda.res, comp = 2, method = 'mean', contrib = 'max')

selectVar(splsda.res, comp = 1)$value


#glm on donor separately to identify norgen associated esvs:   ##<-- table s3

## donor A
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.A)){
tmp2 <- (summary(glm(esv.clr.1.A[,i] ~ env.1.A$norgen, family="gaussian"))$coefficient[2,4])
p_vals <- rbind(p_vals, tmp2)
} 
p_vals
p.adjust(p_vals,"BH") ##<- table s3

rownames(esv.1)==rownames(esv.clr.1)

colMeans(esv.1.n.A)  ## OR colMeans(esv.1.n[env.1.n$subject=="A",])  # to get esv relabundances in NO norgen samples
colMeans(esv.1.y.A)  ## OR colMeans(esv.1.y[env.1.y$subject=="A",])  # to get esv relabundances in WITH norgen samples

## donor B
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.B)){
tmp2 <- (summary(glm(esv.clr.1.B[,i] ~ env.1.B$norgen, family="gaussian"))$coefficient[2,4])
p_vals <- rbind(p_vals, tmp2)
} 
p_vals
p.adjust(p_vals,"BH") ##<- table s3
rownames(esv.1)==rownames(esv.clr.1)

colMeans(esv.1.n.B)  ## OR colMeans(esv.1.n[env.1.n$subject=="B",])  # to get esv relabundances in NO norgen samples
colMeans(esv.1.y.B)  ## OR colMeans(esv.1.y[env.1.y$subject=="B",])  # to get esv relabundances in WITH norgen samples

## donor C
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.C)){
tmp2 <- (summary(glm(esv.clr.1.C[,i] ~ env.1.C$norgen, family="gaussian"))$coefficient[2,4])
p_vals <- rbind(p_vals, tmp2)
} 
p_vals
p.adjust(p_vals,"BH")  ##<- table s3

rownames(esv.1)==rownames(esv.clr.1)

colMeans(esv.1.n.C)  ## OR colMeans(esv.1.n[env.1.n$subject=="C",])  # to get esv relabundances in NO norgen samples
colMeans(esv.1.y.C)  ## OR colMeans(esv.1.y[env.1.y$subject=="C",])  # to get esv relabundances in WITH norgen samples



4. in terms of real world application, would 24 hour duration make a difference in both ice box and room temp conditions?
adonis(esv.clr.1.n ~ subject * storage_groups * time_h, data=env.1.n, strata=env.1.n$subject, method="euclidean", permu=5000 )

rda(esv.clr.1.n) -> esv.clr.1.n.pca
plot(esv.clr.1.n.pca)

## effect of individual is overarching, test storage conditions by donors separately instead:

adonis(esv.clr.1.A ~ storage_groups * time_h, data=env.1.A, strata=env.1.A$storage_groups, method="euclidean", permu=5000 )
adonis(esv.clr.1.B ~ storage_groups * time_h, data=env.1.B, strata=env.1.B$storage_groups, method="euclidean", permu=5000 )
adonis(esv.clr.1.C ~ storage_groups * time_h, data=env.1.C, strata=env.1.C$storage_groups, method="euclidean", permu=5000 )

adonis(esv.clr.1.A ~ storage_groups * time, data=env.1.A, strata=env.1.A$storage_groups, method="euclidean", permu=5000 )   
adonis(esv.clr.1.B ~ storage_groups * time, data=env.1.B, strata=env.1.B$storage_groups, method="euclidean", permu=5000 )   
adonis(esv.clr.1.C ~ storage_groups * time, data=env.1.C, strata=env.1.C$storage_groups, method="euclidean", permu=5000 )   

adonis(esv.clr.1.n.A ~ storage_groups *time, data=env.1.n.A, strata=env.1.n.A$storage_groups, method="euclidean", permu=5000 )   ##<--- table S4
adonis(esv.clr.1.n.B ~ storage_groups *time, data=env.1.n.B, strata=env.1.n.B$storage_groups, method="euclidean", permu=5000 )   ##<--- table S4
adonis(esv.clr.1.n.C ~ storage_groups *time, data=env.1.n.C, strata=env.1.n.C$storage_groups, method="euclidean", permu=5000 )   ##<--- table S4



## can differences be detected in post 2 hour samples? :adonis(esv.1.n.A.ab.02 ~ env.1.n.A.ab.02$time, permu=5000, method="euclidean")   ## <-- table S5
adonis(esv.clr.1.n.A.ab.02 ~ env.1.n.A.ab.02$time, permu=5000, method="euclidean")   ## room temp vs immediately frozen
adonis(esv.clr.1.n.B.ab.02 ~ env.1.n.B.ab.02$time, permu=5000, method="euclidean")
adonis(esv.clr.1.n.C.ab.02 ~ env.1.n.C.ab.02$time, permu=5000, method="euclidean")
adonis(esv.clr.1.n.A.ac.02 ~ env.1.n.A.ac.02$time, permu=5000, method="euclidean")   ## chilled vs immediately frozen
adonis(esv.clr.1.n.B.ac.02 ~ env.1.n.B.ac.02$time, permu=5000, method="euclidean")
adonis(esv.clr.1.n.C.ac.02 ~ env.1.n.C.ac.02$time, permu=5000, method="euclidean")

## can differences be detected in post 5 hour samples? :
adonis(esv.clr.1.n.A.ab.05 ~ env.1.n.A.ab.05$time, permu=5000, method="euclidean")   ## room temp vs immediately frozen
adonis(esv.clr.1.n.B.ab.05 ~ env.1.n.B.ab.05$time, permu=5000, method="euclidean")   # p<0.05
adonis(esv.clr.1.n.C.ab.05 ~ env.1.n.C.ab.05$time, permu=5000, method="euclidean")
adonis(esv.clr.1.n.A.ac.05 ~ env.1.n.A.ac.05$time, permu=5000, method="euclidean")   ## chilled vs immediately frozen
adonis(esv.clr.1.n.B.ac.05 ~ env.1.n.B.ac.05$time, permu=5000, method="euclidean")   # p<0.05
adonis(esv.clr.1.n.C.ac.05 ~ env.1.n.C.ac.05$time, permu=5000, method="euclidean")

## can differences be detected in post 24 hour samples? :
adonis(esv.clr.1.n.A.ab.024 ~ env.1.n.A.ab.024$time, permu=5000, method="euclidean")    ## room temp vs immediately frozen
adonis(esv.clr.1.n.B.ab.024 ~ env.1.n.B.ab.024$time, permu=5000, method="euclidean")    # p<0.05
adonis(esv.clr.1.n.C.ab.024 ~ env.1.n.C.ab.024$time, permu=5000, method="euclidean")
adonis(esv.clr.1.n.A.ac.024 ~ env.1.n.A.ac.024$time, permu=5000, method="euclidean")   ## chilled vs immediately frozen
adonis(esv.clr.1.n.B.ac.024 ~ env.1.n.B.ac.024$time, permu=5000, method="euclidean")   # p<0.05
adonis(esv.clr.1.n.C.ac.024 ~ env.1.n.C.ac.024$time, permu=5000, method="euclidean")


#which taxa are enriched in the later points (2, 5 and 24 h) for ice box and RT conditions?
#subset to get immediately frozen+ room temp (2, 5, 24 h), immediately frozen + ice box (2, 5, 24 h) samples

## GLM for room temp vs immediately frozen:
# subject A
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.n.A.ab)){
x <- summary(glm(esv.clr.1.n.A.ab[,i] ~ env.1.n.A.ab$time, family="gaussian"))   ##because frozen samples are t=0 h, so no need for specifying storage groups here 
h2 <- x$coefficient[2,4]
h5 <- x$coefficient[3,4]
h24 <- x$coefficient[4,4]
p.adjust(h2,"BH") -> h2.adj
p.adjust(h5,"BH") -> h5.adj
p.adjust(h24,"BH") -> h24.adj
p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj)
} 
p_vals  ## <- table s5

colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="a",])  # to get their proportions from frozen t=0 h
colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="b",])  # to get their proportions from RT t=2 h
colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="c",])  # to get their proportions from RT t=5 h
colMeans(esv.1.n.A.ab[env.1.n.A.ab$time=="d",])  # to get their proportions from RT t=24 h

# subject B
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.n.B.ab)){
x <- summary(glm(esv.clr.1.n.B.ab[,i] ~ env.1.n.B.ab$time, family="gaussian"))   ##because frozen samples are t=0 h, so no need for specifying storage groups here 
h2 <- x$coefficient[2,4]
h5 <- x$coefficient[3,4]   
h24 <- x$coefficient[4,4]
p.adjust(h2,"BH") -> h2.adj
p.adjust(h5,"BH") -> h5.adj   
p.adjust(h24,"BH") -> h24.adj
p_vals <- rbind(p_vals, h2.adj,h5.adj, h24.adj)
} 
p_vals  ## <- table s5

colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="a",])  # to get their proportions from frozen t=0 h
colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="b",])  # to get their proportions from RT t=2 h
colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="c",])  # to get their proportions from RT t=5 h   
colMeans(esv.1.n.B.ab[env.1.n.B.ab$time=="d",])  # to get their proportions from RT t=24 h

# subject C
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.n.C.ab)){
x <- summary(glm(esv.clr.1.n.C.ab[,i] ~ env.1.n.C.ab$time, family="gaussian"))   ##because frozen samples are t=0 h, so no need for specifying storage groups here 
h2 <- x$coefficient[2,4]
h5 <- x$coefficient[3,4]
h24 <- x$coefficient[4,4]
p.adjust(h2,"BH") -> h2.adj
p.adjust(h5,"BH") -> h5.adj
p.adjust(h24,"BH") -> h24.adj
p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj)
} 
p_vals  ## <- table s5

colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="a",])  # to get their proportions from frozen t=0 h
colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="b",])  # to get their proportions from RT t=2 h
colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="c",])  # to get their proportions from RT t=5 h
colMeans(esv.1.n.C.ab[env.1.n.C.ab$time=="d",])  # to get their proportions from RT t=24 h



## GLM for ice box vs immediately frozen:
# subject A
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.n.A.ac)){
x <- summary(glm(esv.clr.1.n.A.ac[,i] ~ env.1.n.A.ac$time, family="gaussian"))   ##because frozen samples are t=0 h, so no need for specifying storage groups here 
h2 <- x$coefficient[2,4]
h5 <- x$coefficient[3,4]
h24 <- x$coefficient[4,4]
p.adjust(h2,"BH") -> h2.adj
p.adjust(h5,"BH") -> h5.adj
p.adjust(h24,"BH") -> h24.adj
p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj)
} 
p_vals  ## <- table s5

colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="a",])  # to get their proportions from frozen t=0 h
colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="b",])  # to get their proportions from icebox t=2 h
colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="c",])  # to get their proportions from icebox t=5 h
colMeans(esv.1.n.A.ac[env.1.n.A.ac$time=="d",])  # to get their proportions from icebox t=24 h

# subject B
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.n.B.ac)){
x <- summary(glm(esv.clr.1.n.B.ac[,i] ~ env.1.n.B.ac$time, family="gaussian"))   ##because frozen samples are t=0 h, so no need for specifying storage groups here 
h2 <- x$coefficient[2,4]
h5 <- x$coefficient[3,4]     
h24 <- x$coefficient[4,4]
p.adjust(h2,"BH") -> h2.adj
p.adjust(h5,"BH") -> h5.adj   
p.adjust(h24,"BH") -> h24.adj
p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj)
} 
p_vals  ## <- table s5

colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="a",])  # to get their proportions from frozen t=0 h
colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="b",])  # to get their proportions from icebox t=2 h
colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="c",])  # to get their proportions from icebox t=5 h   
colMeans(esv.1.n.B.ac[env.1.n.B.ac$time=="d",])  # to get their proportions from icebox t=24 h

# subject C
p_vals <- NULL
for (i in 1:ncol(esv.clr.1.n.C.ac)){
x <- summary(glm(esv.clr.1.n.C.ac[,i] ~ env.1.n.C.ac$time, family="gaussian"))   ##because frozen samples are t=0 h, so no need for specifying storage groups here 
h2 <- x$coefficient[2,4]
h5 <- x$coefficient[3,4]
h24 <- x$coefficient[4,4]
p.adjust(h2,"BH") -> h2.adj
p.adjust(h5,"BH") -> h5.adj
p.adjust(h24,"BH") -> h24.adj
p_vals <- rbind(p_vals, h2.adj,h5.adj,h24.adj)
} 
p_vals  ## <- table s5

colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="a",])  # to get their proportions from frozen t=0 h
colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="b",])  # to get their proportions from icebox t=2 h
colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="c",])  # to get their proportions from icebox t=5 h
colMeans(esv.1.n.C.ac[env.1.n.C.ac$time=="d",])  # to get their proportions from icebox t=24 h


5. does collection from different sites of the stool make a difference?
adonis(esv.clr.1.A.a ~ site, data=env.1.A.a, method="euclidean", permu=5000)   ## <--- table S6
adonis(esv.clr.1.B.a ~ site, data=env.1.B.a, method="euclidean", permu=5000)   ## <--- table S6
adonis(esv.clr.1.C.a ~ site, data=env.1.C.a, method="euclidean", permu=5000)   ## <--- table S6

# yes! but only very slightly. If we combine all three donors, inter-donor variability overwhelms intra-sample variability:
rbind(esv.clr.1.A.a, esv.clr.1.B.a, esv.clr.1.C.a) -> esv.clr.1.a
rbind(env.1.A.a, env.1.B.a, env.1.C.a) -> env.1.a

adonis(esv.clr.1.a ~ subject*site, data=env.1.a, strata=env.1.a$subject, method="euclidean", permu=5000)   ##<--- table S4


rda(esv.clr.1.A.a) -> esv.clr.1.A.a.pca
plot(esv.clr.1.A.a.pca, type='n')   ## <--- Fig S4
points(esv.clr.1.A.a.pca, pch=21, cex=2, bg=env.1.A.a$site)
text(esv.clr.1.A.a.pca, labels=env.1.A.a$site, cex=3)
rda(esv.clr.1.A.a ~ env.1.A.a$site) -> esv.clr.1.A.a.rda
permutest(esv.clr.1.A.a.rda, permu=2000)

rda(esv.clr.1.B.a) -> esv.clr.1.B.a.pca
plot(esv.clr.1.B.a.pca, type='n')   ## <--- Fig S4
points(esv.clr.1.B.a.pca, pch=21, cex=2, bg=env.1.B.a$site)
text(esv.clr.1.B.a.pca, labels=env.1.B.a$site, cex=3)
rda(esv.clr.1.B.a ~ env.1.B.a$site) -> esv.clr.1.B.a.rda
permutest(esv.clr.1.B.a.rda, permu=2000)

rda(esv.clr.1.C.a) -> esv.clr.1.C.a.pca
plot(esv.clr.1.C.a.pca, type='n')   ## <--- Fig S4
points(esv.clr.1.C.a.pca, pch=21, cex=2, bg=env.1.C.a$site)
text(esv.clr.1.C.a.pca, labels=env.1.C.a$site, cex=3)
rda(esv.clr.1.C.a ~ env.1.C.a$site) -> esv.clr.1.C.a.rda
permutest(esv.clr.1.C.a.rda, permu=2000)

6. Power calculations
read.table("esv_table_counts_nochlor - power.txt",header=T,row.names=1,sep='\t') -> tmp
## the above counts file has been processed as such:
# 1. chloroplast sequences removed.
# 2. esvs with less than 20 reads in all samples removed.

tmp
ncol(env) ## 10 columns in env
env[,11] <- NA
colSums(tmp[,-193]) -> env[,11]
names(env)[names(env)=="V11"] <- "read_count"

subset(t(tmp[,-193]),env$read_count > 999) -> tmp
env[env$read_count>999,] -> env ## filter away samples with less than 1000 reads

# and make sure rownames are the same in both esv table and env files
rownames(tmp)==rownames(env)

## REMOVE THE ENRICHED SAMPLES!
tmp[-4:-6,]->tmp
env[-4:-6,]->env


subset(tmp, env$subject_visit=="1") -> counts   ##just get the first visit samples
subset(counts, env.1$norgen=="y") -> counts.y
subset(counts, env.1$norgen=="n") -> counts.n

DM.MoM(counts.n) -> fit.counts.n
DM.MoM(counts.y) -> fit.counts.y
numMC <- 1000

nrow(counts.n)
nrow(counts.y)

nrsGrp1 <- rep(2000, 81)
nrsGrp2 <- rep(2000, 36)

group.Nrs <- list(nrsGrp1, nrsGrp2)

### Computing Power of the test statistics (Type II error)
alphap <- rbind(fit.counts.n$gamma, fit.counts.y$gamma)
pwr <- MC.Xdc.statistics(group.Nrs, numMC, alphap, "ha")
pwr