#------------------------------------------------------ #Supporting Information #Estimating and Comparing Microbial Diversity in the Presence of Sequencing Errors #Chun-Huo Chiu and Anne Chao #Institute of Statistics, National Tsing Hua University, Hsin-Chu, Taiwan, 30043 #Supplemental Text S1: R codes for obtaining estimators of Hill numbers #------------------------------------------------------ # R scripts for computing diversity (Hill numbers) profile using individual-based abundance data or sampling-unit-based incidence data. # In all functions, param x is a vector of species sample frequencies (for abundance data) or incidence-based sample frequencies (for incidence data). # For incidence data, the first entry of x must be the number of sampling units. # In all functions, param q is the diversity order; the suggested range for q is [0, 3]. #----------------------------------------------- # Diversity profile estimator (abundance data) #----------------------------------------------- #' Chao_Hill_abu(x, q) is a function of obtaining estimators of Hill numbers of order q based on abundance data. #' @param x a vector of species sample frequencies. #' @param q a numeric or a vector of diversity order; The suggested range for q is [0, 3]. #' @return a numerical vector of diversity. Chao_Hill_abu = function(x,q){ x = x[x>0] n = sum(x) f1 = sum(x==1) f2 = sum(x==2) p1 = ifelse(f2>0,2*f2/((n-1)*f1+2*f2),ifelse(f1>0,2/((n-1)*(f1-1)+2),1)) r <- 0:(n-1) Sub <- function(q){ if(q==0){ sum(x>0) + (n-1)/n*ifelse(f2>0, f1^2/2/f2, f1*(f1-1)/2) } else if(q==1){ #UE <- sum(x/n*(digamma(n)-digamma(x))) #A = 1 - ifelse(f2 > 0, (n-1)*f1/((n-1)*f1+2*f2), (n-1)*f1/((n-1)*f1+2)) #B = f1/n*(1-A)^(-n+1)*(-log(A)-sum(sapply(1:(n-1),function(k){1/k*(1-A)^k}))) #exp(UE+B) A <- sum(x/n*(digamma(n)-digamma(x))) B <- ifelse(f1==0||p1==1,0,f1/n*(1-p1)^(1-n)*(-log(p1)-sum(sapply(1:(n-1), function(r)(1-p1)^r/r)))) exp(A+B) }else if(abs(q-round(q))==0){ A <- sum(exp(lchoose(x,q)-lchoose(n,q))) ifelse(A==0,NA,A^(1/(1-q))) }else { sort.data = sort(unique(x)) tab = table(x) term = sapply(sort.data,function(z){ k=0:(n-z) sum(choose(k-q,k)*exp(lchoose(n-k-1,z-1)-lchoose(n,z))) }) A = sum(tab*term) B = ifelse(f1==0|p1==1,0,f1/n*(1-p1)^(1-n)*(p1^(q-1)-sum(choose(q-1,r)*(p1-1)^r))) (A+B)^(1/(1-q)) } } sapply(q, Sub) } #----------------------------------------------- # Diversity profile estimator (incidence data) #----------------------------------------------- #' Chao_Hill_inc(x, q) is a function of obtaining estimators of Hill numbers of order q based on incidence data. #' @param x a vector of species incidence-based sample frequencies. The first entry of x must be the number of sampling units. #' @param q a numeric or a vector of diversity order. #' @return a numerical vector. Chao_Hill_inc = function(x,q){ n = x[1] x = x[-1];x = x[x>0] U = sum(x) f1 = sum(x==1) f2 = sum(x==2) p1 = ifelse(f2>0,2*f2/((n-1)*f1+2*f2),ifelse(f1>0,2/((n-1)*(f1-1)+2),1)) r <- 0:(n-1) Sub <- function(q){ if(q==0){ sum(x>0) + (n-1)/n*ifelse(f2>0, f1^2/2/f2, f1*(f1-1)/2) } else if(q==1){ A <- sum(x/U*(digamma(n)-digamma(x))) B <- ifelse(f1==0|p1==1,0,f1/U*(1-p1)^(1-n)*(-log(p1)-sum(sapply(1:(n-1), function(r)(1-p1)^r/r)))) exp(A+B)*U/n }else if(abs(q-round(q))==0){ A <- sum(exp(lchoose(x,q)-lchoose(n,q))) ifelse(A==0,NA,((n/U)^q*A)^(1/(1-q))) }else { sort.data = sort(unique(x)) tab = table(x) term = sapply(sort.data,function(z){ k=0:(n-z) sum(choose(k-q,k)*exp(lchoose(n-k-1,z-1)-lchoose(n,z))) }) A = sum(tab*term) B = ifelse(f1==0|p1==1,0,f1/n*(1-p1)^(1-n)*(p1^(q-1)-sum(choose(q-1,r)*(p1-1)^r))) ((n/U)^q*(A+B))^(1/(1-q)) } } sapply(q, Sub) } #' Chao_Hill(x, q,datatype) combines Chao_Hill_abu and Chao_Hill_inc given a specified datatype (either abundance data or incidence data). #' @param x a vector of species sample frequencies (for abundance data), or incidence-based sample frequencies (1st entry must be the number of sampling unit). #' @param q a numeric or a vector of diversity order. #' @param datatype a character of data type,"abundance" or "incidence". #' @return a numerical vector. Chao_Hill = function(x,q,datatype = c("abundance","incidence")){ datatype = match.arg(datatype,c("abundance","incidence")) if(datatype == "abundance"){ est = Chao_Hill_abu(x,q) }else{ est = Chao_Hill_inc(x,q) } return(est) } #----------------------- # The empirical profile #----------------------- #' Hill(x, q, datatype) is a function of obtaining the empirical Hill numbers of order q based on abundance data or incidence data. #' @param x a vector of species sample frequencies (for abundance data), or incidence-based sample frequencies (1st entry must be the number of sampling unit). #' @param q a numeric or a vector of diversity order. #' @param datatype a character of data type,"abundance" or "incidence". #' @return a numerical vector. Hill <- function(x,q,datatype = c("abundance","incidence")){ if(datatype=="incidence"){x = x[-1]} p <- x[x>0]/sum(x) Sub <- function(q){ if(q==0) sum(p>0) else if(q==1) exp(-sum(p*log(p))) else exp(1/(1-q)*log(sum(p^q))) } sapply(q, Sub) } #----------------------------------------- # The bootstrap method for obtaining s.e. #----------------------------------------- #' Bt_prob_abu(x) is a function of estimating the species probabilities in the bootstrap assemblage based on abundance data. #' @param x a vector of species sample frequencies. #' @return a numeric vector. Bt_prob_abu = function(x){ x = x[x>0] n = sum(x) f1 = sum(x==1) f2 = sum(x==2) C = 1 - f1/n*ifelse(f2>0,(n-1)*f1/((n-1)*f1+2*f2),ifelse(f1>0,(n-1)*(f1-1)/((n-1)*(f1-1)+2),0)) W = (1-C)/sum(x/n*(1-x/n)^n) p.new = x/n*(1-W*(1-x/n)^n) f0 = ceiling(ifelse(f2>0,(n-1)/n*f1^2/(2*f2),(n-1)/n*f1*(f1-1)/2)) p0 = (1-C)/f0 p.new=c(p.new,rep(p0,f0)) return(p.new) } #' Bt_prob_inc(x) is a function of estimating the species incidence probabilities in the bootstrap assemblage based on incidence data. #' @param x a vector of incidence-based sample frequencies (1st entry must be the number of sampling unit). #' @return a numeric vector. Bt_prob_inc = function(x){ n = x[1] x = x[-1] U = sum(x) f1 = sum(x==1) f2 = sum(x==2) A = ifelse(f2>0,2*f2/((n-1)*f1+2*f2),ifelse(f1>0,2/((n-1)*(f1-1)+2),1)) C=1-f1/U*(1-A) W=U/n*(1-C)/sum(x/n*(1-x/n)^n) p.new=x/n*(1-W*(1-x/n)^n) f0 = ceiling(ifelse(f2>0,(n-1)/n*f1^2/(2*f2),(n-1)/n*f1*(f1-1)/2)) p0=U/n*(1-C)/f0 p.new=c(p.new,rep(p0,f0)) return(p.new) } #' Bt_prob(x,datatype) combines the two functions Bt_prob_abu and Bt_prob_inc for a specified datatype. #' @param x a vector of species sample frequencies (for abundance data), or incidence-based sample frequencies (1st entry must be the number of sampling unit). #' @param datatype a character of data type,"abundance" or "incidence". #' @return a numeric vector. Bt_prob = function(x,datatype = c("abundance","incidence")){ datatype = match.arg(datatype,c("abundance","incidence")) if(datatype == "abundance"){ prob = Bt_prob_abu(x) }else{ prob = Bt_prob_inc(x) } return(prob) } #' Bootstrap.CI(x,q,B,datatype,conf) is a function of calculating the bootsrapping standard error based on abundance data or incidence data. #' @param x a vector of species sample frequencies (for abundance data) or incidence-based sample frequencies (1st entry must be the number of sampling unit). #' @param q a numeric or a vector of diversity order. #' @param B an integer to specify the number of replications in the bootstrap procedure, B = 1000 is suggested for constructing confidence intervals; #' To save running time, use a smaller value (e.g. B = 200).. #' @param datatype a character of data type,"abundance" or "incidence". #' @param conf a confidence coefficient between 0 and 1. #' @return a list, consisting of 3 matrices including respectively the difference between the average and lower confidence bound of the B bootstrap estimates, #' the difference between the upper confidence bound and the average of the B bootstrap estimates, and the bootstrap standard error of the diversity estimate. #' In each matrix, the first row gives the results for the empirical diversity, and the second row gives the results for the proposed diversity estimates. #' Columns give the results for different orders of q. Bootstrap.CI = function(x,q,B = 1000,datatype = c("abundance","incidence"),conf = 0.95){ datatype = match.arg(datatype,c("abundance","incidence")) p.new = Bt_prob(x,datatype) n = ifelse(datatype=="abundance",sum(x),x[1]) # set.seed(456) if(datatype=="abundance"){ data.bt = rmultinom(B,n,p.new) I1=which(data.bt==1);data.bt[I1]=rep(1,length(I1)); F2=sapply(1:B,function(k) sum(data.bt[,k]==2)); F3=sapply(1:B,function(k) sum(data.bt[,k]==3)); F4=sapply(1:B,function(k) sum(data.bt[,k]==4)); F1=sapply(1:B,function(k) ifelse(F3[k]*F4[k]>0, 4*F2[k]^2/(3*F3[k])-F2[k]*F3[k]/(2*F4[k]), 4*F2[k]^2/(3*(F3[k]+1))-F2[k]*F3[k]/(2*(F4[k]+1)))); F1=round(F1);Max=max(F1);F1.Mat=matrix(0,ncol=B,nrow=Max); sapply(1:B,function(k) F1.Mat[,k][1:length(F1[k])]=rep(1,length(F1[k])) ) data.bt=rbind(data.bt,F1.Mat); }else{ data.bt = rbinom(length(p.new)*B,n,p.new) data.bt = matrix(data.bt,ncol=B) data.bt = rbind(rep(n,B),data.bt) } mle = apply(data.bt,2,function(x)Hill(x,q,datatype)) pro = apply(data.bt,2,function(x)Chao_Hill(x,q,datatype)) mle.mean = rowMeans(mle) pro.mean = rowMeans(pro) LCI.mle = -apply(mle,1,function(x)quantile(x,probs = (1-conf)/2)) + mle.mean UCI.mle = apply(mle,1,function(x)quantile(x,probs = 1-(1-conf)/2)) - mle.mean LCI.pro = -apply(pro,1,function(x)quantile(x,probs = (1-conf)/2)) + pro.mean UCI.pro = apply(pro,1,function(x)quantile(x,probs = 1-(1-conf)/2)) - pro.mean LCI = rbind(LCI.mle,LCI.pro) UCI = rbind(UCI.mle,UCI.pro) sd.mle = apply(mle,1,sd) sd.pro = apply(pro,1,function(x)sd(x,na.rm = T)) se = rbind(sd.mle,sd.pro) return(list(LCI=LCI,UCI=UCI,se=se)) } #---------------------- # singleton estimator #---------------------- #' singleton.Est(x) is a function of obtaining the estimator of singleton based on corrected data #' @param x a vector of species sample frequencies #' @return singleton estimate and a numerical vector of corrected data. Chao_Hill = function(x,q,datatype = c("abundance","incidence")){ datatype = match.arg(datatype,c("abundance","incidence")) if(datatype == "abundance"){ est = Chao_Hill_abu(x,q) }else{ est = Chao_Hill_inc(x,q) } return(est) } singleton.Est=function(dat,datatype = c("abundance","incidence")){ datatype = match.arg(datatype,c("abundance","incidence")) if (is.matrix(dat) == T || is.data.frame(dat) == T){ if (ncol(dat) != 1 & nrow(dat) != 1) stop("Error: The data format is wrong.") if (ncol(dat) == 1){ dat <- dat[, 1] } else { dat <- dat[1, ] } } dat <- as.numeric(dat) if(datatype == "abundance"){ f2 <- sum(dat == 2) #doubleton f3 <- sum(dat == 3) f4 <- sum(dat == 4) f1= ifelse(f3*f4>0, 4*f2^2/(3*f3)-f2*f3/(2*f4), 4*f2^2/(3*(f3+1))-f2*f3/(2*(f4+1))) I=which(dat==1);dat=dat[-I]; dat=c(dat,rep(1,round(f1))); dat=dat[dat>0]; }else{ N=dat[1];dat=dat[-1]; f2 <- sum(dat == 2) #doubleton f3 <- sum(dat == 3) f4 <- sum(dat == 4) f1= ifelse(f3*f4>0, 4*f2^2/(3*f3)-f2*f3/(2*f4), 4*f2^2/(3*(f3+1))-f2*f3/(2*(f4+1))) I=which(dat==1);dat=dat[-I]; dat=c(dat,rep(1,round(f1))); dat=dat[dat>0]; dat=c(N,dat); } return(list(singleton.est=f1,corrected.data=dat)) } #------------------------ # Main function ChaoHill #------------------------ #' ChaoHill(dat, datatype, from, to, interval, B, conf) is the function of calculating the empirical and the proposed diversity profile, #' their bootsrap standard errors and confidance intervals. #' @param dat a vector of species sample frequencies (for abundance data), or incidence-based sample frequencies (1st entry must be the number of sampling unit). #' @param datatype a character of data type,"abundance" or "incidence". #' @param from a numeric number of diversity order q (the start order of profile). #' @param to a numeric number of diversity order q (the end order of profile). #' @param interval a numeric number to specify each increment of q from the start to end order. #' @param B an integer to specify the number of bootstrap replications, B = 1000 is suggested for constructing confidence intervals; #' To save running time, use a smaller value (e.g. B = 200). #' @param conf a confidence coefficient between 0 and 1. #' @return a list, consisting of 4 matrices including respectively diversity estimates, bootstrap standard errors, lower confidence bounds, and upper confidence bounds. #' In each matrix, the first row gives the results for the empirical diversity, and the second row gives the results for the proposed diversity estimates. #' Columns give the results for different orders of q. singleton.ChaoHill <- function(dat, datatype=c("abundance", "incidence"), from=0, to=3, interval=0.1, B=1000, conf=0.95){ datatype = match.arg(datatype,c("abundance","incidence")) # for real data estimation if (is.matrix(dat) == T || is.data.frame(dat) == T){ if (ncol(dat) != 1 & nrow(dat) != 1) stop("Error: The data format is wrong.") if (ncol(dat) == 1){ dat <- dat[, 1] } else { dat <- dat[1, ] } } dat <- as.numeric(dat) #------------------ # correct singleton and raw data #------------------ temp=singleton.Est(dat,datatype); f1=temp$singleton.est; dat=temp$corrected.data #----------------------- q <- seq(from, to, by=interval) #------------- #Estimation #------------- MLE=Hill(dat,q,datatype) qD_pro=Chao_Hill(dat,q,datatype) qD012=Chao_Hill(dat,q=c(0,1,2),datatype); CI_bound = Bootstrap.CI(dat,q,B,datatype,conf) se = CI_bound$se #------------------- #Confidence interval #------------------- tab.est=data.frame(rbind(MLE,qD_pro)) LCI <- tab.est - CI_bound$LCI UCI <- tab.est + CI_bound$UCI colnames(tab.est) <- colnames(se) <- colnames(LCI) <- colnames(UCI) <- paste("q = ", q, sep="") rownames(tab.est) <- rownames(se) <- rownames(LCI) <- rownames(UCI) <- c("Observed", "Chao_2013") return(list(Singleton.est=f1, EST = tab.est, SD = se, LCI = LCI,UCI = UCI)) } ######Example: Swine-feces data of Table 1 in the main text # run all R functions above. fi=c(8025,605,129,41,16,8,4,2,1,1,1) # OTU frequency counts of swine-feces data data=rep(1:length(fi),fi) # transform frequency counts data to species abundances output=singleton.ChaoHill(data,"abundance",from=0,to=2,interval=1,B=200); #estimates of Hill numbers (order q= 0, 1, 2) and corresponding 95% confidence interval output $Singleton.est [1] 2831.436 $EST q = 0 q = 1 q = 2 Observed 3639.00 3250.379 2741.880 Chao_2013 10261.22 9081.061 6404.025 $SD q = 0 q = 1 q = 2 Observed 24.5368 33.72911 54.18838 Chao_2013 278.8550 178.56957 198.79723 $LCI q = 0 q = 1 q = 2 Observed 3591.840 3181.323 2630.539 Chao_2013 9696.206 8736.525 6006.244 $UCI q = 0 q = 1 q = 2 Observed 3682.89 3304.304 2837.815 Chao_2013 10814.14 9427.188 6768.070 ##### Rarefaction and Extrapolation of Hill numbers #First correct the raw singleton count by running singleton.Est function. data=singleton.Est(data,"abundance")$corrected.data; #Then, using R package "iNEXT" to plot the rarefaction and extrapolation curve as shown in Fig. 2 and Fig. 3.