rm(list=ls()) library(mvabund) library(vegan) library(boot) library(formula.tools) library(spdep) library(adespatial) set.seed(12345) reps <- 1 #calculate Moran's I for difference between true and estimated values of a quantitative variable across a set of sites #Arguments: #X is sites x sites array of all spatial explanatory variables: vector of 1s and all the MEMs #Best is estimated coefficients, with order matching X (zeros for those MEMs not selected) #Z.sim is "true" values on linear predictor scale (i.e. those used to generate the simulated data) #listw is spatial weight matrix #Value: vector of Moran's I for error in linear predictor for each taxon errorMoranIfromlperror <- function(X, Best, Z.sim, listw){ lperror <- X %*% Best - Z.sim nsites <- dim(lperror)[1] nspp <- dim(lperror)[2] moranI <- numeric(nspp) for(i in 1:nspp){ if(any(is.infinite(lperror[, i]))){ v <- numeric(nsites) v[lperror[, i] == Inf] <- 1 v[lperror[, i] == -Inf] <- -1 } else { v <- lperror[, i] } moranI[i] <- moran(x = v, n = nsites, listw = listw, S0 = Szero(listw))$I } return(moranI) } rda.func <- function(spe.mat, pcnm.scale, X, listw, Y.sim){ spe.hel <- decostand(spe.mat, method = "hellinger") Yhel <- decostand(Y.sim, method = "hellinger") #transform true values in same way as sampled values sim.rda2.null <- rda(spe.hel ~ 1, data = pcnm.scale) sim.rda2 <- rda(spe.hel ~ ., data = pcnm.scale) if(anova(sim.rda2)$Pr[1] > .05){ output.rda2 <- array(data=FALSE, dim = c(1, ncol(pcnm.scale))) adj.R2 <- 0 } else { rda.selec <- ordiR2step(object = sim.rda2.null, scope = formula(sim.rda2), direction = "forward") output.rda2 <- array(data=FALSE, dim = c(1, ncol(pcnm.scale))) output.rda2[which(names(pcnm.scale) %in% labels(terms(formula(rda.selec))))] <- TRUE if(sum(output.rda2) == 0) adj.R2 <- 0 #no explanatory variables selected else adj.R2 <- RsquareAdj(rda.selec)$adj.r.squared } B.est.rda <- matrix(data = 0, ncol = ncol(spe.mat), nrow =ncol(pcnm.scale)) if(anova(sim.rda2)$Pr[1] <= .05 & sum(output.rda2 > 0)){#model with all explanatory variables significant, and we selected at least one variable in forward selection myqr <- qr(rda.selec) myrdacoef <- qr.coef(myqr, y = spe.hel) #coefficients for multivariate regression of spe.hel on the explanatory variables, see ?qr in vegan selecvar.names <- rhs.vars(formula(rda.selec)) whichcoefs <- as.numeric(gsub("[^0-9]", "", selecvar.names)) B.est.rda[whichcoefs, ] <- myrdacoef } MoranI <- errorMoranIfromlperror(X = X[, -1], Best = B.est.rda, Z.sim = Yhel, listw = listw) #X[, -1] because RDA has no intercepts return(list(output.rda2= output.rda2, adj.R2= adj.R2, B.estim.rda = B.est.rda, MoranI = MoranI )) } glm.func <- function(spe.mat,pcnm.scale, listw, X, Z.sim){ glm.null <- manyglm(spe.mat~1,family="binomial") attach(pcnm.scale) #do this so we can use all the names in scope without hard coding glm.null <- manyglm(spe.mat ~ 1, family = "binomial") add.glm2 <- add1(glm.null, scope = names(pcnm.scale)) done <- FALSE current.glm2 <- glm.null while(done == FALSE){ add.glm2 <- add1(current.glm2, scope = names(pcnm.scale)) if(which.min(add.glm2$AIC) == 1){ done <- TRUE } else { select.var2 <- rownames(add.glm2)[which.min(add.glm2$AIC)] current.glm2 <- update(current.glm2,paste("~.+",select.var2)) if(length(coef(current.glm2)) == length(coef(glm.pcnm))) done <- TRUE } } detach(pcnm.scale) #calculating D-value for final model fit.final <- fitted.values(current.glm2) prob.pres.final <- mean(fit.final[spe.mat == 1]) prob.abs.final <- mean(fit.final[spe.mat == 0]) D.value <- prob.pres.final - prob.abs.final output.glm <- array(data=FALSE,dim=c(ncol(pcnm.scale))) selecvar.names <- rhs.vars(formula(current.glm2)) B.est.glm <- matrix(data = 0, ncol = ncol(spe.mat), nrow = ncol(pcnm.scale) + 1) whichcoefs <- as.numeric(gsub("[^0-9]", "", selecvar.names)) output.glm[whichcoefs] <- TRUE B.est.glm[c(1, whichcoefs + 1), ] <- coef(current.glm2) MoranI <- errorMoranIfromlperror(X = X, Best = B.est.glm, Z.sim = Z.sim, listw = listw) return(list(output.glm = output.glm, D.value = D.value, B_est_glm = B.est.glm[-1, ] , MoranI = MoranI)) #we don't need the intercepts later, hence B.est.glm[-1, ] } plot.simprob <- function(X,sim.spe,i){ plot(X[,i],sim.spe[,1],ylim=c(0,1)) for(j in 2:dim(sim.spe)[2]){ points(X[,i],sim.spe[,j]) } } getscores <- function(tru, obs){ counta <- sum(obs == FALSE & tru == FALSE) countb <- sum(obs == FALSE & tru == TRUE) countc <- sum(obs == TRUE & tru == FALSE) countd <- sum(obs == TRUE & tru == TRUE) countt <- length(obs) scorecorrect <- (counta + countd)/countt scoretypeI <- countc/countt scoretypeII <- countb/countt return(list(scorecorrect = scorecorrect, scoretypeI = scoretypeI, scoretypeII = scoretypeII)) } sites.xy <- read.table(file.choose()) #uploading latlong coordinates for MEMs algae <- read.table(file.choose()) #original species data (here we use dataset A as an example, see below) #MEM from Minimum Spanning Tree for dataset A (BIG marine macroalgae data) mintree_nb <- mst.nb(dist(sites.xy)) #object nb for minimum spanning tree of our sample sites dist_tree <- nbdists(mintree_nb,as.matrix(sites.xy)) fdist <- lapply(dist_tree, function(x) 1-x/max(dist(as.matrix(sites.xy)))) listwmst <- nb2listw(mintree_nb,glist = fdist,style = "B") mem.mst <- mem(listwmst) #selecting MEM variables with significant positive autocorrelation moran_tests <- moran.randtest(mem.mst,listwmst,999) select <- which(moran_tests$pvalue < .05) #in case you want to see what the selected spatial variables look like plot(mem.mst[,select], SpORcoords = sites.xy, nb = mintree_nb) sites.pcnm.pos.scale <- as.data.frame(scale(mem.mst[,select])) #Using manyglm (pckg mvabund) to gather real coefficients drawn from GLMs glm.pcnm <- manyglm(as.matrix(algae)~.,data=sites.pcnm.pos.scale,family="binomial") X <- glm.pcnm$x #Our explanatory variables B <- coef(glm.pcnm) #matrix B with real coefficients B.sim.temp <- matrix(0,nrow=dim(B)[1],ncol=dim(B)[2]) B.sim.temp[1,] <- B[1,]# intercepts #How many non-zero coefficients ns <- c(0,floor(0.17*length(sites.pcnm.pos.scale)),floor(0.34*length(sites.pcnm.pos.scale)),floor(0.5*length(sites.pcnm.pos.scale)),floor(0.75*length(sites.pcnm.pos.scale)),length(sites.pcnm.pos.scale)) ntotal = (3 * (length(ns) - 2) + 2) * reps real.mod.result <- array(data=NA,dim=c(ntotal,ncol(X)-1)) rda.mod.result <- real.mod.result rda.adjR2 <- numeric(ntotal) glm.mod.result <- real.mod.result glm.dvalue <- numeric(ntotal) conditions <- data.frame(ev=integer(ntotal),scaling=integer(ntotal)) rowcount <- 1 nrb <- nrow(B.sim.temp) B_est_rda_list <- list() B_est_glm_list <- list() errorglm <- list() errorrda <- list() MoranI.rda <- list() MoranI.glm <- list() for(n in ns){ indices <- list(2:(n+1),(nrb-n+1):nrb,c(2:(n/2+1),(nrb-n/2+1):nrb)) #this sets which rows will be filled with non-zero values during simulations if(n == 0) iset=1 else if(n == 1) iset=1:2 else if (n == length(sites.pcnm.pos.scale)) iset=1 else iset=1:3 for(i in iset){ #print(indices[[i]]) #print(beta) #print(iset[i]) for(r in 1:reps){ B.sim <- B.sim.temp #preparing new B matrix from template if( n == 0) B.sim <- B.sim.temp else if((n == 5) && (i == 3)) #when the number of non-zeros to be assigned to scaling 3 is odd, we arbitrarily chose to which position the remaining non-zero coeff would be assigned B.sim[c(2,3,4,16,17),] <- sample(B[2:nrow(B),],replace=TRUE,size=length(i)*ncol(algae)) else B.sim[indices[[i]],] <- sample(B[2:nrow(B),],replace=TRUE,size=length(i)*ncol(algae)) #filling rows of specific indices with non-zero values sampled from original matrix B #print(c(r,n,i)) #print(B.sim[,1]) Z.sim <- X %*% B.sim #estimating probabilities of occurrences from X * B* (lines 161-164) lambda = exp(Z.sim) Y.sim <- 1 - exp(-lambda) sim.binom.pcnm <- rbinom(Y.sim,size=1,prob= 1 - exp(-lambda)) mat.sim.spe.pcnm <- matrix(sim.binom.pcnm,nrow=nrow(algae),ncol=ncol(algae)) #arranging pre/abse in same size as original community rdaoutput <- rda.func(spe.mat=mat.sim.spe.pcnm,pcnm.scale=sites.pcnm.pos.scale, X = X, listw = listwmst, Y.sim = Y.sim) rda.mod.result[rowcount, ] <- rdaoutput$output.rda2 rda.adjR2[rowcount] <- rdaoutput$adj.R2 B_est_rda_list[[rowcount]] <- rdaoutput$B.estim.rda MoranI.rda[[rowcount]] <- rdaoutput$MoranI glmoutput <- glm.func(spe.mat=mat.sim.spe.pcnm,pcnm.scale=sites.pcnm.pos.scale, listw = listwmst, X = X, Z.sim = logit(Y.sim)) #logit(Y.sim) is on linear predictor scale glm.mod.result[rowcount,] <- glmoutput$output.glm glm.dvalue[rowcount] <- glmoutput$D.value B_est_glm_list[[rowcount]] <- glmoutput$B_est_glm MoranI.glm[[rowcount]] <- glmoutput$MoranI conditions[rowcount,] <- c(n,i) real.mod.result[rowcount,] <- abs(B.sim[-1,1])>0 rowcount <- rowcount + 1 } } } #scoring results glm.scores <- rowSums(real.mod.result == glm.mod.result) rda.scores <- rowSums(real.mod.result == rda.mod.result) mean.moran.glm <- unlist(lapply(MoranI.glm, function(x){mean(abs(x), na.rm = TRUE)})) mean.moran.rda <- unlist(lapply(MoranI.rda, function(x){mean(abs(x), na.rm = TRUE)})) table.results <- cbind(conditions,glm.scores,glm.dvalue,mean.moran.glm, rda.scores, rda.adjR2, mean.moran.rda) glmsummaryscores <- getscores(tru = real.mod.result, obs = glm.mod.result) rdasummaryscores <- getscores(tru = real.mod.result, obs = rda.mod.result) #write.table(table.results, sep="\t ",file="rda_glm_resultsalgae_simglm.txt",row.names=FALSE,col.names=TRUE) summaryscores <- cbind(glmsummaryscores, rdasummaryscores) #write.table(summaryscores, file = "summaryscoresalgae_simglm.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE) #write.table(glm.mod.result, file = "glm_mod_resultalgae_simglm.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) #write.table(rda.mod.result, file = "rda_mod_resultalgae_simglm.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE) #write.table(real.mod.result, file = "real_mod_resultalgae_simglm.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)