rm(list=ls())

library(mvabund)
library(vegan)
library(boot)
library(formula.tools)
library(spdep)
library(adespatial)

set.seed(12345)

reps <- 1
#calculate Moran's I for difference between true and estimated values of a quantitative variable across a set of sites
#Arguments:
#X is sites x sites array of all spatial explanatory variables: vector of 1s and all the MEMs
#Best is estimated coefficients, with order matching X (zeros for those MEMs not selected)
#Z.sim is "true" values on linear predictor scale (i.e. those used to generate the simulated data) 
#listw is spatial weight matrix
#Value: vector of Moran's I for error in linear predictor for each taxon
errorMoranIfromlperror <- function(X, Best, Z.sim, listw){ 
  lperror <- X %*% Best - Z.sim
  nsites <- dim(lperror)[1]
  nspp <- dim(lperror)[2]
  moranI <- numeric(nspp)
  for(i in 1:nspp){
    if(any(is.infinite(lperror[, i]))){
      v <- numeric(nsites)
      v[lperror[, i] == Inf] <- 1
      v[lperror[, i] == -Inf] <- -1
    } else {
      v <- lperror[, i]
    }
    moranI[i] <- moran(x = v, n = nsites, listw = listw, S0 = Szero(listw))$I
  }
  return(moranI)
}

rda.func <- function(spe.mat, pcnm.scale, X, listw, Y.sim){
  spe.hel <- decostand(spe.mat, method = "hellinger")
  Yhel <- decostand(Y.sim, method = "hellinger") #transform true values in same way as sampled values
  sim.rda2.null <- rda(spe.hel ~ 1, data = pcnm.scale)
  sim.rda2 <- rda(spe.hel ~ ., data = pcnm.scale)
  if(anova(sim.rda2)$Pr[1] > .05){
    output.rda2 <- array(data=FALSE, dim = c(1, ncol(pcnm.scale)))
    adj.R2 <- 0
  } else {
    rda.selec <- ordiR2step(object = sim.rda2.null, scope = formula(sim.rda2), direction = "forward")
    output.rda2 <- array(data=FALSE, dim = c(1, ncol(pcnm.scale)))
    output.rda2[which(names(pcnm.scale) %in% labels(terms(formula(rda.selec))))] <- TRUE
    if(sum(output.rda2) == 0) adj.R2 <- 0 #no explanatory variables selected
    else adj.R2 <- RsquareAdj(rda.selec)$adj.r.squared
  }
  B.est.rda <- matrix(data = 0, ncol = ncol(spe.mat), nrow =ncol(pcnm.scale))
  if(anova(sim.rda2)$Pr[1] <= .05 & sum(output.rda2 > 0)){#model with all explanatory variables significant, and we selected at least one variable in forward selection
    myqr <- qr(rda.selec)
    myrdacoef <- qr.coef(myqr, y = spe.hel) #coefficients for multivariate regression of spe.hel on the explanatory variables, see ?qr in vegan
    selecvar.names <- rhs.vars(formula(rda.selec))
    whichcoefs <- as.numeric(gsub("[^0-9]", "", selecvar.names))
    B.est.rda[whichcoefs, ] <- myrdacoef
  }
  MoranI <- errorMoranIfromlperror(X = X[, -1], Best = B.est.rda, Z.sim = Yhel, listw = listw) #X[, -1] because RDA has no intercepts
  return(list(output.rda2= output.rda2, adj.R2= adj.R2, B.estim.rda = B.est.rda, MoranI = MoranI ))
}


glm.func <- function(spe.mat,pcnm.scale, listw, X, Z.sim){
  glm.null <- manyglm(spe.mat~1,family="binomial")
  attach(pcnm.scale) #do this so we can use all the names in scope without hard coding
  glm.null <- manyglm(spe.mat ~ 1, family = "binomial")
  add.glm2 <- add1(glm.null, scope = names(pcnm.scale))
  done <- FALSE
  current.glm2 <- glm.null
  while(done == FALSE){
    add.glm2 <- add1(current.glm2, scope = names(pcnm.scale))
    if(which.min(add.glm2$AIC) == 1){
      done <- TRUE
    } else {
      select.var2 <- rownames(add.glm2)[which.min(add.glm2$AIC)]
      current.glm2 <- update(current.glm2,paste("~.+",select.var2))
      if(length(coef(current.glm2)) == length(coef(glm.pcnm)))
        done <- TRUE
    }
  }
  detach(pcnm.scale)
  #calculating D-value for final model
  fit.final <- fitted.values(current.glm2)
  prob.pres.final <- mean(fit.final[spe.mat == 1])
  prob.abs.final <- mean(fit.final[spe.mat == 0])
  D.value <- prob.pres.final - prob.abs.final
  output.glm <- array(data=FALSE,dim=c(ncol(pcnm.scale)))
  selecvar.names <- rhs.vars(formula(current.glm2))
  B.est.glm <- matrix(data = 0, ncol = ncol(spe.mat), nrow = ncol(pcnm.scale) + 1)
  whichcoefs <- as.numeric(gsub("[^0-9]", "", selecvar.names))
  output.glm[whichcoefs] <- TRUE
  B.est.glm[c(1, whichcoefs + 1), ] <- coef(current.glm2)
  MoranI <- errorMoranIfromlperror(X = X, Best = B.est.glm, Z.sim = Z.sim, listw = listw)
  return(list(output.glm = output.glm, D.value = D.value, B_est_glm = B.est.glm[-1, ] , MoranI = MoranI)) #we don't need the intercepts later, hence B.est.glm[-1, ]
}


plot.simprob <- function(X,sim.spe,i){
  plot(X[,i],sim.spe[,1],ylim=c(0,1))
  for(j in 2:dim(sim.spe)[2]){
    points(X[,i],sim.spe[,j])
  }
}


getscores <- function(tru, obs){
  counta <- sum(obs == FALSE & tru == FALSE)
  countb <- sum(obs == FALSE & tru == TRUE)
  countc <- sum(obs == TRUE & tru == FALSE)
  countd <- sum(obs == TRUE & tru == TRUE)
  countt <- length(obs)
  scorecorrect <- (counta + countd)/countt
  scoretypeI <- countc/countt
  scoretypeII <- countb/countt
  return(list(scorecorrect = scorecorrect, scoretypeI = scoretypeI, scoretypeII = scoretypeII))
}


sites.xy <- read.table(file.choose()) #uploading latlong coordinates for MEMs 

algae <- read.table(file.choose()) #original species data (here we use dataset A as an example, see below)


#MEM from Minimum Spanning Tree for dataset A (BIG marine macroalgae data)
mintree_nb <- mst.nb(dist(sites.xy)) #object nb for minimum spanning tree of our sample sites
dist_tree <- nbdists(mintree_nb,as.matrix(sites.xy))

fdist <- lapply(dist_tree, function(x) 1-x/max(dist(as.matrix(sites.xy)))) 

listwmst <- nb2listw(mintree_nb,glist = fdist,style = "B")
mem.mst <- mem(listwmst)

#selecting MEM variables with significant positive autocorrelation
moran_tests <- moran.randtest(mem.mst,listwmst,999)
select <- which(moran_tests$pvalue < .05)

#in case you want to see what the selected spatial variables look like
plot(mem.mst[,select], SpORcoords = sites.xy, nb = mintree_nb)

sites.pcnm.pos.scale <- as.data.frame(scale(mem.mst[,select]))

#Using manyglm (pckg mvabund) to gather real coefficients drawn from GLMs
glm.pcnm <- manyglm(as.matrix(algae)~.,data=sites.pcnm.pos.scale,family="binomial")


X <- glm.pcnm$x #Our explanatory variables
B <- coef(glm.pcnm) #matrix B with real coefficients
B.sim.temp <- matrix(0,nrow=dim(B)[1],ncol=dim(B)[2])
B.sim.temp[1,] <- B[1,]# intercepts 

#How many non-zero coefficients 
ns <- c(0,floor(0.17*length(sites.pcnm.pos.scale)),floor(0.34*length(sites.pcnm.pos.scale)),floor(0.5*length(sites.pcnm.pos.scale)),floor(0.75*length(sites.pcnm.pos.scale)),length(sites.pcnm.pos.scale))
                    
ntotal = (3 * (length(ns) - 2) + 2) * reps
real.mod.result <- array(data=NA,dim=c(ntotal,ncol(X)-1))
rda.mod.result <- real.mod.result
rda.adjR2 <- numeric(ntotal)
glm.mod.result <- real.mod.result
glm.dvalue <- numeric(ntotal)
conditions <- data.frame(ev=integer(ntotal),scaling=integer(ntotal))
rowcount <- 1
nrb <- nrow(B.sim.temp)


B_est_rda_list <- list()
B_est_glm_list <- list()

errorglm <- list()
errorrda <- list()

MoranI.rda <- list()
MoranI.glm <- list()

for(n in ns){
  indices <- list(2:(n+1),(nrb-n+1):nrb,c(2:(n/2+1),(nrb-n/2+1):nrb)) #this sets which rows will be filled with non-zero values during simulations
  if(n == 0)
    iset=1
  else if(n == 1)
    iset=1:2
  else if (n == length(sites.pcnm.pos.scale))
    iset=1
  else
    iset=1:3
    for(i in iset){
    #print(indices[[i]])
    #print(beta)
    #print(iset[i])
      for(r in 1:reps){
      B.sim <- B.sim.temp #preparing new B matrix from template
      if( n == 0)
        B.sim <- B.sim.temp
      else if((n == 5) && (i == 3)) #when the number of non-zeros to be assigned to scaling 3 is odd, we arbitrarily chose to which position the remaining non-zero coeff would be assigned
        B.sim[c(2,3,4,16,17),] <- sample(B[2:nrow(B),],replace=TRUE,size=length(i)*ncol(algae))
      else
       B.sim[indices[[i]],] <- sample(B[2:nrow(B),],replace=TRUE,size=length(i)*ncol(algae)) #filling rows of specific indices with non-zero values sampled from original matrix B
      #print(c(r,n,i))
      #print(B.sim[,1])
      Z.sim <- X %*% B.sim #estimating probabilities of occurrences from X * B* (lines 161-164)
      lambda = exp(Z.sim)
      Y.sim <- 1 - exp(-lambda)
      sim.binom.pcnm <- rbinom(Y.sim,size=1,prob= 1 - exp(-lambda))
      mat.sim.spe.pcnm <- matrix(sim.binom.pcnm,nrow=nrow(algae),ncol=ncol(algae)) #arranging pre/abse in same size as original community
 rdaoutput <- rda.func(spe.mat=mat.sim.spe.pcnm,pcnm.scale=sites.pcnm.pos.scale, X = X, listw = listwmst, Y.sim = Y.sim)
	rda.mod.result[rowcount, ] <- rdaoutput$output.rda2
	rda.adjR2[rowcount] <- rdaoutput$adj.R2
	B_est_rda_list[[rowcount]] <- rdaoutput$B.estim.rda
	MoranI.rda[[rowcount]] <- rdaoutput$MoranI
	glmoutput <- glm.func(spe.mat=mat.sim.spe.pcnm,pcnm.scale=sites.pcnm.pos.scale, listw = listwmst, X = X, Z.sim = logit(Y.sim)) #logit(Y.sim) is on linear predictor scale
      glm.mod.result[rowcount,] <- glmoutput$output.glm
      glm.dvalue[rowcount] <- glmoutput$D.value
      B_est_glm_list[[rowcount]] <- glmoutput$B_est_glm
      MoranI.glm[[rowcount]] <- glmoutput$MoranI
      conditions[rowcount,] <- c(n,i)
      real.mod.result[rowcount,] <- abs(B.sim[-1,1])>0
      rowcount <- rowcount + 1
    }
  }
  
}

#scoring results

glm.scores <- rowSums(real.mod.result == glm.mod.result)
rda.scores <- rowSums(real.mod.result == rda.mod.result)

mean.moran.glm <- unlist(lapply(MoranI.glm, function(x){mean(abs(x), na.rm = TRUE)}))
mean.moran.rda <- unlist(lapply(MoranI.rda, function(x){mean(abs(x), na.rm = TRUE)}))

table.results <- cbind(conditions,glm.scores,glm.dvalue,mean.moran.glm, rda.scores, rda.adjR2, mean.moran.rda)
glmsummaryscores <- getscores(tru = real.mod.result, obs = glm.mod.result)
rdasummaryscores <- getscores(tru = real.mod.result, obs = rda.mod.result)

#write.table(table.results, sep="\t ",file="rda_glm_resultsalgae_simglm.txt",row.names=FALSE,col.names=TRUE)

summaryscores <- cbind(glmsummaryscores, rdasummaryscores)
#write.table(summaryscores, file = "summaryscoresalgae_simglm.txt", sep = "\t", row.names = TRUE, col.names = TRUE, quote = FALSE)
#write.table(glm.mod.result, file = "glm_mod_resultalgae_simglm.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
#write.table(rda.mod.result, file = "rda_mod_resultalgae_simglm.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
#write.table(real.mod.result, file = "real_mod_resultalgae_simglm.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)