#R code for Jill's analysis of carrion beetle vertical habitat partitioning

## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# Test for differences between ground and canopy traps in number of species and number of beetles
## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

# Load data ---------------------------
Jilldatwz <- read.csv(file.choose()) # use dialog box to import data file Jill2.data.withzeros.csv and assign to Jilldatwz
names(Jilldatwz) # lists all variable names

###############################################
## Number of Beetles

# Plot and transform data to normalize ---------------------------
hist(Jilldatwz$numb.beetles)
hist(log(Jilldatwz$numb.beetles+3))
Jilldatwz$log.beetles <- log(Jilldatwz$numb.beetles+3)
Jilldatwz$log.beetles.int <- as.integer(Jilldatwz$log.beetles*3)

Jilldatwz$station <- as.factor(Jilldatwz$Station)
is.factor(Jilldatwz$station)

# Rescale and center continuous variable ---------------------------
library(arm)
Jilldatwz$zjulian.date <- rescale(Jilldatwz$Julian.Date)

# Statistical tests - Generalized linear mixed-effects models ---------------------------
library(lme4)
model_a <- glmer(log.beetles.int ~ zjulian.date * vert.location + (1|station), family = poisson(link = "log"), data = Jilldatwz)

# Check model fit ---------------------------
plot(model_a)  
plot(resid(model_a) ~ Jilldatwz$zjulian.date)  
plot(resid(model_a) ~ Jilldatwz$vert.location)
bartlett.test(resid(model_a), Jilldatwz$vert.location)
hist(resid(model_a))
shapiro.test(resid(model_a))
deviance(model_a) / df.residual(model_a)
# looks okay

# Compare models with different predictors using information-theoretic approach; rank models by delta AICc
library(MuMIn)
options(na.action = "na.fail")   #  prevent fitting models to different datasets
ma <- dredge(model_a, rank = "AICc")
subset(ma,delta < 2) #subset of models where delta AICc<2
# best model is Julian Date and vertical location without interaction

bestmodel_a <- glmer(log.beetles.int ~ zjulian.date + vert.location + (1|station), family = poisson(link = "log"), data = Jilldatwz)

# Check best model fit ---------------------------
plot(bestmodel_a)  
plot(resid(bestmodel_a) ~ Jilldatwz$zjulian.date)  
plot(resid(bestmodel_a) ~ Jilldatwz$vert.location)
bartlett.test(resid(bestmodel_a), Jilldatwz$vert.location)
hist(resid(bestmodel_a))
shapiro.test(resid(bestmodel_a))
deviance(bestmodel_a) / df.residual(bestmodel_a)

# Best model results ---------------------------
summary(bestmodel_a)

# Plot results ---------------------------
library(ggplot2)
p <- ggplot(Jilldatwz, aes(x = Julian.Date, y = log.beetles, shape = vert.location, colour = vert.location)) +
	  ggtitle(" ") +
	  xlab("Julian Date") +
	  ylab("Number of beetles (log transformed)") +
    geom_point(shape = 1) +    
    scale_colour_manual(values = c("red", "#00008B")) +
    geom_smooth(method = lm) 
p + theme_bw() 

theme_bw.2 <- theme_bw() + theme(legend.direction = "horizontal", legend.position = c(0.5,1.047), legend.title = element_blank())
p + theme_bw.2

###############################################
## Number of Species

# Plot data ---------------------------
hist(Jilldatwz$numb.spp)
# use Poisson distribution

# Statistical tests - Generalized linear mixed-effects models ---------------------------
library(lme4)
model_b <- glmer(numb.spp ~ zjulian.date * vert.location + (1|station), family = poisson(link = "log"), data = Jilldatwz)

# Check model fit ---------------------------
plot(model_b)  
plot(resid(model_b) ~ Jilldatwz$zjulian.date)  
plot(resid(model_b) ~ Jilldatwz$vert.location)
bartlett.test(resid(model_b), Jilldatwz$vert.location)
hist(resid(model_b))
shapiro.test(resid(model_b))
deviance(model_b) / df.residual(model_b)
# problems with residual distribution

library(nlme)
Jilldatwz$sqrt.numb.spp <- sqrt(Jilldatwz$numb.spp + 2)

model_b_2 <- lme(sqrt.numb.spp ~ zjulian.date * vert.location, random =~  1|station, data = Jilldatwz)

# Check model fit ---------------------------
plot(model_b_2)  
plot(resid(model_b_2) ~ Jilldatwz$zjulian.date)  
plot(resid(model_b_2) ~ Jilldatwz$vert.location)
bartlett.test(resid(model_b_2), Jilldatwz$vert.location)
hist(resid(model_b_2))
shapiro.test(resid(model_b_2))
# better

# Compare models with different predictors using information-theoretic approach; rank models by delta AICc
library(MuMIn)
options(na.action = "na.fail")   #  prevent fitting models to different datasets
model_b_2_ML <- lme(sqrt.numb.spp ~ zjulian.date * vert.location, random =~  1|station, method = "ML", data = Jilldatwz)
mb<-dredge(model_b_2_ML, rank="AICc")
subset(mb, delta<2) #subset of models where delta AICc<2
# best model has Julian date and vertical location, but no interaction

bestmodel_b <- lme(sqrt.numb.spp ~ zjulian.date + vert.location, random =~  1|station, data = Jilldatwz)
plot(bestmodel_b)  
plot(resid(bestmodel_b) ~ Jilldatwz$zjulian.date)  
plot(resid(bestmodel_b) ~ Jilldatwz$vert.location)
bartlett.test(resid(bestmodel_b), Jilldatwz$vert.location)
hist(resid(bestmodel_b))
shapiro.test(resid(bestmodel_b))

# Best model results ---------------------------
summary(bestmodel_b)

# Plot results ---------------------------
library(ggplot2)
r <- ggplot(Jilldatwz, aes(x = Julian.Date, y = numb.spp, shape = vert.location, colour = vert.location)) +
	  ggtitle(" ") +
	  xlab("Julian Date") +
	  ylab("Number of species") +
    geom_point(shape = 1) +    
    scale_colour_manual(values = c("red", "#00008B")) +
    geom_smooth(method = lm) 
r + theme_bw() 

theme_bw.2<-theme_bw() + theme(legend.direction="horizontal", legend.position=c(0.5,1.047), legend.title=element_blank())
r + theme_bw.2

####################################################

## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# Test for differences between ground and canopy traps using Random Forest models
## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

library(randomForest) # load Random Forest package for analyses

# Load data ---------------------------
Jilldat <- read.csv(file.choose()) # use dialog box to import data file Jill2.data.averaged.csv and assign to Jilldat
names(Jilldat) # lists all variable names

Jilldat.rf <- Jilldat[c(1:11)]

names(Jilldat.rf) # lists all variable names
cor(Jilldat.rf[,unlist(lapply(Jilldat.rf, is.numeric))]) # correlates all non-categorical variables

set.seed(111) # this sets a random number (seed) for a starting point; by specifying this number, we obtain the same results each time [otherwise, a different random number is selected for starting for each run]
grdcanopy.rf <- randomForest(vert.location ~., data = Jilldat.rf, importance = TRUE, proximity = TRUE)
print(grdcanopy.rf)

# try different values of mtry (number of variables used for each classification)
set.seed(111) # this sets a random number (seed) for a starting point; by specifying this number, we obtain the same results each time [otherwise, a different random number is selected for starting for each run]
grdcanopy.rf.2 <- randomForest(vert.location ~., data = Jilldat.rf, importance = TRUE, proximity = TRUE, mtry = 2)
grdcanopy.rf.4 <- randomForest(vert.location ~., data = Jilldat.rf, importance = TRUE, proximity = TRUE, mtry = 4)
grdcanopy.rf.5 <- randomForest(vert.location ~., data = Jilldat.rf, importance = TRUE, proximity = TRUE, mtry = 5)
grdcanopy.rf.6 <- randomForest(vert.location ~., data = Jilldat.rf, importance = TRUE, proximity = TRUE, mtry = 6)
grdcanopy.rf.7 <- randomForest(vert.location ~., data = Jilldat.rf, importance = TRUE, proximity = TRUE, mtry = 7)
print(grdcanopy.rf) #first run had an mtry=3
print(grdcanopy.rf.2)
print(grdcanopy.rf.4)
print(grdcanopy.rf.5)
print(grdcanopy.rf.6)
print(grdcanopy.rf.7)
# mtry = 2 has lowest error rate

# run model again for final model with higher value for ntree
set.seed(111)
grdcanopy.rf<-randomForest(vert.location ~., data = Jilldat.rf,ntree = 10000, importance = TRUE, proximity = TRUE,mtry=2)
print(grdcanopy.rf)
varImpPlot(grdcanopy.rf, sort = TRUE, n.var = min(33,nrow(grdcanopy.rf$importance)), type = NULL, class = NULL, scale = TRUE)

#"Partial dependence is the dependence of the probability of presence on one predictor variable after averaging out the effects of the other predictor variables in the model." (Cutler et al. 2007, Ecology) 
partialPlot(grdcanopy.rf, Jilldat.rf, Nphla.americana, plot = TRUE, add = FALSE, rug = FALSE, xlab = deparse(substitute("Number of Necrophila americana")), ylab = " ", main = paste("Partial Dependence on Necrophila americana"), ylim = c(-1,1))
partialPlot(grdcanopy.rf, Jilldat.rf, N.pustulatus, plot = TRUE, add = FALSE, rug = FALSE, xlab = deparse(substitute("Number of Nicrophorus pustulatus")), ylab = " ", main = paste("Partial Dependence on Nicrophorus pustulatus"), ylim = c(-1,1))
partialPlot(grdcanopy.rf, Jilldat.rf, N.orbicollis, plot = TRUE, add = FALSE, rug = FALSE, xlab = deparse(substitute("Number of Nicrophorus orbicollis")), ylab = " ", main = paste("Partial Dependence on Nicrophorus orbicollis"), ylim = c(-1,1))

# rerun Random Forest with only N. pustulatus, N. orbicollis, and Necrophila americana
names(Jilldat)
Jilldat2 <- Jilldat[c(1,3,7,9)] 
names(Jilldat2)
set.seed(111)
grdcanopy.rf2 <- randomForest(vert.location ~., data = Jilldat2, ntree = 10000, importance = TRUE, proximity = TRUE, mtry = 3)
print(grdcanopy.rf2)

# rerun Random Forest with only N. pustulatus and Necrophila americana
names(Jilldat)
Jilldat3 <- Jilldat[c(1,7,9)] 
names(Jilldat3)
set.seed(111)
grdcanopy.rf3 <- randomForest(vert.location ~., data = Jilldat3, ntree = 10000, importance = TRUE, proximity = TRUE, mtry = 2)
print(grdcanopy.rf3)

## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
# Test for differences between ground and canopy traps using binomial glm models
## xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

#convert vertical location into numbers 0 and 1 (necessary for later models)
Jilldat$vert.loc.bv[Jilldat$vert.location == "ground"] <- 0
Jilldat$vert.loc.bv[Jilldat$vert.location == "canopy"] <- 1
Jilldat$vert.loc.bv

model.grdcan <- glm(vert.loc.bv ~ Nphla.americana + N.pustulatus + N.orbicollis + N.sayi + N.defodiens + N.tomentosus + N.hebes + Ncrdes.surinamensus + O.inequale + O.noveboracense, data = Jilldat, family = binomial)
# warning message suggests perfect separation - will lead to inflated coefficient estimates - we will return to this

#check model fit
#don't need to worry about over- or under-dispersion (http://stats.stackexchange.com/questions/92156/how-to-handle-underdispersion-in-glmm-binomial-outcome-variable)
library(pscl)
library(heatmapFit)
pred<-predict(model.grdcan, type = "response")
heatmap.fit(model.grdcan$y, pred, reps = 1000)
# looks good

# compare models with different predictors using information-theoretic approach with MuMIn package; rank models by delta AICc
library(MuMIn)
options(na.action = "na.fail")   #  prevent fitting models to different datasets
model.grdcan <- glm(vert.loc.bv ~ Nphla.americana + N.pustulatus + N.orbicollis + N.sayi + N.defodiens + N.tomentosus + N.hebes + Ncrdes.surinamensus + O.inequale + O.noveboracense, data = Jilldat, family = binomial)
m1<-dredge(model.grdcan, rank="AICc")
subset(m1, delta<2) #subset of models where delta AICc<2

best.model2 <- glm(vert.loc.bv ~ Nphla.americana + N.pustulatus + N.orbicollis, data = Jilldat, family = binomial)
pred<-predict(best.model2, type = "response")
heatmap.fit(best.model2$y, pred, reps = 1000)
summary(best.model2)
# perfect separation can lead to uninformative (inflated) statistical results

# use Firth's penalized-likelihood logistic regression to estimate coefficients instead
library(logistf)
best.model2.Frth <- logistf(vert.loc.bv ~ Nphla.americana + N.pustulatus + N.orbicollis, data = Jilldat)
summary(best.model2.Frth)
#results support previous analysis - Nicrophorus pustlatus (canopy), Nicrophorus orbicollis (ground) and Necrophila americana (ground) best differentiate ground vs canopy traps
#other species also help to differentiate ground and canopy traps to a lesser extent
#Nicrophorus pustlatus coefficient estimates stand out as extremely different from the rest