# Script Mueller et al., 2023, PeerJ

# questions may be addressed to Agathe Toumoulin: agathe.toumoulin@gmail.com
# data used are available as supporting material to the paper. 

setwd("~/Desktop/Collaborations/Christian/Paper_11_08_21/")

# The different datasets mentioned in the paper are named as follow:
# dataset 3 = "Leaf"
# dataset 4 = "noNA_LMAobs"
# dataset 5 = "noNA_LMAall"

#### Packages loading and dataset importation ####
library(readxl) # to import Excel files
library(plyr) # to calculate cumulative mean
library(dplyr) # to calculate cumulative mean
library(ggplot2) # to illustrate results
library(FactoMineR) # this package run the Principal Component Analysis (PCA)
library(factoextra) # this package is to illustrate results from the PCA
library(MASS) # to run PCA with negative binomial distribution data
library(car) # to run type II anova
library(lmtest) # to run the Breusch-Pagan test (test homoscedasticity)
library(reshape) # to reshape data set before plotting multiple lines with ggplot
library(agricolae) # for kruskal tests 
library(goft) # for testing data fitting to gamma law (to choose a family when GLMing)
library(gridExtra) # Functions for "Grid" Graphics
library(forcats) # extra options for ggplot
library(nlme) # Nonlinear Mixed-Effects Models
library(ggpubr) # combine Ggplots together
library(performance) # check model assumptions
library(mice) # data imputation 
library(lm.beta) # standardized coefficients
library(ggsci) # for palette npg
library(ggExtra)
library(tidyr)


# [leaf] - Importation of the complete dataset 

leaf <- read_excel("Mueller_et_al_all-digitized-leaf-data_12-03-22.xlsx", 
                   sheet = "data", na = "na")

str(leaf)

#### I. Basic leaf area / LMA values and differences between localities ####

# Leaf area
summary(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) 
sd(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) # mean = 1262.40; sd = 963.0993 <<<<
summary(leaf$MAX_area[leaf$Locality=="Suletice-Berand"])
sd(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # mean = 766.73; sd = 786.242 <<<<<
shapiro.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) # W = 0.8576, p-value < 2.2e-16, data are not normal
shapiro.test(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 0.75549, p-value < 2.2e-16, data are not normal
wilcox.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"],leaf$MAX_area[leaf$Locality=="Suletice-Berand"])
# W = 111396, p-value < 2.2e-16 >> leaf surface is significantly higher in Seifhennersdorf <<<<<

# LMA
summary(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) 
sd(leaf$LMA[leaf$Locality=="Seifhennersdorf"],na.rm=T) # mean = 84.92; NAs = 225 // # sd = 23.34044 <<<<
summary(leaf$LMA[leaf$Locality=="Suletice-Berand"])
sd(leaf$LMA[leaf$Locality=="Suletice-Berand"],na.rm=T) # mean = 104.84; NAs = 257 // Sd =29.96921 <<<<<
shapiro.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) # W = 0.95399, p-value = 1.756e-05, data are not normal
shapiro.test(leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 0.96846, p-value = 0.002201, data are not normal
wilcox.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"],leaf$LMA[leaf$Locality=="Suletice-Berand"])
# W = 7397, p-value = 3.582e-10 >> leaf LMA is significantly higher in Suletice-Berand <<<<

# barplot (number of leaf per family per locality)
figure1 <- ggplot(data=leaf, aes(x=Family, fill=Locality))+
  geom_bar(position="dodge")+
  theme_classic()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+
  ggtitle("[all digitized leaves, n=800, leaf]")+
  theme(legend.position="none")
figure1
# since the number of leaf/family varies between Sf and SuBe, we may expect family to explain part of 
# leaf trait variation between the two localities. 

#### II. Leaf size and LMA outliers removal ####
leaf.Sf <- subset(leaf, leaf$Locality=="Seifhennersdorf") # n = 400
leaf.SuBe <- subset(leaf, leaf$Locality=="Suletice-Berand") # n = 400

##### ------ II.1. // Leaf size outliers ####

meansf <- mean(leaf.Sf$MAX_area) # average size in Sf = 1262.40
Fig2A <- ggplot(leaf.Sf, aes(x=Family,y=MAX_area)) +
  geom_hline(yintercept=meansf, colour="grey",linetype = "dashed")+
  geom_jitter(aes(x=Family,y=MAX_area,colour=factor(Locality)), width=0.25) +
  geom_boxplot(alpha=0.5, outlier.colour="black",outlier.shape=8)+ 
  xlab(label = "Family") + 
  ylab(label = "Max. leaf area (mm2)") + 
  theme_classic()+ 
  scale_y_continuous(limits=c(0,6000))+
  theme(legend.position="none")+
  ggtitle("Sf [n = 400; leaf]")+ 
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))

meanSuBe <- mean(leaf.SuBe$MAX_area) # average size in SuBe = 766.73
Fig2B <- ggplot(leaf.SuBe, aes(x=Family,y=MAX_area)) +
  geom_hline(yintercept=meanSuBe, colour="grey",linetype = "dashed")+
  geom_jitter(aes(x=Family,y=MAX_area),color="#00AFBB",width=0.25) +
  geom_boxplot(alpha=0.5, outlier.colour="black",outlier.shape=8)+ 
  xlab(label = "Family") + 
  ylab(label = "Max. leaf area (mm2)") + 
  theme_classic()+ 
  scale_y_continuous(limits=c(0,6000))+
  theme(legend.position="none")+
  ggtitle("SuBe [n = 400; leaf]")+ 
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))

figure2 <- ggarrange(Fig2A,Fig2B,
                     labels = c("A", "B"),
                     ncol = 2, nrow = 1)
figure2

# remove lines with leaf area > 1.5 interquartile distance for Seifhennersdorf and then Suletice-Berand
outlier_val_Sf <- boxplot.stats(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"])$out 
outlier_idx_Sf <-which(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"] %in% c(outlier_val_Sf))
outlier_val_SuBe <- boxplot.stats(leaf$MAX_area[leaf$Locality=="Suletice-Berand"])$out
outlier_idx_SuBe <-which(leaf$MAX_area[leaf$Locality=="Suletice-Berand"] %in% c(outlier_val_SuBe))

# temporary datasat without leaf size outliers  # minus size outliers -> n = 759
leaf_clean_a <- subset(leaf[-outlier_idx_Sf,])
leaf_clean_a <- subset(leaf_clean_a[-outlier_idx_SuBe,])

leaf_clean_a_Sf <- subset(leaf.Sf[-outlier_idx_Sf,]) # just for visualization purpose
leaf_clean_a_SuBe <- subset(leaf.SuBe[-outlier_idx_SuBe,]) # just for visualization purpose

meansfclean <- mean(leaf_clean_a_Sf$MAX_area) # 1144.33
Fig2C <- ggplot(leaf_clean_a_Sf, aes(x=Family,y=MAX_area)) +   # outliers removed
  geom_hline(yintercept=meansfclean, colour="grey",linetype = "dashed")+
  geom_jitter(aes(x=Family,y=MAX_area,colour=factor(Locality)), width=0.25) +
  geom_boxplot(alpha=0.5, outlier.shape=NA)+ 
  xlab(label = "Family") + 
  ylab(label = "Max. leaf area (mm2)") + 
  theme_classic()+ 
  scale_y_continuous(limits=c(0,6000))+
  annotate("text", x = 12, y = 5000, col="grey",label = "OUTLIERS REMOVED")+ 
  theme(legend.position="none")+
  ggtitle("Sf [n = 385; leaf_clean_a_Sf]")+ 
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))
  
meansubeclean <- mean(leaf_clean_a_SuBe$MAX_area) # 608.18
Fig2D <- ggplot(leaf_clean_a_SuBe, aes(x=Family,y=MAX_area)) +   # outliers removed
  geom_hline(yintercept=meansubeclean, colour="grey",linetype = "dashed")+
  geom_jitter(aes(x=Family,y=MAX_area),color="#00AFBB",width=0.25) +
  geom_boxplot(alpha=0.5, outlier.shape=NA)+ 
  xlab(label = "Family") + 
  ylab(label = "Max. leaf area (mm2)") + 
  theme_classic()+ 
  scale_y_continuous(limits=c(0,6000))+
  annotate("text", x = 12, y = 5000, col="grey",label = "OUTLIERS REMOVED")+ 
  theme(legend.position="none")+
  ggtitle("SuBe [n = 374; leaf_clean_a_SuBe]")+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))

figure3 <- ggarrange(Fig2A,Fig2B,Fig2C,Fig2D,
                    labels = c("A", "B", "C","D"),
                    ncol = 2, nrow = 2)
figure3

##### ------ II.2. // Removing LMA outliers ####
outlier_val_SfLMA <- boxplot.stats(leaf_clean_a$LMA[leaf_clean_a$Locality=="Seifhennersdorf"])$out
outlier_idx_SfLMA <-which(leaf_clean_a$LMA[leaf_clean_a$Locality=="Seifhennersdorf"] %in% c(outlier_val_SfLMA))
outlier_val_SuBeLMA <- boxplot.stats(leaf_clean_a$LMA[leaf_clean_a$Locality=="Suletice-Berand"])$out
outlier_idx_SuBeLMA <-which(leaf_clean_a$LMA[leaf_clean_a$Locality=="Suletice-Berand"] %in% c(outlier_val_SuBeLMA))

leaf_clean_b <- subset(leaf_clean_a[-outlier_idx_SfLMA,]) 
leaf_clean_b <- subset(leaf_clean_b[-outlier_idx_SuBeLMA,]) # minus LMA outliers -> n = 753 

Fig3B <- ggplot(data=leaf_clean_b, aes(x=Family, fill=Locality))+
  geom_bar(position="dodge")+
  theme_classic()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+
  ggtitle("[no size & LMA outliers, n=752, dataset=leaf_clean_b]")+
  theme(legend.position="none")

figure4 <- ggarrange(figure1, Fig3B,
                    labels = c("A", "B"),
                    ncol = 2, nrow = 1)
figure4

#### III. LMA imputation #####
##### ------ III.1. // Preparing the data ####

# We will simulate LMA for specimen belonging to species having >= 5 LMA observation per locality
# what are these species? 

ggplot(data=leaf_clean_b, aes(x=fct_infreq(Species)))+
  geom_bar(data=leaf_clean_b[which(leaf_clean_b$petiole_w>0),], aes(x=fct_infreq(Species)),fill="grey",stat = 'count') +
  theme_classic()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  geom_text(data=leaf_clean_b[which(leaf_clean_b$petiole_w>0),], stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+
  ggtitle("Number of specimen / family with LMA")+
  facet_wrap(~ Locality, strip.position = "bottom") +
  theme(legend.position="none")

leaf_clean_c <- subset(leaf_clean_b, leaf_clean_b$LMA!="NA") # there are 298 observed LMA
leaf_clean_c_Sf <- subset(leaf_clean_c, leaf_clean_c$Locality=="Seifhennersdorf") # n = 160
leaf_clean_c_SuBe <- subset(leaf_clean_c, leaf_clean_c$Locality=="Suletice-Berand") # n = 138

by(leaf_clean_c_Sf, leaf_clean_c_Sf$Species,nrow)
# Families with with LMA observation > 5, for which we may simulate LMA in Sf
a <- c("Acer_angustilobum","Betula_alboides","Carpinus_grandis","Carpinus_roscheri",
       "Carya_fragiliformis","Laurophyllum_sp","Leguminosites_sp","Platanus_neptuni","Rosa_lignitum","Ulmus_fischeri")

by(leaf_clean_c_SuBe, leaf_clean_c_SuBe$Species,nrow)
# Families with with LMA observation > 5, for which we may simulate LMA in SuBe
b <- c("Daphnogene_cinnamomifolia","Engelhardia_orsbergensis","Laurophyllum_cfacutimontanum",
       "Leguminosae_indet","Platanus_neptuni","Symblocos_deichmuelleri","Zelkova_zelkovifolia")

leaf_clean_b_Sf <- subset(leaf_clean_b, leaf_clean_b$Locality=="Seifhennersdorf") 
leaf_clean_b_Sf_light <- subset(leaf_clean_b_Sf, Species %in% a)

leaf_clean_b_SuBe <- subset(leaf_clean_b, leaf_clean_b$Locality=="Suletice-Berand") 
leaf_clean_b_SuBe_light <- subset(leaf_clean_b_SuBe, Species %in% b) 

leaf_clean <- rbind(leaf_clean_b_Sf_light,leaf_clean_b_SuBe_light) # n = 530 
# We have created a reduced dataset of leaf_clean_b. Data contains "na" from these species for which we believe 
# it's possible to simulate LMA quite reliably this dataset will contain LMA obs and specimen for which LMA may be predicted

# cleaning workspace from temporary used datasets and values
rm(leaf_clean_a, leaf_clean_a_Sf, leaf_clean_a_SuBe, leaf_clean_b, leaf_clean_b_Sf, leaf_clean_b_Sf_light, 
   leaf_clean_b_SuBe, leaf_clean_b_SuBe_light, leaf_clean_c, leaf_clean_c_Sf, leaf_clean_c_SuBe)

##### ------ III.2. // Simulating missing LMA values ####
leaf_clean$Locality=as.factor(leaf_clean$Locality)
leaf_clean$Family=as.factor(leaf_clean$Family)
leaf_clean$Species=as.factor(leaf_clean$Species)
short <-c("ID","Locality","Species","MAX_area","MAX_w","LMA")
datamice <- subset(leaf_clean[,short])

# method "norm" (bayesian linear regression) should be used for Normal data. 
mod1 <- lm(LMA~-1+Locality+Species+MAX_area+MAX_w, data=leaf_clean, na.action=na.omit)
par(mfrow = c(2,2))
plot(mod1) # linearity, homoscedasticity and normality of the model residuals look ok
shapiro.test(residuals(mod1)) # p-value = 0.07256... it's not perfectly normal but acceptable (as suggested by the check_normality function below)
check_normality(mod1) # "OK: residuals appear as normally distributed (p = 0.087)" 
# we may use the method "norm"
# the data we will simulate may tend to be closer to a Gaussian distribution than they are in our dataset.

# setting up MICE
init = mice(datamice, maxit=0) 
meth = init$method
predM = init$predictorMatrix

predM[, c("ID")]=0 # here are listed the variables in datamice that won't be used to predict LMA
meth[c("LMA")]="norm"  # method used 
set.seed(103)
IMP <- mice(datamice, method=meth, predictorMatrix=predM, m=10)

# we test the reliability of the imputation by plotting LMA vs leaf size (same as Fig. 4D)
# to see whether the relation between both parameters with/without imputation values remains the same 
figure5 <- xyplot(IMP,LMA~MAX_area | as.factor(.imp), pch=18,cex=0.5) # ok
figure5

fit<-with(IMP,lm(LMA~-1+Locality+Species+MAX_area+MAX_w))
summary(pool(fit))
# the variables contributing significantly to the imputation are Locality and MAX_area

imputed_data <- complete(IMP) # we have a dataset with observation + imputed data (n= 530, 294 modelled LMA)

colimput <- imputed_data[,c("MAX_area","LMA")]
data <- left_join(leaf_clean, colimput, by = c("MAX_area" = "MAX_area")) # There are doublons in IDs...

names(data)[names(data) == "LMA.x"] <- "LMA_obs"
names(data)[names(data) == "LMA.y"] <- "LMA_all"

data$`LMA value`=as.character(data$`LMA value`) 
data$`LMA value` <- replace_na(data$`LMA value`,"pred")

write.csv2(data,file="data_LMA_imputed.csv") # to download the data. 

##### ------ III.3. // Visualizing imputed vs calculated values ####
# fig SI_1_A
p<- ggplot(data, aes(x=LMA_all, y=MAX_area, color=`LMA value`)) +
  geom_point(alpha = 0.4, size=1.5) +
  scale_color_npg()+
  ylab(label = "Leaf area") + 
  xlab(label = "LMA") + 
  theme_classic()+
  theme(legend.position="none")
fig_SI1_A <- ggMarginal(p, margins = "x", type="density", groupColour = T, groupFill = T, alpha=0.2)

# fig SI_1_B
data_sf <- subset(data, data$Locality=="Seifhennersdorf") 
data_sube <- subset(data, data$Locality=="Suletice-Berand") 

minobs <- min(data$LMA_all[data$`LMA value`=="obs"],na.rm="T")
meanobs <- mean(data$LMA_all[data$`LMA value`=="obs"],na.rm="T")
meanpred <- mean(data$LMA_all[data$`LMA value`=="pred"],na.rm="T")

fig_SI1_B <- ggplot(data, aes(x=Locality,y=LMA_all,color=`LMA value`)) +   
  geom_boxplot(aes(y=LMA_all,fct_reorder(Locality,LMA_all,.desc=T,.fun="median",na.rm=T),
                   colour=factor(`LMA value`)), alpha=1, outlier.shape=NA)+ 
  geom_point(position=position_jitterdodge(),alpha=0.2)+
  scale_color_npg()+
  annotate("text", x = 2.5, y = 200, hjust = 1, col="#F8766D",label = "calculated")+ 
  annotate("text", x = 2.5, y = 190, hjust = 1, col="#00BFC4",label = "predicted")+ 
  annotate("text", x = 0.5, y = 36, hjust = 0, col="#F8766D",label = "minimum calculated LMA")+ 
  geom_hline(yintercept=minobs, colour="#F8766D",linetype = "solid", alpha=1)+
  xlab(label = "Locality") + 
  ylab(label = "LMA") + 
  theme_classic()+ 
  scale_y_continuous(limits=c(25,200))+
  scale_x_discrete(labels=c("SuBe","Sf"))+
  theme(legend.position="none")+
  theme(axis.text.x = element_text(angle = 0))
fig_SI1_B

# fig SI_1_C - boxplots of LMA, calculated vs predicted values per species
sp <- c("Symb. dei", "Daph. cin", "Legu. ind", "Laur. sp", "Plat. nep", "Enge. ors", "Rosa lig.", "Legu. sp", "Laur. cfa", "Zelk. zel", "Ulmu. fis", "Carp. gra", "Acer ang", "Carp. ros", "Cary. fra", "Betu. alb")
fig_SI1_C <- ggplot(data, aes(x=Species,y=LMA_all,color=`LMA value`)) +   
  geom_boxplot(aes(y=LMA_all,fct_reorder(Species,LMA_all,.desc=T,.fun="median",na.rm=T),colour=factor(`LMA value`)), alpha=1, outlier.shape=NA)+ 
  geom_point(position=position_jitterdodge(),alpha=0.5,size=0.7)+
  scale_color_npg()+
  geom_hline(yintercept=minobs, colour="#F8766D",linetype = "solid",alpha=1)+
  xlab(label = "Species") + 
  ylab(label = "LMA") + 
  scale_x_discrete(labels=c(sp))+
  theme_classic()+ 
  scale_y_continuous(limits=c(25,200))+
  theme(legend.position="none")+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))

fig_SI_1 <- ggdraw() +
  draw_plot(fig_SI1_A, x = 0, y = .6, width = .6, height = .4) +
  draw_plot(fig_SI1_B, x = .6, y = .6, width = .4, height = .4) +
  draw_plot(fig_SI1_C, x = 0.05, y = 0, width = 0.9, height = 0.6) +
  draw_plot_label(label = c("A", "B", "C"), size = 15,
                  x = c(0, 0.6, 0), y = c(1, 1, 0.6))
fig_SI_1

pdf(file="fig_SI_1.pdf",width=7.1, height=5)
fig_SI_1      
dev.off()

# removing the seven predictions of LMA < any observed LMA
data <- subset(data,LMA_all >= minobs)

#### IV. Principal component Analyses ####
##### ------ IV.1. // Preparing the data + checking size and LMA values ####
# Removing lines with NAs in one of the variable we will use in our multivariate analysis. 
x <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_obs","Locality","Family","Genus","specimen_TCT","taxon_TCT","combined_TCT","cat_TCT","PI","Pheno","Growth_form")
noNA_LMAobs <- subset(data, complete.cases(data[,x])) # n = 236
xx <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_all","Locality","Family","Genus","specimen_TCT","taxon_TCT","combined_TCT","cat_TCT","PI","Pheno","Growth_form")
noNA_LMAall <- subset(data, complete.cases(data[,xx])) # n = 506

# Suppressing / grouping rare TCTs
by(noNA_LMAobs,noNA_LMAobs$cat_TCT,nrow) 
noNA_LMAobs <- subset(noNA_LMAobs[!noNA_LMAobs$cat_TCT %in% c("CD"),]) # n=232
by(noNA_LMAobs,noNA_LMAobs$combined_TCT,nrow)
noNA_LMAobs <- subset(noNA_LMAobs[!noNA_LMAobs$combined_TCT %in% c("E,F"),]) # n=228
by(noNA_LMAobs,noNA_LMAobs$specimen_TCT,nrow) # no rare TCT with n<5

by(noNA_LMAall,noNA_LMAall$cat_TCT,nrow) # no cat_TCT with n<5
by(noNA_LMAall,noNA_LMAall$combined_TCT,nrow) # "C" and "D" are rare so we move them to "C or D" class
noNA_LMAall$combined_TCT <- revalue(noNA_LMAall$combined_TCT, c("C"="C,D"))
noNA_LMAall$combined_TCT <- revalue(noNA_LMAall$combined_TCT, c("D"="C,D"))
by(noNA_LMAall,noNA_LMAall$specimen_TCT,nrow) # same for specimen TCT
noNA_LMAall$specimen_TCT <- revalue(noNA_LMAall$specimen_TCT, c("C"="C or D"))
noNA_LMAall$specimen_TCT <- revalue(noNA_LMAall$specimen_TCT, c("D"="C or D"))

# check again that qualitative variables are factors
noNA_LMAobs$Locality=as.factor(noNA_LMAobs$Locality) 
noNA_LMAobs$Family=as.factor(noNA_LMAobs$Family)
noNA_LMAobs$Pheno=as.factor(noNA_LMAobs$Pheno) 
noNA_LMAobs$Interaction=as.factor(noNA_LMAobs$Interaction) 

noNA_LMAall$Locality=as.factor(noNA_LMAall$Locality) 
noNA_LMAall$Family=as.factor(noNA_LMAall$Family)
noNA_LMAall$Pheno=as.factor(noNA_LMAall$Pheno) 
noNA_LMAall$Interaction=as.factor(noNA_LMAall$Interaction) 

##### ------ IV.2. // Checking leaf size among datasets #####
summary(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) #mean = 1262.40; sd=963.0993
sd(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"])
summary(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"]) # mean = 1104.4; sd=730.0568
sd(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"]) 
summary(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"]) # mean = 1064.28; sd = 704.9531
sd(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"]) 

summary(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # mean = 766.73; sd = 786.242
sd(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) 
summary(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) # mean =452.90; sd = 476.2621
sd(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) 
summary(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) # mean = 526.27; sd = 437.1632
sd(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) 

shapiro.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) # W = 0.8576, p-value < 2.2e-16, data are not N
shapiro.test(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"]) # W = 0.90357, p-value = 2.072e-07, data are not N
shapiro.test(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"]) # W = 0.90875, p-value = 3.305e-11, data are not N
shapiro.test(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 0.75549, p-value < 2.2e-16, data are not N
shapiro.test(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 0.73286, p-value = 1.822e-12, data are not N
shapiro.test(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 0.81888, p-value < 2.2e-16, data are not N

wilcox.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"],leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 111396, p-value < 2.2e-16, significant differences 
wilcox.test(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"],noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 10626, p-value < 2.2e-16, idem
wilcox.test(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"],noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 50377, p-value < 2.2e-16, idem

##### ------ IV.3. // Checking LMA among datasets #####
summary(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) # mean 84.92; sd = 23.34044
sd(leaf$LMA[leaf$Locality=="Seifhennersdorf"],na.rm=T) 
summary(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"]) # mean = 82.92; sd =  19.63653
sd(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"])  
summary(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"]) # mean = 81.62; sd = 20.91337
sd(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"]) 

summary(leaf$LMA[leaf$Locality=="Suletice-Berand"])# mean = 104.84; sd = 29.96921
sd(leaf$LMA[leaf$Locality=="Suletice-Berand"],na.rm=T)
summary(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) # mean = 102.70; sd = 24.93115 
sd(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) 
summary(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) # mean = 103.78; sd = 24.01588
sd(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"])

shapiro.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) # W = 0.95399, p-value = 1.756e-05, data are not N
shapiro.test(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"]) # W = 0.97124, p-value = 0.009463, data are not N
shapiro.test(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"]) # W = 0.99002, p-value = 0.08377, data are N

shapiro.test(leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 0.96846, p-value = 0.002201, data are not N
shapiro.test(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 0.97212, p-value = 0.02701, data are not N
shapiro.test(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 0.9908, p-value = 0.09721, data are N 

wilcox.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"],leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 7397, p-value = 3.582e-10, significant differences 
wilcox.test(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"],noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 3425, p-value = 1.11e-09, idem
wilcox.test(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"],noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 15979, p-value < 2.2e-16, idem
# all datasets reflect significant LMA differences between localities


##### ------ IV.4. // Running PCA for data with LMA observations only #####
quantivar <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_obs") # listing quantitative variables to be used
leaf.pca1 <- PCA(noNA_LMAobs[,quantivar],graph=F)
eig.val1 <- get_eigenvalue(leaf.pca1) 
head(eig.val1) # Dimensions 1 and 2 explain 85,03% of data ordination. 
fviz_screeplot(leaf.pca1,addlabels = TRUE) # same information with an histogram
var1 <- get_pca_var(leaf.pca1) # extract information on the variables (see bellow)
var1
fviz_pca_var (leaf.pca1, repel = TRUE) # show how variables affect data point location in space
head(var1$contrib) # give the contribution of the different variables to the dimensions 
fviz_contrib_a <- fviz_contrib (leaf.pca1, "var", axes = 1,title="PC1, without imputation") # Axes 1 --> MAX_area and MAX_w
fviz_contrib_b <- fviz_contrib (leaf.pca1, "var", axes = 2,title="PC2, without imputation") # Axes 2 --> MAX_lw
fviz_contrib_a
fviz_contrib_b
# Rq: The red dashed line indicates the expected mean value if the contributions were uniform. 

PCA1 <- fviz_pca_biplot(leaf.pca1, geom = c("point", "text"),
             habillage = as.factor(noNA_LMAobs$Locality), # color by Locality 
             label=c("var"),
             col.var = "black",
             pointsize = 0.5,
             addEllipses = TRUE, ellipse.level=0.95, ellipse.type="convex",
             repel = TRUE, 
             palette = "npg",
             title = "Specimens") 
PCA1 <- PCA1 + theme(legend.position="none")
PCA1 <- PCA1 + xlim(-5, 7.5) + ylim (-3.5, 5)
PCA1       

PCA2 <- fviz_pca_ind(leaf.pca1, geom = c("point"),
                addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                pointshape = 19,
                pointsize = 0.5,
                col.ind = noNA_LMAobs$combined_TCT,
                palette = "npg",
                title="Combined TCT",
                repel = TRUE)
PCA2 <- PCA2 +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none")
PCA2

PCA3 <- fviz_pca_ind(leaf.pca1, geom = c("point"),
                         addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                         pointshape = 19,
                     pointsize = 0.5,
                     palette = "npg",
                         col.ind = noNA_LMAobs$cat_TCT,
                         title="Categorial TCT",
                         repel = TRUE)
PCA3 <- PCA3 +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none")
PCA3

PCA4 <- fviz_pca_ind(leaf.pca1, geom = c("point"),
                        addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                         pointshape = 19,
                     pointsize = 0.5,
                     palette = "npg",
                         habillage = noNA_LMAobs$Family,
                         title="Family",
                         repel = TRUE)
PCA4 <- PCA4 +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none")
PCA4


PCA5 <- fviz_pca_ind(leaf.pca1, geom = c("point"),
                      addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                      pointshape = 19,
                     pointsize = 0.5,
                     palette = "npg",
                      habillage = noNA_LMAobs$Pheno,
                      title="Phenology",
                      repel = TRUE)
PCA5 <- PCA5 +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none")
PCA5

PCA6 <- fviz_pca_ind(leaf.pca1, geom = c("point"),
                      addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                      pointshape = 19,
                     pointsize = 0.5,
                     palette = "npg",
                      habillage = noNA_LMAobs$Interaction,
                      title="Interaction",
                      repel = TRUE)
PCA6 <- PCA6 +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none")
PCA6

figureSI_PCA <- ggarrange(PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,
                     labels = c("A", "B","C","D","E","F"),
                     ncol = 2, nrow = 3)

##### ------ IV.5. // Running PCA for data with LMA observations AND imputation #####
quantivar2 <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_all") 
leaf.pca2 <- PCA(noNA_LMAall[,quantivar2],graph=F)
eig.val2 <- get_eigenvalue(leaf.pca2) 
head(eig.val2) # Dimensions 1 and 2 explain 83.91% of data location 
fviz_screeplot(leaf.pca2,addlabels = TRUE) 
fviz_pca_var (leaf.pca2, repel = TRUE) 
var2 <- get_pca_var(leaf.pca2) 
head(var2$contrib)  
fviz_contrib_c <- fviz_contrib (leaf.pca2, "var", axes = 1,title="PC1, with imputation") # Axes 1 --> MAX_area and MAX_w
fviz_contrib_d <-fviz_contrib (leaf.pca2, "var", axes = 2,title="PC2, with imputation") # Axes 2 --> MAX_lw
figureSI_contrib <- ggarrange(fviz_contrib_a,fviz_contrib_b,fviz_contrib_c,fviz_contrib_d,
                          labels = c("A", "B","C","D"),
                          ncol = 2, nrow = 2)
figureSI_contrib

PCA1b <- fviz_pca_biplot(leaf.pca2, geom = c("point", "text"),
                        habillage = as.factor(noNA_LMAall$Locality), # color by Locality 
                        label=c("var"),
                        pointsize = 0.5,
                        col.var = "black",
                        addEllipses = TRUE, ellipse.level=0.95, ellipse.type="convex",
                        repel = TRUE, 
                        palette = "npg",
                        title = "Specimens") 
PCA1b <- PCA1b + theme(legend.position="none")
PCA1b <- PCA1b + xlim(-5, 7.5) + ylim (-3.5, 5)
PCA1b       

PCA2b <- fviz_pca_ind(leaf.pca2, geom = c("point"),
                     addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                     pointshape = 19,
                     pointsize = 0.5,
                     col.ind = noNA_LMAall$combined_TCT,
                     palette = "npg",
                     title="Combined TCT",
                     repel = TRUE)
PCA2b <- PCA2b +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none")
PCA2b

PCA3b <- fviz_pca_ind(leaf.pca2, geom = c("point"),
                     addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                     pointshape = 19,
                     pointsize = 0.5,       
                     palette = "npg",
                     col.ind = noNA_LMAall$cat_TCT,
                     title="Categorial TCT",
                     repel = TRUE)
PCA3b <- PCA3b +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none")
PCA3b

PCA4b <- fviz_pca_ind(leaf.pca2, geom = c("point"),
                     addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                     pointshape = 19,
                     pointsize = 0.5, 
                     palette = "npg",
                     habillage = noNA_LMAall$Family,
                     title="Family",
                     repel = TRUE)
PCA4b <- PCA4b +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none")
PCA4b


PCA5b <- fviz_pca_ind(leaf.pca2, geom = c("point"),
                     addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                     pointshape = 19,
                     pointsize = 0.5, 
                     palette = "npg",
                     habillage = noNA_LMAall$Pheno,
                     title="Phenology",
                     repel = TRUE)
PCA5b <- PCA5b +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none")
PCA5b

PCA6b <- fviz_pca_ind(leaf.pca2, geom = c("point"),
                     addEllipses = T, ellipse.level=0.95,ellipse.type="convex",
                     pointshape = 19,
                     pointsize = 0.5, 
                     palette = "npg",
                     habillage = noNA_LMAall$Interaction,
                     title="Interaction",
                     repel = TRUE)
PCA6b <- PCA6b +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none")
PCA6b

figure_PCA <- ggarrange(PCA1b,PCA2b,PCA3b,PCA4b,PCA5b,PCA6b,
                          labels = c("A", "B","C","D","E","F"),
                          ncol = 2, nrow = 3)
figure_PCA

figure_PCAlight <- ggarrange(PCA1b,PCA4b,PCA5b,PCA6b,
                        labels = c("A", "B","C","D"),
                        ncol = 2, nrow = 2)
figure_PCAlight


#### V. Generalized Linear Models ####
##### ------ V.1. // GLM 1/3 - for data with LMA observations only ####

# A - LMA 
shapiro.test(noNA_LMAobs$LMA_obs) #p-value = 2.585e-05
gamma_test(noNA_LMAobs$LMA_obs) # V = 1.5729, p-value = 0.266 --> data fit to gamma distribution
lm.null.a <- glm(LMA_obs ~ 1, data=noNA_LMAobs, family = Gamma(link = "identity"), na.action = na.omit)
model.aic.both.a <- step(lm.null.a, direction = "both", trace = 1, scope = ~ Locality + Family + Species + combined_TCT + cat_TCT + PI + Pheno + MAX_area + Growth_form)
formula(model.aic.both.a) # LMA_obs ~ MAX_area + Pheno + Species + Locality ; AIC=1930.31
anova.LMA <- Anova(model.aic.both.a) # Type II Anova --> the order of parameter in the model does not matter
anova.LMA # Locality isn't significant

par(mfrow = c(2,2))
plot(model.aic.both.a)
par(mfrow = c(1,1))
shapiro.test(residuals(model.aic.both.a)) # W = 0.99606, p-value = 0.8323, normality of residuals ok 
bptest(model.aic.both.a) #  p-value = 0.07368 --> homogeneity of residuals ok
check_model(model.aic.both.a)
r2(model.aic.both.a)

# B - leaf size
shapiro.test(noNA_LMAobs$MAX_area) #p-value = 1.109e-13
gamma_test(noNA_LMAobs$MAX_area) # V = -0.0014267, p-value = 0.9992 --> data fit to gamma distribution
lm.null.b <- glm(MAX_area ~ 1, data=noNA_LMAobs, family = Gamma(link="identity"), na.action = na.omit)
model.aic.both.size <- step(lm.null.b, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + combined_TCT + cat_TCT+ PI + Pheno + Growth_form)
# the model do not converge when using specimen_TCT or combined_TCT with cat_TCT. 
formula(model.aic.both.size) # MAX_area ~ Species + PI + Pheno;  AIC=3334.94
anova.size <- Anova(model.aic.both.size) # Type II Anova 
anova.size # all significant

cor.test(noNA_LMAobs$MAX_area,noNA_LMAobs$PI,method="spearman") # rho = -0.3814086 ; p-value = 2.615e-09
boxplot(noNA_LMAobs$MAX_area~noNA_LMAobs$Pheno) 

par(mfrow = c(2,2))
plot(model.aic.both.size) 
shapiro.test(residuals(model.aic.both.size)) # p-value = 0.2264 normality of residuals ok 
bptest(model.aic.both.size) # p-value = 0.002736 problem with residuals homogeneity, but it seems it only comes from 3 points
check_model(model.aic.both.size)
r2(model.aic.both.size)

# C - Interaction with insects 
100*sum(noNA_LMAobs$area_damaged == 0)/nrow(noNA_LMAobs) # 92.11% of data are zero!  
non_zero <- ifelse(noNA_LMAobs$area_damaged > 0, 1, 0)

ggplot(aes(x = as.numeric(row.names(noNA_LMAobs)), y = area_damaged),  data = noNA_LMAobs) + 
  geom_jitter(aes(color = area_damaged > 0))

# we have semi(continuous) and zero-inflated data 
# we have a lot of zeros, which mean either : 
# (1) no herbivore trace was found (e.g., damage on a missing part of the leaves or traces too small to be observed and measured); 
# or (2) the leaf wasn't eaten
# we will use a "two-stage model" process as suggested here: 
# (Ben Bolker (https://stats.stackexchange.com/users/2126/ben-bolker), How to model non-negative zero-inflated continuous data?, URL (version: 2021-10-03): https://stats.stackexchange.com/q/187843)

# step 1 : presence / absence of damages
# Is the probability to be eaten/non eaten for a leaf related to certain traits?
library(plyr)
noNA_LMAobs$Interaction <- revalue(noNA_LMAobs$Interaction, c("yes"=1))
noNA_LMAobs$Interaction <- revalue(noNA_LMAobs$Interaction, c("no"=0))
noNA_LMAobs$Interaction=as.factor(noNA_LMAobs$Interaction)

lm.null.herbi.1a <- glm(Interaction ~ 1, data=noNA_LMAobs, family = binomial (link="logit"))
model.herbi.1a <- step(lm.null.herbi.1a, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_obs + MAX_area + combined_TCT + PI + Pheno)
formula(model.herbi.1a) # nothing explains the presence / absence of herbivory traces (with the var "Interaction")

# Step 2: surface of damages
# Is the surface damaged by herbivory interaction associated to particular traits?
short_herbi <- subset(noNA_LMAobs, noNA_LMAobs$area_damaged!="0") # We focus on leaves for which non-zero values are observed, n=18
shapiro.test(short_herbi$area_damaged) # p-value = 9.506e-07
gamma_test(short_herbi$area_damaged) # V = 2.2726, p-value = 0.1081 --> data fit to gamma distribution

lm.null.herbi.1b <- glm(area_damaged ~ 1, data=short_herbi, family = Gamma(link="identity"), na.action = na.omit)
model.herbi.1b <- step(lm.null.herbi.1b, direction = "both", trace = 1, scope = ~ -1 + Locality + Species + LMA_obs + MAX_area + combined_TCT + Pheno)
formula(model.herbi.1b) # algo did not converge (leaf area could have an effect though), removing combined_TCT and pheno var doesnt help
anova.herbi2 <- Anova(model.herbi.1b) 
anova.herbi2 

# We also try to use the Herbivory Index instead of the area damaged
lm.null.herbi.1c <- glm(Herbivory_index ~ 1, data=short_herbi, family = Gamma(link="log"), na.action = na.omit)
model.herbi.1c <- step(lm.null.herbi.1c, direction = "both", trace = 1, scope = ~ -1 + Locality + Species + LMA_obs + MAX_area + combined_TCT + cat_TCT + Pheno)
formula(model.herbi.1c) # Herbivory_index ~ 1 -> No model found. 

##### ------ V.2. // GLM 2/3 - for data with LMA observations & imputations #####

# -- LMA dataset WITH LMA imputation

# A - LMA 
gamma_test(noNA_LMAall$LMA_all) # V = -1.8269, p-value = 0.1964 --> data fit to gamma distribution
lm.null.c <- glm(LMA_all ~ 1, data=noNA_LMAall, family = Gamma(link="identity"), na.action = na.omit)
modelLMA2 <- step(lm.null.c, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + combined_TCT + cat_TCT + PI + Pheno + MAX_area + Growth_form)
formula(modelLMA2) # LMA_all ~ Species + MAX_area + Pheno + PI + Locality, AIC=4431.38
anova.LMA <- Anova(modelLMA2) # Type II Anova --> the order of parameter in the model does not matter
anova.LMA # all variable are significant
summary(modelLMA2)

par(mfrow = c(2,2))
plot(modelLMA2)
shapiro.test(residuals(modelLMA2)) # p-value = 0.001568 --> not ok but it's because the number of specimen is important!
bptest(modelLMA2) # p-value = 0.1525 --> homogeneity ok
check_model(modelLMA2)
r2(modelLMA2)

# B - leaf size
gamma_test(noNA_LMAall$MAX_area) # V = 1.459, p-value = 0.3022 --> data fit to gamma distribution
lm.null.d <- glm(MAX_area ~ 1, data=noNA_LMAall, family = Gamma(link="identity"), na.action = na.omit)
model.size.2 <- step(lm.null.d, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + combined_TCT + cat_TCT + PI + Pheno + Growth_form)
formula(model.size.2) # MAX_area ~ Species + PI + Pheno + Locality;  AIC=7525.76
anova.size <- Anova(model.size.2) # Type II Anova 
anova.size # all significant but Locality, marginally significant
summary(model.size.2)

cor.test(noNA_LMAall$MAX_area,noNA_LMAall$PI,method="spearman") # rho = -0.2911195  ; p-value = 1.77e-11
boxplot(noNA_LMAall$MAX_area~noNA_LMAall$Pheno) 

shapiro.test(residuals(model.size.2)) # p-value = 0.0007707 --> N not ok 
bptest(model.size.2) # p-value = 4.192e-05 --> homogeneity not ok
par(mfrow = c(2,2))
plot(model.size.2) 
check_model(model.size.2)
# in this case, normality of the residuals doesn't seem that good, which is surprising because the number of data is quite important

r2(model.size.2)
summary(model.size.2)

# C - Interaction with insects 
100*sum(noNA_LMAall$area_damaged == 0)/nrow(noNA_LMAall) # 90.25% of data are zero!  
non_zero <- ifelse(noNA_LMAall$area_damaged > 0, 1, 0)

ggplot(aes(x = as.numeric(row.names(noNA_LMAall)), y = area_damaged),  data = noNA_LMAall) + 
  geom_jitter(aes(color = area_damaged > 0))

# step 1 : presence / absence of damages
# Is the probability to be eaten/non eaten for a leaf related to certain traits?
noNA_LMAall$Interaction <- revalue(noNA_LMAall$Interaction, c("yes"=1))
noNA_LMAall$Interaction <- revalue(noNA_LMAall$Interaction, c("no"=0))
noNA_LMAall$Interaction=as.factor(noNA_LMAall$Interaction)

lm.null.herbi.2a <- glm(Interaction ~ 1, data=noNA_LMAall, family = binomial (link="logit"))
model.herbi.2a <- step(lm.null.herbi.2a, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_all + MAX_area + combined_TCT + cat_TCT + PI + Pheno + Growth_form)
formula(model.herbi.2a) # nothing explains the presence / absence of herbivory traces (with the var "Interaction")
anova.size <- Anova(model.herbi.2a) # Type II Anova 
anova.size # all significant but Locality, marginally significant

r2(model.herbi.2a)
summary(model.herbi.2a)

# Step 2: surface of damages
# Is the surface damaged by herbivory interaction associated to particular traits?
short_herbi2 <- subset(noNA_LMAall, noNA_LMAall$area_damaged!="0") # We focus on leaves for which non-zero values are observed, n=50
gamma_test(short_herbi2$area_damaged) # V = 2.3293, p-value = 0.09955 --> data fit to gamma distribution

lm.null.herbi.2b <- glm(area_damaged ~ 1, data=short_herbi2, family = Gamma(link="identity"), na.action = na.omit)
model.herbi.2b <- step(lm.null.herbi.2b, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_all + MAX_area + cat_TCT + combined_TCT + Pheno + Growth_form)
formula(model.herbi.2b) # area_damaged ~ MAX_area + Pheno
anova.herbi.2b <- Anova(model.herbi.2b) 
anova.herbi.2b # both variable effects are significant

shapiro.test(residuals(model.herbi.2b)) # p-value = 0.3701 ok
bptest(model.herbi.2b) # p-value = 0.01653 --> homogeneity not ok
par(mfrow = c(2,2))
plot(model.herbi.2b)
check_model(model.herbi.2b)

summary(model.herbi.2b) # positive corr. to leaf surface, negative corr to leaf lifespan. 
r2(model.herbi.2b)

plot(short_herbi2$MAX_area,short_herbi2$area_damaged)
shapiro.test(short_herbi2$MAX_area) # data not N --> non parametric test
cor.test(short_herbi2$MAX_area,short_herbi2$area_damaged, method="spearman") # p-value = 0.004895 there is a correlation ; rho = 0.39

plot(short_herbi2$LMA_all,short_herbi2$area_damaged)
shapiro.test(short_herbi2$LMA_all) # data N
shapiro.test(short_herbi2$area_damaged) # data not N --> non parametric test
cor.test(short_herbi2$LMA_all,short_herbi2$area_damaged, method="spearman") # p-value = 0.1719; there is no correlation ; rho = -0.1960624 

# We also try using the Herbivory Index instead of the area damaged
lm.null.herbi.2c <- glm(Herbivory_index ~ 1, data=short_herbi2, family = Gamma(link="log"), na.action = na.omit)
model.herbi.2c <- step(lm.null.herbi.2c, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_all + cat_TCT + combined_TCT + Pheno + Growth_form)
formula(model.herbi.2c) # Herbivory_index ~ Pheno + LMA_all;  AIC=236.26. 
anova.herbi.2c <- Anova(model.herbi.2c) 
anova.herbi.2c # Only phenology is significant

shapiro.test(residuals(model.herbi.2c)) # p-value = 0.401 ok
bptest(model.herbi.2c) # p-value = 0.1955 --> homogeneity ok
par(mfrow = c(2,2))
plot(model.herbi.2c)
check_model(model.herbi.2c)

summary(model.herbi.2c)
r2(model.herbi.2c)


##### ------ V.2. // GLM 3/3 - A last attempt to understand plant-insect interactions (dataset n = 6284 leaves, no quantitative traits) #####

complete_data <- read_excel("Sf-SuBe_dataset_full_May_2022.xlsx", sheet = "Dataset complet", na = "na")
complete_data$Interaction <- revalue(complete_data$Interaction, c("yes"=1))
complete_data$Interaction <- revalue(complete_data$Interaction, c("no"=0))

complete_data$locality=as.factor(complete_data$locality)
complete_data$combined_TCT=as.factor(complete_data$combined_TCT)
complete_data$phenology=as.factor(complete_data$phenology)
complete_data$Interaction=as.factor(complete_data$Interaction)
complete_data$species=as.factor(complete_data$species)
complete_data$`growth form`=as.factor(complete_data$`growth form`)

lm.herbi.complete <- glm(Interaction ~ 1, data=complete_data, family = binomial (link="logit"))
model.herbi.complete1 <- step(lm.herbi.complete, direction = "both", trace = 1, scope = ~ -1 + combined_TCT + phenology + species + locality)
anova.herbi.comp1 <- Anova(model.herbi.complete1) # Step:  AIC=2838.79 ; Interaction ~ locality + combined_TCT
anova.herbi.comp1 # both effects are significant
summary(model.herbi.complete1) # More interactions in SuBe, F and O
r2(model.herbi.complete1)

##### Some of the figures for the paper (other are un the PCA section) ####

# boxplots for the MS
krsize<-with(noNA_LMAall,kruskal(MAX_area,Family,group=TRUE, console=TRUE))
krLMA<-with(noNA_LMAall,kruskal(LMA_all,Family,group=TRUE, console=TRUE))

bx1 <- ggplot(noNA_LMAall, aes(y=MAX_area, x=fct_reorder(Family,MAX_area,.fun="median",.desc = T))) +  
  geom_boxplot(alpha=0.5, outlier.shape=NA)+
  scale_color_npg()+
  ylim(0,3500)+
  geom_jitter(aes(x=Family,y=MAX_area,color=factor(Locality)),size=0.4, width=0.3)+
  xlab("Family")+
  ylab(label = "Leaf size") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  theme(legend.position="none")

bx2 <- ggplot(noNA_LMAall, aes(y=LMA_all, x=fct_reorder(Family,LMA_all,.fun="median",.desc = T))) +  
  geom_boxplot(alpha=0.5, outlier.shape=NA)+
  scale_color_npg()+
  ylim(0,200)+
  geom_jitter(aes(x=Family,y=LMA_all,color=factor(Locality)),size=0.4, width=0.3)+
  xlab("Family")+
  ylab(label = "LMA") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  theme(legend.position="none")

figure_bxpl <- ggarrange(bx1,bx2,
                             labels = c("A", "B"),
                             ncol = 2, nrow = 1)
figure_bxpl

# boxplots for the SI
krsize2<-with(noNA_LMAobs,kruskal(MAX_area,Family,group=TRUE, console=TRUE))
krLMA2<-with(noNA_LMAobs,kruskal(LMA_obs,Family,group=TRUE, console=TRUE))

bx3 <- ggplot(noNA_LMAobs, aes(y=MAX_area, x=fct_reorder(Family,MAX_area,.fun="median",.desc = T))) +  
  geom_boxplot(alpha=0.5, outlier.shape=NA)+
  scale_color_npg()+
  ylim(0,3500)+
  geom_jitter(aes(x=Family,y=MAX_area,color=factor(Locality)),size=0.4, width=0.3)+
  xlab("Family")+
  ylab(label = "Leaf size") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  theme(legend.position="none")

bx4 <- ggplot(noNA_LMAobs, aes(y=LMA_all, x=fct_reorder(Family,LMA_all,.fun="median",.desc = T))) +  
  geom_boxplot(alpha=0.5, outlier.shape=NA)+
  scale_color_npg()+
  ylim(0,200)+
  geom_jitter(aes(x=Family,y=LMA_obs,color=factor(Locality)),size=0.4, width=0.3)+
  xlab("Family")+
  ylab(label = "LMA") + 
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+
  theme(legend.position="none")

figure_bxpl_SI <- ggarrange(bx3,bx4,
                         labels = c("A", "B"),
                         ncol = 2, nrow = 1)
figure_bxpl_SI

# Barplot TCT / Fam
brpl1 <- ggplot(data=noNA_LMAall, aes(x=combined_TCT, fill=Family)) +
  geom_bar(stat="count")+
  theme_minimal()+
  scale_fill_npg()+
  ylim(0,220)+
  xlab("Combined TCTs")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  theme(legend.position="bottom")

hist1 <- ggplot(data=noNA_LMAall, aes(x=Family, fill=Locality))+
  geom_bar(position="dodge")+
  theme_minimal()+
  scale_fill_npg()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+
  theme(legend.position="none")

figure_brpl <- ggarrange(hist1,brpl1,
                         labels = c("A", "B"),
                         ncol = 2, nrow = 1)
figure_brpl


brpl2 <- ggplot(data=noNA_LMAobs, aes(x=combined_TCT, fill=Family)) +
  geom_bar(stat="count")+
  theme_minimal()+
  scale_fill_npg()+
  ylim(0,220)+
  xlab("Combined TCTs")+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  theme(legend.position="bottom")

hist2 <- ggplot(data=noNA_LMAobs, aes(x=Family, fill=Locality))+
  geom_bar(position="dodge")+
  theme_minimal()+
  scale_fill_npg()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+
  geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+
  theme(legend.position="none")

figure_brpl2 <- ggarrange(hist2,brpl2,
                         labels = c("A", "B"),
                         ncol = 2, nrow = 1)
figure_brpl2