# Script Mueller et al., 2023, PeerJ # questions may be addressed to Agathe Toumoulin: agathe.toumoulin@gmail.com # data used are available as supporting material to the paper. setwd("~/Desktop/Collaborations/Christian/Paper_11_08_21/") # The different datasets mentioned in the paper are named as follow: # dataset 3 = "Leaf" # dataset 4 = "noNA_LMAobs" # dataset 5 = "noNA_LMAall" #### Packages loading and dataset importation #### library(readxl) # to import Excel files library(plyr) # to calculate cumulative mean library(dplyr) # to calculate cumulative mean library(ggplot2) # to illustrate results library(FactoMineR) # this package run the Principal Component Analysis (PCA) library(factoextra) # this package is to illustrate results from the PCA library(MASS) # to run PCA with negative binomial distribution data library(car) # to run type II anova library(lmtest) # to run the Breusch-Pagan test (test homoscedasticity) library(reshape) # to reshape data set before plotting multiple lines with ggplot library(agricolae) # for kruskal tests library(goft) # for testing data fitting to gamma law (to choose a family when GLMing) library(gridExtra) # Functions for "Grid" Graphics library(forcats) # extra options for ggplot library(nlme) # Nonlinear Mixed-Effects Models library(ggpubr) # combine Ggplots together library(performance) # check model assumptions library(mice) # data imputation library(lm.beta) # standardized coefficients library(ggsci) # for palette npg library(ggExtra) library(tidyr) # [leaf] - Importation of the complete dataset leaf <- read_excel("Mueller_et_al_all-digitized-leaf-data_12-03-22.xlsx", sheet = "data", na = "na") str(leaf) #### I. Basic leaf area / LMA values and differences between localities #### # Leaf area summary(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) sd(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) # mean = 1262.40; sd = 963.0993 <<<< summary(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) sd(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # mean = 766.73; sd = 786.242 <<<<< shapiro.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) # W = 0.8576, p-value < 2.2e-16, data are not normal shapiro.test(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 0.75549, p-value < 2.2e-16, data are not normal wilcox.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"],leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 111396, p-value < 2.2e-16 >> leaf surface is significantly higher in Seifhennersdorf <<<<< # LMA summary(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) sd(leaf$LMA[leaf$Locality=="Seifhennersdorf"],na.rm=T) # mean = 84.92; NAs = 225 // # sd = 23.34044 <<<< summary(leaf$LMA[leaf$Locality=="Suletice-Berand"]) sd(leaf$LMA[leaf$Locality=="Suletice-Berand"],na.rm=T) # mean = 104.84; NAs = 257 // Sd =29.96921 <<<<< shapiro.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) # W = 0.95399, p-value = 1.756e-05, data are not normal shapiro.test(leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 0.96846, p-value = 0.002201, data are not normal wilcox.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"],leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 7397, p-value = 3.582e-10 >> leaf LMA is significantly higher in Suletice-Berand <<<< # barplot (number of leaf per family per locality) figure1 <- ggplot(data=leaf, aes(x=Family, fill=Locality))+ geom_bar(position="dodge")+ theme_classic()+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+ ggtitle("[all digitized leaves, n=800, leaf]")+ theme(legend.position="none") figure1 # since the number of leaf/family varies between Sf and SuBe, we may expect family to explain part of # leaf trait variation between the two localities. #### II. Leaf size and LMA outliers removal #### leaf.Sf <- subset(leaf, leaf$Locality=="Seifhennersdorf") # n = 400 leaf.SuBe <- subset(leaf, leaf$Locality=="Suletice-Berand") # n = 400 ##### ------ II.1. // Leaf size outliers #### meansf <- mean(leaf.Sf$MAX_area) # average size in Sf = 1262.40 Fig2A <- ggplot(leaf.Sf, aes(x=Family,y=MAX_area)) + geom_hline(yintercept=meansf, colour="grey",linetype = "dashed")+ geom_jitter(aes(x=Family,y=MAX_area,colour=factor(Locality)), width=0.25) + geom_boxplot(alpha=0.5, outlier.colour="black",outlier.shape=8)+ xlab(label = "Family") + ylab(label = "Max. leaf area (mm2)") + theme_classic()+ scale_y_continuous(limits=c(0,6000))+ theme(legend.position="none")+ ggtitle("Sf [n = 400; leaf]")+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1)) meanSuBe <- mean(leaf.SuBe$MAX_area) # average size in SuBe = 766.73 Fig2B <- ggplot(leaf.SuBe, aes(x=Family,y=MAX_area)) + geom_hline(yintercept=meanSuBe, colour="grey",linetype = "dashed")+ geom_jitter(aes(x=Family,y=MAX_area),color="#00AFBB",width=0.25) + geom_boxplot(alpha=0.5, outlier.colour="black",outlier.shape=8)+ xlab(label = "Family") + ylab(label = "Max. leaf area (mm2)") + theme_classic()+ scale_y_continuous(limits=c(0,6000))+ theme(legend.position="none")+ ggtitle("SuBe [n = 400; leaf]")+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1)) figure2 <- ggarrange(Fig2A,Fig2B, labels = c("A", "B"), ncol = 2, nrow = 1) figure2 # remove lines with leaf area > 1.5 interquartile distance for Seifhennersdorf and then Suletice-Berand outlier_val_Sf <- boxplot.stats(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"])$out outlier_idx_Sf <-which(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"] %in% c(outlier_val_Sf)) outlier_val_SuBe <- boxplot.stats(leaf$MAX_area[leaf$Locality=="Suletice-Berand"])$out outlier_idx_SuBe <-which(leaf$MAX_area[leaf$Locality=="Suletice-Berand"] %in% c(outlier_val_SuBe)) # temporary datasat without leaf size outliers # minus size outliers -> n = 759 leaf_clean_a <- subset(leaf[-outlier_idx_Sf,]) leaf_clean_a <- subset(leaf_clean_a[-outlier_idx_SuBe,]) leaf_clean_a_Sf <- subset(leaf.Sf[-outlier_idx_Sf,]) # just for visualization purpose leaf_clean_a_SuBe <- subset(leaf.SuBe[-outlier_idx_SuBe,]) # just for visualization purpose meansfclean <- mean(leaf_clean_a_Sf$MAX_area) # 1144.33 Fig2C <- ggplot(leaf_clean_a_Sf, aes(x=Family,y=MAX_area)) + # outliers removed geom_hline(yintercept=meansfclean, colour="grey",linetype = "dashed")+ geom_jitter(aes(x=Family,y=MAX_area,colour=factor(Locality)), width=0.25) + geom_boxplot(alpha=0.5, outlier.shape=NA)+ xlab(label = "Family") + ylab(label = "Max. leaf area (mm2)") + theme_classic()+ scale_y_continuous(limits=c(0,6000))+ annotate("text", x = 12, y = 5000, col="grey",label = "OUTLIERS REMOVED")+ theme(legend.position="none")+ ggtitle("Sf [n = 385; leaf_clean_a_Sf]")+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1)) meansubeclean <- mean(leaf_clean_a_SuBe$MAX_area) # 608.18 Fig2D <- ggplot(leaf_clean_a_SuBe, aes(x=Family,y=MAX_area)) + # outliers removed geom_hline(yintercept=meansubeclean, colour="grey",linetype = "dashed")+ geom_jitter(aes(x=Family,y=MAX_area),color="#00AFBB",width=0.25) + geom_boxplot(alpha=0.5, outlier.shape=NA)+ xlab(label = "Family") + ylab(label = "Max. leaf area (mm2)") + theme_classic()+ scale_y_continuous(limits=c(0,6000))+ annotate("text", x = 12, y = 5000, col="grey",label = "OUTLIERS REMOVED")+ theme(legend.position="none")+ ggtitle("SuBe [n = 374; leaf_clean_a_SuBe]")+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1)) figure3 <- ggarrange(Fig2A,Fig2B,Fig2C,Fig2D, labels = c("A", "B", "C","D"), ncol = 2, nrow = 2) figure3 ##### ------ II.2. // Removing LMA outliers #### outlier_val_SfLMA <- boxplot.stats(leaf_clean_a$LMA[leaf_clean_a$Locality=="Seifhennersdorf"])$out outlier_idx_SfLMA <-which(leaf_clean_a$LMA[leaf_clean_a$Locality=="Seifhennersdorf"] %in% c(outlier_val_SfLMA)) outlier_val_SuBeLMA <- boxplot.stats(leaf_clean_a$LMA[leaf_clean_a$Locality=="Suletice-Berand"])$out outlier_idx_SuBeLMA <-which(leaf_clean_a$LMA[leaf_clean_a$Locality=="Suletice-Berand"] %in% c(outlier_val_SuBeLMA)) leaf_clean_b <- subset(leaf_clean_a[-outlier_idx_SfLMA,]) leaf_clean_b <- subset(leaf_clean_b[-outlier_idx_SuBeLMA,]) # minus LMA outliers -> n = 753 Fig3B <- ggplot(data=leaf_clean_b, aes(x=Family, fill=Locality))+ geom_bar(position="dodge")+ theme_classic()+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+ ggtitle("[no size & LMA outliers, n=752, dataset=leaf_clean_b]")+ theme(legend.position="none") figure4 <- ggarrange(figure1, Fig3B, labels = c("A", "B"), ncol = 2, nrow = 1) figure4 #### III. LMA imputation ##### ##### ------ III.1. // Preparing the data #### # We will simulate LMA for specimen belonging to species having >= 5 LMA observation per locality # what are these species? ggplot(data=leaf_clean_b, aes(x=fct_infreq(Species)))+ geom_bar(data=leaf_clean_b[which(leaf_clean_b$petiole_w>0),], aes(x=fct_infreq(Species)),fill="grey",stat = 'count') + theme_classic()+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ geom_text(data=leaf_clean_b[which(leaf_clean_b$petiole_w>0),], stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+ ggtitle("Number of specimen / family with LMA")+ facet_wrap(~ Locality, strip.position = "bottom") + theme(legend.position="none") leaf_clean_c <- subset(leaf_clean_b, leaf_clean_b$LMA!="NA") # there are 298 observed LMA leaf_clean_c_Sf <- subset(leaf_clean_c, leaf_clean_c$Locality=="Seifhennersdorf") # n = 160 leaf_clean_c_SuBe <- subset(leaf_clean_c, leaf_clean_c$Locality=="Suletice-Berand") # n = 138 by(leaf_clean_c_Sf, leaf_clean_c_Sf$Species,nrow) # Families with with LMA observation > 5, for which we may simulate LMA in Sf a <- c("Acer_angustilobum","Betula_alboides","Carpinus_grandis","Carpinus_roscheri", "Carya_fragiliformis","Laurophyllum_sp","Leguminosites_sp","Platanus_neptuni","Rosa_lignitum","Ulmus_fischeri") by(leaf_clean_c_SuBe, leaf_clean_c_SuBe$Species,nrow) # Families with with LMA observation > 5, for which we may simulate LMA in SuBe b <- c("Daphnogene_cinnamomifolia","Engelhardia_orsbergensis","Laurophyllum_cfacutimontanum", "Leguminosae_indet","Platanus_neptuni","Symblocos_deichmuelleri","Zelkova_zelkovifolia") leaf_clean_b_Sf <- subset(leaf_clean_b, leaf_clean_b$Locality=="Seifhennersdorf") leaf_clean_b_Sf_light <- subset(leaf_clean_b_Sf, Species %in% a) leaf_clean_b_SuBe <- subset(leaf_clean_b, leaf_clean_b$Locality=="Suletice-Berand") leaf_clean_b_SuBe_light <- subset(leaf_clean_b_SuBe, Species %in% b) leaf_clean <- rbind(leaf_clean_b_Sf_light,leaf_clean_b_SuBe_light) # n = 530 # We have created a reduced dataset of leaf_clean_b. Data contains "na" from these species for which we believe # it's possible to simulate LMA quite reliably this dataset will contain LMA obs and specimen for which LMA may be predicted # cleaning workspace from temporary used datasets and values rm(leaf_clean_a, leaf_clean_a_Sf, leaf_clean_a_SuBe, leaf_clean_b, leaf_clean_b_Sf, leaf_clean_b_Sf_light, leaf_clean_b_SuBe, leaf_clean_b_SuBe_light, leaf_clean_c, leaf_clean_c_Sf, leaf_clean_c_SuBe) ##### ------ III.2. // Simulating missing LMA values #### leaf_clean$Locality=as.factor(leaf_clean$Locality) leaf_clean$Family=as.factor(leaf_clean$Family) leaf_clean$Species=as.factor(leaf_clean$Species) short <-c("ID","Locality","Species","MAX_area","MAX_w","LMA") datamice <- subset(leaf_clean[,short]) # method "norm" (bayesian linear regression) should be used for Normal data. mod1 <- lm(LMA~-1+Locality+Species+MAX_area+MAX_w, data=leaf_clean, na.action=na.omit) par(mfrow = c(2,2)) plot(mod1) # linearity, homoscedasticity and normality of the model residuals look ok shapiro.test(residuals(mod1)) # p-value = 0.07256... it's not perfectly normal but acceptable (as suggested by the check_normality function below) check_normality(mod1) # "OK: residuals appear as normally distributed (p = 0.087)" # we may use the method "norm" # the data we will simulate may tend to be closer to a Gaussian distribution than they are in our dataset. # setting up MICE init = mice(datamice, maxit=0) meth = init$method predM = init$predictorMatrix predM[, c("ID")]=0 # here are listed the variables in datamice that won't be used to predict LMA meth[c("LMA")]="norm" # method used set.seed(103) IMP <- mice(datamice, method=meth, predictorMatrix=predM, m=10) # we test the reliability of the imputation by plotting LMA vs leaf size (same as Fig. 4D) # to see whether the relation between both parameters with/without imputation values remains the same figure5 <- xyplot(IMP,LMA~MAX_area | as.factor(.imp), pch=18,cex=0.5) # ok figure5 fit<-with(IMP,lm(LMA~-1+Locality+Species+MAX_area+MAX_w)) summary(pool(fit)) # the variables contributing significantly to the imputation are Locality and MAX_area imputed_data <- complete(IMP) # we have a dataset with observation + imputed data (n= 530, 294 modelled LMA) colimput <- imputed_data[,c("MAX_area","LMA")] data <- left_join(leaf_clean, colimput, by = c("MAX_area" = "MAX_area")) # There are doublons in IDs... names(data)[names(data) == "LMA.x"] <- "LMA_obs" names(data)[names(data) == "LMA.y"] <- "LMA_all" data$`LMA value`=as.character(data$`LMA value`) data$`LMA value` <- replace_na(data$`LMA value`,"pred") write.csv2(data,file="data_LMA_imputed.csv") # to download the data. ##### ------ III.3. // Visualizing imputed vs calculated values #### # fig SI_1_A p<- ggplot(data, aes(x=LMA_all, y=MAX_area, color=`LMA value`)) + geom_point(alpha = 0.4, size=1.5) + scale_color_npg()+ ylab(label = "Leaf area") + xlab(label = "LMA") + theme_classic()+ theme(legend.position="none") fig_SI1_A <- ggMarginal(p, margins = "x", type="density", groupColour = T, groupFill = T, alpha=0.2) # fig SI_1_B data_sf <- subset(data, data$Locality=="Seifhennersdorf") data_sube <- subset(data, data$Locality=="Suletice-Berand") minobs <- min(data$LMA_all[data$`LMA value`=="obs"],na.rm="T") meanobs <- mean(data$LMA_all[data$`LMA value`=="obs"],na.rm="T") meanpred <- mean(data$LMA_all[data$`LMA value`=="pred"],na.rm="T") fig_SI1_B <- ggplot(data, aes(x=Locality,y=LMA_all,color=`LMA value`)) + geom_boxplot(aes(y=LMA_all,fct_reorder(Locality,LMA_all,.desc=T,.fun="median",na.rm=T), colour=factor(`LMA value`)), alpha=1, outlier.shape=NA)+ geom_point(position=position_jitterdodge(),alpha=0.2)+ scale_color_npg()+ annotate("text", x = 2.5, y = 200, hjust = 1, col="#F8766D",label = "calculated")+ annotate("text", x = 2.5, y = 190, hjust = 1, col="#00BFC4",label = "predicted")+ annotate("text", x = 0.5, y = 36, hjust = 0, col="#F8766D",label = "minimum calculated LMA")+ geom_hline(yintercept=minobs, colour="#F8766D",linetype = "solid", alpha=1)+ xlab(label = "Locality") + ylab(label = "LMA") + theme_classic()+ scale_y_continuous(limits=c(25,200))+ scale_x_discrete(labels=c("SuBe","Sf"))+ theme(legend.position="none")+ theme(axis.text.x = element_text(angle = 0)) fig_SI1_B # fig SI_1_C - boxplots of LMA, calculated vs predicted values per species sp <- c("Symb. dei", "Daph. cin", "Legu. ind", "Laur. sp", "Plat. nep", "Enge. ors", "Rosa lig.", "Legu. sp", "Laur. cfa", "Zelk. zel", "Ulmu. fis", "Carp. gra", "Acer ang", "Carp. ros", "Cary. fra", "Betu. alb") fig_SI1_C <- ggplot(data, aes(x=Species,y=LMA_all,color=`LMA value`)) + geom_boxplot(aes(y=LMA_all,fct_reorder(Species,LMA_all,.desc=T,.fun="median",na.rm=T),colour=factor(`LMA value`)), alpha=1, outlier.shape=NA)+ geom_point(position=position_jitterdodge(),alpha=0.5,size=0.7)+ scale_color_npg()+ geom_hline(yintercept=minobs, colour="#F8766D",linetype = "solid",alpha=1)+ xlab(label = "Species") + ylab(label = "LMA") + scale_x_discrete(labels=c(sp))+ theme_classic()+ scale_y_continuous(limits=c(25,200))+ theme(legend.position="none")+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1)) fig_SI_1 <- ggdraw() + draw_plot(fig_SI1_A, x = 0, y = .6, width = .6, height = .4) + draw_plot(fig_SI1_B, x = .6, y = .6, width = .4, height = .4) + draw_plot(fig_SI1_C, x = 0.05, y = 0, width = 0.9, height = 0.6) + draw_plot_label(label = c("A", "B", "C"), size = 15, x = c(0, 0.6, 0), y = c(1, 1, 0.6)) fig_SI_1 pdf(file="fig_SI_1.pdf",width=7.1, height=5) fig_SI_1 dev.off() # removing the seven predictions of LMA < any observed LMA data <- subset(data,LMA_all >= minobs) #### IV. Principal component Analyses #### ##### ------ IV.1. // Preparing the data + checking size and LMA values #### # Removing lines with NAs in one of the variable we will use in our multivariate analysis. x <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_obs","Locality","Family","Genus","specimen_TCT","taxon_TCT","combined_TCT","cat_TCT","PI","Pheno","Growth_form") noNA_LMAobs <- subset(data, complete.cases(data[,x])) # n = 236 xx <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_all","Locality","Family","Genus","specimen_TCT","taxon_TCT","combined_TCT","cat_TCT","PI","Pheno","Growth_form") noNA_LMAall <- subset(data, complete.cases(data[,xx])) # n = 506 # Suppressing / grouping rare TCTs by(noNA_LMAobs,noNA_LMAobs$cat_TCT,nrow) noNA_LMAobs <- subset(noNA_LMAobs[!noNA_LMAobs$cat_TCT %in% c("CD"),]) # n=232 by(noNA_LMAobs,noNA_LMAobs$combined_TCT,nrow) noNA_LMAobs <- subset(noNA_LMAobs[!noNA_LMAobs$combined_TCT %in% c("E,F"),]) # n=228 by(noNA_LMAobs,noNA_LMAobs$specimen_TCT,nrow) # no rare TCT with n<5 by(noNA_LMAall,noNA_LMAall$cat_TCT,nrow) # no cat_TCT with n<5 by(noNA_LMAall,noNA_LMAall$combined_TCT,nrow) # "C" and "D" are rare so we move them to "C or D" class noNA_LMAall$combined_TCT <- revalue(noNA_LMAall$combined_TCT, c("C"="C,D")) noNA_LMAall$combined_TCT <- revalue(noNA_LMAall$combined_TCT, c("D"="C,D")) by(noNA_LMAall,noNA_LMAall$specimen_TCT,nrow) # same for specimen TCT noNA_LMAall$specimen_TCT <- revalue(noNA_LMAall$specimen_TCT, c("C"="C or D")) noNA_LMAall$specimen_TCT <- revalue(noNA_LMAall$specimen_TCT, c("D"="C or D")) # check again that qualitative variables are factors noNA_LMAobs$Locality=as.factor(noNA_LMAobs$Locality) noNA_LMAobs$Family=as.factor(noNA_LMAobs$Family) noNA_LMAobs$Pheno=as.factor(noNA_LMAobs$Pheno) noNA_LMAobs$Interaction=as.factor(noNA_LMAobs$Interaction) noNA_LMAall$Locality=as.factor(noNA_LMAall$Locality) noNA_LMAall$Family=as.factor(noNA_LMAall$Family) noNA_LMAall$Pheno=as.factor(noNA_LMAall$Pheno) noNA_LMAall$Interaction=as.factor(noNA_LMAall$Interaction) ##### ------ IV.2. // Checking leaf size among datasets ##### summary(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) #mean = 1262.40; sd=963.0993 sd(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) summary(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"]) # mean = 1104.4; sd=730.0568 sd(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"]) summary(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"]) # mean = 1064.28; sd = 704.9531 sd(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"]) summary(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # mean = 766.73; sd = 786.242 sd(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) summary(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) # mean =452.90; sd = 476.2621 sd(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) summary(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) # mean = 526.27; sd = 437.1632 sd(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) shapiro.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"]) # W = 0.8576, p-value < 2.2e-16, data are not N shapiro.test(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"]) # W = 0.90357, p-value = 2.072e-07, data are not N shapiro.test(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"]) # W = 0.90875, p-value = 3.305e-11, data are not N shapiro.test(leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 0.75549, p-value < 2.2e-16, data are not N shapiro.test(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 0.73286, p-value = 1.822e-12, data are not N shapiro.test(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 0.81888, p-value < 2.2e-16, data are not N wilcox.test(leaf$MAX_area[leaf$Locality=="Seifhennersdorf"],leaf$MAX_area[leaf$Locality=="Suletice-Berand"]) # W = 111396, p-value < 2.2e-16, significant differences wilcox.test(noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Seifhennersdorf"],noNA_LMAobs$MAX_area[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 10626, p-value < 2.2e-16, idem wilcox.test(noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Seifhennersdorf"],noNA_LMAall$MAX_area[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 50377, p-value < 2.2e-16, idem ##### ------ IV.3. // Checking LMA among datasets ##### summary(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) # mean 84.92; sd = 23.34044 sd(leaf$LMA[leaf$Locality=="Seifhennersdorf"],na.rm=T) summary(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"]) # mean = 82.92; sd = 19.63653 sd(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"]) summary(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"]) # mean = 81.62; sd = 20.91337 sd(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"]) summary(leaf$LMA[leaf$Locality=="Suletice-Berand"])# mean = 104.84; sd = 29.96921 sd(leaf$LMA[leaf$Locality=="Suletice-Berand"],na.rm=T) summary(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) # mean = 102.70; sd = 24.93115 sd(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) summary(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) # mean = 103.78; sd = 24.01588 sd(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) shapiro.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"]) # W = 0.95399, p-value = 1.756e-05, data are not N shapiro.test(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"]) # W = 0.97124, p-value = 0.009463, data are not N shapiro.test(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"]) # W = 0.99002, p-value = 0.08377, data are N shapiro.test(leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 0.96846, p-value = 0.002201, data are not N shapiro.test(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 0.97212, p-value = 0.02701, data are not N shapiro.test(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 0.9908, p-value = 0.09721, data are N wilcox.test(leaf$LMA[leaf$Locality=="Seifhennersdorf"],leaf$LMA[leaf$Locality=="Suletice-Berand"]) # W = 7397, p-value = 3.582e-10, significant differences wilcox.test(noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Seifhennersdorf"],noNA_LMAobs$LMA_obs[noNA_LMAobs$Locality=="Suletice-Berand"]) # W = 3425, p-value = 1.11e-09, idem wilcox.test(noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Seifhennersdorf"],noNA_LMAall$LMA_all[noNA_LMAall$Locality=="Suletice-Berand"]) # W = 15979, p-value < 2.2e-16, idem # all datasets reflect significant LMA differences between localities ##### ------ IV.4. // Running PCA for data with LMA observations only ##### quantivar <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_obs") # listing quantitative variables to be used leaf.pca1 <- PCA(noNA_LMAobs[,quantivar],graph=F) eig.val1 <- get_eigenvalue(leaf.pca1) head(eig.val1) # Dimensions 1 and 2 explain 85,03% of data ordination. fviz_screeplot(leaf.pca1,addlabels = TRUE) # same information with an histogram var1 <- get_pca_var(leaf.pca1) # extract information on the variables (see bellow) var1 fviz_pca_var (leaf.pca1, repel = TRUE) # show how variables affect data point location in space head(var1$contrib) # give the contribution of the different variables to the dimensions fviz_contrib_a <- fviz_contrib (leaf.pca1, "var", axes = 1,title="PC1, without imputation") # Axes 1 --> MAX_area and MAX_w fviz_contrib_b <- fviz_contrib (leaf.pca1, "var", axes = 2,title="PC2, without imputation") # Axes 2 --> MAX_lw fviz_contrib_a fviz_contrib_b # Rq: The red dashed line indicates the expected mean value if the contributions were uniform. PCA1 <- fviz_pca_biplot(leaf.pca1, geom = c("point", "text"), habillage = as.factor(noNA_LMAobs$Locality), # color by Locality label=c("var"), col.var = "black", pointsize = 0.5, addEllipses = TRUE, ellipse.level=0.95, ellipse.type="convex", repel = TRUE, palette = "npg", title = "Specimens") PCA1 <- PCA1 + theme(legend.position="none") PCA1 <- PCA1 + xlim(-5, 7.5) + ylim (-3.5, 5) PCA1 PCA2 <- fviz_pca_ind(leaf.pca1, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, col.ind = noNA_LMAobs$combined_TCT, palette = "npg", title="Combined TCT", repel = TRUE) PCA2 <- PCA2 +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none") PCA2 PCA3 <- fviz_pca_ind(leaf.pca1, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", col.ind = noNA_LMAobs$cat_TCT, title="Categorial TCT", repel = TRUE) PCA3 <- PCA3 +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none") PCA3 PCA4 <- fviz_pca_ind(leaf.pca1, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", habillage = noNA_LMAobs$Family, title="Family", repel = TRUE) PCA4 <- PCA4 +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none") PCA4 PCA5 <- fviz_pca_ind(leaf.pca1, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", habillage = noNA_LMAobs$Pheno, title="Phenology", repel = TRUE) PCA5 <- PCA5 +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none") PCA5 PCA6 <- fviz_pca_ind(leaf.pca1, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", habillage = noNA_LMAobs$Interaction, title="Interaction", repel = TRUE) PCA6 <- PCA6 +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none") PCA6 figureSI_PCA <- ggarrange(PCA1,PCA2,PCA3,PCA4,PCA5,PCA6, labels = c("A", "B","C","D","E","F"), ncol = 2, nrow = 3) ##### ------ IV.5. // Running PCA for data with LMA observations AND imputation ##### quantivar2 <- c("MAX_area","MAX_lw","MAX_l","MAX_w","LMA_all") leaf.pca2 <- PCA(noNA_LMAall[,quantivar2],graph=F) eig.val2 <- get_eigenvalue(leaf.pca2) head(eig.val2) # Dimensions 1 and 2 explain 83.91% of data location fviz_screeplot(leaf.pca2,addlabels = TRUE) fviz_pca_var (leaf.pca2, repel = TRUE) var2 <- get_pca_var(leaf.pca2) head(var2$contrib) fviz_contrib_c <- fviz_contrib (leaf.pca2, "var", axes = 1,title="PC1, with imputation") # Axes 1 --> MAX_area and MAX_w fviz_contrib_d <-fviz_contrib (leaf.pca2, "var", axes = 2,title="PC2, with imputation") # Axes 2 --> MAX_lw figureSI_contrib <- ggarrange(fviz_contrib_a,fviz_contrib_b,fviz_contrib_c,fviz_contrib_d, labels = c("A", "B","C","D"), ncol = 2, nrow = 2) figureSI_contrib PCA1b <- fviz_pca_biplot(leaf.pca2, geom = c("point", "text"), habillage = as.factor(noNA_LMAall$Locality), # color by Locality label=c("var"), pointsize = 0.5, col.var = "black", addEllipses = TRUE, ellipse.level=0.95, ellipse.type="convex", repel = TRUE, palette = "npg", title = "Specimens") PCA1b <- PCA1b + theme(legend.position="none") PCA1b <- PCA1b + xlim(-5, 7.5) + ylim (-3.5, 5) PCA1b PCA2b <- fviz_pca_ind(leaf.pca2, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, col.ind = noNA_LMAall$combined_TCT, palette = "npg", title="Combined TCT", repel = TRUE) PCA2b <- PCA2b +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none") PCA2b PCA3b <- fviz_pca_ind(leaf.pca2, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", col.ind = noNA_LMAall$cat_TCT, title="Categorial TCT", repel = TRUE) PCA3b <- PCA3b +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none") PCA3b PCA4b <- fviz_pca_ind(leaf.pca2, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", habillage = noNA_LMAall$Family, title="Family", repel = TRUE) PCA4b <- PCA4b +xlim(-4, 7.5) + ylim (-3.5, 5) + theme(legend.position="none") PCA4b PCA5b <- fviz_pca_ind(leaf.pca2, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", habillage = noNA_LMAall$Pheno, title="Phenology", repel = TRUE) PCA5b <- PCA5b +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none") PCA5b PCA6b <- fviz_pca_ind(leaf.pca2, geom = c("point"), addEllipses = T, ellipse.level=0.95,ellipse.type="convex", pointshape = 19, pointsize = 0.5, palette = "npg", habillage = noNA_LMAall$Interaction, title="Interaction", repel = TRUE) PCA6b <- PCA6b +xlim(-4, 7.5) + ylim (-3, 5) +theme(legend.position="none") PCA6b figure_PCA <- ggarrange(PCA1b,PCA2b,PCA3b,PCA4b,PCA5b,PCA6b, labels = c("A", "B","C","D","E","F"), ncol = 2, nrow = 3) figure_PCA figure_PCAlight <- ggarrange(PCA1b,PCA4b,PCA5b,PCA6b, labels = c("A", "B","C","D"), ncol = 2, nrow = 2) figure_PCAlight #### V. Generalized Linear Models #### ##### ------ V.1. // GLM 1/3 - for data with LMA observations only #### # A - LMA shapiro.test(noNA_LMAobs$LMA_obs) #p-value = 2.585e-05 gamma_test(noNA_LMAobs$LMA_obs) # V = 1.5729, p-value = 0.266 --> data fit to gamma distribution lm.null.a <- glm(LMA_obs ~ 1, data=noNA_LMAobs, family = Gamma(link = "identity"), na.action = na.omit) model.aic.both.a <- step(lm.null.a, direction = "both", trace = 1, scope = ~ Locality + Family + Species + combined_TCT + cat_TCT + PI + Pheno + MAX_area + Growth_form) formula(model.aic.both.a) # LMA_obs ~ MAX_area + Pheno + Species + Locality ; AIC=1930.31 anova.LMA <- Anova(model.aic.both.a) # Type II Anova --> the order of parameter in the model does not matter anova.LMA # Locality isn't significant par(mfrow = c(2,2)) plot(model.aic.both.a) par(mfrow = c(1,1)) shapiro.test(residuals(model.aic.both.a)) # W = 0.99606, p-value = 0.8323, normality of residuals ok bptest(model.aic.both.a) # p-value = 0.07368 --> homogeneity of residuals ok check_model(model.aic.both.a) r2(model.aic.both.a) # B - leaf size shapiro.test(noNA_LMAobs$MAX_area) #p-value = 1.109e-13 gamma_test(noNA_LMAobs$MAX_area) # V = -0.0014267, p-value = 0.9992 --> data fit to gamma distribution lm.null.b <- glm(MAX_area ~ 1, data=noNA_LMAobs, family = Gamma(link="identity"), na.action = na.omit) model.aic.both.size <- step(lm.null.b, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + combined_TCT + cat_TCT+ PI + Pheno + Growth_form) # the model do not converge when using specimen_TCT or combined_TCT with cat_TCT. formula(model.aic.both.size) # MAX_area ~ Species + PI + Pheno; AIC=3334.94 anova.size <- Anova(model.aic.both.size) # Type II Anova anova.size # all significant cor.test(noNA_LMAobs$MAX_area,noNA_LMAobs$PI,method="spearman") # rho = -0.3814086 ; p-value = 2.615e-09 boxplot(noNA_LMAobs$MAX_area~noNA_LMAobs$Pheno) par(mfrow = c(2,2)) plot(model.aic.both.size) shapiro.test(residuals(model.aic.both.size)) # p-value = 0.2264 normality of residuals ok bptest(model.aic.both.size) # p-value = 0.002736 problem with residuals homogeneity, but it seems it only comes from 3 points check_model(model.aic.both.size) r2(model.aic.both.size) # C - Interaction with insects 100*sum(noNA_LMAobs$area_damaged == 0)/nrow(noNA_LMAobs) # 92.11% of data are zero! non_zero <- ifelse(noNA_LMAobs$area_damaged > 0, 1, 0) ggplot(aes(x = as.numeric(row.names(noNA_LMAobs)), y = area_damaged), data = noNA_LMAobs) + geom_jitter(aes(color = area_damaged > 0)) # we have semi(continuous) and zero-inflated data # we have a lot of zeros, which mean either : # (1) no herbivore trace was found (e.g., damage on a missing part of the leaves or traces too small to be observed and measured); # or (2) the leaf wasn't eaten # we will use a "two-stage model" process as suggested here: # (Ben Bolker (https://stats.stackexchange.com/users/2126/ben-bolker), How to model non-negative zero-inflated continuous data?, URL (version: 2021-10-03): https://stats.stackexchange.com/q/187843) # step 1 : presence / absence of damages # Is the probability to be eaten/non eaten for a leaf related to certain traits? library(plyr) noNA_LMAobs$Interaction <- revalue(noNA_LMAobs$Interaction, c("yes"=1)) noNA_LMAobs$Interaction <- revalue(noNA_LMAobs$Interaction, c("no"=0)) noNA_LMAobs$Interaction=as.factor(noNA_LMAobs$Interaction) lm.null.herbi.1a <- glm(Interaction ~ 1, data=noNA_LMAobs, family = binomial (link="logit")) model.herbi.1a <- step(lm.null.herbi.1a, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_obs + MAX_area + combined_TCT + PI + Pheno) formula(model.herbi.1a) # nothing explains the presence / absence of herbivory traces (with the var "Interaction") # Step 2: surface of damages # Is the surface damaged by herbivory interaction associated to particular traits? short_herbi <- subset(noNA_LMAobs, noNA_LMAobs$area_damaged!="0") # We focus on leaves for which non-zero values are observed, n=18 shapiro.test(short_herbi$area_damaged) # p-value = 9.506e-07 gamma_test(short_herbi$area_damaged) # V = 2.2726, p-value = 0.1081 --> data fit to gamma distribution lm.null.herbi.1b <- glm(area_damaged ~ 1, data=short_herbi, family = Gamma(link="identity"), na.action = na.omit) model.herbi.1b <- step(lm.null.herbi.1b, direction = "both", trace = 1, scope = ~ -1 + Locality + Species + LMA_obs + MAX_area + combined_TCT + Pheno) formula(model.herbi.1b) # algo did not converge (leaf area could have an effect though), removing combined_TCT and pheno var doesnt help anova.herbi2 <- Anova(model.herbi.1b) anova.herbi2 # We also try to use the Herbivory Index instead of the area damaged lm.null.herbi.1c <- glm(Herbivory_index ~ 1, data=short_herbi, family = Gamma(link="log"), na.action = na.omit) model.herbi.1c <- step(lm.null.herbi.1c, direction = "both", trace = 1, scope = ~ -1 + Locality + Species + LMA_obs + MAX_area + combined_TCT + cat_TCT + Pheno) formula(model.herbi.1c) # Herbivory_index ~ 1 -> No model found. ##### ------ V.2. // GLM 2/3 - for data with LMA observations & imputations ##### # -- LMA dataset WITH LMA imputation # A - LMA gamma_test(noNA_LMAall$LMA_all) # V = -1.8269, p-value = 0.1964 --> data fit to gamma distribution lm.null.c <- glm(LMA_all ~ 1, data=noNA_LMAall, family = Gamma(link="identity"), na.action = na.omit) modelLMA2 <- step(lm.null.c, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + combined_TCT + cat_TCT + PI + Pheno + MAX_area + Growth_form) formula(modelLMA2) # LMA_all ~ Species + MAX_area + Pheno + PI + Locality, AIC=4431.38 anova.LMA <- Anova(modelLMA2) # Type II Anova --> the order of parameter in the model does not matter anova.LMA # all variable are significant summary(modelLMA2) par(mfrow = c(2,2)) plot(modelLMA2) shapiro.test(residuals(modelLMA2)) # p-value = 0.001568 --> not ok but it's because the number of specimen is important! bptest(modelLMA2) # p-value = 0.1525 --> homogeneity ok check_model(modelLMA2) r2(modelLMA2) # B - leaf size gamma_test(noNA_LMAall$MAX_area) # V = 1.459, p-value = 0.3022 --> data fit to gamma distribution lm.null.d <- glm(MAX_area ~ 1, data=noNA_LMAall, family = Gamma(link="identity"), na.action = na.omit) model.size.2 <- step(lm.null.d, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + combined_TCT + cat_TCT + PI + Pheno + Growth_form) formula(model.size.2) # MAX_area ~ Species + PI + Pheno + Locality; AIC=7525.76 anova.size <- Anova(model.size.2) # Type II Anova anova.size # all significant but Locality, marginally significant summary(model.size.2) cor.test(noNA_LMAall$MAX_area,noNA_LMAall$PI,method="spearman") # rho = -0.2911195 ; p-value = 1.77e-11 boxplot(noNA_LMAall$MAX_area~noNA_LMAall$Pheno) shapiro.test(residuals(model.size.2)) # p-value = 0.0007707 --> N not ok bptest(model.size.2) # p-value = 4.192e-05 --> homogeneity not ok par(mfrow = c(2,2)) plot(model.size.2) check_model(model.size.2) # in this case, normality of the residuals doesn't seem that good, which is surprising because the number of data is quite important r2(model.size.2) summary(model.size.2) # C - Interaction with insects 100*sum(noNA_LMAall$area_damaged == 0)/nrow(noNA_LMAall) # 90.25% of data are zero! non_zero <- ifelse(noNA_LMAall$area_damaged > 0, 1, 0) ggplot(aes(x = as.numeric(row.names(noNA_LMAall)), y = area_damaged), data = noNA_LMAall) + geom_jitter(aes(color = area_damaged > 0)) # step 1 : presence / absence of damages # Is the probability to be eaten/non eaten for a leaf related to certain traits? noNA_LMAall$Interaction <- revalue(noNA_LMAall$Interaction, c("yes"=1)) noNA_LMAall$Interaction <- revalue(noNA_LMAall$Interaction, c("no"=0)) noNA_LMAall$Interaction=as.factor(noNA_LMAall$Interaction) lm.null.herbi.2a <- glm(Interaction ~ 1, data=noNA_LMAall, family = binomial (link="logit")) model.herbi.2a <- step(lm.null.herbi.2a, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_all + MAX_area + combined_TCT + cat_TCT + PI + Pheno + Growth_form) formula(model.herbi.2a) # nothing explains the presence / absence of herbivory traces (with the var "Interaction") anova.size <- Anova(model.herbi.2a) # Type II Anova anova.size # all significant but Locality, marginally significant r2(model.herbi.2a) summary(model.herbi.2a) # Step 2: surface of damages # Is the surface damaged by herbivory interaction associated to particular traits? short_herbi2 <- subset(noNA_LMAall, noNA_LMAall$area_damaged!="0") # We focus on leaves for which non-zero values are observed, n=50 gamma_test(short_herbi2$area_damaged) # V = 2.3293, p-value = 0.09955 --> data fit to gamma distribution lm.null.herbi.2b <- glm(area_damaged ~ 1, data=short_herbi2, family = Gamma(link="identity"), na.action = na.omit) model.herbi.2b <- step(lm.null.herbi.2b, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_all + MAX_area + cat_TCT + combined_TCT + Pheno + Growth_form) formula(model.herbi.2b) # area_damaged ~ MAX_area + Pheno anova.herbi.2b <- Anova(model.herbi.2b) anova.herbi.2b # both variable effects are significant shapiro.test(residuals(model.herbi.2b)) # p-value = 0.3701 ok bptest(model.herbi.2b) # p-value = 0.01653 --> homogeneity not ok par(mfrow = c(2,2)) plot(model.herbi.2b) check_model(model.herbi.2b) summary(model.herbi.2b) # positive corr. to leaf surface, negative corr to leaf lifespan. r2(model.herbi.2b) plot(short_herbi2$MAX_area,short_herbi2$area_damaged) shapiro.test(short_herbi2$MAX_area) # data not N --> non parametric test cor.test(short_herbi2$MAX_area,short_herbi2$area_damaged, method="spearman") # p-value = 0.004895 there is a correlation ; rho = 0.39 plot(short_herbi2$LMA_all,short_herbi2$area_damaged) shapiro.test(short_herbi2$LMA_all) # data N shapiro.test(short_herbi2$area_damaged) # data not N --> non parametric test cor.test(short_herbi2$LMA_all,short_herbi2$area_damaged, method="spearman") # p-value = 0.1719; there is no correlation ; rho = -0.1960624 # We also try using the Herbivory Index instead of the area damaged lm.null.herbi.2c <- glm(Herbivory_index ~ 1, data=short_herbi2, family = Gamma(link="log"), na.action = na.omit) model.herbi.2c <- step(lm.null.herbi.2c, direction = "both", trace = 1, scope = ~ -1 + Locality + Family + Species + LMA_all + cat_TCT + combined_TCT + Pheno + Growth_form) formula(model.herbi.2c) # Herbivory_index ~ Pheno + LMA_all; AIC=236.26. anova.herbi.2c <- Anova(model.herbi.2c) anova.herbi.2c # Only phenology is significant shapiro.test(residuals(model.herbi.2c)) # p-value = 0.401 ok bptest(model.herbi.2c) # p-value = 0.1955 --> homogeneity ok par(mfrow = c(2,2)) plot(model.herbi.2c) check_model(model.herbi.2c) summary(model.herbi.2c) r2(model.herbi.2c) ##### ------ V.2. // GLM 3/3 - A last attempt to understand plant-insect interactions (dataset n = 6284 leaves, no quantitative traits) ##### complete_data <- read_excel("Sf-SuBe_dataset_full_May_2022.xlsx", sheet = "Dataset complet", na = "na") complete_data$Interaction <- revalue(complete_data$Interaction, c("yes"=1)) complete_data$Interaction <- revalue(complete_data$Interaction, c("no"=0)) complete_data$locality=as.factor(complete_data$locality) complete_data$combined_TCT=as.factor(complete_data$combined_TCT) complete_data$phenology=as.factor(complete_data$phenology) complete_data$Interaction=as.factor(complete_data$Interaction) complete_data$species=as.factor(complete_data$species) complete_data$`growth form`=as.factor(complete_data$`growth form`) lm.herbi.complete <- glm(Interaction ~ 1, data=complete_data, family = binomial (link="logit")) model.herbi.complete1 <- step(lm.herbi.complete, direction = "both", trace = 1, scope = ~ -1 + combined_TCT + phenology + species + locality) anova.herbi.comp1 <- Anova(model.herbi.complete1) # Step: AIC=2838.79 ; Interaction ~ locality + combined_TCT anova.herbi.comp1 # both effects are significant summary(model.herbi.complete1) # More interactions in SuBe, F and O r2(model.herbi.complete1) ##### Some of the figures for the paper (other are un the PCA section) #### # boxplots for the MS krsize<-with(noNA_LMAall,kruskal(MAX_area,Family,group=TRUE, console=TRUE)) krLMA<-with(noNA_LMAall,kruskal(LMA_all,Family,group=TRUE, console=TRUE)) bx1 <- ggplot(noNA_LMAall, aes(y=MAX_area, x=fct_reorder(Family,MAX_area,.fun="median",.desc = T))) + geom_boxplot(alpha=0.5, outlier.shape=NA)+ scale_color_npg()+ ylim(0,3500)+ geom_jitter(aes(x=Family,y=MAX_area,color=factor(Locality)),size=0.4, width=0.3)+ xlab("Family")+ ylab(label = "Leaf size") + theme_minimal()+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+ theme(legend.position="none") bx2 <- ggplot(noNA_LMAall, aes(y=LMA_all, x=fct_reorder(Family,LMA_all,.fun="median",.desc = T))) + geom_boxplot(alpha=0.5, outlier.shape=NA)+ scale_color_npg()+ ylim(0,200)+ geom_jitter(aes(x=Family,y=LMA_all,color=factor(Locality)),size=0.4, width=0.3)+ xlab("Family")+ ylab(label = "LMA") + theme_minimal()+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+ theme(legend.position="none") figure_bxpl <- ggarrange(bx1,bx2, labels = c("A", "B"), ncol = 2, nrow = 1) figure_bxpl # boxplots for the SI krsize2<-with(noNA_LMAobs,kruskal(MAX_area,Family,group=TRUE, console=TRUE)) krLMA2<-with(noNA_LMAobs,kruskal(LMA_obs,Family,group=TRUE, console=TRUE)) bx3 <- ggplot(noNA_LMAobs, aes(y=MAX_area, x=fct_reorder(Family,MAX_area,.fun="median",.desc = T))) + geom_boxplot(alpha=0.5, outlier.shape=NA)+ scale_color_npg()+ ylim(0,3500)+ geom_jitter(aes(x=Family,y=MAX_area,color=factor(Locality)),size=0.4, width=0.3)+ xlab("Family")+ ylab(label = "Leaf size") + theme_minimal()+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+ theme(legend.position="none") bx4 <- ggplot(noNA_LMAobs, aes(y=LMA_all, x=fct_reorder(Family,LMA_all,.fun="median",.desc = T))) + geom_boxplot(alpha=0.5, outlier.shape=NA)+ scale_color_npg()+ ylim(0,200)+ geom_jitter(aes(x=Family,y=LMA_obs,color=factor(Locality)),size=0.4, width=0.3)+ xlab("Family")+ ylab(label = "LMA") + theme_minimal()+ theme(axis.text.x = element_text(angle = 90,vjust = 0.5, hjust=1))+ theme(legend.position="none") figure_bxpl_SI <- ggarrange(bx3,bx4, labels = c("A", "B"), ncol = 2, nrow = 1) figure_bxpl_SI # Barplot TCT / Fam brpl1 <- ggplot(data=noNA_LMAall, aes(x=combined_TCT, fill=Family)) + geom_bar(stat="count")+ theme_minimal()+ scale_fill_npg()+ ylim(0,220)+ xlab("Combined TCTs")+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ theme(legend.position="bottom") hist1 <- ggplot(data=noNA_LMAall, aes(x=Family, fill=Locality))+ geom_bar(position="dodge")+ theme_minimal()+ scale_fill_npg()+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+ theme(legend.position="none") figure_brpl <- ggarrange(hist1,brpl1, labels = c("A", "B"), ncol = 2, nrow = 1) figure_brpl brpl2 <- ggplot(data=noNA_LMAobs, aes(x=combined_TCT, fill=Family)) + geom_bar(stat="count")+ theme_minimal()+ scale_fill_npg()+ ylim(0,220)+ xlab("Combined TCTs")+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ theme(legend.position="bottom") hist2 <- ggplot(data=noNA_LMAobs, aes(x=Family, fill=Locality))+ geom_bar(position="dodge")+ theme_minimal()+ scale_fill_npg()+ theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5))+ geom_text(stat='count', aes(label=..count..), angle = 90, position=position_dodge(width=0.9))+ theme(legend.position="none") figure_brpl2 <- ggarrange(hist2,brpl2, labels = c("A", "B"), ncol = 2, nrow = 1) figure_brpl2