# Code for some analysis in the paper: # Downloading the database. setwd("~/") setwd("D://Gustavo/CICESE - Doctorado/Análisis de datos/Analisis datos con GLMs en R") Dolphins_data <- read.table(file = "Dolphins_clicks_database_2011_2015_Final.txt", header = TRUE, dec = ".") # Variables as factors Dolphins_data$Time_day <- as.factor(Dolphins_data$Time_day) Dolphins_data$Day_night <- as.factor(Dolphins_data$Day_night) Dolphins_data$Site <- as.factor(Dolphins_data$Site) Dolphins_data$Month_Cat <- as.factor(Dolphins_data$Month_Cat) Dolphins_data$Year_Cat <- as.factor(Dolphins_data$Year_Cat) # Year of sampling season str(Dolphins_data) # Checking the kind of variables (factors, integers or numerical) names(Dolphins_data) # Checking the names of variables. # Downloading of the packages. library(MASS) # GLMs binomial negative distribution library(ggplot2) # Plots library(car) # Correlations of predictor variables. library(corrplot) # We used to created a "Correlogram" library(FSA) # Dunn´s test library(cowplot) # Two graphs in one page # Diel patterns analysis ######### Checking for statistical differences in counts of dolphin clicks # related to daytime $ nighttime periods ######### # Mean and SD of nigthtime and daytime condition mean(Dolphins_data$Dolphin_clicks,na.rm = TRUE) sd(Dolphins_data$Dolphin_clicks,na.rm = TRUE) mean(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Day"],na.rm = TRUE) sd(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Day"],na.rm = TRUE) mean(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Night"],na.rm = TRUE) sd(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Night"],na.rm = TRUE) # Kruskal-Wallis test. Evaluation of differences in mean by light condition of day kruskal.test(Dolphins_data$Dolphin_clicks~as.factor(Dolphins_data$Day_night)) # Kruskal-Wallis test. Evaluation of differences in mean by time of day kruskal.test(Dolphins_data$Dolphin_clicks~as.factor(Dolphins_data$Time_day)) # Dunn´s test Timeday <- dunnTest(Dolphins_data$Dolphin_clicks ~ as.factor(Dolphins_data$Time_day), method="bonferroni") print(Timeday,dunn.test.results=TRUE) # summary of results # Violin graph. Time of the day vs dolphin clicks. Part 1. Y axis = 0 to 20. g1 <- ggplot(Dolphins_data, aes(x = factor(Dolphins_data$Time_day), y = (Dolphins_data$Dolphin_clicks))) + geom_violin(draw_quantiles = c(.25, .5, .75)) + coord_cartesian(ylim = c(0,20)) + stat_summary(fun.y = "mean", geom = "point", shape = 15, size = 4, color = "midnightblue") + stat_summary(fun.y = "median", geom = "point", shape = 2, size = 3, color = "red") + labs(title="", x="Time of day", y="Dolphin clicks") + geom_point() + theme( axis.title.x = element_text(color="black", size=12, face="bold"), axis.title.y = element_text(color="black", size=12, face="bold") ) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) # Graph. Part 2. Y axis = 20 to 7,700. I split the graph into two because of the excess of zeroes. # More easy to see the distribution of the response variable g2 <- ggplot(Dolphins_data, aes(x = factor(Dolphins_data$Time_day), y = (Dolphins_data$Dolphin_clicks))) + geom_line() + coord_cartesian(ylim = c(20, 8000)) + scale_y_continuous(breaks = c(20,2000,4000,6000,8000), limits = c(20,8000)) + labs(title="", x="", y="Dolphin clicks") + geom_point() + theme( axis.title.x = element_text(color="black", size=12, face="bold"), axis.title.y = element_text(color="black", size=12, face="bold") ) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + theme(plot.margin = unit(c(0, 0, 0, 0), "cm")) # Two graphs in one page png("Figure2.png", units="in", width=5, height=5, res=600) plot_grid(g2, g1, ncol=1, align="v") dev.off() ############ Spatial patterns of the observed acoustic occurrence of dolphins ########## # Kruskal-Wallis test. Evaluation of differences in mean by sampling season (Year_Cat) kruskal.test(Dolphins_data$Dolphin_clicks~as.factor(Dolphins_data$Year_Cat)) SamplingSeason <- dunnTest(Dolphins_data$Dolphin_clicks ~ as.factor(Dolphins_data$Year_Cat), method="bonferroni") print(SamplingSeason,dunn.test.results=TRUE) # ########## Data exploration ########## # Basic exploration in the response variable (counts of dolphin clicks per hour) plot(table(Dolphins_data$Dolphin_clicks)) summary(Dolphins_data$Dolphin_clicks) # NA´s 34,218. No NA´s = 293,137 count(Dolphins_data, vars=Dolphin_clicks,wt_var = "0") # Results with 0 data (290,328). 99% of response variable dara are zeros. # Checking for outliers in respose variable boxplot(Dolphins_data$Dolphin_clicks, main = "Dolphin clicks outliers", boxwex = 0.5,col="blue") # Examples of box plots with predictor and respose variable boxplot(Dolphin_clicks ~ Year_Cat, data = Dolphins_data, xlab = "Sampling season", ylab = "Dolphin clicks") # Pearson correlation coefficiente to evalute association between two or more variables. Mypredvars <- data.frame(Dolphins_data$Vaquita_clicks, Dolphins_data$UTM_north, Dolphins_data$UTM_east, Dolphins_data$Sonar_clicks, Dolphins_data$Sea_temperature, Dolphins_data$Depth) Mydata.cor = cor(Mypredvars, method = c("pearson"), use = "pairwise.complete.obs") corrplot(Mydata.cor) # Graph matrix of correlations # The Depth and UTM_east variables had a value of 0.67 coefficient of Pearson. # We checked in the value was sifinicant Mydata.cor2 = cor.test(Dolphins_data$UTM_east, Dolphins_data$Depth, method = c("pearson"), use = "pairwise.complete.obs") # The correlation coefficient is significant (p-value < 2.2e-16). # We eliminated UTM_east of the formulation of variables ######### GLMs with binomial negative distribution ########## # Formulations of explanatory variables: # All the variables Mod1 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Sonar_clicks + Month_Cat + Time_day Year_Cat + Speed_tide + Depth + UTM_north + Vaquita_clicks + Sea_temperature, data = Dolphins_data) # Result: glm.fit: algorithm did not converge # Only temporal variables Mod2 <- glm.nb(Dolphin_clicks ~ Day_night + Time_day + Month_Cat + Year_Cat,data = Dolphins_data) summary(Mod2) # Only space variables Mod3 <- glm.nb(Dolphin_clicks ~ Site + UTM_north + UTM_east, data = Dolphins_data) # Result: glm.fit: algorithm did not converge # Only anthropogenic variable Mod4 <- glm.nb(Dolphin_clicks ~ Sonar_clicks, data = Dolphins_data) summary(Mod4) # All biological and oceanographic variables Mod5 <- glm.nb(Dolphin_clicks ~ Speed_tide + Depth + Vaquita_clicks + Sea_temperature, data = Dolphins_data) # Depth+ Site + Day-night + Sonar + Tide + Vaquita+ Temperature Mod6 <- glm.nb(Dolphin_clicks ~ Depth + Site + Day_night + Sonar_clicks + Speed_tide + Vaquita_clicks + Sea_temperature, data = Dolphins_data) # Result: glm.fit: algorithm did not converge # Day-night + Month + Year + Tide + UTM_north + Vaquita Mod7 <- glm.nb(Dolphin_clicks ~ Day_night + Month_Cat + Year_Cat + Speed_tide + UTM_north + Vaquita_clicks, data = Dolphins_data) # Result: glm.fit: algorithm did not converge # Day-night + Month + Year + Tide + Site + Vaquita + Sea temperature Mod8 <- glm.nb(Dolphin_clicks ~ Day_night + Month_Cat + Year_Cat + Speed_tide + Site + Vaquita_clicks + Sea_temperature, data = Dolphins_data) summary(Mod8) # glm.fit: algorithm did not converge # Site + Day-night + Vaquita + Sea temperature Mod9 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Vaquita_clicks + Sea_temperature, data = Dolphins_data) summary(Mod9) # glm.fit: algorithm did not converge # Only site variable Mod10 <- glm.nb(Dolphin_clicks ~ Site, data = Dolphins_data) summary(Mod10) # Sea temperature + Site + Day-night Mod11 <- glm.nb(Dolphin_clicks ~ Sea_temperature + Site + Day_night, data = Dolphins_data) # glm.fit: algorithm did not converge # Site + Day-night Mod12 <- glm.nb(Dolphin_clicks ~ Site + Day_night, data = Dolphins_data) summary(Mod12) # Day-night Mod13 <- glm.nb(Dolphin_clicks ~ Day_night, data = Dolphins_data) summary(Mod13) # Site + Day-night + Tide + Vaquita + Sea temperature Mod14 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Speed_tide + Vaquita_clicks + Sea_temperature, data = Dolphins_data) summary(Mod14) # glm.fit: algorithm did not converge # Site * Day-night + Tide + Vaquita + Sea temperature Mod15 <- glm.nb(Dolphin_clicks ~ Site * Day_night + Speed_tide + Vaquita_clicks + Sea_temperature, data = Dolphins_data) summary(Mod15) # glm.fit: algorithm did not converge # Site * Day-night + Tide Mod16 <- glm.nb(Dolphin_clicks ~ Site * Day_night + Speed_tide, data = Dolphins_data) summary(Mod16) # We tested for goodness-of-fit of the models with a chi-square test based on # the degrees of freedom and residual deviance. Chi_testM16 = 1- pchisq(summary(Mod16)$deviance, summary(Mod16)$df.residual) # Day-night * Tide Mod17 <- glm.nb(Dolphin_clicks ~ Day_night * Speed_tide, data = Dolphins_data) summary(Mod17) # Tide * Site Mod18 <- glm.nb(Dolphin_clicks ~ Speed_tide * Site, data = Dolphins_data) summary(Mod18) # glm.fit: algorithm did not converge # Site * Tide + Day-night Mod19 <- glm.nb(Dolphin_clicks ~ Site * Speed_tide + Day_night, data = Dolphins_data) summary(Mod19) # glm.fit: algorithm did not converge # Site * Day-night + Tide * Depth Mod20 <- glm.nb(Dolphin_clicks ~ Site * Day_night + Speed_tide * Depth, data = Dolphins_data) summary(Mod20) # Day-night + Vaquita + Tide + Site Mod21 <- glm.nb(Dolphin_clicks ~ Day_night + Vaquita_clicks + Speed_tide + Site, data = Dolphins_data) summary(Mod21) # Site + Day-night + Tide Mod22 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Speed_tide, data = Dolphins_data) summary(Mod22) # Site + Speed tide Mod23 <- glm.nb(Dolphin_clicks ~ Site + Speed_tide, data = Dolphins_data) summary(Mod23) # Site + Time of day + Tide Mod24 <- glm.nb(Dolphin_clicks ~ Site + Time_day + Speed_tide, data = Dolphins_data) summary(Mod24) 1- pchisq(summary(Mod24)$deviance, summary(Mod24)$df.residual, lower.tail=FALSE) # Site + Time of day + Depth Mod25 <- glm.nb(Dolphin_clicks ~ Site + Time_day + Depth, data = Dolphins_data) summary(Mod25) 1- pchisq(summary(Mod25)$deviance, summary(Mod25)$df.residual, lower.tail=FALSE) # Site + Time of day + Depth + Day_night Mod26 <- glm.nb(Dolphin_clicks ~ Site + Time_day + Depth + Day_night, data = Dolphins_data) summary(Mod26) ############### PREDICTION OF THE BEST GLM MODEL (Mod16) ####################### # Values prediction of final model. We created a data frame to obtain predicted values. Dolphins_data1 <- read.table(file = "Dolphins_clicks_database_2011_2015.txt", header = TRUE, dec = ".") probDolphins <- predict(Mod16, newdata = Dolphins_data1, type = "response") View(probDolphins) write.csv(data.frame(probDolphins), "PredictionDolphins.csv") # We applied the best GLM model to the data for every sampling season # to get predictions per season (Figure 4).