# Code for some analysis in the paper: 

# Downloading the database.  
setwd("~/")
setwd("D://Gustavo/CICESE - Doctorado/Anįlisis de datos/Analisis datos con GLMs en R")
Dolphins_data <- read.table(file = "Dolphins_clicks_database_2011_2015_Final.txt", header = TRUE, dec = ".")

# Variables as factors
Dolphins_data$Time_day      <- as.factor(Dolphins_data$Time_day)
Dolphins_data$Day_night      <- as.factor(Dolphins_data$Day_night)
Dolphins_data$Site      <- as.factor(Dolphins_data$Site)
Dolphins_data$Month_Cat      <- as.factor(Dolphins_data$Month_Cat)                              
Dolphins_data$Year_Cat      <- as.factor(Dolphins_data$Year_Cat) # Year of sampling season

str(Dolphins_data) # Checking the kind of variables (factors, integers or numerical)
names(Dolphins_data) # Checking the names of variables. 


# Downloading of the packages.  
library(MASS) # GLMs binomial negative distribution
library(ggplot2) # Plots
library(car)  # Correlations of predictor variables. 
library(corrplot) # We used to created a "Correlogram"
library(FSA) # Dunn“s test
library(cowplot) # Two graphs in one page


# Diel patterns analysis

######### Checking for statistical differences in counts of dolphin clicks 
# related to daytime $ nighttime periods #########

# Mean and SD of nigthtime and daytime condition
mean(Dolphins_data$Dolphin_clicks,na.rm = TRUE)
sd(Dolphins_data$Dolphin_clicks,na.rm = TRUE)

mean(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Day"],na.rm = TRUE)
sd(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Day"],na.rm = TRUE)

mean(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Night"],na.rm = TRUE)
sd(Dolphins_data$Dolphin_clicks[Dolphins_data$Day_night=="Night"],na.rm = TRUE)

# Kruskal-Wallis test. Evaluation of differences in mean by light condition of day
kruskal.test(Dolphins_data$Dolphin_clicks~as.factor(Dolphins_data$Day_night))

# Kruskal-Wallis test. Evaluation of differences in mean by time of day 
kruskal.test(Dolphins_data$Dolphin_clicks~as.factor(Dolphins_data$Time_day))

# Dunn“s test
Timeday <- dunnTest(Dolphins_data$Dolphin_clicks ~ as.factor(Dolphins_data$Time_day), method="bonferroni")

print(Timeday,dunn.test.results=TRUE) # summary of results


# Violin graph. Time of the day vs dolphin clicks. Part 1. Y axis = 0 to 20. 
g1 <- ggplot(Dolphins_data, aes(x = factor(Dolphins_data$Time_day), y = (Dolphins_data$Dolphin_clicks))) +
  geom_violin(draw_quantiles = c(.25, .5, .75)) + coord_cartesian(ylim = c(0,20)) +
  stat_summary(fun.y = "mean", geom = "point", shape = 15, size = 4, color = "midnightblue") +
  stat_summary(fun.y = "median", geom = "point", shape = 2, size = 3, color = "red") +
  labs(title="", x="Time of day", y="Dolphin clicks") + geom_point() +
  theme(
    axis.title.x = element_text(color="black", size=12, face="bold"),
    axis.title.y = element_text(color="black", size=12, face="bold")
  ) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                   panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) +
 theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

# Graph. Part 2. Y axis = 20 to 7,700. I split the graph into two because of the excess of zeroes. 
# More easy to see the distribution of the response variable 
g2 <- ggplot(Dolphins_data, aes(x = factor(Dolphins_data$Time_day), y = (Dolphins_data$Dolphin_clicks))) +
  geom_line() + coord_cartesian(ylim = c(20, 8000)) +
  scale_y_continuous(breaks = c(20,2000,4000,6000,8000), limits = c(20,8000)) +
  labs(title="", x="", y="Dolphin clicks") + geom_point() +
  theme(
    axis.title.x = element_text(color="black", size=12, face="bold"),
    axis.title.y = element_text(color="black", size=12, face="bold")
  ) + theme_bw() + theme(panel.border = element_blank(), panel.grid.major = element_blank(),
                           panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) +
  theme(plot.margin = unit(c(0, 0, 0, 0), "cm"))

# Two graphs in one page
png("Figure2.png", units="in", width=5, height=5, res=600)
plot_grid(g2, g1, ncol=1, align="v")
dev.off()

############ Spatial patterns of the observed acoustic occurrence of dolphins ##########

# Kruskal-Wallis test. Evaluation of differences in mean by sampling season (Year_Cat)

kruskal.test(Dolphins_data$Dolphin_clicks~as.factor(Dolphins_data$Year_Cat))

SamplingSeason <- dunnTest(Dolphins_data$Dolphin_clicks ~ as.factor(Dolphins_data$Year_Cat), method="bonferroni")

print(SamplingSeason,dunn.test.results=TRUE) # 


########## Data exploration ##########

# Basic exploration in the response variable (counts of dolphin clicks per hour) 
plot(table(Dolphins_data$Dolphin_clicks)) 
summary(Dolphins_data$Dolphin_clicks) # NA“s 34,218. No NA“s = 293,137
count(Dolphins_data, vars=Dolphin_clicks,wt_var = "0")
# Results with 0 data (290,328). 99% of response variable dara are zeros. 


# Checking for outliers in respose variable

boxplot(Dolphins_data$Dolphin_clicks, 
        main = "Dolphin clicks outliers",
        boxwex = 0.5,col="blue")


# Examples of box plots with predictor and respose variable
boxplot(Dolphin_clicks ~ Year_Cat, data = Dolphins_data,
        xlab = "Sampling season", ylab = "Dolphin clicks")


# Pearson correlation coefficiente to evalute association between two or more variables.

Mypredvars <- data.frame(Dolphins_data$Vaquita_clicks, Dolphins_data$UTM_north, Dolphins_data$UTM_east,
                         Dolphins_data$Sonar_clicks, Dolphins_data$Sea_temperature, Dolphins_data$Depth)

Mydata.cor = cor(Mypredvars, method = c("pearson"), use = "pairwise.complete.obs")

corrplot(Mydata.cor) # Graph matrix of correlations

# The Depth and UTM_east variables had a value of 0.67 coefficient of Pearson. 
# We checked in the value was sifinicant

Mydata.cor2 = cor.test(Dolphins_data$UTM_east, Dolphins_data$Depth, method = c("pearson"), use = "pairwise.complete.obs")
# The correlation coefficient is significant (p-value < 2.2e-16). 
# We eliminated UTM_east of the formulation of variables


#########  GLMs with binomial negative distribution ##########  

# Formulations of explanatory variables:

# All the variables 
Mod1 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Sonar_clicks + Month_Cat + Time_day  
                 Year_Cat + Speed_tide + Depth + UTM_north + Vaquita_clicks + Sea_temperature, 
               data = Dolphins_data)
# Result: glm.fit: algorithm did not converge 

# Only temporal variables
Mod2 <- glm.nb(Dolphin_clicks ~  Day_night + Time_day + Month_Cat + Year_Cat,data = Dolphins_data)
summary(Mod2)

# Only space variables
Mod3 <- glm.nb(Dolphin_clicks ~ Site + UTM_north + UTM_east, data = Dolphins_data)
# Result: glm.fit: algorithm did not converge 

# Only anthropogenic variable
Mod4 <- glm.nb(Dolphin_clicks ~  Sonar_clicks, data = Dolphins_data)
summary(Mod4)

# All biological and oceanographic variables
Mod5 <- glm.nb(Dolphin_clicks ~ Speed_tide + Depth + Vaquita_clicks + Sea_temperature, 
               data = Dolphins_data)

# Depth+ Site + Day-night + Sonar + Tide + Vaquita+ Temperature
Mod6 <- glm.nb(Dolphin_clicks ~ Depth + Site + Day_night + Sonar_clicks + Speed_tide + Vaquita_clicks + Sea_temperature, 
               data = Dolphins_data)
# Result: glm.fit: algorithm did not converge 

# Day-night  + Month + Year + Tide + UTM_north + Vaquita
Mod7 <- glm.nb(Dolphin_clicks ~ Day_night + Month_Cat + Year_Cat + Speed_tide + UTM_north + Vaquita_clicks, 
               data = Dolphins_data)
# Result: glm.fit: algorithm did not converge 

# Day-night + Month + Year + Tide + Site + Vaquita + Sea temperature
Mod8 <- glm.nb(Dolphin_clicks ~ Day_night +  Month_Cat +  Year_Cat + Speed_tide + Site + Vaquita_clicks + Sea_temperature, 
               data = Dolphins_data)
summary(Mod8) # glm.fit: algorithm did not converge

# Site + Day-night + Vaquita + Sea temperature
Mod9 <- glm.nb(Dolphin_clicks ~ Site + Day_night +  Vaquita_clicks + Sea_temperature, 
               data = Dolphins_data)
summary(Mod9) # glm.fit: algorithm did not converge

# Only site variable
Mod10 <- glm.nb(Dolphin_clicks ~ Site, data = Dolphins_data)
summary(Mod10)

# Sea temperature + Site + Day-night
Mod11 <- glm.nb(Dolphin_clicks ~ Sea_temperature + Site + Day_night,
                data = Dolphins_data)
# glm.fit: algorithm did not converge

# Site + Day-night
Mod12 <- glm.nb(Dolphin_clicks ~ Site + Day_night, data = Dolphins_data)
summary(Mod12)

# Day-night
Mod13 <- glm.nb(Dolphin_clicks ~ Day_night, data = Dolphins_data)
summary(Mod13)

# Site + Day-night + Tide + Vaquita + Sea temperature
Mod14 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Speed_tide + Vaquita_clicks + Sea_temperature, data = Dolphins_data)
summary(Mod14) # glm.fit: algorithm did not converge

# Site * Day-night + Tide + Vaquita + Sea temperature
Mod15 <- glm.nb(Dolphin_clicks ~ Site * Day_night + Speed_tide + Vaquita_clicks + Sea_temperature, data = Dolphins_data)
summary(Mod15) # glm.fit: algorithm did not converge

# Site * Day-night + Tide   
Mod16 <- glm.nb(Dolphin_clicks ~ Site * Day_night + Speed_tide, data = Dolphins_data)
summary(Mod16)
# We tested for goodness-of-fit of the models with a chi-square test based on 
# the degrees of freedom and residual deviance.

Chi_testM16 = 1- pchisq(summary(Mod16)$deviance,
                summary(Mod16)$df.residual)

# Day-night * Tide
Mod17 <- glm.nb(Dolphin_clicks ~ Day_night * Speed_tide, data = Dolphins_data)
summary(Mod17)

# Tide * Site
Mod18 <- glm.nb(Dolphin_clicks ~ Speed_tide * Site, data = Dolphins_data)
summary(Mod18) # glm.fit: algorithm did not converge

# Site * Tide + Day-night
Mod19 <- glm.nb(Dolphin_clicks ~ Site *  Speed_tide + Day_night, data = Dolphins_data)
summary(Mod19) # glm.fit: algorithm did not converge

# Site * Day-night + Tide * Depth
Mod20 <- glm.nb(Dolphin_clicks ~ Site * Day_night + Speed_tide * Depth, data = Dolphins_data)
summary(Mod20)

# Day-night + Vaquita + Tide + Site
Mod21 <- glm.nb(Dolphin_clicks ~ Day_night + Vaquita_clicks + Speed_tide + Site, data = Dolphins_data)
summary(Mod21)

# Site + Day-night + Tide   
Mod22 <- glm.nb(Dolphin_clicks ~ Site + Day_night + Speed_tide, data = Dolphins_data)
summary(Mod22)

# Site + Speed tide
Mod23 <- glm.nb(Dolphin_clicks ~ Site + Speed_tide, data = Dolphins_data)
summary(Mod23)

# Site + Time of day + Tide   
Mod24 <- glm.nb(Dolphin_clicks ~ Site + Time_day + Speed_tide, data = Dolphins_data)
summary(Mod24)
1- pchisq(summary(Mod24)$deviance,
          summary(Mod24)$df.residual, lower.tail=FALSE)

# Site + Time of day + Depth   
Mod25 <- glm.nb(Dolphin_clicks ~ Site + Time_day + Depth, data = Dolphins_data)
summary(Mod25)
1- pchisq(summary(Mod25)$deviance,
          summary(Mod25)$df.residual, lower.tail=FALSE)

# Site + Time of day + Depth + Day_night  
Mod26 <- glm.nb(Dolphin_clicks ~ Site + Time_day + Depth + Day_night, data = Dolphins_data)
summary(Mod26)


############### PREDICTION OF THE BEST GLM MODEL (Mod16) #######################

# Values prediction of final model. We created a data frame to obtain predicted values.  
Dolphins_data1 <- read.table(file = "Dolphins_clicks_database_2011_2015.txt", header = TRUE, dec = ".")

probDolphins <- predict(Mod16, newdata = Dolphins_data1, type = "response")

View(probDolphins)

write.csv(data.frame(probDolphins), "PredictionDolphins.csv") 

# We applied the best GLM model to the data for every sampling season 
# to get predictions per season (Figure 4).