##############################
#
#   Script for preparing Observation data
#   Need to have duration of behaviour per individual per observation
#   Including individuals that were nobserved but not recorded working
#     12/12/19
#
###############################

# Load packages
if (!require(data.table)) install.packages("data.table")
  library(data.table)
if (!require(ggplot2))  install.packages("ggplot2")
  library(ggplot2)
if (!require(lme4)) install.packages("lme4")
  library(lme4)
if (!require(gridExtra)) install.packages("gridExtra")
  library(gridExtra)
if (!require(dplyr))  install.packages("dplyr")
  library(dplyr)
if (!require(DHARMa)) install.packages("DHARMa")
  library(DHARMa)
if (!require(LMERConvenienceFunctions))  install.packages("LMERConvenienceFunctions")
  library(LMERConvenienceFunctions)
if (!require(AICcmodavg))  install.packages("AICcmodavg")
  library(AICcmodavg)
if (!require(SummarizedExperiment)) install.packages("SummarizedExperiment")
library(SummarizedExperiment)

DIR <- # filepath to project directory
DATADIR <- # filepath to data directory
DurationsObserved <- read.csv(file.path(paste(DATADIR,"BEHAVIOUR_DURATIONS_INPUT.ext", sep="")), sep="\t")
Ind <- read.csv(file.path(paste(DATADIR,"INDIVIDUAL_INFO_INPUT.ext", sep="")), sep="\t")


# Create dataframe and add the duration of observation per session per individual. Each individual was observed 6 times.
# If there are fewer than 6 durations, corresponding to observations when working was not recorded, fill in with 0.
Number_Individuals <- (length(unique(Ind$TagNo))) * 6 
DurationsAll <- data.frame(matrix("", nrow=Number_Individuals, ncol=1)) # create blank dataframe with one column and 6 rows per individual
DurationsAll$Obs_Per_Ind <- c(1:6) # Repeat 1-6, observation per individual
DurationsAll$TagNo <- rep(unique(Ind$TagNo), each=6) # Add individual (tag number) to 6 rows
DurationsAll <- merge(Ind[,c("TagNo","Colony")], DurationsAll) # Bring Colony from Ind and merge by TagNo
DurationsAll$matrix.....nrow...Number_Individuals..ncol...1. <- NULL # remove variable

# to number the individual's observations (first time observed, second time observed etc)
DurationsObserved$Obs_Per_Ind <- rowid(DurationsObserved$TagNo)

# merge the data frames leaving each individual's observations split by observation session, filling in NAs with zero
DurationsAll <- (merge(DurationsAll, DurationsObserved, all.x=T))
DurationsAll$BehaviourDuration_seconds <- as.numeric(DurationsAll$BehaviourDuration_seconds)
row.index <- (which(is.na(DurationsAll$BehaviourDuration_seconds)))
DurationsAll[row.index,"BehaviourDuration_seconds"] <- 0

# merge individual characteristics
DurationsAll <- merge(DurationsAll, Ind[, c("TagNo", "AgeMonths","GroupSize","MeanWeight", "Rank", "ScaledRank", "Sex",
                                            "Status", "TotalDurationCooperation", "ScaledRandom_Rank")], by = "TagNo")
DurationsAll$Sex <- as.factor(DurationsAll$Sex)

# remove question mark symbols from input files
DurationsAll$Sex <- gsub("?", "", DurationsAll$Sex, fixed=TRUE)
DurationsAll$Status <- gsub("B?", "B", DurationsAll$Status, fixed=TRUE) # B? = red vagina. May indicate reproductive activation but not evidence of full reproductive status
DurationsAll$Status <- as.factor(DurationsAll$Status)

# binary cooperation for models
DurationsAll$Coop_Binary <- factor(ifelse(DurationsAll$BehaviourDuration_seconds >0, 1,0))

# create subsets for breeders and non-breeders
NonBreeders <- subset(DurationsAll, DurationsAll$Status=="NB" & !is.na(DurationsAll$AgeMonths) &
                        !is.na(DurationsAll$TagNo) & !is.na(DurationsAll$ScaledRank))
Breeders <- subset(DurationsAll, DurationsAll$Status=="B")

# Create logged variables
NonBreeders$Log_Coop <- log(NonBreeders$BehaviourDuration_seconds)
NonBreeders$Log_Age <- log(NonBreeders$AgeMonths)
NonBreeders$Log_Weight <- log(NonBreeders$MeanWeight)
NonBreeders$Log_Rank  <- log(NonBreeders$ScaledRank)
# Create subset of observations of non-breeders that had non-zero work durations (working behaviour was observed)
NBNZ <- NonBreeders[NonBreeders$BehaviourDuration_seconds != 0, ] # Non-Breeder Non-Zero
NBNZ$Sex <- as.factor(NBNZ$Sex)


# Create regressions and look at distributions of residuals

# Starting with logistic regression: do variables predict whether observation session is non-zero?

# Combine functions of the DHARMa package that assess residuals from Logistic regressions

LogRegResiduals <- function(model) {
  par(mfrow=c(2,2))
  fittedModel <- model
  # plot(resid(fittedModel, type="deviance"))
  simulationOutput <- simulateResiduals(fittedModel = fittedModel, refit=F) 
  plot(simulationOutput, quantreg = T)
  testDispersion(simulationOutput, alternative="two.sided", plot=T)
  testZeroInflation(simulationOutput)
  testOutliers(simulationOutput, alternative = "two.sided", plot = T)
}

# Define models
LogReg_Null <- glmer(Coop_Binary ~ (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Sex <- glmer(Coop_Binary ~ Sex + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Age <- glmer(Coop_Binary ~ AgeMonths + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Weight <- glmer(Coop_Binary ~ MeanWeight + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Rank <- glmer(Coop_Binary ~ ScaledRank + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Random_Rank <- glmer(Coop_Binary ~ ScaledRandom_Rank + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Sq_Weight <- glmer(Coop_Binary ~ poly(MeanWeight,2) + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Sq_Age <- glmer(Coop_Binary ~ poly(AgeMonths,2) + (1|Colony/TagNo), data = NonBreeders, family = "binomial")
LogReg_Cu_Age <- glmer(Coop_Binary ~ poly(AgeMonths,3) + (1|Colony/TagNo), data = NonBreeders, family = "binomial")

LogRegList <- c(LogReg_Null,LogReg_Sex,LogReg_Age,LogReg_Sq_Age,LogReg_Cu_Age, LogReg_Weight,LogReg_Sq_Weight,LogReg_Rank, LogReg_Random_Rank)

# run and inspect residual checks on Logistic regressions
for (i in LogRegList){
  LogRegResiduals(i)
  }

# compare AIC values
aictab(LogRegList, sort=T,
       modnames=c("LogReg_Null","LogReg_Sex","LogReg_Age","LogReg_Sq_Age", "LogReg_Cu_Age","LogReg_Weight","LogReg_Sq_Weight","LogReg_Rank", "LogReg_Random_Rank"))

# To print full model summaries or co-efficients
for (i in LogRegList){
  print(i)
  print(coef(summary(i)))
}

### Second part of the 2-part models for semi-continuous data
# running linear mixed-effects models on the non-zero data
# define the models
glmnull <- lmer(BehaviourDuration_seconds ~ (1|Colony/TagNo),
                data=NBNZ, REML=F)
glmsex <- lmer(BehaviourDuration_seconds ~ Sex +
                 (1|Colony/TagNo), data=NBNZ, REML=F)
glmage <- lmer(BehaviourDuration_seconds ~ AgeMonths +
                 (1|Colony/TagNo), data=NBNZ, REML=F)
glmage_sq <- lmer(BehaviourDuration_seconds ~ poly(AgeMonths,2) +
                    (1|Colony/TagNo), data=NBNZ, REML=F)
glmweight <- lmer(BehaviourDuration_seconds ~ MeanWeight +
                    (1|Colony/TagNo), data=NBNZ, REML=F)

glmrank <- lmer(BehaviourDuration_seconds ~ ScaledRank +
                  (1|Colony/TagNo), data=NBNZ, REML=F)

GLMlist <- c(glmnull, glmsex, glmage, glmage_sq, glmweight, glmrank)

# for each model, display residual information and model summaries
for (i in GLMlist) {
  qqplot(resid(i), fitted(i))
  mcp.fnc(i)
  mean(resid(i))
  hist(resid(i))
  print(summary(i))
}

# compare AIC values
aictab(GLMlist, sort=T, c.hat=T,
       modnames=c("glmnull","glmsex","glmage", "glmage_sq","glmweight","glmrank"))

### Colony-level data
# Create new dataframe and add information - ColonyData
# Number of observations per colony
ObservationsPerColony <- as.data.frame(table(NonBreeders$Colony))
colnames(ObservationsPerColony) <- c("Colony", "NumberOfObservations")

# Total durations per colony
DurationPerColony <- as.data.frame(tapply(NonBreeders$BehaviourDuration_seconds, NonBreeders$Colony, sum))
setDT(DurationPerColony, keep.rownames = TRUE)[]
colnames(DurationPerColony) <- c("Colony", "ColonyDuration_seconds")

# Define ColonyData from file with basic info
ColonyData <- read.csv(file.path(DATADIR,paste("COLONY_INPUT.EXT")), sep=",")

# Merge and add variables for how many sessions recorded or did not record working behaviour
ColonyData <- merge(ColonyData, ObservationsPerColony, all.x=T, all.y=T)
ColonyData <- merge(ColonyData, DurationPerColony, all.x=T, all.y=T)
BehaviourPresentPerColony <- t(table(NonBreeders$Coop_Binary, NonBreeders$Colony))
BehaviourPresentPerColony <- setDT(as.data.table(BehaviourPresentPerColony), keep.rownames = TRUE)[]
BehaviourPresentPerColony <- cbind(BehaviourPresentPerColony, unstack(BehaviourPresentPerColony, form= N ~ V2))
BehaviourPresentPerColony <- BehaviourPresentPerColony[(1:8), c(1,3,5)]
colnames(BehaviourPresentPerColony) <- c("Colony", "Behaviour_Absent", "Behaviour_Present")
ColonyData <- merge(ColonyData, BehaviourPresentPerColony)

# Add more data to the dataframe
ColonyData$MeanCoopPerObservation <- with(ColonyData, ColonyData$ColonyDuration_seconds/ColonyData$Behaviour_Present)
ColonyData$Percent_Present <- 100*ColonyData$Behaviour_Present/(ColonyData$Behaviour_Present+ColonyData$Behaviour_Absent)
ColonyData$MeanColonyWeight <- tapply(NonBreeders$MeanWeight, NonBreeders$Colony, mean)
ColonyData$MeanAgePerColony <- tapply(NonBreeders$AgeMonths, NonBreeders$Colony, mean)
ColonyData$LengthPerIndividual <- with(ColonyData, ColonyData$Length/ColonyData$NumberofIndividualsAtFirstObs)
ColonyData$AreaPerIndividual <- with(ColonyData, ColonyData$Area/ColonyData$NumberofIndividualsAtFirstObs)
ColonyData$NumberObserved <- table(NonBreeders$Colony)/6
Females <- subset(NonBreeders, NonBreeders$Sex=="F")
ColonyData$NumberFemales <- table(Females$Colony)/6
ColonyData$ProportionFemale <- (ColonyData$NumberFemales / ColonyData$NumberObserved)
ColonyData$Weight_SD <- tapply(NonBreeders$MeanWeight, NonBreeders$Colony, sd)
ColonyData$Age_SD <- tapply(NonBreeders$AgeMonths, NonBreeders$Colony, sd)
ColonyData$Age_Range <- tapply(NonBreeders$AgeMonths, NonBreeders$Colony, range)
ColonyData$Weight_Range <- tapply(NonBreeders$MeanWeight, NonBreeders$Colony, range)
ColonyData$Behaviour_mean <- tapply(NonBreeders$BehaviourDuration_seconds, NonBreeders$Colony, mean)
ColonyData$Behaviour_sd <- tapply(NonBreeders$BehaviourDuration_seconds, NonBreeders$Colony, sd)
ColonyData <- ColonyData %>% 
  mutate_if(is.numeric, round, digits = 1)

### General summary stats
mean(NonBreeders$AgeMonths)
range(NonBreeders$AgeMonths)
mean(NonBreeders$MeanWeight)
range(NonBreeders$MeanWeight)
mean(NonBreeders$BehaviourDuration_seconds)
sd(NonBreeders$BehaviourDuration_seconds)

# average (sd) and percentage time spent working
tapply(NonBreeders$BehaviourDuration_seconds, NonBreeders$Sex, mean)
tapply(NonBreeders$BehaviourDuration_seconds, NonBreeders$Sex, sd)
(100*tapply(NonBreeders$BehaviourDuration_seconds, NonBreeders$Sex, mean))/600
mean(NonBreeders$BehaviourDuration_seconds)
sd(NonBreeders$BehaviourDuration_seconds)
(100*(mean(NonBreeders$BehaviourDuration_seconds)))/600


### Creating figures

# define theme for graphs of logistic regressions
Logistic_Regression_theme <- theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
  plot.title = element_text(size=18, hjust=0.5), 
  text = element_text(size=16, family="serif"),
  axis.line = element_line(colour = "black"),
  panel.border = element_rect(colour = "black", fill=NA, size=2))
  # panel.border=element_blank())

Logistic_Regression_Line <- stat_smooth(method = 'glm', method.args = list(family = "binomial"), formula=y~x, 
              alpha=0.2, size=1, fill="black", colour="black")

Points <- geom_point(position=position_jitter(height=0.03, width=0))
Counts <- geom_count(col="black",show.legend = F)

# Plot observed working by age
LogisticByAge_GlmLine <- ggplot(NonBreeders, aes(AgeMonths, as.numeric(Coop_Binary)-1)) +
  Logistic_Regression_theme +
  Logistic_Regression_Line +
  Counts +
  scale_y_continuous(breaks=c(0,0.5,1)) +
  labs(title="A. Age",
       x="Age (Months)",
       y="Observed Working (1 = Yes)")

LogisticByWeight <- ggplot(NonBreeders, aes(MeanWeight, as.numeric(Coop_Binary)-1)) +
  Logistic_Regression_theme +
  Logistic_Regression_Line +
  Counts +
  scale_y_continuous(breaks=c(0,0.5,1)) +
  labs(title="B. Weight",
       x="Weight (Grams)",
       y="Observed Working (1 = Yes)")

LogisticByRank <- ggplot(NonBreeders, aes(ScaledRank, as.numeric(Coop_Binary)-1)) +
  Logistic_Regression_theme +
  Logistic_Regression_Line +
  Counts +
  scale_y_continuous(breaks=c(0,0.5,1)) +
  labs(title="C. Rank",
       x="Rank (0-1, Scaled)",
       y="Observed Working (1 = Yes)")

# define plots to save and save to PDF file
LogPlots <- list(LogisticByAge_GlmLine, LogisticByWeight, LogisticByRank)
pdf(paste0(file.path(DIR,"Figures", sep=""), "Figure1_WorkPresent.pdf",sep=""), onefile = TRUE)
LogPlots
dev.off()

# Define theme for graphs of linear regressions 
LinearModel_theme <- theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        plot.title = element_text(size=18, hjust=0.5), 
        text = element_text(size=16, family="serif"),
        axis.line = element_line(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA, size=2))

LinearModel_Line <- stat_smooth(method = 'lm', formula=y~x, 
                                        alpha=0.2, size=1, fill="black", colour="black")

CoopByAge <- ggplot(NBNZ, aes(x = NBNZ$AgeMonths, y = NBNZ$BehaviourDuration_seconds)) +
  LinearModel_theme +
  LinearModel_Line +
  Points + 
  labs(title="A. Age",
       x="Age (Months)",
       y="Duration Working (Seconds)")

CoopByWeight <- ggplot(NBNZ, aes(x = NBNZ$MeanWeight, y = NBNZ$BehaviourDuration_seconds)) +
  LinearModel_theme +
  LinearModel_Line +
  Points + 
  labs(title="B. Weight",
       x="Weight (Grams)",
       y="Duration Working (Seconds)")

CoopByRank <- ggplot(NBNZ, aes(x = NBNZ$ScaledRank, y = NBNZ$BehaviourDuration_seconds)) + 
  LinearModel_theme +
  LinearModel_Line +
  Points + 
  labs(title="C. Rank",
       x="Rank (0-1, Scaled)",
       y="Duration Working (Seconds)")

# define plots to save and save to PDF file
GLMPlots <- list(CoopByAge, CoopByWeight, CoopByRank)
pdf(paste0(file.path(DIR,"Figures", sep=""), "Figure2_WorkDuration.pdf",sep=""),onefile = TRUE)
GLMPlots
dev.off()

# Colony Plots

# Plot of work duration per observation by colony
Colony_Coop <- ggplot(NonBreeders, aes(x=Colony, y=BehaviourDuration_seconds)) +
  geom_boxplot() +
  theme_bw() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
        legend.position = "none",
        plot.title = element_text(hjust = 0.5),
        text = element_text(size=16, family="serif"),
        axis.line = element_line(colour = "black"),
        panel.border = element_rect(colour = "black", fill=NA, size=2)) +
        labs(title="\n\n\n\n\n",
        x="Colony",
        y="Duration of Working Behaviour (Seconds)")

pdf(paste0(file.path(DIR,"Figures", sep=""), "Figure3_MeanWorkByColony.pdf",sep=""), onefile = TRUE)
Colony_Coop
dev.off()

# Plots of observed working (binary) by age, weight and rank for each colony
Colony_LogAge <- LogisticByAge_GlmLine +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
  facet_grid(cols=vars(NonBreeders$Colony))

Colony_LogWeight <- LogisticByWeight +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
  facet_grid(cols=vars(NonBreeders$Colony))

Colony_LogRank <- LogisticByRank + 
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
  scale_x_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1),
  label = c("", ".25", ".50", ".75", "")) +
  facet_grid(cols=vars(NonBreeders$Colony))

# Define list and save outputs to PDF
LogWorkByColony <- list(Colony_LogAge, Colony_LogWeight, Colony_LogRank)
pdf(paste0(file.path(DIR,"Figures", sep=""), "Figure4_WorkPresenceByColony.pdf",sep=""),
    width = 25, height = 4, paper= "a4r",  onefile = TRUE)
LogWorkByColony
dev.off()

# Plots of duration of working by age, weight and rank for each colony
Colony_LinAge <- CoopByAge +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
  facet_grid(cols=vars(NBNZ$Colony))

Colony_LinWeight <- CoopByWeight +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
  facet_grid(cols=vars(NBNZ$Colony))

Colony_LinRank <- CoopByRank +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1)) +
  scale_x_continuous(breaks = c(0, 0.25, 0.50, 0.75, 1),
                     label = c("", ".25", ".50", ".75", "")) +
  facet_grid(cols=vars(NBNZ$Colony))

# Define list and save outputs to PDF
LinWorkByColony <- list(Colony_LinAge, Colony_LinWeight, Colony_LinRank)

pdf(paste0(file.path(DIR,"Figures", sep=""), "Figure5_WorkDurationByColony.pdf",sep=""),
    width = 25, height = 4, paper= "a4r",  onefile = TRUE)
LinWorkByColony
dev.off()