#Content: Data analysis for Spider study
#Author: Guillaume Dezecache (guillaume.dezecache@gmail.com)
#Last update: Fri Oct 23 19:48:11 2020
#Checks: [OK] next to the corresponding code bit

###USE THE CHECK_OUTLIERS FUNCTION!!


#Set working directory [OK]
setwd('/Users/dtaylor/Desktop/pan_spider/')

#Libraries + script needed (clean it for some are not used)
library(tseries)
library(e1071) 
library(ez)
library(car)
library(readxl)
library(effects)
library(lme4)
library(MASS)
library(DescTools)
library(irr)
library(lsr)
library(emmeans)
library(dplyr)
library(lattice)
library(ggplot2)
source("RogerMundrysdiagnostic_fcns.R")
library(performance)

#Load dataset
data = read_excel("data.xlsx", sheet = "data")

#Load dataset with wide structure
datawide = read_excel("data.xlsx", sheet = "datawide")

#Check structure and transform some variables as factors
str(datawide)
datawide$ID=as.factor(datawide$ID)
datawide$GROUP=as.factor(datawide$GROUP)
datawide$P.CALL_1=as.factor(datawide$P.CALL_1)
datawide$P.CALL_2=as.factor(datawide$P.CALL_2)

#Data structure
##Check balance wrt predictors
table(datawide$ID, datawide$GROUP)
##Inspection of predictors
hist(datawide$SPIDER.MOVE_2)
skewness(datawide$SPIDER.MOVE_2) ##not so bad
datawide$SPIDER.MOVE_2z=as.vector(scale(datawide$SPIDER.MOVE_2))
hist(datawide$DELAY) ##seriously right skewed
skewness(datawide$DELAY) 
hist(log(datawide$DELAY)) ##try log: looks better
skewness(log(datawide$DELAY)) ##try log: looks better indeed
datawide$DELAYlog=log(datawide$DELAY) ##apply log transformation
datawide$DELAYlogz=as.vector(scale(datawide$DELAYlog))

#Hypothesis 1: More likely to call in Social group
##Call model specification
Hyp1FullModel=glm(P.CALL_2~GROUP+SPIDER.MOVE_2z+DELAYlogz+P.CALL_1,data=datawide, family=binomial)
##Model diagnostics using package 'performance'
check_model(Hyp1FullModel)
model_performance(Hyp1FullModel) 
##Model diagnostics using Roger Mundry's tools & guidelines
vif(Hyp1FullModel) ##VIF, relatively OK
max(influence(Hyp1FullModel)$hat)-lev.thresh(Hyp1FullModel) ##max influence does not go beyond the leverage's threshold
beta=cbind(coefficients(Hyp1FullModel), coefficients(Hyp1FullModel)+t(apply(X=dfbeta(Hyp1FullModel), MARGIN=2, FUN=range))) ##actual in column 1, min in column 2 & max in column 3
m.stab.plot(beta) ##not very good
##Inference
r2(Hyp1FullModel)
Hyp1NullModel=glm(P.CALL_2~1,data=datawide, family=binomial)
anova(Hyp1FullModel,Hyp1NullModel, test='Chisq')


#Hypothesis 2: More gazing in Social Group
##Calculation of dependent variable
datawide$proportionoflookingtrial1=datawide$D.LOOK.30.R_1/datawide$CODINGTIMEMAX30_1 ##for a linear model
datawide$proportionoflookingtrial2=datawide$D.LOOK.30.R_2/datawide$CODINGTIMEMAX30_2 ##for a linear model
datawide$look=datawide$D.LOOK.30.R_2 ##for a binomial model
datawide$nolook=datawide$CODINGTIMEMAX30_2-datawide$D.LOOK.30.R_2 ##for a binomial model
datawide$lookvsnolook=round(cbind(datawide$look,datawide$nolook),0) ##for a binomial model
##Transformation of predictors?
hist(datawide$D.LOOK.PROP_1) ##quite left skewed...
skewness(datawide$D.LOOK.PROP_1) ##indeed...
hist(datawide$D.LOOK.PROP_1^2) ##try ^2 transformation
hist(log(datawide$D.LOOK.PROP_1)) ##try log; it makes things worse...
hist(datawide$D.LOOK.PROP_1^(1/3)) ##try cuberoot; makes thing worse...
datawide$D.LOOK.PROP_1_t=datawide$D.LOOK.PROP_1^2 ##apply ^2
skewness(datawide$D.LOOK.PROP_1_t) ##less skewing...
datawide$D.LOOK.PROP_1_tz=as.vector(scale(datawide$D.LOOK.PROP_1_t)) ##zscore it

##Model specification with binomial family
Hyp2FullModel=glm(lookvsnolook~GROUP+SPIDER.MOVE_2+DELAYlogz+D.LOOK.PROP_1_tz,data=datawide, family=binomial)
###Model diagnostics using package 'performance'
check_model(Hyp2FullModel)
model_performance(Hyp2FullModel) 
###Model diagnostics using Roger Mundry's tools & guidelines
vif(Hyp2FullModel) ##VIF, relatively OK
max(influence(Hyp2FullModel)$hat)-lev.thresh(Hyp2FullModel) ##max influence does not go beyond the leverage's threshold
beta=cbind(coefficients(Hyp2FullModel), coefficients(Hyp2FullModel)+t(apply(X=dfbeta(Hyp2FullModel), MARGIN=2, FUN=range))) ##actual in column 1, min in column 2 & max in column 3
m.stab.plot(beta) ##looks OK?
###Inference
Hyp2NullModel=glm(lookvsnolook~1,data=datawide, family=binomial)
r2(Hyp2FullModel)
anova(Hyp2NullModel, Hyp2FullModel, test="Chisq")
summary(Hyp2FullModel)$coefficients
Hyp2table=cbind(coefficients(Hyp2FullModel), confint(object=Hyp2FullModel))
plot(allEffects(Hyp2FullModel))

##Model specification with gaussian family
Hyp2FullModellinear=lm(D.LOOK.PROP_2~GROUP+SPIDER.MOVE_2+DELAYlogz+D.LOOK.PROP_1_tz,data=datawide)
###Model diagnostics using package 'performance'
check_model(Hyp2FullModellinear)
###Model diagnostics following R. Mundry's
diagnostics.plot(Hyp2FullModellinear) ###all look relatively ok besides residuals against fitted values, where there seems to be sort of a pattern maybe?
cor.test(fitted(Hyp2FullModellinear), abs(residuals(Hyp2FullModellinear)))
dffits(Hyp2FullModellinear)
max(abs(dffits(Hyp2FullModellinear))) ###Two problematic cases here, 5th and 9th
dfbeta(Hyp2FullModellinear)
round(cbind(coefficients(Hyp2FullModellinear), coefficients(Hyp2FullModellinear)+
              t(apply(X=dfbeta(Hyp2FullModellinear), MARGIN=2, FUN=range))), 3) ##does not look so problematic?
dfbetas(Hyp2FullModellinear) ##5th and 9th values are problematic
max(influence(Hyp2FullModellinear)$hat)-lev.thresh(Hyp2FullModellinear) ##looks OK
plot(as.vector(influence(Hyp2FullModellinear)$hat))
max(cooks.distance(Hyp2FullModellinear))
plot(cooks.distance(Hyp2FullModellinear)) ##5th and 9th look quite problematic
###Run model without likely influential cases
Hyp2FullModellinearNOINFL=lm(D.LOOK.PROP_2~GROUP+SPIDER.MOVE_2+DELAYlogz+D.LOOK.PROP_1_tz,data=datawide[-c(5, 9),])
plot(cooks.distance(Hyp2FullModellinear)) ##but it reveals other influential cases...
###Inference
summary(Hyp2FullModellinear)
r2(Hyp2FullModellinear)
Hyp2NullModellinear=lm(D.LOOK.PROP_2~1,data=datawide)
anova(Hyp2NullModellinear,Hyp2FullModellinear, test="F")
Hyp2RedModellinear=lm(D.LOOK.PROP_2~SPIDER.MOVE_2+DELAY+D.LOOK.PROP_1,data=datawide)
anova(Hyp2RedModellinear, Hyp2FullModellinear, test="F") 
plot(allEffects(Hyp2FullModellinear))
Anova(Hyp2FullModellinear, type=3)
summary(Hyp1FullModel)$coefficients
Hyp2table=cbind(coefficients(Hyp2FullModel), confint(object=Hyp2FullModel))
###Explore models without primary influential cases?
summary(Hyp2FullModellinearNOINFL)
r2(Hyp2FullModellinearNOINFL)
Hyp2NullModellinearNOINFL=lm(D.LOOK.PROP_2~1,data=datawide[-c(5, 9),])
anova(Hyp2NullModellinearNOINFL,Hyp2FullModellinearNOINFL, test="F")