# ExPra SoSe 2019
# data analysis

# clean workspace
rm(list=ls())

# install packages
install.packages("Rmisc")
install.packages("plyr")
install.packages("dplyr")
install.packages("data.table")
install.packages("ggplot")

# load packages
library(plyr)
library(dplyr)
library(data.table)
library(ggplot2)        
library(Rmisc)

library(tidyr)

library(BayesFactor)
################ functions   ##############################################################################################

# IQR outlier
identifyOutliers <-  function(x) {
  qnt <- quantile(x, probs=c(.25, .75), na.rm=TRUE, type = 5)
  H <- 1.5 * (quantile(x, .75, na.rm=TRUE, type = 5) - quantile(x, .25, na.rm=TRUE, type = 5))
  outlier <- ((x) < (qnt[1] - H)) | ((x) > qnt[2] + H) | x < .10
  outlier
}

################ load *Rdat file ##########################################################################################

# when reading Windows *csv files: encoding="UTF-8"
# when reading MAC *csv files: encoding="latin1"

setwd("C:/BACKUP13072018/PHB 2018/LEHRE/Experimentelles Praktikum/SoSe 2019/Data & Code")

load("ExPra_Data_17062019.RData")

dat.long = data.long
rm(data.long) # rest of the code works with "dat"

################ add word frequency ##########################################################################################
### source (German corpus): http://www.dlexdb.de/query/kern/typposlem/list/
### we chose the nn output of the PoS tag (exceptions: Berlin as ne, App as ne, Edeka as xy, Chrome as ne)

dat.long$word_freq <- revalue(dat.long$word,
                             c("Kaufland"= "1", 	"Apfel"= "578", 	"Website"= "43", 	"Straße"= "14429", 	"Deutsch"= "1775", 	 	"Datenvolumen"= "5",
                               "Klavier"= "1383", 	"Email"= "91", 	"Brille"= "1329", "Seife"= "841", 	"Tisch"= "13807", 	"Suchmaschine"= "37", 	"Wikipedia"= "NA",
                               "Sonne"= "9570", 	"Sport"= "2011", 	"Berlin"= "36560", 	"WLAN"= "NA", 	"Schild"= "1288", 	"Gedicht"= "1958", "App"= "10", 	"Google"= "NA", 	"Flasche"= "2800",
                               "Auto"= "4440", 	 	"Nagel"= "229",	"Bambus"= "60", 	"Smartphone"= "NA", 	"Mütze"= "1350", "Boot"= "2456","Rahmen"= "11867", 	"Hotspot"= "1",	 	"Online"= "1",
                               "Küche"= "4870", "Bürste"= "293", 	"Lampe"= "1699", 	 	"Blog"= "NA", 	"Spiegel"= "3620", 	 	"Leiter"= "7653",  	"Skulptur"= "236", 	"Schale"= "1524",
                               "Spotify"= "NA",	"Firefox"= "NA",	 	"Kante"= "584", "WhatsApp"= "NA", 	"Box"= "184","Edeka"= "6", 	"Chrome"= "1", 	"Gleise"= "204", 	"Storch"= "188" ))

dat.long$word_freq = as.numeric(as.character(dat.long$word_freq))

# translations (added August 2020)

# unrelated words

# Kaufland = a German supermarket chain
# Apfel = apple
# Straße = street
# Deutsch = German
# Klavier = piano
# Brille = glasses
# Seife = soap
# Tisch = table
# Sonne = sun
# Sport = sport
# Berlin = Berlin
# Schild = sign
# Gedicht = poem
# Flasche = bottle
# Auto = car
# Nagel = nail
# Bambus = bamboo
# Mütze = hat
# Boot = boat
# Rahmen = frame
# Küche = kitchen
# Bürste = brush
# Lampe = lamp
# Spiegel = mirror
# Leiter = ladder
# Skulptur = sculpture
# Schale = bowl
# Kante = edge
# Box = box
# Edeka = a German supermarket chain
# Gleise = rail tracks
# Storch = stork

# related words

# Website
# Datenvolumen = data volume
# Email 
# Suchmaschine = search engine
# Wikipedia 
# WLAN 
# App 
# Google 
# Smartphone
# Hotspot
# Online
# Blog
# Spotify
# Firefox
# Whatsapp
# Chrome

############################################################# code for analysis ##########################

# distribution of age

dat.age <- dplyr::summarise(group_by(dat.long, 
                                         VPcode), 
                                age = mean(age,na.rm = TRUE)) # not elegant but does the job (ie, get age)

ggplot(dat.age, aes(x=1, y=age)) + 
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice
  stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") +  
  geom_jitter(position=position_jitter(width=.1, height=0))

# exclude 28 participants older than the 75% quantile (44)

dat.age = filter(dat.age, age > quantile(dat.age$age, .75) & !is.na(age))

dat.long = filter(dat.long, !(VPcode %in% dat.age$VPcode))

rm(dat.age)

# correct response in easy questionnaire

dat.correct <- dplyr::summarise(group_by(dat.long, 
                                         VPcode), 
                                correct = mean(ETQ_acc,na.rm = T)) # we need a summary function here; mean is ok because all values are the same

Easy_Questions_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc

# correct response in hard questionnaire

dat.correct <- dplyr::summarise(group_by(dat.long, 
                                         VPcode), 
                                correct = mean(HTQ_acc,na.rm = T)) # we need a summary function here; mean is ok because all values are the same

Hard_Questions_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc

# correct response in number task
# will be a rather conservative estimate of performance

dat.correct <- dplyr::summarise(group_by(dat.long, 
                                         VPcode), 
                                correct = sum(CLaccuracy)/length(CLaccuracy))

Numbers_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc

# correct response in color task 

dat.correct <- dplyr::summarise(group_by(dat.long, 
                                   VPcode), 
                          correct = sum(accuracy)/length(accuracy))

Colors_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc

# exclude incorrect trials (not done by Betsy Sparrow et al., 2011)
# dat.long = filter(dat.long, accuracy == 1)

# identify and remove anticipatory responses (RT < 100ms, none)

dat.long$toofast <- ifelse(dat.long$RT < 0.1, 1, 0) 

dat.long = filter(dat.long, toofast ==0)

# visualize RT distribution and IQR outliers (per participant)

ggplot(dat.long, aes(x=VPcode, y=RT)) + 
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice
  stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") +  
  geom_jitter(position=position_jitter(width=.1, height=0))

# identify outliers (Tukey, IQR), per participant

dat.long <- dat.long %>%
  group_by(VPcode) %>%
  mutate(outlier = identifyOutliers(RT))

dat.long <- dat.long %>% mutate(outlier = as.numeric(outlier))

dat.outlier <- dplyr::summarise(group_by(dat.long, 
                                          VPcode), 
                                 outlier = sum(outlier)/length(outlier))

Outliers = summarySE(dat.outlier, measurevar = 'outlier')

# remove outliers

dat.long = filter(dat.long, outlier == 0)

# calculate overall mean RT per participant

dat.RT = dplyr::summarise(group_by(dat.long, 
                                   VPcode), 
                          RT = mean(RT))

ResponseTime = summarySE(dat.RT, measurevar = 'RT')

# visualize distribution of RTs across participants

ggplot(dat.RT, aes(x=1, y=RT)) + 
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice
  stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") +  
  geom_jitter(position=position_jitter(width=.1, height=0))

# calculate mean RT per condition (per participant)

dat.RT = dplyr::summarise(group_by(dat.long, 
                                    VPcode, tq_trial_str, word_cat_str), 
                           RT = mean(RT))

RTs = summarySE(dat.RT, measurevar = 'RT', groupvars =  c('tq_trial_str', 'word_cat_str'))

######### plot the RT data (2x2 design), export (400 x 400)

# note: a good resource for nice graphs:
# http://shinyapps.org/apps/RGraphCompendium/index.php#bar-plots

p = ggplot() +
  
  geom_bar(data=RTs,aes(x=word_cat_str, y=RT, fill = as.factor(tq_trial_str)),position=position_dodge(), colour="black", stat="identity", width = 0.75) +
  
  geom_errorbar(data=RTs, aes(x=word_cat_str, y=RT,ymin=RT-se, ymax=RT+se, fill = as.factor(tq_trial_str)),
                width=.10,               # Width of the error bars
                position=position_dodge(0.75)) 

p = p + theme_bw() + scale_fill_brewer(palette="Blues") #+ theme(legend.position = "topright")

p = p + scale_fill_manual("legend", values = c("easy" = "dark grey", "hard" = "white"))

p = p + coord_cartesian(ylim=c(400,900)) + theme(text = element_text(size=16))

p

###########################################################################

## plot answer to "do you think of the internet" question

dat.quest <- filter(dat.long, round == "2" & trial ==1)

table(dat.quest$internet) # note: 11 participants did not provide a response (NaN)

pie(table(dat.quest$internet),
    col = c("blue","deepskyblue","cyan"),
    main = "Internetverhalten")

###########################################################################

## some checks

# starttime for each experimenter
# https://deanattali.com/blog/visualize-git-commits-time/

dat.long$starttime_short = format(as.POSIXct(strptime(dat.long$starttime,"%Y-%m-%d %H:%M",tz="")) ,format = "%H:%M")

dat.plot = filter(dat.long, round == "2" & trial ==1)

dat.plot$starttime_plot = as.POSIXct(dat.plot$starttime_short, format = "%H:%M", tz = "UTC")

p <- ggplot(dat.plot, aes(x=VL, y=starttime_short, label = "time_short")) +
  geom_point(aes(fill = as.factor(VL)), col = "#555555", size = 5,
             shape = 21, position = position_jitter()) + theme_bw(12) + 
  xlab(NULL) + ylab(NULL) 

p

## calculate SDs across participants' RTs (for each student experimenter)

dat.RT = dplyr::summarise(group_by(dat.long, 
                                   VPcode, VL), 
                          RT = mean(RT))

dat.SD = dplyr::summarise(group_by(dat.RT, 
                                   VL), 
                          SD = sd(RT))

ggplot(dat.SD, aes(x=1, y=SD)) + 
  geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice
  stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") +  
  geom_jitter(position=position_jitter(width=.1, height=0))

## calculate RTs across participants

dat.RT = dplyr::summarise(group_by(dat.long, 
                                   starttime, VL), 
                          RT = mean(RT))

###########################################################################

# Calculate BF for effect of word frequency

# https://richarddmorey.github.io/BayesFactor/#regression

# we first remove word_freq == NaNs
# note that this mainly affects computer terms,
# so that testing for the effect of interest (word type x question type) is no longer meaningful

dat.BF= filter(dat.long, !is.na(word_freq)) # remove NaNs (word frequency)

full <- lmBF(RT ~ VPcode + tq_trial + word_freq, whichRandom = "VPcode",data=as.data.frame(dat.BF))

reduced <- lmBF(RT ~ VPcode + tq_trial, whichRandom = "VPcode",data=as.data.frame(dat.BF))

null <- lmBF(RT ~ VPcode, whichRandom = "VPcode",data=as.data.frame(dat.BF))

reduced/full # BF = 16

###########################################################################

# Calculate BF-LMM for effect of interest

dat.long$word_numeric = as.factor(as.numeric(dat.long$word))

#calculate BF
bf1 <- generalTestBF(RT ~ tq_trial * word_cat
                     + VPcode + word_numeric, 
                     whichRandom = c("VPcode", "word_numeric"), whichModels = "all",
                     data = as.data.frame(dat.long) )

# output BF table
x <- extractBF(head(bf1) / max(bf1))
x$bf <- 1/x$bf
x <- x[x$bf < 100,]
knitr::kable(x[1:2])

###########################################################################

# Calculate main effect "Related/unrelated" and export to *csv
# note: this is bad R programming, please change if possible (using)

dat.RT = dplyr::summarise(group_by(dat.long, 
                                   VPcode, word_cat_str), 
                          RT = mean(RT))

dat.RT = spread(dat.RT,word_cat_str,RT) # long to wide

dat.RT = dplyr::mutate(group_by(dat.RT,VPcode), 
                          diff = related - unrelated)

# write.csv(dat.RT$diff, "ExPra_maineffect_relatedness_N89_09072019.csv")

###########################################################################

# Calculate main effect "Hard/easy question" and export to *csv
# note: this is bad R programming, please change if possible (using reshape)

dat.RT = dplyr::summarise(group_by(dat.long, 
                                   VPcode, tq_trial_str), 
                          RT = mean(RT))

dat.RT$diff = dat.RT$RT[dat.RT$tq_trial_str=="hard"]-dat.RT$RT[dat.RT$tq_trial_str=="easy"]
dat.RT = dat.RT[1:length(unique(dat.RT$VPcode)),]

# write.csv(dat.RT$diff, "ExPra_maineffect_question_N89_09072019.csv")

###########################################################################

# Calculate interaction "Related/unrelated" x "Hard/easy question" and export to *csv
# note: this is bad R programming, please change if possible (using reshape)

# related-unrelated (hard) MINUS related-unrelated (easy) - diff should be > 0 (interaction)

dat.RT = dplyr::summarise(group_by(dat.long, 
                                   VPcode, tq_trial_str, word_cat_str), 
                          RT = mean(RT))

dat.diff = group_by(dat.RT, VPcode)


dat.RT$diff = (dat.RT$RT[dat.RT$word_cat_str=="related" & dat.RT$tq_trial_str=="hard"]-dat.RT$RT[dat.RT$word_cat_str=="unrelated" & dat.RT$tq_trial_str=="hard"]) -
              (dat.RT$RT[dat.RT$word_cat_str=="related" & dat.RT$tq_trial_str=="easy"]-dat.RT$RT[dat.RT$word_cat_str=="unrelated" & dat.RT$tq_trial_str=="easy"])

dat.RT = dat.RT[1:length(unique(dat.RT$VPcode)),]

#write.csv(dat.RT$diff, "ExPra_interaction_N89_22082019_incloutlier.csv")