# ExPra SoSe 2019 # data analysis # clean workspace rm(list=ls()) # install packages install.packages("Rmisc") install.packages("plyr") install.packages("dplyr") install.packages("data.table") install.packages("ggplot") # load packages library(plyr) library(dplyr) library(data.table) library(ggplot2) library(Rmisc) library(tidyr) library(BayesFactor) ################ functions ############################################################################################## # IQR outlier identifyOutliers <- function(x) { qnt <- quantile(x, probs=c(.25, .75), na.rm=TRUE, type = 5) H <- 1.5 * (quantile(x, .75, na.rm=TRUE, type = 5) - quantile(x, .25, na.rm=TRUE, type = 5)) outlier <- ((x) < (qnt[1] - H)) | ((x) > qnt[2] + H) | x < .10 outlier } ################ load *Rdat file ########################################################################################## # when reading Windows *csv files: encoding="UTF-8" # when reading MAC *csv files: encoding="latin1" setwd("C:/BACKUP13072018/PHB 2018/LEHRE/Experimentelles Praktikum/SoSe 2019/Data & Code") load("ExPra_Data_17062019.RData") dat.long = data.long rm(data.long) # rest of the code works with "dat" ################ add word frequency ########################################################################################## ### source (German corpus): http://www.dlexdb.de/query/kern/typposlem/list/ ### we chose the nn output of the PoS tag (exceptions: Berlin as ne, App as ne, Edeka as xy, Chrome as ne) dat.long$word_freq <- revalue(dat.long$word, c("Kaufland"= "1", "Apfel"= "578", "Website"= "43", "Straße"= "14429", "Deutsch"= "1775", "Datenvolumen"= "5", "Klavier"= "1383", "Email"= "91", "Brille"= "1329", "Seife"= "841", "Tisch"= "13807", "Suchmaschine"= "37", "Wikipedia"= "NA", "Sonne"= "9570", "Sport"= "2011", "Berlin"= "36560", "WLAN"= "NA", "Schild"= "1288", "Gedicht"= "1958", "App"= "10", "Google"= "NA", "Flasche"= "2800", "Auto"= "4440", "Nagel"= "229", "Bambus"= "60", "Smartphone"= "NA", "Mütze"= "1350", "Boot"= "2456","Rahmen"= "11867", "Hotspot"= "1", "Online"= "1", "Küche"= "4870", "Bürste"= "293", "Lampe"= "1699", "Blog"= "NA", "Spiegel"= "3620", "Leiter"= "7653", "Skulptur"= "236", "Schale"= "1524", "Spotify"= "NA", "Firefox"= "NA", "Kante"= "584", "WhatsApp"= "NA", "Box"= "184","Edeka"= "6", "Chrome"= "1", "Gleise"= "204", "Storch"= "188" )) dat.long$word_freq = as.numeric(as.character(dat.long$word_freq)) # translations (added August 2020) # unrelated words # Kaufland = a German supermarket chain # Apfel = apple # Straße = street # Deutsch = German # Klavier = piano # Brille = glasses # Seife = soap # Tisch = table # Sonne = sun # Sport = sport # Berlin = Berlin # Schild = sign # Gedicht = poem # Flasche = bottle # Auto = car # Nagel = nail # Bambus = bamboo # Mütze = hat # Boot = boat # Rahmen = frame # Küche = kitchen # Bürste = brush # Lampe = lamp # Spiegel = mirror # Leiter = ladder # Skulptur = sculpture # Schale = bowl # Kante = edge # Box = box # Edeka = a German supermarket chain # Gleise = rail tracks # Storch = stork # related words # Website # Datenvolumen = data volume # Email # Suchmaschine = search engine # Wikipedia # WLAN # App # Google # Smartphone # Hotspot # Online # Blog # Spotify # Firefox # Whatsapp # Chrome ############################################################# code for analysis ########################## # distribution of age dat.age <- dplyr::summarise(group_by(dat.long, VPcode), age = mean(age,na.rm = TRUE)) # not elegant but does the job (ie, get age) ggplot(dat.age, aes(x=1, y=age)) + geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") + geom_jitter(position=position_jitter(width=.1, height=0)) # exclude 28 participants older than the 75% quantile (44) dat.age = filter(dat.age, age > quantile(dat.age$age, .75) & !is.na(age)) dat.long = filter(dat.long, !(VPcode %in% dat.age$VPcode)) rm(dat.age) # correct response in easy questionnaire dat.correct <- dplyr::summarise(group_by(dat.long, VPcode), correct = mean(ETQ_acc,na.rm = T)) # we need a summary function here; mean is ok because all values are the same Easy_Questions_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc # correct response in hard questionnaire dat.correct <- dplyr::summarise(group_by(dat.long, VPcode), correct = mean(HTQ_acc,na.rm = T)) # we need a summary function here; mean is ok because all values are the same Hard_Questions_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc # correct response in number task # will be a rather conservative estimate of performance dat.correct <- dplyr::summarise(group_by(dat.long, VPcode), correct = sum(CLaccuracy)/length(CLaccuracy)) Numbers_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc # correct response in color task dat.correct <- dplyr::summarise(group_by(dat.long, VPcode), correct = sum(accuracy)/length(accuracy)) Colors_Performance = summarySE(dat.correct, measurevar = 'correct') # Rmisc # exclude incorrect trials (not done by Betsy Sparrow et al., 2011) # dat.long = filter(dat.long, accuracy == 1) # identify and remove anticipatory responses (RT < 100ms, none) dat.long$toofast <- ifelse(dat.long$RT < 0.1, 1, 0) dat.long = filter(dat.long, toofast ==0) # visualize RT distribution and IQR outliers (per participant) ggplot(dat.long, aes(x=VPcode, y=RT)) + geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") + geom_jitter(position=position_jitter(width=.1, height=0)) # identify outliers (Tukey, IQR), per participant dat.long <- dat.long %>% group_by(VPcode) %>% mutate(outlier = identifyOutliers(RT)) dat.long <- dat.long %>% mutate(outlier = as.numeric(outlier)) dat.outlier <- dplyr::summarise(group_by(dat.long, VPcode), outlier = sum(outlier)/length(outlier)) Outliers = summarySE(dat.outlier, measurevar = 'outlier') # remove outliers dat.long = filter(dat.long, outlier == 0) # calculate overall mean RT per participant dat.RT = dplyr::summarise(group_by(dat.long, VPcode), RT = mean(RT)) ResponseTime = summarySE(dat.RT, measurevar = 'RT') # visualize distribution of RTs across participants ggplot(dat.RT, aes(x=1, y=RT)) + geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") + geom_jitter(position=position_jitter(width=.1, height=0)) # calculate mean RT per condition (per participant) dat.RT = dplyr::summarise(group_by(dat.long, VPcode, tq_trial_str, word_cat_str), RT = mean(RT)) RTs = summarySE(dat.RT, measurevar = 'RT', groupvars = c('tq_trial_str', 'word_cat_str')) ######### plot the RT data (2x2 design), export (400 x 400) # note: a good resource for nice graphs: # http://shinyapps.org/apps/RGraphCompendium/index.php#bar-plots p = ggplot() + geom_bar(data=RTs,aes(x=word_cat_str, y=RT, fill = as.factor(tq_trial_str)),position=position_dodge(), colour="black", stat="identity", width = 0.75) + geom_errorbar(data=RTs, aes(x=word_cat_str, y=RT,ymin=RT-se, ymax=RT+se, fill = as.factor(tq_trial_str)), width=.10, # Width of the error bars position=position_dodge(0.75)) p = p + theme_bw() + scale_fill_brewer(palette="Blues") #+ theme(legend.position = "topright") p = p + scale_fill_manual("legend", values = c("easy" = "dark grey", "hard" = "white")) p = p + coord_cartesian(ylim=c(400,900)) + theme(text = element_text(size=16)) p ########################################################################### ## plot answer to "do you think of the internet" question dat.quest <- filter(dat.long, round == "2" & trial ==1) table(dat.quest$internet) # note: 11 participants did not provide a response (NaN) pie(table(dat.quest$internet), col = c("blue","deepskyblue","cyan"), main = "Internetverhalten") ########################################################################### ## some checks # starttime for each experimenter # https://deanattali.com/blog/visualize-git-commits-time/ dat.long$starttime_short = format(as.POSIXct(strptime(dat.long$starttime,"%Y-%m-%d %H:%M",tz="")) ,format = "%H:%M") dat.plot = filter(dat.long, round == "2" & trial ==1) dat.plot$starttime_plot = as.POSIXct(dat.plot$starttime_short, format = "%H:%M", tz = "UTC") p <- ggplot(dat.plot, aes(x=VL, y=starttime_short, label = "time_short")) + geom_point(aes(fill = as.factor(VL)), col = "#555555", size = 5, shape = 21, position = position_jitter()) + theme_bw(12) + xlab(NULL) + ylab(NULL) p ## calculate SDs across participants' RTs (for each student experimenter) dat.RT = dplyr::summarise(group_by(dat.long, VPcode, VL), RT = mean(RT)) dat.SD = dplyr::summarise(group_by(dat.RT, VL), SD = sd(RT)) ggplot(dat.SD, aes(x=1, y=SD)) + geom_boxplot(fill = "white", colour = "#3366FF", outlier.shape=NA) + #avoid plotting outliers twice stat_boxplot(geom = "errorbar", width = 0.5,colour = "#3366FF") + geom_jitter(position=position_jitter(width=.1, height=0)) ## calculate RTs across participants dat.RT = dplyr::summarise(group_by(dat.long, starttime, VL), RT = mean(RT)) ########################################################################### # Calculate BF for effect of word frequency # https://richarddmorey.github.io/BayesFactor/#regression # we first remove word_freq == NaNs # note that this mainly affects computer terms, # so that testing for the effect of interest (word type x question type) is no longer meaningful dat.BF= filter(dat.long, !is.na(word_freq)) # remove NaNs (word frequency) full <- lmBF(RT ~ VPcode + tq_trial + word_freq, whichRandom = "VPcode",data=as.data.frame(dat.BF)) reduced <- lmBF(RT ~ VPcode + tq_trial, whichRandom = "VPcode",data=as.data.frame(dat.BF)) null <- lmBF(RT ~ VPcode, whichRandom = "VPcode",data=as.data.frame(dat.BF)) reduced/full # BF = 16 ########################################################################### # Calculate BF-LMM for effect of interest dat.long$word_numeric = as.factor(as.numeric(dat.long$word)) #calculate BF bf1 <- generalTestBF(RT ~ tq_trial * word_cat + VPcode + word_numeric, whichRandom = c("VPcode", "word_numeric"), whichModels = "all", data = as.data.frame(dat.long) ) # output BF table x <- extractBF(head(bf1) / max(bf1)) x$bf <- 1/x$bf x <- x[x$bf < 100,] knitr::kable(x[1:2]) ########################################################################### # Calculate main effect "Related/unrelated" and export to *csv # note: this is bad R programming, please change if possible (using) dat.RT = dplyr::summarise(group_by(dat.long, VPcode, word_cat_str), RT = mean(RT)) dat.RT = spread(dat.RT,word_cat_str,RT) # long to wide dat.RT = dplyr::mutate(group_by(dat.RT,VPcode), diff = related - unrelated) # write.csv(dat.RT$diff, "ExPra_maineffect_relatedness_N89_09072019.csv") ########################################################################### # Calculate main effect "Hard/easy question" and export to *csv # note: this is bad R programming, please change if possible (using reshape) dat.RT = dplyr::summarise(group_by(dat.long, VPcode, tq_trial_str), RT = mean(RT)) dat.RT$diff = dat.RT$RT[dat.RT$tq_trial_str=="hard"]-dat.RT$RT[dat.RT$tq_trial_str=="easy"] dat.RT = dat.RT[1:length(unique(dat.RT$VPcode)),] # write.csv(dat.RT$diff, "ExPra_maineffect_question_N89_09072019.csv") ########################################################################### # Calculate interaction "Related/unrelated" x "Hard/easy question" and export to *csv # note: this is bad R programming, please change if possible (using reshape) # related-unrelated (hard) MINUS related-unrelated (easy) - diff should be > 0 (interaction) dat.RT = dplyr::summarise(group_by(dat.long, VPcode, tq_trial_str, word_cat_str), RT = mean(RT)) dat.diff = group_by(dat.RT, VPcode) dat.RT$diff = (dat.RT$RT[dat.RT$word_cat_str=="related" & dat.RT$tq_trial_str=="hard"]-dat.RT$RT[dat.RT$word_cat_str=="unrelated" & dat.RT$tq_trial_str=="hard"]) - (dat.RT$RT[dat.RT$word_cat_str=="related" & dat.RT$tq_trial_str=="easy"]-dat.RT$RT[dat.RT$word_cat_str=="unrelated" & dat.RT$tq_trial_str=="easy"]) dat.RT = dat.RT[1:length(unique(dat.RT$VPcode)),] #write.csv(dat.RT$diff, "ExPra_interaction_N89_22082019_incloutlier.csv")