Helper Functions
logodds
This function takes a percentage and returns the log odds.
logodds <- function(p){log(p/(100-p))}
SummarySE
This function can be found on the website “Cookbook for R”.
http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper
It summarizes data, giving count, mean, standard deviation, standard error of the mean, and confidence interval (default 95%).
- data: a data frame
- measurevar: the name of a column that contains the variable to be summarized
- groupvars: a vector containing names of columns that contain grouping variables
- na.rm: a boolean that indicates whether to ignore NA’s
- conf.interval: the percent range of the confidence interval (default is 95%)
summarySE <- function(data = NULL, measurevar, groupvars = NULL, na.rm = FALSE, conf.interval = .95, .drop = TRUE) {require(plyr)
# New version of length which can handle NA's: if na.rm==T, don't count them
length2 <- function (x, na.rm = FALSE) {
if (na.rm) sum(!is.na(x))
else length(x)
}
# This does the summary. For each group's data frame, return a vector with
# N, mean, and sd
datac <- ddply(data, groupvars, .drop = .drop,
.fun = function(xx, col) {
c(N = length2(xx[[col]], na.rm = na.rm),
mean = mean (xx[[col]], na.rm = na.rm),
sd = sd (xx[[col]], na.rm = na.rm)
)
},
measurevar
)
# Rename the "mean" column
datac <- rename(datac, c("mean" = measurevar))
datac$se <- datac$sd / sqrt(datac$N) # Calculate standard error of the mean
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
ciMult <- qt(conf.interval/2 + .5, datac$N-1)
datac$ci <- datac$se * ciMult
return(datac)
}
SummarySEwithin
This function can be found on the website “Cookbook for R”.
http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper
From that website:
Summarizes data, handling within-subject variables by removing inter-subject variability. It will still work if there are no within-subject variables. Gives count, un-normed mean, normed mean (with same between-group mean), standard deviation, standard error of the mean, and confidence interval. If there are within-subject variables, it calculates adjusted values using the method from Morey (2008).
- data: a data frame
- measurevar: the name of a column that contains the variable to be summarized
- betweenvars: a vector containing names of columns that are between-subjects variables
- withinvars: a vector containing names of columns that are within-subjects variables
- idvar: the name of a column that identifies each subject (or matched subjects)
- na.rm: a boolean that indicates whether to ignore NA’s
- conf.interval: the percent range of the confidence interval (default is 95%)
summarySEwithin <- function(data = NULL, measurevar, betweenvars = NULL, withinvars = NULL,
idvar = NULL, na.rm = FALSE, conf.interval = .95, .drop = TRUE) {
# Ensure that the betweenvars and withinvars are factors
factorvars <- vapply(data[, c(betweenvars, withinvars), drop = FALSE],
FUN = is.factor, FUN.VALUE = logical(1))
if (!all(factorvars)) {
nonfactorvars <- names(factorvars)[!factorvars]
message("Automatically converting the following non-factors to factors: ",
paste(nonfactorvars, collapse = ", "))
data[nonfactorvars] <- lapply(data[nonfactorvars], factor)
}
# Get the means from the un-normed data
datac <- summarySE(data, measurevar, groupvars = c(betweenvars, withinvars),
na.rm = na.rm, conf.interval = conf.interval, .drop = .drop)
# Drop all the unused columns (these will be calculated with normed data)
datac$sd <- NULL
datac$se <- NULL
datac$ci <- NULL
# Norm each subject's data
ndata <- normDataWithin(data, idvar, measurevar, betweenvars, na.rm, .drop = .drop)
# This is the name of the new column
measurevar_n <- paste(measurevar, "_norm", sep ="")
# Collapse the normed data - now we can treat between and within vars the same
ndatac <- summarySE(ndata, measurevar_n, groupvars = c(betweenvars, withinvars),
na.rm = na.rm, conf.interval = conf.interval, .drop = .drop)
# Apply correction from Morey (2008) to the standard error and confidence interval
# Get the product of the number of conditions of within-S variables
nWithinGroups <- prod(vapply(ndatac[,withinvars, drop = FALSE], FUN = nlevels,
FUN.VALUE = numeric(1)))
correctionFactor <- sqrt( nWithinGroups / (nWithinGroups-1) )
# Apply the correction factor
ndatac$sd <- ndatac$sd * correctionFactor
ndatac$se <- ndatac$se * correctionFactor
ndatac$ci <- ndatac$ci * correctionFactor
# Combine the un-normed means with the normed results
merge(datac, ndatac)
}
normDataWithin
This function is used by the SummarySEWithin function above. It can be found on the website “Cookbook for R”.
http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper
From that website:
Norms the data within specified groups in a data frame; it normalizes each subject (identified by idvar) so that they have the same mean, within each group specified by betweenvars.
- data: a data frame
- idvar: the name of a column that identifies each subject (or matched subjects)
- measurevar: the name of a column that contains the variable to be summarized
- betweenvars: a vector containing names of columns that are between-subjects variables
- na.rm: a boolean that indicates whether to ignore NAs
normDataWithin <- function(data = NULL, idvar, measurevar, betweenvars = NULL,
na.rm = FALSE, .drop = TRUE) {
#library(plyr)
# Measure var on left, idvar + between vars on right of formula.
data.subjMean <- ddply(data, c(idvar, betweenvars), .drop = .drop,
.fun = function(xx, col, na.rm) {
c(subjMean = mean(xx[,col], na.rm = na.rm))
},
measurevar,
na.rm
)
# Put the subject means with original data
data <- merge(data, data.subjMean)
# Get the normalized data in a new column
measureNormedVar <- paste(measurevar, "_norm", sep = "")
data[,measureNormedVar] <- data[,measurevar] - data[,"subjMean"] +
mean(data[,measurevar], na.rm = na.rm)
# Remove this subject mean column
data$subjMean <- NULL
return(data)
}
myCenter
This function outputs the centered values of a variable, which can be a numeric variable, a factor, or a data frame. It was taken from Florian Jaeger’s blog - https://hlplab.wordpress.com/2009/04/27/centering-several-variables/.
From his blog:
If the input is a numeric variable, the output is the centered variable
If the input is a factor, the output is a numeric variable with centered factor level values. That is, the factor’s levels are converted into numerical values in their inherent order (if not specified otherwise, R defaults to alphanumerical order). More specifically, this centers any binary factor so that the value below 0 will be the 1st level of the original factor, and the value above 0 will be the 2nd level.
If the input is a data frame or matrix, the output is a new matrix of the same dimension and with the centered values and column names that correspond to the colnames() of the input preceded by “c” (e.g. “Variable1” will be “cVariable1”).
myCenter = function(x) {
if (is.numeric(x)) { return(x - mean(x, na.rm = T)) }
if (is.factor(x)) {
x= as.numeric(x)
return(x - mean(x, na.rm = T))
}
if (is.data.frame(x) || is.matrix(x)) {
m= matrix(nrow = nrow(x), ncol = ncol(x))
colnames(m)= paste("c", colnames(x), sep = "")
for (i in 1:ncol(x)) {
m[,i] = myCenter(x[,i])
}
return(as.data.frame(m))
}
}
lizCenter
This function provides a wrapper around myCenter allowing you to center a specific list of variables from a data frame.
- x: data frame
- listfname: a list of the variables to be centered (e.g. list(variable1, variable2))
The output is a copy of the data frame with a column (always a numeric variable) added for each of the centered variables. These columns are labelled with the each column’s previous name, but with “.ct” appended (e.g., “variable1” will become “variable1.ct”).
lizCenter = function(x, listfname)
{
for (i in 1:length(listfname))
{
fname = as.character(listfname[i])
x[paste(fname,".ct", sep="")] = myCenter(x[fname])
}
return(x)
}
lizContrasts
This function can be used to create two centered dummy variables which stand in place of a three-way factor (condition). This allows us to inspect each contrast separately, as well as their interactions with other factors. Other fixed effects in the model can be evaluated as the average effects across all levels of the factor.
The function takes a data frame (d), a factor from that database (condition), which must have three levels, and the name of the level of the factor which is to be used as the baseline for the contrasts (base level).
For example, if d is a data frame with a factor “condition” with three levels “(”lex_skew“,”lex_noskew“, and”mixed“) then lizContrasts(d, d$condition,”lex_no_skew“) returns a data frame with two (numeric) columns added labelled”lex_noskew_VERSUS_lex_mixed" and “lex_noskew_VERSUS_lex_skew”.
Wherever you would normally use “condition”in a formula in an LME, it can be replaced by (lex_noskew_VERSUS_lex_mixed + “lex_noskew_VERSUS_lex_skew) e.g. ~ (a * condition) becomes ~ (a * (lex_noskew_VERSUS_lex_mixed + lex_noskew_VERSUS_lex_skew)).
lizContrasts = function(d, condition, baselevel)
{
condition = factor(condition)
condition = relevel(condition, baselevel)
a = (contrasts(condition)-apply(contrasts(condition),2,mean))
d$dummy1[condition == rownames(a)[1]] <- a[1]
d$dummy1[condition == rownames(a)[2]] <- a[2]
d$dummy1[condition == rownames(a)[3]] <- a[3]
d$dummy2[condition == rownames(a)[1]] <- a[4]
d$dummy2[condition == rownames(a)[2]] <- a[5]
d$dummy2[condition == rownames(a)[3]] <- a[6]
name1 = paste(baselevel, rownames(a)[2],sep ="_VERSUS_")
name2 = paste(baselevel, rownames(a)[3],sep ="_VERSUS_")
d[name1] = d$dummy1
d[name2] = d$dummy2
d$dummy1 <-NULL
d$dummy2 <-NULL
return(d)
}
filter2
This is a function which filters a column of data removing values which a certain number of standard deviations above/below the mean for that participant, possibly in some condition/sub-condition.
- im: the input matrix (a data frame)
- svn: a list of the names of factors to be group by (subject name + one or more conditions)
- fn: the name of the column containing the data to be filtered
- lim: how many standard deviations above/below the mean to filter
The function returns an input matrix identical to the original input matrix but with additional columns giving the group means and the “filtered” data
filter2 = function(im, svn, fn, lim)
{
## work out means for each subject for each word
x = list()
y = ""
for(n in svn) x = append(im[n],x)
for(n in svn) y = paste(y,n,sep = "_")
means = aggregate(im[fn], by = x, mean, na.rm = T)
head(means)
nocols = dim(means)[2]
colnames(means)[nocols] = "means"
sds = aggregate(im[fn], by = x, sd, na.rm = T)
head(sds)
nocols = dim(sds)[2]
colnames(sds)[nocols] = "sds"
gs = merge(means,sds)
## because if there is just one value it does not have a standard deviation and we do not want to just disregard all of these
gs$sds[is.na(gs$sds)] = 0
gs$max = gs$means + lim*gs$sds
gs$min = gs$means - lim*gs$sds
im2 = merge(im, gs, sort = F)
im2[paste(fn, "filt", sep = "")] = im2[fn]
cn = dim(im2)[2] ## get colnumber (last one added)
im2[,cn][im2[,fn] > im2$max] = ""
im2[,cn][im2[,fn] < im2$min] = ""
im2[,cn] = as.numeric(im2[,cn])
names(im2)[names(im2) == "means"] = paste("mean", y, sep = "_")
names(im2)[names(im2) =="sds"] = paste("sd", y, sep = "_")
names(im2)[names(im2) =="max"] = paste("max", y, sep = "_")
names(im2)[names(im2) =="min"] = paste("min", y, sep = "_")
return(im2)
}
get_coeffs
This function allows us to inspect particular coefficients from the output of an LME model by putting them in table.
- x: the output returned when running lmer or glmer (i.e. an object of type lmerMod or glmerMod)
- list: a list of the names of the coefficients to be extracted (e.g. c(“variable1”, “variable1:variable2”))
get_coeffs <- function(x,list){(as.data.frame(summary(x)$coefficients)[list,])}