# 00_Stats_Computation_v4.R # PeerJ-CS, "Personality-based pair programming" # # This script: # 1) Reads raw experiment data from the specified Excel file(s). # 2) Preprocesses and factorizes Big Five and Role columns. # 3) Creates per-session data files and aggregated Big Five averages. # 4) Writes out three key files: # - Stats_..._Session_Data.xlsx # - Stats_..._B5Avg.xlsx # - Stats_..._Ready.xlsx # ------------------------------------------------ # Install and load required packages # ------------------------------------------------ required_packages <- c("openxlsx", "dplyr", "digest") new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])] if(length(new_packages)) { install.packages(new_packages) } lapply(required_packages, require, character.only = TRUE) # ------------------------------------------------ # Define dataset filenames (adjust as needed) # ------------------------------------------------ datasets <- c( # "Experiment_SS2022.xlsx", "Experiment_WS2021+SS2022.xlsx" ) # ------------------------------------------------ # Process each dataset in a loop # ------------------------------------------------ for (file in datasets) { # ------------------------------------------------ # Step 1: Preprocessing # ------------------------------------------------ cat("\nReading file:", file, "\n") data <- read.xlsx(file, sheet = "Sheet1") # Rename columns for readability colnames(data) <- c( 'ID', 'Start_time', 'Completion_time', 'Email', 'Name', 'Consent_read', 'Experience_yrs', 'Gender', '01_01', '01_02', '01_03', '01_04', '01_05', '01_06', '01_07', '01_08', '01_09', '01_10', '03_01', '03_02', '03_03', '03_04', '03_05', '03_06', '03_07', '03_Role', '04_01', '04_02', '04_03', '04_04', '04_05', '04_06', '04_07', '04_Role', '05_01', '05_02', '05_03', '05_04', '05_05', '05_06', '05_07', '05_Role', '06_01', '06_02', '06_03', '06_04', '06_05', '06_06', '06_07', '06_Role', '07_01', '07_02', '07_03', '07_04', '07_05', '07_06', '07_07', '07_Role', '08_01', '08_02', '08_03', '08_04', '08_05', '08_06', '08_07', '08_Role', 'Ps_results_flag', 'Initials' ) # Factorize Likert values (1–5) likert_levels <- c("Disagree strongly", "Disagree a little", "Neither agree nor disagree", "Agree a little", "Agree strongly") data[9:66] <- lapply(colnames(data)[9:66], function(col_name) { if (grepl("Role", col_name)) { # Skip factorization for Role columns return(data[[col_name]]) } else { # Convert to numeric factor (1–5) factor(data[[col_name]], levels = likert_levels, labels = 1:5) } }) # ------------------------------------------------ # Role mapping # ------------------------------------------------ # Special "disagree/agree" statements come from a custom question that # determined the user’s chosen role (Pilot, Solo, Navigator). role_mapping <- function(x) { x <- trimws(tolower(x)) ifelse(x == "disagree strongly", "Pilot", ifelse(x == "neither agree nor disagree", "Solo", ifelse(x == "agree strongly", "Navigator", NA) ) ) } role_columns <- c('03_Role', '04_Role', '05_Role', '06_Role', '07_Role', '08_Role') data[role_columns] <- lapply(data[role_columns], function(x) { factor(role_mapping(x), levels = c("Pilot", "Solo", "Navigator")) }) # Convert factor columns to numeric where appropriate data[9:66] <- lapply(data[9:66], as.numeric) # Clean 'Initials' and generate Student_ID via hashing data$Initials <- tolower(substring(data$Initials, nchar(data$Initials)-5)) data$Student_ID <- sapply(data$Initials, function(x) digest::digest(x, algo = "md5")) # Create a variable for the experiment round (Exc_round) data$Start_time <- as.Date(data$Start_time, origin = "1899-12-30") data$Exc_round <- ceiling((as.numeric(data$Start_time) - 44652) / 7) data$Exc_round <- ifelse(data$Exc_round == 7, 3, data$Exc_round) # ------------------------------------------------ # Step 2: Create Stats_Session_Data.xlsx # ------------------------------------------------ # Reverse-coded items: 01_01, 01_03, 01_04, 01_05, 01_07 are reversed, # so we do (6 - data$...) Stats_Session_Data <- data.frame( Start_time = data$Start_time, Completion_time = data$Completion_time, Experience_yrs = data$Experience_yrs, Gender = data$Gender, Student_ID = data$Student_ID, Initials = data$Initials, Exc_round = data$Exc_round, B5_O = ((6 - data$`01_05`) + data$`01_10`) / 2, B5_C = ((6 - data$`01_03`) + data$`01_08`) / 2, B5_E = ((6 - data$`01_01`) + data$`01_06`) / 2, B5_A = (data$`01_02` + (6 - data$`01_07`)) / 2, B5_N = ((6 - data$`01_04`) + data$`01_09`) / 2, INNER_R1 = rowSums(data[, c('03_01','03_02','03_03','03_04','03_05','03_06','03_07')]), INNER_R2 = rowSums(data[, c('04_01','04_02','04_03','04_04','04_05','04_06','04_07')]), INNER_R3 = rowSums(data[, c('05_01','05_02','05_03','05_04','05_05','05_06','05_07')]), INNER_R4 = rowSums(data[, c('06_01','06_02','06_03','06_04','06_05','06_06','06_07')]), INNER_R5 = rowSums(data[, c('07_01','07_02','07_03','07_04','07_05','07_06','07_07')]), INNER_R6 = rowSums(data[, c('08_01','08_02','08_03','08_04','08_05','08_06','08_07')]), Role_01 = data$`03_Role`, Role_02 = data$`04_Role`, Role_03 = data$`05_Role`, Role_04 = data$`06_Role`, Role_05 = data$`07_Role`, Role_06 = data$`08_Role` ) # Write out the session-level data output_prefix <- gsub("Experiment_", "Stats_", sub("\\.xlsx", "", file)) session_file <- paste0(output_prefix, "_Session_Data.xlsx") write.xlsx(Stats_Session_Data, file = session_file, colNames = TRUE, overwrite = TRUE) cat("Wrote:", session_file, "\n") # ------------------------------------------------ # Step 3: Create Stats_B5Avg.xlsx # ------------------------------------------------ library(dplyr) # ensure dplyr is loaded for group_by, summarize Stats_B5Avg <- Stats_Session_Data %>% group_by(Student_ID, Initials) %>% summarize( B5_O = round(mean(B5_O, na.rm = TRUE), 4), B5_C = round(mean(B5_C, na.rm = TRUE), 4), B5_E = round(mean(B5_E, na.rm = TRUE), 4), B5_A = round(mean(B5_A, na.rm = TRUE), 4), B5_N = round(mean(B5_N, na.rm = TRUE), 4), .groups = "drop" ) b5avg_file <- paste0(output_prefix, "_B5Avg.xlsx") write.xlsx(Stats_B5Avg, file = b5avg_file, colNames = TRUE, overwrite = TRUE) cat("Wrote:", b5avg_file, "\n") # ------------------------------------------------ # Step 4: Create Stats_Ready.xlsx # ------------------------------------------------ Stats_Ready <- Stats_Session_Data for (i in 1:nrow(Stats_B5Avg)) { current_initials <- Stats_B5Avg$Initials[i] avg_B5 <- Stats_B5Avg[i, c("B5_O", "B5_C", "B5_E", "B5_A", "B5_N")] # Replace the B5 trait scores in Stats_Ready with the average, for each participant Stats_Ready[Stats_Ready$Initials == current_initials, c("B5_O", "B5_C", "B5_E", "B5_A", "B5_N")] <- avg_B5 } ready_file <- paste0(output_prefix, "_Ready.xlsx") write.xlsx(Stats_Ready, file = ready_file, colNames = TRUE, overwrite = TRUE) cat("Wrote:", ready_file, "\n") cat("\nProcessing complete for:", file, "\n") } cat("\nAll datasets processed.\n")