# 03_Descriptives_BigFive.R
# PeerJ-CS, "Personality-based pair programming"

# This script:
#   1) Reads the Big Five averages dataset (Stats_B5Avg).
#   2) Computes descriptive statistics (descriptives + Shapiro–Wilk).
#   3) Loads the main "Ready" dataset for demographic info.
#   4) Summarizes demographics.

# ------------------------------------------------
# Install and load required libraries
# ------------------------------------------------
required_packages <- c("openxlsx", "psych", "dplyr")
new_packages <- required_packages[!(required_packages %in% installed.packages()[,"Package"])]
if(length(new_packages)) {
  install.packages(new_packages)
}
lapply(required_packages, require, character.only = TRUE)

# ------------------------------------------------
# Read the Big Five averages per participant
# (Created in "00_Stats_Computation_v4.R", step 3)
# ------------------------------------------------
b5avg_file <- "Stats_WS2021+SS2022_B5Avg.xlsx"
Stats_B5Avg <- read.xlsx(b5avg_file)
cat("Loaded Stats_B5Avg with", nrow(Stats_B5Avg), "rows.\n")

# ------------------------------------------------
# Big Five Descriptives & Normality Tests
# ------------------------------------------------
b5_vars <- c("B5_O", "B5_C", "B5_E", "B5_A", "B5_N")
descriptives_file <- "Descriptives_BigFive.txt"
cat("", file = descriptives_file, append = FALSE)  # Clear previous content

for (v in b5_vars) {
  
  # Extract the vector of trait scores
  trait_scores <- Stats_B5Avg[[v]]
  
  # Descriptive statistics (psych::describe gives n, mean, sd, median, min, max, skew, kurtosis)
  desc_results <- psych::describe(trait_scores)
  
  # Shapiro–Wilk normality test
  sw_results <- shapiro.test(trait_scores)
  
  # Print results to console
  cat("\n=== ", v, " ===\n", sep="")
  cat("Descriptive statistics:\n")
  print(desc_results)
  cat("Shapiro–Wilk normality test:\n")
  print(sw_results)
  
  # Append the same results to the output file
  capture.output(
    list(
      Trait = v,
      DescriptiveStatistics = desc_results,
      ShapiroWilkTest = sw_results
    ),
    file = descriptives_file,
    append = TRUE
  )
}

cat("\nDescriptive statistics and Shapiro–Wilk tests completed.\n")
cat("Results saved in:", descriptives_file, "\n")

# ------------------------------------------------
# DEMOGRAPHICS
# ------------------------------------------------
# Here we read the "Ready" dataset created in "00_Stats_Computation_v4.R" step 4,
# which contains Student_ID, Experience_yrs, Gender, etc.

df_file <- "Stats_WS2021+SS2022_Ready.xlsx"
df <- read.xlsx(df_file)
cat("\nLoaded df with", nrow(df), "rows for demographics.\n")

# 1) Keep only unique participants, ignoring duplicates
df_unique <- df %>%
  distinct(Student_ID, .keep_all = TRUE)

# 2) Report the total number of unique participants
n_participants <- nrow(df_unique)
cat("Number of unique participants:", n_participants, "\n")

# 3) Compute mean (and SD) of experience in years
mean_experience <- mean(df_unique$Experience_yrs, na.rm = TRUE)
sd_experience <- sd(df_unique$Experience_yrs, na.rm = TRUE)
cat("Avg. years of experience:", mean_experience, "SD:", sd_experience, "\n")

# 4) Count gender distribution
gender_counts <- table(df_unique$Gender)
cat("Gender distribution:\n")
print(gender_counts)

cat("\nDemographic summary completed.\n")