# Set the working directory to the specified path
setwd("C:/Users/…")

# Read the CSV file into a data frame
R1024 <- read.csv(file = "XXX.csv", header = TRUE, sep = ",")

# Load necessary packages
library(survival)
library(survcomp)
library(ggplot2)
library(gridExtra)

# Set a random seed for reproducibility
set.seed(2000)

# Define the number of folds for cross-validation and the list of variables
k <- 5
variables <- c("NLR", "PLR", "SII", "MLR")

# Initialize a data frame to store results for all variables and a data frame for annotations
all_results <- data.frame(variable = character(), fold = integer(), C_Index = numeric(), Type = character(), stringsAsFactors = FALSE)
annotations <- data.frame(variable = character(), Type = character(), C_Index = numeric(), MSE = numeric(), stringsAsFactors = FALSE)

# Perform K-fold cross-validation for each variable
for (var in variables) {
  folds <- sample(1:k, size = nrow(R1024), replace = TRUE)
  train_cindex_list <- numeric(k)
  test_cindex_list <- numeric(k)
  mse_list <- numeric(k)
  
  for (i in 1:k) {
    # Split the data and train the model
    train_data <- R1024[folds != i, ]
    test_data <- R1024[folds == i, ]
    cox_model <- coxph(as.formula(paste("Surv(OS, CENSOR) ~", var)), data = train_data)
    
    # Calculate the C-index for training and validation sets
    train_pred <- predict(cox_model, newdata = train_data, type = "risk")
    test_pred <- predict(cox_model, newdata = test_data, type = "risk")
    train_cindex_list[i] <- concordance.index(train_pred, surv.time = train_data$OS, surv.event = train_data$CENSOR)$c.index
    test_cindex_list[i] <- concordance.index(test_pred, surv.time = test_data$OS, surv.event = test_data$CENSOR)$c.index
    mse_list[i] <- (train_cindex_list[i] - test_cindex_list[i])^2
  }
  
  # Summarize the results for the current variable
  mean_train_cindex <- mean(train_cindex_list, na.rm = TRUE)
  mean_test_cindex <- mean(test_cindex_list, na.rm = TRUE)
  mean_mse <- mean(mse_list, na.rm = TRUE)
  
  # Create a simplified variable name
  short_var_name <- gsub("[0-9.]+", "", var)
  
  # Create a data frame for plotting
  cindex_df <- data.frame(Fold = rep(1:k, 2), C_Index = c(train_cindex_list, test_cindex_list), Type = rep(c("Training set", "Validation set"), each = k))
  cindex_df$variable <- short_var_name
  all_results <- rbind(all_results, cindex_df)
  
  # Store the annotation data for the current variable in the annotations data frame
  annotations <- rbind(annotations, data.frame(variable = short_var_name, Type = "Validation set mean C-index", C_Index = mean_test_cindex, MSE = mean_mse))
  annotations <- rbind(annotations, data.frame(variable = short_var_name, Type = "Training set mean C-index", C_Index = mean_train_cindex, MSE = mean_mse))
}

# Plot boxplots for all variables
ggplot(all_results, aes(x = Type, y = C_Index, fill = Type)) +
  geom_boxplot(width = 0.2) +
  facet_wrap(~ variable, scales = "fixed") +  # Set a fixed scale
  labs(title = "Distribution of C-Index for different Systemic Inflammation Indices", x = NULL, y = "C-index") +
  theme_minimal() +
  theme(axis.title = element_text(size = 14),
        axis.text = element_text(size = 12),
        plot.title = element_text(size = 16, face = "bold"),
        strip.text = element_text(size = 12),
        legend.position = "bottom") +
  scale_fill_manual(values = c("lightblue", "salmon"))