# Set the working directory to the specified path setwd("C:/Users/…") # Read the CSV file into a data frame R1024 <- read.csv(file = "XXX.csv", header = TRUE, sep = ",") # Load necessary packages library(survival) library(survcomp) library(ggplot2) library(gridExtra) # Set a random seed for reproducibility set.seed(2000) # Define the number of folds for cross-validation and the list of variables k <- 5 variables <- c("NLR", "PLR", "SII", "MLR") # Initialize a data frame to store results for all variables and a data frame for annotations all_results <- data.frame(variable = character(), fold = integer(), C_Index = numeric(), Type = character(), stringsAsFactors = FALSE) annotations <- data.frame(variable = character(), Type = character(), C_Index = numeric(), MSE = numeric(), stringsAsFactors = FALSE) # Perform K-fold cross-validation for each variable for (var in variables) { folds <- sample(1:k, size = nrow(R1024), replace = TRUE) train_cindex_list <- numeric(k) test_cindex_list <- numeric(k) mse_list <- numeric(k) for (i in 1:k) { # Split the data and train the model train_data <- R1024[folds != i, ] test_data <- R1024[folds == i, ] cox_model <- coxph(as.formula(paste("Surv(OS, CENSOR) ~", var)), data = train_data) # Calculate the C-index for training and validation sets train_pred <- predict(cox_model, newdata = train_data, type = "risk") test_pred <- predict(cox_model, newdata = test_data, type = "risk") train_cindex_list[i] <- concordance.index(train_pred, surv.time = train_data$OS, surv.event = train_data$CENSOR)$c.index test_cindex_list[i] <- concordance.index(test_pred, surv.time = test_data$OS, surv.event = test_data$CENSOR)$c.index mse_list[i] <- (train_cindex_list[i] - test_cindex_list[i])^2 } # Summarize the results for the current variable mean_train_cindex <- mean(train_cindex_list, na.rm = TRUE) mean_test_cindex <- mean(test_cindex_list, na.rm = TRUE) mean_mse <- mean(mse_list, na.rm = TRUE) # Create a simplified variable name short_var_name <- gsub("[0-9.]+", "", var) # Create a data frame for plotting cindex_df <- data.frame(Fold = rep(1:k, 2), C_Index = c(train_cindex_list, test_cindex_list), Type = rep(c("Training set", "Validation set"), each = k)) cindex_df$variable <- short_var_name all_results <- rbind(all_results, cindex_df) # Store the annotation data for the current variable in the annotations data frame annotations <- rbind(annotations, data.frame(variable = short_var_name, Type = "Validation set mean C-index", C_Index = mean_test_cindex, MSE = mean_mse)) annotations <- rbind(annotations, data.frame(variable = short_var_name, Type = "Training set mean C-index", C_Index = mean_train_cindex, MSE = mean_mse)) } # Plot boxplots for all variables ggplot(all_results, aes(x = Type, y = C_Index, fill = Type)) + geom_boxplot(width = 0.2) + facet_wrap(~ variable, scales = "fixed") + # Set a fixed scale labs(title = "Distribution of C-Index for different Systemic Inflammation Indices", x = NULL, y = "C-index") + theme_minimal() + theme(axis.title = element_text(size = 14), axis.text = element_text(size = 12), plot.title = element_text(size = 16, face = "bold"), strip.text = element_text(size = 12), legend.position = "bottom") + scale_fill_manual(values = c("lightblue", "salmon"))