setwd("F:\\project\\0-20240112\\code\\1 ")
data <- read.table("data.txt", header = TRUE, row.names = 1)

# 1. Split the dataset into training and validation groups using a 1:1 ratio with random sampling
set.seed(123)
index <- sample(1:nrow(data), nrow(data) * 0.5)
train_data <- data[index, ]
test_data <- data[-index, ]

# 2. Perform Chi-squared tests for categorical data and normality tests, homogeneity of variance tests, and Wilcoxon tests for continuous data in both groups (training and validation)
chi_square_train_results <- apply(train_data[, 2:22], 2, function(x) chisq.test(table(x, train_data[, 1])))
chi_square_test_results <- apply(test_data[, 2:22], 2, function(x) chisq.test(table(x, test_data[, 1])))

normality_train_results <- lapply(train_data[, 23:29], shapiro.test)
variance_homogeneity_train_results <- apply(train_data[, 23:29], 2, function(x) bartlett.test(x ~ train_data[, 1]))
wilcox_train_results <- apply(train_data[, 23:29], 2, function(x) wilcox.test(x ~ train_data[, 1]))

normality_test_results <- lapply(test_data[, 23:29], shapiro.test)
variance_homogeneity_test_results <- apply(test_data[, 23:29], 2, function(x) bartlett.test(x ~ test_data[, 1]))
wilcox_test_results <- apply(test_data[, 23:29], 2, function(x) wilcox.test(x ~ test_data[, 1]))

# 3. Divide the training group into two subgroups based on the 'group' variable and perform statistical tests
# Assuming `group` is the first column in `train_data`, indicating the two groups (0 and 1)

# 1. Chi-square test: for categorical variables (assuming categorical variables are in columns 2 to 22)
print("Chi-square test results:")
chi_square_results <- apply(train_data[, 2:22], 2, function(x) chisq.test(table(x, train_data$group)))
print(chi_square_results)

# 2. For continuous variables, perform normality test and appropriate difference test
normality_results <- lapply(train_data[, 23:29], shapiro.test)  # Normality test

# Check the normality test results and choose the appropriate test (t-test or Wilcoxon test)
for (i in 23:29) {
  var_name <- colnames(train_data)[i]
  print(paste("Normality test result for variable", var_name, ":"))
  print(normality_results[[i - 22]])
  
  # If the normality test passes (p-value > 0.05), use t-test, otherwise use Wilcoxon test
  if (normality_results[[i - 22]]$p.value > 0.05) {
    # Normal distribution: use t-test
    print(paste("Performing t-test for", var_name))
    t_test_result <- t.test(train_data[, i] ~ train_data$group)
    print(t_test_result)
  } else {
    # Non-normal distribution: use Wilcoxon test
    print(paste("Performing Wilcoxon test for", var_name))
    wilcox_test_result <- wilcox.test(train_data[, i] ~ train_data$group)
    print(wilcox_test_result)
  }
}


# 4. Perform LASSO regression analysis on significant features in the training group
library(glmnet)
# 1. Define the features required for LASSO regression
selected_features <- c("Histological_type", "T_stage", "M_stage", "Vascular_tumor_thrombus", 
                       "Nerve_involvement", "PMS2", "MSH2", "KRAS", "BRAF", "PIK3CA", 
                       "Leukocyte", "Neutrophils")

# 2. Check the column names in train_data to ensure they match the selected features
print(colnames(train_data))

# 3. Extract significant features from the training data
x_train_selected <- as.matrix(train_data[, selected_features])  # Extract selected features
y_train <- as.numeric(train_data[, "group"])  # Target variable

# 4. Perform LASSO regression analysis
library(glmnet)
lasso_model <- cv.glmnet(x_train_selected, y_train, family = "binomial", alpha = 1)

# 5. Plot LASSO results
# Plot 1: LASSO path (coefficients vs. lambda)
plot(lasso_model, xvar = "lambda", label = TRUE)

# Plot 2: Cross-validation error plot (error vs. lambda values)
plot(lasso_model)

# 6. Construct a multivariate logistic regression model using selected features
logistic_model <- glm(as.formula(paste("group ~", paste(selected_features, collapse = " + "))), 
                      data = train_data, family = "binomial")

# 7. Output summary of the logistic regression model
summary(logistic_model)


# 5. Construct a multivariate logistic regression model in the training group based on LASSO results and build a nomogram regression model
# 1. Define the selected factors
selected_factors <- c("T_stage", "M_stage", "vascular_tumor_thrombus", "nerve_involvement", 
                      "PMS2", "MSH2", "KRAS", "BRAF", "PIK3CA", 
                      "leukocyte_count", "neutrophil_count")

# 2. Extract features from the training data
x_train_selected <- as.matrix(train_data[, selected_factors])  # Extract selected features
y_train <- as.numeric(train_data[, 1])  # Target variable

# 3. Perform LASSO regression analysis
lasso_model <- cv.glmnet(x_train_selected, y_train, family = "binomial", alpha = 1)

# 4. Fit a multivariate logistic regression model
logistic_model <- glm(as.formula(paste("group ~", paste(selected_factors, collapse = " + "))), 
                      data = train_data, family = "binomial")

# 5. Construct the nomogram regression model
library(rms)

# Create a distribution object for the data
dd <- datadist(train_data)
options(datadist = "dd")

# Fit the logistic regression model
nomogram_model <- lrm(as.formula(paste("group ~", paste(selected_factors, collapse = " + "))), 
                      data = train_data)

# Generate a nomogram
nomogram_plot <- nomogram(nomogram_model, fun = plogis)

# Plot the nomogram
plot(nomogram_plot)

# 6. Calibration curve, ROC curve, and DCA analysis
library(pROC)
library(rmda)

# Calibration curve
calibration <- calibrate(nomogram_model, method = "boot", B = 1000)
plot(calibration)

# ROC curve
roc_curve <- roc(train_data[, 1], predict(logistic_model, train_data, type = "response"))
plot(roc_curve, print.auc = TRUE)

# DCA analysis
dca_results <- dca(logistic_model)
plot(dca_results)

# Validate in the validation group
roc_curve_test <- roc(test_data[, 1], predict(logistic_model, test_data, type = "response"))
plot(roc_curve_test, print.auc = TRUE)

calibration_test <- calibrate(nomogram_model, method = "boot", B = 1000, newdata = test_data)
plot(calibration_test)

dca_results_test <- dca(logistic_model, newdata = test_data)
plot(dca_results_test)