setwd("F:\\project\\0-20240112\\code\\1 ") data <- read.table("data.txt", header = TRUE, row.names = 1) # 1. Split the dataset into training and validation groups using a 1:1 ratio with random sampling set.seed(123) index <- sample(1:nrow(data), nrow(data) * 0.5) train_data <- data[index, ] test_data <- data[-index, ] # 2. Perform Chi-squared tests for categorical data and normality tests, homogeneity of variance tests, and Wilcoxon tests for continuous data in both groups (training and validation) chi_square_train_results <- apply(train_data[, 2:22], 2, function(x) chisq.test(table(x, train_data[, 1]))) chi_square_test_results <- apply(test_data[, 2:22], 2, function(x) chisq.test(table(x, test_data[, 1]))) normality_train_results <- lapply(train_data[, 23:29], shapiro.test) variance_homogeneity_train_results <- apply(train_data[, 23:29], 2, function(x) bartlett.test(x ~ train_data[, 1])) wilcox_train_results <- apply(train_data[, 23:29], 2, function(x) wilcox.test(x ~ train_data[, 1])) normality_test_results <- lapply(test_data[, 23:29], shapiro.test) variance_homogeneity_test_results <- apply(test_data[, 23:29], 2, function(x) bartlett.test(x ~ test_data[, 1])) wilcox_test_results <- apply(test_data[, 23:29], 2, function(x) wilcox.test(x ~ test_data[, 1])) # 3. Divide the training group into two subgroups based on the 'group' variable and perform statistical tests # Assuming `group` is the first column in `train_data`, indicating the two groups (0 and 1) # 1. Chi-square test: for categorical variables (assuming categorical variables are in columns 2 to 22) print("Chi-square test results:") chi_square_results <- apply(train_data[, 2:22], 2, function(x) chisq.test(table(x, train_data$group))) print(chi_square_results) # 2. For continuous variables, perform normality test and appropriate difference test normality_results <- lapply(train_data[, 23:29], shapiro.test) # Normality test # Check the normality test results and choose the appropriate test (t-test or Wilcoxon test) for (i in 23:29) { var_name <- colnames(train_data)[i] print(paste("Normality test result for variable", var_name, ":")) print(normality_results[[i - 22]]) # If the normality test passes (p-value > 0.05), use t-test, otherwise use Wilcoxon test if (normality_results[[i - 22]]$p.value > 0.05) { # Normal distribution: use t-test print(paste("Performing t-test for", var_name)) t_test_result <- t.test(train_data[, i] ~ train_data$group) print(t_test_result) } else { # Non-normal distribution: use Wilcoxon test print(paste("Performing Wilcoxon test for", var_name)) wilcox_test_result <- wilcox.test(train_data[, i] ~ train_data$group) print(wilcox_test_result) } } # 4. Perform LASSO regression analysis on significant features in the training group library(glmnet) # 1. Define the features required for LASSO regression selected_features <- c("Histological_type", "T_stage", "M_stage", "Vascular_tumor_thrombus", "Nerve_involvement", "PMS2", "MSH2", "KRAS", "BRAF", "PIK3CA", "Leukocyte", "Neutrophils") # 2. Check the column names in train_data to ensure they match the selected features print(colnames(train_data)) # 3. Extract significant features from the training data x_train_selected <- as.matrix(train_data[, selected_features]) # Extract selected features y_train <- as.numeric(train_data[, "group"]) # Target variable # 4. Perform LASSO regression analysis library(glmnet) lasso_model <- cv.glmnet(x_train_selected, y_train, family = "binomial", alpha = 1) # 5. Plot LASSO results # Plot 1: LASSO path (coefficients vs. lambda) plot(lasso_model, xvar = "lambda", label = TRUE) # Plot 2: Cross-validation error plot (error vs. lambda values) plot(lasso_model) # 6. Construct a multivariate logistic regression model using selected features logistic_model <- glm(as.formula(paste("group ~", paste(selected_features, collapse = " + "))), data = train_data, family = "binomial") # 7. Output summary of the logistic regression model summary(logistic_model) # 5. Construct a multivariate logistic regression model in the training group based on LASSO results and build a nomogram regression model # 1. Define the selected factors selected_factors <- c("T_stage", "M_stage", "vascular_tumor_thrombus", "nerve_involvement", "PMS2", "MSH2", "KRAS", "BRAF", "PIK3CA", "leukocyte_count", "neutrophil_count") # 2. Extract features from the training data x_train_selected <- as.matrix(train_data[, selected_factors]) # Extract selected features y_train <- as.numeric(train_data[, 1]) # Target variable # 3. Perform LASSO regression analysis lasso_model <- cv.glmnet(x_train_selected, y_train, family = "binomial", alpha = 1) # 4. Fit a multivariate logistic regression model logistic_model <- glm(as.formula(paste("group ~", paste(selected_factors, collapse = " + "))), data = train_data, family = "binomial") # 5. Construct the nomogram regression model library(rms) # Create a distribution object for the data dd <- datadist(train_data) options(datadist = "dd") # Fit the logistic regression model nomogram_model <- lrm(as.formula(paste("group ~", paste(selected_factors, collapse = " + "))), data = train_data) # Generate a nomogram nomogram_plot <- nomogram(nomogram_model, fun = plogis) # Plot the nomogram plot(nomogram_plot) # 6. Calibration curve, ROC curve, and DCA analysis library(pROC) library(rmda) # Calibration curve calibration <- calibrate(nomogram_model, method = "boot", B = 1000) plot(calibration) # ROC curve roc_curve <- roc(train_data[, 1], predict(logistic_model, train_data, type = "response")) plot(roc_curve, print.auc = TRUE) # DCA analysis dca_results <- dca(logistic_model) plot(dca_results) # Validate in the validation group roc_curve_test <- roc(test_data[, 1], predict(logistic_model, test_data, type = "response")) plot(roc_curve_test, print.auc = TRUE) calibration_test <- calibrate(nomogram_model, method = "boot", B = 1000, newdata = test_data) plot(calibration_test) dca_results_test <- dca(logistic_model, newdata = test_data) plot(dca_results_test)