library(readxl) mydata <- read.csv("D:/mydata.csv") # set.seed(888) mydata_shuffled <- mydata[sample(nrow(mydata)), ] split_point <- round(nrow(mydata_shuffled) * 0.7) train_data <- mydata_shuffled[1:split_point, ] test_data <- mydata_shuffled[(split_point + 1):nrow(mydata_shuffled), ] write.csv(train_data, "D:/train_data.csv", row.names = FALSE) write.csv(test_data, "D:/test_data.csv", row.names = FALSE) library(caret) library(dplyr) df <- read.csv("D:/train_data.csv") df1 <- read.csv("D:/test_data.csv") Zscore <- function(df,filename){ dfC <- df[, 1] dfZ <- df[, 2:ncol(df)] numeric_cols <- sapply(dfZ, is.numeric) df_numeric <- dfZ[, numeric_cols] df_numeric_scaled <- scale(df_numeric) df_scaled <- dfZ df_scaled[, numeric_cols] <- df_numeric_scaled df_merge <- cbind(dfC,df_scaled) write.csv(df_merge, filename, row.names = FALSE) } Zscore(df,'D:/Z_train_data.csv') Zscore(df1,'D:/Z_test_data.csv') library(dplyr) df_train <- read.csv("C:/Users/lswww/Desktop/train_data.csv") df_test <- read.csv("C:/Users/lswww/Desktop/test_data.csv") train_data <- df_train[,1:26] validation_data <- df_test[,1:26] train_response <- train_data[, 1] train_predictors <- train_data[, -1] validation_response <- validation_data[, 1] validation_predictors <- validation_data[, -1] regression_results <- list() for (predictor_name in names(train_predictors)) { train_model <- glm(train_response ~ train_predictors[[predictor_name]], data = train_data, family = binomial(link = "logit")) train_or <- exp(coef(train_model)[2]) train_p_value <- summary(train_model)$coefficients[2, 4] train_conf_int <- confint(train_model,level = 0.95) train_lower_ci <- exp(train_conf_int[2, 1]) train_upper_ci <- exp(train_conf_int[2, 2]) # validation_model <- glm(validation_response ~ validation_predictors[[predictor_name]], data = validation_data, family = binomial(link = "logit")) # validation_or <- exp(coef(validation_model)[2]) # validation_p_value <- summary(validation_model)$coefficients[2, 4] # validation_conf_int <- confint(validation_model,level = 0.95) # validation_lower_ci <- exp(validation_conf_int[2, 1]) # validation_upper_ci <- exp(validation_conf_int[2, 2]) result <- data.frame( Variable = predictor_name, OR = train_or, Lower_CI = train_lower_ci, Upper_CI = train_upper_ci, P_value = train_p_value # Variable = c(predictor_name, predictor_name), # OR = c(train_or, validation_or), # Lower_CI = c(train_lower_ci, validation_lower_ci), # Upper_CI = c(train_upper_ci, validation_upper_ci), # P_value = c(train_p_value, validation_p_value) ) regression_results[[predictor_name]] <- result } final_results <- do.call(rbind, regression_results) write.csv(final_results, "D:/单因素逻辑回归筛选结果.csv", row.names = FALSE) selected_features <- final_results %>% filter(P_value <= 0.05) %>% pull(Variable) train_data_selected <- train_data %>% select(Label, all_of(selected_features)) test_data_selected <- validation_data %>% select(Label, all_of(selected_features)) write.csv(train_data_selected, "D:/训练集_单因素回归特征筛选.csv", row.names = FALSE) write.csv(test_data_selected, "D:/测试集_单因素回归特征筛选.csv", row.names = FALSE) library(DMwR) library(tidyverse) library(pROC) library(caret) library(e1071) library(rmda) train_data <- read.csv("D:/训练集_单因素回归特征筛选.csv") test_data <- read.csv("D:/测试集_单因素回归特征筛选.csv") response <- train_data[[1]] predictors <- train_data[,-1] response1 <- test_data[[1]] predictors1 <- test_data[,-1] model <- glm(response ~ ., data = predictors, family = binomial(link = "logit")) summary_model <- summary(model) coef_table <- summary_model$coefficients or_values <- exp(coef_table[, "Estimate"]) se <- coef_table[, "Std. Error"] z_values <- coef_table[, "z value"] lower_ci <- exp(coef_table[, "Estimate"] - 1.96 * se) upper_ci <- exp(coef_table[, "Estimate"] + 1.96 * se) p_values <- 2 * (1 - pnorm(abs(z_values))) results_df <- data.frame( Term = rownames(coef_table), OR = or_values, Lower_CI = lower_ci, Upper_CI = upper_ci, P_value = p_values ) write.csv(results_df, "D:/多因素逻辑回归特征筛选.csv", row.names = FALSE) p_values <- coef_table[, "Pr(>|z|)"] selected_features <- names(p_values)[p_values <= 0.05] selected_features <- selected_features[selected_features != "(Intercept)"] train_data_selected <- train_data %>% select(Label, all_of(selected_features)) test_data_selected <- test_data %>% select(Label, all_of(selected_features)) write.csv(train_data_selected, "D:/训练集_多因素回归特征筛选.csv", row.names = FALSE) write.csv(test_data_selected, "D:/测试集_多因素回归特征筛选.csv", row.names = FALSE) library(DMwR) library(tidyverse) library(pROC) library(caret) library(e1071) library(rmda) library(writexl) library(ResourceSelection) df <- read.csv("D:/output_train_feature.csv") df1 <- read.csv("D:/output_validation_feature.csv") df$Label <- factor(df$Label) df1$Label <- factor(df1$Label) train_data <- df[,c(1:5)] test_data <- df1[,c(1:5)] train_data1 <- df[1:4] test_data1 <- df1[1:4] train_data2 <- df[,c(1,5)] test_data2 <- df1[,c(1,5)] response <- train_data[[1]] predictors <- train_data[,-1] response1 <- test_data[[1]] predictors1 <- test_data[,-1] response2 <- train_data1[[1]] predictors2 <- train_data1[,-1] response3 <- test_data1[[1]] predictors3 <- test_data1[,-1] response4 <- train_data2[[1]] predictors4 <- train_data2[,-1] response5 <- test_data2[[1]] predictors5 <- test_data2[,-1] train_data_combined <- cbind(response, predictors) train_data_combined$response <- factor(train_data_combined$response) train_data_combined1 <- cbind(response2, predictors2) train_data_combined1$response2 <- factor(train_data_combined1$response2) # train_data_combined2 <- cbind(response4, predictors4) # train_data_combined2$response4 <- factor(train_data_combined2$response4) smote_data <- SMOTE(response ~ ., data = train_data_combined, perc.over = 100, perc.under = 150) smote_data1 <- SMOTE(response2 ~ ., data = train_data_combined1, perc.over = 100, perc.under = 150) # smote_data2 <- SMOTE(response4 ~ ., data = train_data_combined2, perc.over = 100, perc.under = 200) response_smote <- smote_data$response predictors_smote <- smote_data[,-1] response_smote1 <- smote_data1$response2 predictors_smote1 <- smote_data1[,-1] # response_smote2 <- smote_data2$response4 # predictors_smote2 <- smote_data2[,-1] model <- glm(response_smote ~ ., data = predictors_smote, family = binomial(link = "logit")) model1 <- glm(response_smote1 ~ ., data = predictors_smote1, family = binomial(link = "logit")) # model2 <- glm(response_smote2 ~ ., data = predictors_smote2, family = binomial(link = "logit")) model2 <- glm(response ~ Radscore, data = train_data2, family = binomial(link = "logit")) # set.seed(2) # model <- glm(response ~ ., data = predictors, family = binomial(link = "logit")) # model1 <- glm(response ~ ., data = predictors2, family = binomial(link = "logit")) # model2 <- glm(response ~ ., data = predictors4, family = binomial(link = "logit")) # model2 <- glm(response ~ Radscore, data = train_data2, family = binomial(link = "logit")) summary_model <- summary(model) summary_model1 <- summary(model1) summary_model2 <- summary(model2) train_preds <- predict(model, newdata = predictors, type = "response") test_preds <- predict(model, newdata = predictors1, type = "response") train_roc <- roc(response, train_preds) train_auc <- auc(train_roc) train_ci <- ci.auc(train_roc) test_roc <- roc(response1, test_preds) test_auc <- auc(test_roc) test_ci <- ci.auc(test_roc) train_preds1 <- predict(model1, newdata = predictors2, type = "response") test_preds1 <- predict(model1, newdata = predictors3, type = "response") train_roc1 <- roc(response2, train_preds1) train_auc1 <- auc(train_roc1) train_ci1 <- ci.auc(train_roc1) test_roc1 <- roc(response3, test_preds1) test_auc1 <- auc(test_roc1) test_ci1 <- ci.auc(test_roc1) train_preds2 <- predict(model2, newdata = train_data2, type = "response") test_preds2 <- predict(model2, newdata = test_data2, type = "response") train_roc2 <- roc(train_data2$Label, train_preds2) train_auc2 <- auc(train_roc2) train_ci2 <- ci.auc(train_roc2) test_roc2 <- roc(test_data2$Label, test_preds2) test_auc2 <- auc(test_roc2) test_ci2 <- ci.auc(test_roc2) response_numeric <- as.numeric(as.character(response)) str(response_numeric) dca_data <- data.frame( response = response_numeric, model = train_preds, model1 = train_preds1, model2 = train_preds2 ) dca_model <- decision_curve(response ~ model, data = dca_data, family = binomial(link = "logit")) dca_model1 <- decision_curve(response ~ model1, data = dca_data, family = binomial(link = "logit")) dca_model2 <- decision_curve(response ~ model2, data = dca_data, family = binomial(link = "logit")) pdf('D:/decision.pdf', width = 7, height = 5) plot_decision_curve(list(dca_model, dca_model1, dca_model2), confidence.intervals = F, col = c('#FF6A6A', '#90EE90', '#FFa500', 'black'), curve.names = c('Nomogram', 'Clinics','Radiomics'), legend.position = 'topright', cost.benefits = FALSE) dev.off() response_train <- data.frame(response) response_test <- data.frame(response1) data_train_RC <- data.frame(train_preds) data_test_RC <- data.frame(test_preds) data_train_C <- data.frame(train_preds1) data_test_C <- data.frame(test_preds1) data_train_N <- data.frame(train_preds2) data_test_N <- data.frame(test_preds2) roc_train_RC <- roc(response_train$response, data_train_RC$train_preds) roc_test_RC <- roc(response_test$response1, data_test_RC$test_preds) roc_train_C <- roc(response_train$response, data_train_C$train_preds1) roc_test_C <- roc(response_test$response1, data_test_C$test_preds1) roc_train_N <- roc(response_train$response, data_train_N$train_preds2) roc_test_N <- roc(response_test$response1, data_test_N$test_preds2) result_train_RC_C <- roc.test(roc_train_RC, roc_train_C) result_train_RC_R <- roc.test(roc_train_RC, roc_train_N) result_train_C_R <- roc.test(roc_train_C, roc_train_N) result_test_RC_C <- roc.test(roc_test_RC, roc_test_C) result_test_RC_R <- roc.test(roc_test_RC, roc_test_N) result_test_C_R <- roc.test(roc_test_C, roc_test_N) results000 <- data.frame( Comparison = c("RC vs C", "RC vs R", "C vs R"), Train_p_Value = c(result_train_RC_C$p.value, result_train_RC_R$p.value, result_train_C_R$p.value), Test_p_Value = c(result_test_RC_C$p.value, result_test_RC_R$p.value, result_test_C_R$p.value) ) write_xlsx(results000, "D:/DeLong_Test_Results.xlsx") pdf("D:/trainroc_curves.pdf", width = 7, height = 7) # 绘制ROC曲线 plot(train_roc, col = "#FF6A6A", percent = TRUE, print.auc = FALSE, lty = 1,cex.axis=1.1) # print.auc = FALSE 避免重复显示AUC plot(train_roc1, col = "#90EE90", add = TRUE,percent = TRUE, print.auc = FALSE, lty = 1,cex.axis=1.1) # print.auc = FALSE 避免重复显示AUC plot(train_roc2, col = "#FFa500", add = TRUE,percent = TRUE, print.auc = FALSE, lty = 1,cex.axis=1.1) # print.auc = FALSE 避免重复显示AUC train_legend_text <- paste("Nomogram AUC: ", round(train_auc, 3), " (CI: ", round(train_ci[1], 3), "-", round(train_ci[3], 3), ")") train_legend_text1 <- paste("Clinics AUC: ", round(train_auc1, 3), " (CI: ", round(train_ci1[1], 3), "-", round(train_ci1[3], 3), ")") train_legend_text2 <- paste("Radiomics AUC: ", round(train_auc2, 3), " (CI: ", round(train_ci2[1], 3), "-", round(train_ci2[3], 3), ")") #legend("bottomright", legend = c(train_legend_text, test_legend_text,train_legend_text1,test_legend_text1,train_legend_text2,test_legend_text2), # col = c("#00E5EE", "#FF6A6A","#90EE90","#FFa500","#16E750","#F4F500"),lty = c(1,2,1,2,1,2), cex = 0.9) legend("bottomright", legend = c(train_legend_text,train_legend_text1,train_legend_text2), col = c("#FF6A6A","#90EE90","#FFa500"),lty = c(1,1,1), cex = 0.9) dev.off() dev.new() pdf("D:/testroc_curves.pdf", width = 7, height = 7) plot(test_roc, col = "#FF6A6A", percent = TRUE,percent = TRUE, print.auc = FALSE, lty = 1,cex.axis=1.1) plot(test_roc1, col = "#90EE90", add = TRUE, percent = TRUE, print.auc = FALSE, lty = 1,cex.axis=1.1) plot(test_roc2, col = "#FFa500", add = TRUE, percent = TRUE, print.auc = FALSE, lty = 1,cex.axis=1.1) test_legend_text <- paste("Nomogram AUC: ", round(test_auc, 3), " (CI: ", round(test_ci[1], 3), "-", round(test_ci[3], 3), ")") test_legend_text1 <- paste("Clinics AUC: ", round(test_auc1, 3), " (CI: ", round(test_ci1[1], 3), "-", round(test_ci1[3], 3), ")") test_legend_text2 <- paste("Radiomics AUC: ", round(test_auc2, 3), " (CI: ", round(test_ci2[1], 3), "-", round(test_ci2[3], 3), ")") legend("bottomright", legend = c(test_legend_text,test_legend_text1,test_legend_text2), col = c( "#FF6A6A","#90EE90","#FFa500"),lty = c(1,1,1), cex = 0.9) dev.off() response_smote <- as.numeric(as.character(response_smote)) response_smote1 <- as.numeric(as.character(response_smote1)) # response_smote2 <- as.numeric(as.character(response_smote2)) response_smote2 <- as.numeric(as.character(response2)) hl_test_train <- hoslem.test(response_smote, fitted(model), g=10) hl_test_train1 <- hoslem.test(response_smote1, fitted(model1), g=10) # hl_test_train2 <- hoslem.test(response_smote2, fitted(model2), g=10) hl_test_train2 <- hoslem.test(as.numeric(train_data$Label), fitted(model2), g=10) test_data$Label <- as.numeric(as.character(test_data$Label)) hl_test_test <- hoslem.test(test_data$Label, test_preds, g=10) hl_test_test1 <- hoslem.test(test_data$Label, test_preds1, g=10) hl_test_test2 <- hoslem.test(test_data$Label, test_preds2, g=10) results <- data.frame( Model = c("model", "model1", "model2"), HL_Train_Chi_Square = c(hl_test_train$statistic, hl_test_train1$statistic, hl_test_train2$statistic), HL_Train_p_Value = c(hl_test_train$p.value, hl_test_train1$p.value, hl_test_train2$p.value), HL_Test_Chi_Square = c(hl_test_test$statistic, hl_test_test1$statistic, hl_test_test2$statistic), HL_Test_p_Value = c(hl_test_test$p.value, hl_test_test1$p.value, hl_test_test2$p.value) ) write_xlsx(results, "D:/HL_Test_Results.xlsx") youden_index <- function(sens, spec) { return(sens + spec - 1) } compute_and_save_metrics <- function(model, train_data, test_data, train_response, test_response, file_name) { train_preds <- predict(model, newdata = train_data, type = "response") test_preds <- predict(model, newdata = test_data, type = "response") train_roc <- roc(train_response, train_preds) train_auc <- auc(train_roc) train_ci <- ci.auc(train_roc) test_roc <- roc(test_response, test_preds) test_auc <- auc(test_roc) test_ci <- ci.auc(test_roc) coords_train <- coords(train_roc, "best", ret = "threshold") coords_test <- coords(test_roc, "best", ret = "threshold") train_cutoff <- coords_train$threshold test_cutoff <- coords_test$threshold train_class <- ifelse(train_preds > train_cutoff, 1, 0) train_confusion_matrix <- table(Prediction = train_class, Actual = train_response) train_accuracy <- sum(diag(train_confusion_matrix)) / sum(train_confusion_matrix) train_sensitivity <- train_confusion_matrix[2, 2] / sum(train_confusion_matrix[, 2]) train_specificity <- train_confusion_matrix[1, 1] / sum(train_confusion_matrix[, 1]) train_ppv <- train_confusion_matrix[2, 2] / sum(train_confusion_matrix[2, ]) train_npv <- train_confusion_matrix[1, 1] / sum(train_confusion_matrix[1, ]) test_class <- ifelse(test_preds > test_cutoff, 1, 0) test_confusion_matrix <- table(Prediction = test_class, Actual = test_response) test_accuracy <- sum(diag(test_confusion_matrix)) / sum(test_confusion_matrix) test_sensitivity <- test_confusion_matrix[2, 2] / sum(test_confusion_matrix[, 2]) test_specificity <- test_confusion_matrix[1, 1] / sum(test_confusion_matrix[, 1]) test_ppv <- test_confusion_matrix[2, 2] / sum(test_confusion_matrix[2, ]) test_npv <- test_confusion_matrix[1, 1] / sum(test_confusion_matrix[1, ]) results <- data.frame( Set = c("Train", "Test"), Accuracy = c(train_accuracy, test_accuracy), Sensitivity = c(train_sensitivity, test_sensitivity), Specificity = c(train_specificity, test_specificity), PPV = c(train_ppv, test_ppv), NPV = c(train_npv, test_npv) ) write.csv(results, file = file_name, row.names = FALSE) } compute_and_save_metrics2 <- function(model, train_data, test_data, train_response, test_response, file_name) { train_preds <- predict(model, newdata = train_data, type = "response") test_preds <- predict(model, newdata = test_data, type = "response") train_roc <- roc(train_response$Label, train_preds) train_auc <- auc(train_roc) train_ci <- ci.auc(train_roc) test_roc <- roc(test_response$Label, test_preds) test_auc <- auc(test_roc) test_ci <- ci.auc(test_roc) coords_train <- coords(train_roc, "best", ret = "threshold") coords_test <- coords(test_roc, "best", ret = "threshold") train_cutoff <- coords_train$threshold test_cutoff <- coords_test$threshold train_class <- ifelse(train_preds > train_cutoff, 1, 0) train_confusion_matrix <- table(Prediction = train_class, Actual = train_response$Label) train_accuracy <- sum(diag(train_confusion_matrix)) / sum(train_confusion_matrix) train_sensitivity <- train_confusion_matrix[2, 2] / sum(train_confusion_matrix[, 2]) train_specificity <- train_confusion_matrix[1, 1] / sum(train_confusion_matrix[, 1]) train_ppv <- train_confusion_matrix[2, 2] / sum(train_confusion_matrix[2, ]) train_npv <- train_confusion_matrix[1, 1] / sum(train_confusion_matrix[1, ]) test_class <- ifelse(test_preds > test_cutoff, 1, 0) test_confusion_matrix <- table(Prediction = test_class, Actual = test_response$Label) test_accuracy <- sum(diag(test_confusion_matrix)) / sum(test_confusion_matrix) test_sensitivity <- test_confusion_matrix[2, 2] / sum(test_confusion_matrix[, 2]) test_specificity <- test_confusion_matrix[1, 1] / sum(test_confusion_matrix[, 1]) test_ppv <- test_confusion_matrix[2, 2] / sum(test_confusion_matrix[2, ]) test_npv <- test_confusion_matrix[1, 1] / sum(test_confusion_matrix[1, ]) results <- data.frame( Set = c("Train", "Test"), Accuracy = c(train_accuracy, test_accuracy), Sensitivity = c(train_sensitivity, test_sensitivity), Specificity = c(train_specificity, test_specificity), PPV = c(train_ppv, test_ppv), NPV = c(train_npv, test_npv) ) write.csv(results, file = file_name, row.names = FALSE) } compute_and_save_metrics(model, predictors, predictors1, response, response1, "D:/model1指标.csv") compute_and_save_metrics(model1, predictors2, predictors3, response2, response3, "D:/model2指标.csv") # compute_and_save_metrics(model2, predictors4, predictors5, response4, response5, "D:/model3指标.csv") compute_and_save_metrics2(model2, train_data2, test_data2, train_data2, test_data2, "D:/model3指标.csv") library(rms) library(caret) library(e1071) library(tidyverse) library(ggplot2) df <- read.csv("D:/code-nomogram/train_data_Nom.csv") df1 <- read.csv("D:/code-nomogram/test_data_Nom.csv") df$Label <- factor(df$Label) df1$Label <- factor(df1$Label) train_data <- df[,c(1:6)] test_data <- df1[,c(1:6)] dim(train_data) str(train_data) summary(train_data) train_data$Label <- as.factor(train_data$Label) dd <- datadist(train_data) options(datadist = "dd") response <- train_data[[1]] predictors <- train_data[,-1] response1 <- test_data[[1]] predictors1 <- test_data[,-1] # fit1 <- lrm(Label ~ G + pleural.effusions + interiobular.septal.thickening + Radscore, data = train_data, x = TRUE, y = TRUE) fit1 <- lrm(response ~ ., data = predictors, x = TRUE, y = TRUE) train_pred <- predict(fit1, train_data, type = "fitted") train_df <- data.frame(response = train_data$Label, predict = train_pred) calibration_curve_train <- calibrate(fit1, data = train_df, method = "boot", bw = TRUE, groups = 10) calibration_data_train_df <- as.data.frame(calibration_curve_train[, c("predy", "calibrated.corrected")]) ggplot(calibration_data_train_df, aes(x = predy, y = calibrated.corrected)) + geom_line(color = "#FF6A6A", size = 1) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray",size=0.8) + labs(x = "Predicted Probability", y = "Calibrated Probability") + theme_minimal() + theme( axis.line = element_line(color = "black"), panel.background = element_rect(fill = "white"), axis.title = element_text(size = 13), axis.text = element_text(size = 10), plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio = 1, plot.background = element_rect(fill = "white")) + theme(plot.title = element_blank()) ggsave("D:/code-nomogram/calibration_curve_train.pdf", width =6, height = 6) test_pred <- predict(fit1, test_data, type = "fitted") test_df <- data.frame(response = test_data$Label, predict = test_pred) calibration_curve_test <- calibrate(fit1, data = test_df, method = "boot", bw = TRUE, groups = 10) calibration_data_test_df <- as.data.frame(calibration_curve_test[, c("predy", "calibrated.corrected")]) ggplot(calibration_data_test_df, aes(x = predy, y = calibrated.corrected)) + geom_line(color = "#FF6A6A", size = 1) + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray",size=0.8) + labs(x = "Predicted Probability", y = "Calibrated Probability") + theme_minimal() + theme( axis.line = element_line(color = "black"), panel.background = element_rect(fill = "white"), axis.title = element_text(size = 13), axis.text = element_text(size = 10), plot.title = element_text(hjust = 0.5)) + theme(aspect.ratio = 1, plot.background = element_rect(fill = "white")) + theme(plot.title = element_blank()) ggsave("D:/code-nomogram/calibration_curve_test.pdf", width =6, height = 6) summary_model <- summary(fit1) nom <- nomogram(fit1, fun=plogis, # fun.at=c(0.001,0.1,0.25,0.5,0.75,0.9,0.99), fun.at=c(0.001,0.5,0.99), lp=T, funlabel="Risk") nom1 <- nomogram(fit1, lp = F, fun = plogis, fun.at = c(0.1, 0.4, 0.99), funlabel = 'Risk') pdf("D:/列线图.pdf", width = 15, height = 7) plot(nom) dev.off()