install.packages("randomForest") library(data.table) library(readxl) library(stringr) library(tableone) library(naniar) #获取当前文件的路径 getwd() #设置读取文件的路径 setwd("C:/Users/Administrator/Desktop/桌面") stroke <- read_excel("脑卒中复发原始数据.xls", col_types = "text") #变量名称 colnames(stroke) stroke dim(stroke) #[1] 1179 86 stroke <- data.table(apply(stroke, 2, function(x) ifelse(trimws(x)=="/", NA, trimws(x)))) #dt[i, j, by=k] stroke[, date.in:=as.Date(as.numeric(入院日期), origin = "1899-12-30")] stroke[, date.fu:=as.Date(as.numeric(随访时间), origin = "1899-12-30")] stroke[, date.out:=as.Date(as.numeric(出院日期), origin = "1899-12-30")] stroke[, date.onset:=as.Date(as.numeric(发病日期), origin = "1899-12-30")] stroke[, diff_on_in:=as.numeric(difftime(date.in, date.onset, units = "days"))] summary(stroke[!is.na(diff_on_in)]$diff_on_in) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.000 0.000 0.000 0.279 1.000 3.000 #stroke[diff_on_in>10] #Empty data.table (0 rows and 91 cols): 编号,新编号,姓名,性别,年龄,院区(总院1/东院2)... stroke[, diff_fu_in:=as.numeric(difftime(date.fu, date.in, units = "days"))/365.25*12] summary(stroke[!is.na(diff_fu_in)]$diff_fu_in) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.0000 0.6242 9.1499 12.1445 20.9692 48.9856 #quantile(stroke[!is.na(diff_fu_in)]$diff_fu_in, probs = seq(0, 1, 0.1)) #stroke[diff_fu_in>=60, date.fu:=as.Date("2023-03-31")] #stroke[, diff_fu_in:=as.numeric(difftime(date.fu, date.in, units = "days"))/365.25*12] #删除上述测试变量 stroke[, diff_on_in:=NULL] stroke[, diff_fu_in:=NULL] table(stroke$`首发1/再发2/对照3`) # 1 2 3 # 649 179 351 myVars <- colnames(stroke)[c(5,13,29:31, 39:86)][!grepl("时间|日期|随访状态", colnames(stroke)[c(5,13,29:31, 39:86)])] #catVars <- c("性别", "院区(总院1/东院2)", "首发1/再发2/对照3", "发病日期", "发病时间", "入院日期", # "入院时间", "出院日期", "结局(存活1/死亡2)", "发病季节", "发病时辰", # "亚型", "诊断(梗死1/出血2/TIA3)", "+随访时间", "随访状态", "高血压", "糖尿病", "冠心病", "房颤", "脑梗死", # "脑出血", "烟(是1/否2)", "酒(是1/否2)", "脑狭窄", "脑闭塞", "脑硬化", "颈狭窄", "颈硬化", "颈闭塞", "颈斑块") #numVars <- setdiff(myVars, catVars) vars <- data.table(read_excel("整理后卒中变量.xls")) myVars[!myVars %in% vars$变量] vars[!变量 %in% myVars]$变量 #vars <- vars[!grepl("随访状态|时间|日期|诊断(梗死1", 变量)] numVars <- c(myVars[myVars %in% vars[`分类1/数值2`=="2"]$变量]) catVars <- c(myVars[myVars %in% vars[`分类1/数值2`=="1"]$变量]) stroke2 <- data.table(cbind(stroke[, -numVars, with = FALSE], apply(stroke[, numVars, with = FALSE], 2, as.numeric))) stroke3 <- stroke2[`首发1/再发2/对照3`=="1"] length(unique(stroke3$新编号)) #649 nrow(stroke3) #649 #ncol setorder(stroke3, 新编号, date.in) stroke3[, nn:=1:.N, by="新编号"][nn>1] #4 stroke3 <- stroke3[nn==1] stroke3[, nn:=NULL] #649 #筛选所有急性脑梗死患者 stroke3 <- stroke3[`诊断(梗死1/出血2/TIA3)`=="1"] stroke3[, `诊断(梗死1/出血2/TIA3)`:=NULL] stroke3[, `首发1/再发2/对照3`:=NULL] miss <- data.table(miss_var_summary(stroke3)) #变量缺失80%以上,直接删除 miss[pct_miss>=80] #Empty data.table (0 rows and 3 cols): variable,n_miss,pct_miss #变量缺失20%-80%,转化为缺失类别 miss[pct_miss>=20 & pct_miss<80] #1 #较一般步骤:可利用四分位数将连续变量转化为分类变量,但无太大实际意义 #quantile(stroke3[!is.na(Essen)]$Essen) #stroke3[!is.na(Essen), 血氧饱和度_cat:=ifelse(Essen<=2, "low", ifelse(Essen<=4, "normal", "high"))] #stroke3[is.na(Essen), 血氧饱和度_cat:="Unknown"] #vars[变量=="Essen"] #for (i in miss[pct_miss>=20 & pct_miss<80]$variable) { # if (i %in% catVars) { # #stroke3[[paste0(i, "_cat")]] <- ifelse(is.na(stroke3[[i]]), "Unknown", ifelse(stroke3[[i]]<=quantile(na.omit(stroke3[[i]]))[2], "low", ifelse(stroke3[[i]]<=quantile(na.omit(stroke3[[i]]))[4], "normal", "high"))) #} else { # stroke3[[i]] <- ifelse(is.na(stroke3[[i]]), "Unknown", stroke3[[i]]) # } #} #stroke3 <- stroke3[, -miss[pct_miss>=20 & pct_miss<80 & variable %in% numVars]$variable, with = FALSE] #变量缺失小于20%,随机森林插补 miss[pct_miss<20 & pct_miss>0] #58 stroke3[, status:=ifelse(新编号 %in% stroke2[`首发1/再发2/对照3`=="2"]$新编号, 1, 0)] table(stroke3$status) # 0 1 #426 114 stroke3[status==0, time:=as.numeric(difftime(date.fu, date.in, units = "days"))/365.25*12] recur <- stroke2[`首发1/再发2/对照3`=="2", .(date.in), by="新编号"] setorder(recur, 新编号, date.in)[] recur.new <- merge(stroke3[, .(新编号, date.in.origin=date.in)], recur, by="新编号")[date.in>date.in.origin] recur.new[, date.in.again:=min(date.in), by="新编号"] recur.new <- unique(recur.new[, .(新编号, date.in.again)]) stroke3 <- merge(stroke3, recur.new, by="新编号", all.x=T) stroke3[!is.na(date.in.again), time:=as.numeric(difftime(date.in.again, date.in, units = "days"))/365.25*12] stroke3[, date.in.again:=NULL] summary(stroke3$time) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.03285 0.39425 3.76181 9.12136 15.90965 41.75770 stroke3 <- stroke3[`结局(存活1/死亡2)`!=2] table(stroke3$status) # 0 1 #409 114 stroke3[, `结局(存活1/死亡2)`:=NULL] library(randomForest) for (i in setdiff(colnames(stroke3)[!grepl("编号|时间|日期|date|time|随访", colnames(stroke3))], numVars)) { stroke3[[i]] <- as.factor(stroke3[[i]]) } set.seed(30) stroke3.imputed <- rfImpute(status ~ ., stroke3[, -colnames(stroke3)[grepl("编号|姓名|时间|日期|date|time|随访", colnames(stroke3))], with = FALSE]) class(stroke3.imputed$性别) #factor #write.csv(stroke3.imputed, file = "stroke3.impute.csv", row.names = F) #stroke3.imputed <- read.csv("stroke3.impute.csv") #检查是否还有缺失 data.table(miss_var_summary(stroke3.imputed)) stroke3.imputed <- cbind(stroke3[, .(新编号, 姓名, time)], stroke3.imputed) library(survival) library(survminer) library(glmnet) library(ggplot2) table(stroke3.imputed) write.csv(stroke3.imputed,file="mytable.csv") # 0 1 #409 114 stroke3.imputed[, status:=as.numeric(status)] table(stroke3.imputed$status) stroke3.imputed[, status:=ifelse(status==2, 1, 0)] table(stroke3.imputed$status) #绘制KM曲线 fit <- survfit(Surv(time, status) ~ 性别, data = stroke3.imputed) #拟合生存函数 ggsurvplot(fit, pval=TRUE,#添加P值 conf.int = TRUE,#增加置信区间 risk.tabel.col="strata", #change risk tabel color by groups linetype = "strata", #change line type by groups surv.median.line = "hv", #specify median survival 增加中位生存时间 ggtheme = theme_bw(), #change ggplots theme palette = c("hue"), risk.table = TRUE,#add risk tabel绘制累计风险曲线 xlab = "Follow up time(m)", # 指定x轴标签 legend.title = "KM曲线", # 设置图例标题 legend.labs = c("Male", "Female"), # 指定图例分组标签 ) #单因素cox myVars <- colnames(stroke3.imputed)[5:77]#c(numVars, catVars) #分别对每一个变量,构建生存分析的公式 #coxph(Surv(time, status) ~ v1, data = d1) uni_formulas <- sapply(myVars[!grepl("出血量|结局|首发1|诊断(梗死1", myVars)], function(x) as.formula(paste("Surv(time, status) ~ ", paste0("`", x, "`")))) stroke3.imputed2 <- stroke3.imputed[(time<=12) | (status==0 & time>12)] stroke3.imputed2[status==0 & time>12, time:=12] table(stroke3.imputed2$status) #一年内复发与没复发人群占比 uni_models <- lapply(uni_formulas, function(x) {coxph(x, data = stroke3.imputed)}) uni_models2 <- lapply(uni_formulas, function(x) {coxph(x, data = stroke3.imputed2)}) #提取p值 uni_results <- lapply(uni_models, function(x){ x <- summary(x) #获取p值 p.value <- signif(x$logtest["pvalue"], digits=2) return(p.value)}) uni_results2 <- lapply(uni_models2, function(x){ x <- summary(x) #获取p值 p.value <- signif(x$logtest["pvalue"], digits=2) return(p.value)}) #转换成数据集 res <- data.table(vars = names(uni_results), p.value = unlist(uni_results)) res2 <- data.table(vars = names(uni_results2), p.value = unlist(uni_results2)) res[p.value<0.2] #25 res2[p.value<0.2] #29 #多因素cox1 res.formula <- as.formula(paste("Surv(time, status) ~ ", paste(paste0("`", c("性别", "年龄", res[p.value<0.2 & vars!="亚型"]$vars), "`"), collapse = " + "))) res.cox <- coxph(res.formula, data = stroke3.imputed) res.surv <- summary(res.cox) c_index <- res.surv$concordance c_index # C se(C) #0.71073741 0.02713465 #bootstrapping + lasso library(Hmisc) library(rms) k <- 100 vars_set <- NULL for (i in 1:k) { s1 <- stroke3.imputed[status=="1"] s2 <- stroke3.imputed[status=="0"] dat1 <- s1[sample(1:nrow(s1), nrow(s1), replace = T), ] dat2 <- s2[sample(1:nrow(s2), nrow(s2), replace = T), ] boot_set <- rbind(dat1, dat2) #lasso+cox回归 mx <- data.matrix(boot_set[, 5:77]) my <- as.matrix(Surv(boot_set$time, boot_set$status)) #set.seed(1) lasso.cox <- cv.glmnet(x = mx, y = my, family = "cox", type.measure = "C") #plot(lasso.cox) #print(lasso.cox) coef.cox <- coef(lasso.cox, s="lambda.min") #lambda.1se tt <- data.table(t(ifelse(coef.cox!=0, 1, 0))) colnames(tt) <- rownames(coef.cox) vars_set <- rbind(vars_set, tt) } lasso.vars <- names(sort(colSums(vars_set)/nrow(vars_set), decreasing = T)[1:35]) lasso.vars <- lasso.vars[!lasso.vars %in% c("亚型", "院区(总院1/东院2)")] #拟合cox回归 table(stroke3.imputed$status) lasso.for <- as.formula(paste("Surv(time, status) ~ strata(`院区(总院1/东院2)`) + 年龄 + 性别 + ", paste(paste0("`", colnames(stroke3.imputed)[colnames(stroke3.imputed) %in% lasso.vars], "`"), collapse = " + "))) lasso.cox <- coxph(lasso.for, data = stroke3.imputed) lasso.result <- summary(lasso.cox) lasso.result$concordance # C se(C) #0.76300698 0.02399951 0.76300698+-1.96*0.02399951 anova(lasso.cox) #随机森林生存分析 library(randomForestSRC) #set.seed(2) rf.cox <- rfsrc(Surv(time, status) ~ ., data = stroke3.imputed[, 3:77], ntree = 500, na.action = "na.impute", tree.err = TRUE, importance = TRUE, block.size = 1) print(rf.cox) #C-index 1-rf.cox$err.rate[rf.cox$ntree] #0.5615545 ## plot results of trained forest plot(rf.cox) rf.result <- plot(rf.cox) rf.vars <- c("收缩压", "脑梗死病史", "HCT", "ALT ", "PDW ", "HDL", "UA/HDL", "LYMPH% ", "NIHSS评分", "MCV", "中性粒细胞/淋巴细胞", "PT", "DDL ", "MCHC", "B2MG", "HGB", "GLU", "RBC", "脑出血", "CRP", "WBC", "亚型", "TG/HDL ", "MPV ", "CREA", "高血压病史", "年龄", "AST ", "冠心病", "NEUT%", "PLT", "Fg", "TG/CYC", "BUN", "URCA") final.vars <- union(lasso.vars,rf.vars) final.vars <- final.vars[!final.vars %in% c("亚型", "院区(总院1/东院2)")] #拟合cox回归 table(stroke3.imputed$status) final.for <- as.formula(paste("Surv(time, status) ~ strata(`院区(总院1/东院2)`) + 年龄 + 性别 + ", paste(paste0("`", colnames(stroke3.imputed)[colnames(stroke3.imputed) %in% final.vars], "`"), collapse = " + "))) final.cox <- coxph(final.for, data = stroke3.imputed) final.result <- summary(final.cox) final.result$concordance anova(final.cox) #rf.cox <- rfsrc(Surv(time, status) ~ `血压(高)` + 脑梗死 + GLU + PT + ALT + MCHC + HGB + `LYMPH%` + `UA/HDL`, data = stroke3.imputed[, 3:77], ntree = 500, # na.action = "na.impute", tree.err = TRUE, importance = TRUE, block.size = 1) #print(rf.cox) #C-index #1-rf.cox$err.rate[rf.cox$ntree] #0.6137697 ##打包数据http://127.0.0.1:39569/graphics/63bb6826-a509-4063-b2ca-2d4390054b1d.png library(Hmisc) library(rms) coxm <- cph(Surv(time, status) ~ `History of hypertension` + `History of CI` + `Cerebral artery stenosis` + CAS + SBP + URCA + RDW-CV + MPV, data = stroke3.imputed, x = T, y = T, surv=T) #等比例风险假定 #cox.zph(coxm) ddist <- datadist(stroke3.imputed) options(datadist='ddist') #cox列线图绘制 med <- Quantile(coxm) surv <- Survival(coxm) # 建立生存函数 plot(nomogram(coxm, fun=list(function(x) surv(12, x), function(x) surv(12*2, x), function(x) surv(12*3, x)), funlabel=c("1-year Survival Probability", "2-year Survival Probability", "3-year Survival Probability"))) #再入院死亡结局:人数过少 length(unique(stroke3.imputed[status==1]$新编号)) #112 re <- stroke2[paste(新编号, date.in) %in% paste(recur.new$新编号, recur.new$date.in.again)] re[, nn:=1:.N, by="新编号"][nn>1] re <- re[nn==1][, nn:=NULL] re[, status:=ifelse(`结局(存活1/死亡2)`=="1", 0, 1)] table(re$status) re[, time:=as.numeric(difftime(date.out, date.in, units = "days"))] #/365.25*12 summary(re$time) install.packages("DescTools") library(pROC) library(caret) library(Metrics) library(DescTools) # 假设你已经预测了结果,并将它们存储在pred_A和pred_B中 # 同时,假设你的真实结果存储在actual中 # AUC auc_A <- roc(stroke3.imputed, pred_A)$auc auc_B <- roc(actual, pred_B)$auc auc_A <- roc_(response = stroke3.imputed, predictor = pred_A)$auc auc_A <- roc_(response = stroke3.imputed, predictor = pred_A)$auc # F1 Score f1_A <- F1_Score(stroke3.imputed, pred_A) f1_B <- F1_Score(actual, pred_B) # Accuracy acc_A <- accuracy(stroke3.imputed, pred_A) acc_B <- accuracy(actual, pred_B) # Sensitivity sens_A <- sensitivity(stroke3.imputed, pred_A) sens_B <- sensitivity(actual, pred_B) # Specificity spec_A <- specificity(actual, pred_A) spec_B <- specificity(actual, pred_B) # Brier Score brier_A <- brier.score(actual, pred_A) brier_B <- brier.score(actual, pred_B) # 90% Confidence Intervals ci_A <- BinomCI(sum(pred_A == actual), length(pred_A), conf.level = 0.9) ci_B <- BinomCI(sum(pred_B == actual), length(pred_B), conf.level = 0.9) # 输出结果 cat("Model A: AUC =", auc_A, "F1 Score =", f1_A, "Accuracy =", acc_A, "Sensitivity =", sens_A, "Specificity =", spec_A, "Brier Score =", brier_A, "90% CI =", ci_A) cat("Model B: AUC =", auc_B, "F1 Score =", f1_B, "Accuracy =", acc_B, "Sensitivity =", sens_B, "Specificity =", spec_B, "Brier Score =", brier_B, "90% CI =", ci_B) # 比较模型的显著性差异 # 你可以使用卡方检验或t检验等统计检验方法,这取决于你的数据和模型 # 以下是一个使用卡方检验的示例 chisq_test <- chisq.test(matrix(c(sum(pred_A == actual), sum(pred_B == actual), length(pred_A) - sum(pred_A == actual), length(pred_B) - sum(pred_B == actual)), nrow = 2)) print(chisq_test)