install.packages("randomForest") library(data.table) library(readxl) library(stringr) library(tableone) library(naniar) #Get the current file path getwd() #Set the path to read the file setwd("C:/Users/Administrator/Desktop/Deskto") stroke <- read_excel("Stroke Recurrence Original Data.xls", col_types = "text") #Variable names colnames(stroke) stroke dim(stroke) #[1] 1179 86 stroke <- data.table(apply(stroke, 2, function(x) ifelse(trimws(x)=="/", NA, trimws(x)))) #dt[i, j, by=k] stroke[, date.in:=as.Date(as.numeric(admission_date), origin = "1899-12-30")] stroke[, date.fu:=as.Date(as.numeric(follow_up_time), origin = "1899-12-30")] stroke[, date.out:=as.Date(as.numeric(discharge_date), origin = "1899-12-30")] stroke[, date.onset:=as.Date(as.numeric(onset_date), origin = "1899-12-30")] stroke[, diff_on_in:=as.numeric(difftime(date.in, date.onset, units = "days"))] summary(stroke[!is.na(diff_on_in)]$diff_on_in) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.000 0.000 0.000 0.279 1.000 3.000 #stroke[diff_on_in>10] #Empty data.table (0 rows and 91 cols): ID, new_id, Name, Gender, Age, Hospital Area (Main Hospital 1/East Hospital 2)... stroke[, diff_fu_in:=as.numeric(difftime(date.fu, date.in, units = "days"))/365.25*12] summary(stroke[!is.na(diff_fu_in)]$diff_fu_in) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.0000 0.6242 9.1499 12.1445 20.9692 48.9856 #quantile(stroke[!is.na(diff_fu_in)]$diff_fu_in, probs = seq(0, 1, 0.1)) #stroke[diff_fu_in>=60, date.fu:=as.Date("2023-03-31")] #stroke[, diff_fu_in:=as.numeric(difftime(date.fu, date.in, units = "days"))/365.25*12] #Delete the above test variables stroke[, diff_on_in:=NULL] stroke[, diff_fu_in:=NULL] table(stroke$`First occurrence 1/ Recurrence 2/ Control 3`) # 1 2 3 # 649 179 351 myVars <- colnames(stroke)[c(5,13,29:31, 39:86)][!grepl("time|date|follow-up status", colnames(stroke)[c(5,13,29:31, 39:86)])] #catVars <- c("gender", "hospital area (main hospital 1/east hospital 2)", "first occurrence 1/recurrence 2/control 3", "onset date", "onset time", "admission date", # "admission time", "discharge date", "outcome (survived 1/death 2)", "onset season", "onset hour", # "subtype", "diagnosis (infarction 1/hemorrhage 2/TIA 3)", "follow-up time", "follow-up status", "hypertension", "diabetes", "coronary heart disease", "atrial fibrillation", "cerebral infarction", # "cerebral hemorrhage", "smoking (yes 1/no 2)", "alcohol (yes 1/no 2)", "cerebral stenosis", "cerebral occlusion", "cerebral sclerosis", "carotid stenosis", "carotid sclerosis", "carotid occlusion", "carotid plaque") #numVars <- setdiff(myVars, catVars) vars <- data.table(read_excel("Organized Stroke Variables.xls")) myVars[!myVars %in% vars$Variable] vars[!Variable %in% myVars]$Variable #vars <- vars[!grepl("Follow-up status|Time|Date|Diagnosis (Infarction 1", variables)] numVars <- c(myVars[myVars %in% vars[`Category1/Numeric2`=="2"]$Variable]) catVars <- c(myVars[myVars %in% vars[`Category1/Numeric2`=="1"]$Variable]) stroke2 <- data.table(cbind(stroke[, -numVars, with = FALSE], apply(stroke[, numVars, with = FALSE], 2, as.numeric))) stroke3 <- stroke2[`First occurrence 1/Recurrence 2/Control 3`=="1"] length(unique(stroke3$new_id)) #649 nrow(stroke3) #649 #ncol setorder(stroke3, new_id, date.in) stroke3[, nn:=1:.N, by="new_id"][nn>1] #4 stroke3 <- stroke3[nn==1] stroke3[, nn:=NULL] #649 #Filter all acute cerebral infarction patients stroke3 <- stroke3[`diagnosis (infarction 1/hemorrhage 2/TIA 3)`=="1"] stroke3[, `diagnosis (infarction 1/hemorrhage 2/TIA 3)`:=NULL] stroke3[, `first occurrence 1/recurrence 2/control 3`:=NULL] miss <- data.table(miss_var_summary(stroke3)) #Variables with more than 80% missing, directly delete miss[pct_miss>=80] #Empty data.table (0 rows and 3 cols): variable,n_miss,pct_miss #Variables with 20%-80% missing, convert to missing category miss[pct_miss>=20 & pct_miss<80] #1 #General steps: Continuous variables can be converted into categorical variables using quartiles, but it has little practical significance. #quantile(stroke3[!is.na(Essen)]$Essen) #stroke3[!is.na(Essen), oxygen_saturation_cat:=ifelse(Essen<=2, "low", ifelse(Essen<=4, "normal", "high"))] #stroke3[is.na(Essen), oxygen_saturation_cat:="Unknown"] #vars[variable=="Essen"] #for (i in miss[pct_miss>=20 & pct_miss<80]$variable) { # if (i %in% catVars) { # #stroke3[[paste0(i, "_cat")]] <- ifelse(is.na(stroke3[[i]]), "Unknown", ifelse(stroke3[[i]]<=quantile(na.omit(stroke3[[i]]))[2], "low", ifelse(stroke3[[i]]<=quantile(na.omit(stroke3[[i]]))[4], "normal", "high"))) #} else { # stroke3[[i]] <- ifelse(is.na(stroke3[[i]]), "Unknown", stroke3[[i]]) # } #} #stroke3 <- stroke3[, -miss[pct_miss>=20 & pct_miss<80 & variable %in% numVars]$variable, with = FALSE] #Missing variables less than 20%, random forest imputation miss[pct_miss<20 & pct_miss>0] #58 stroke3[, status:=ifelse(new_id %in% stroke2[`first occurrence 1/recurrence 2/control 3`=="2"]$new_id, 1, 0)] table(stroke3$status) # 0 1 #426 114 stroke3[status==0, time:=as.numeric(difftime(date.fu, date.in, units = "days"))/365.25*12] recur <- stroke2[`first occurrence 1/recurrent 2/control 3`=="2", .(date.in), by="new_id"] setorder(recur, new_id, date.in)[] recur.new <- merge(stroke3[, .(new_id, date.in.origin=date.in)], recur, by="new_id")[date.in>date.in.origin] recur.new[, date.in.again:=min(date.in), by="new_id"] recur.new <- unique(recur.new[, .(new_id, date.in.again)]) stroke3 <- merge(stroke3, recur.new, by="new_id", all.x=T) stroke3[!is.na(date.in.again), time:=as.numeric(difftime(date.in.again, date.in, units = "days"))/365.25*12] stroke3[, date.in.again:=NULL] summary(stroke3$time) # Min. 1st Qu. Median Mean 3rd Qu. Max. #0.03285 0.39425 3.76181 9.12136 15.90965 41.75770 stroke3 <- stroke3[`outcome (survival 1/death 2)`!=2] table(stroke3$status) # 0 1 #409 114 stroke3[, `outcome (survival 1/death 2)`:=NULL] library(randomForest) for (i in setdiff(colnames(stroke3)[!grepl("number|time|date|time|follow-up", colnames(stroke3))], numVars)) { stroke3[[i]] <- as.factor(stroke3[[i]]) } set.seed(30) stroke3.imputed <- rfImpute(status ~ ., stroke3[, -colnames(stroke3)[grepl("ID|time|date|date|time|follow-up", colnames(stroke3))], with = FALSE]) class(stroke3.imputed$gender) #factor #write.csv(stroke3.imputed, file = "stroke3.impute.csv", row.names = F) #stroke3.imputed <- read.csv("stroke3.impute.csv") #Check for any missing values data.table(miss_var_summary(stroke3.imputed)) stroke3.imputed <- cbind(stroke3[, .(new_id, name, time)], stroke3.imputed) library(survival) library(survminer) library(glmnet) library(ggplot2) table(stroke3.imputed) write.csv(stroke3.imputed,file="mytable.csv") # 0 1 #409 114 stroke3.imputed[, status:=as.numeric(status)] table(stroke3.imputed$status) stroke3.imputed[, status:=ifelse(status==2, 1, 0)] table(stroke3.imputed$status) #Draw KM curve fit <- survfit(Surv(time, status) ~ gender, data = stroke3.imputed) #Fit survival function ggsurvplot(fit, pval=TRUE,#Add P value conf.int = TRUE,# Increase confidence interval risk.tabel.col="strata", #change risk tabel color by groups linetype = "strata", #change line type by groups surv.median.line = "hv", #specify median survival ggtheme = theme_bw(), #change ggplots theme palette = c("hue"), risk.table = TRUE,#add risk table to plot cumulative risk curve xlab = "Follow up time(m)", # specify x-axis label legend.title = "KM Curve", # set legend title legend.labs = c("Male", "Female"), # specify legend group labels ) #Univariate Cox myVars <- colnames(stroke3.imputed)[5:77]#c(numVars, catVars) #Construct survival analysis formulas for each variable #coxph(Surv(time, status) ~ v1, data = d1) uni_formulas <- sapply(myVars[!grepl("bleeding amount|outcome|first onset 1|diagnosis(infarction 1", myVars)], function(x) as.formula(paste("Surv(time, status) ~ ", paste0("`", x, "`")))) stroke3.imputed2 <- stroke3.imputed[(time<=12) | (status==0 & time>12)] stroke3.imputed2[status==0 & time>12, time:=12] table(stroke3.imputed2$status) #Proportion of recurrence and non-recurrence within one year uni_models <- lapply(uni_formulas, function(x) {coxph(x, data = stroke3.imputed)}) uni_models2 <- lapply(uni_formulas, function(x) {coxph(x, data = stroke3.imputed2)}) #Extract p-values uni_results <- lapply(uni_models, function(x){ x <- summary(x) #Get p-value p.value <- signif(x$logtest["pvalue"], digits=2) return(p.value)}) uni_results2 <- lapply(uni_models2, function(x){ x <- summary(x) #Get p-value p.value <- signif(x$logtest["pvalue"], digits=2) return(p.value)}) #Convert to dataset res <- data.table(vars = names(uni_results), p.value = unlist(uni_results)) res2 <- data.table(vars = names(uni_results2), p.value = unlist(uni_results2)) res[p.value<0.2] #25 res2[p.value<0.2] #29 #Multifactor cox1 res.formula <- as.formula(paste("Surv(time, status) ~ ", paste(paste0("`", c("gender", "age", res[p.value<0.2 & vars!="subtype"]$vars), "`"), collapse = " + "))) res.cox <- coxph(res.formula, data = stroke3.imputed) res.surv <- summary(res.cox) c_index <- res.surv$concordance c_index # C se(C) #0.71073741 0.02713465 #bootstrapping + lasso library(Hmisc) library(rms) k <- 100 vars_set <- NULL for (i in 1:k) { s1 <- stroke3.imputed[status=="1"] s2 <- stroke3.imputed[status=="0"] dat1 <- s1[sample(1:nrow(s1), nrow(s1), replace = T), ] dat2 <- s2[sample(1:nrow(s2), nrow(s2), replace = T), ] boot_set <- rbind(dat1, dat2) #Lasso + Cox regression mx <- data.matrix(boot_set[, 5:77]) my <- as.matrix(Surv(boot_set$time, boot_set$status)) #set.seed(1) lasso.cox <- cv.glmnet(x = mx, y = my, family = "cox", type.measure = "C") #plot(lasso.cox) #print(lasso.cox) coef.cox <- coef(lasso.cox, s="lambda.min") #lambda.1se tt <- data.table(t(ifelse(coef.cox!=0, 1, 0))) colnames(tt) <- rownames(coef.cox) vars_set <- rbind(vars_set, tt) } lasso.vars <- names(sort(colSums(vars_set)/nrow(vars_set), decreasing = T)[1:35]) lasso.vars <- lasso.vars[!lasso.vars %in% c("Subtype", "Hospital Area (Main Hospital 1/East Hospital 2)")] #Fit Cox regression table(stroke3.imputed$status) lasso.for <- as.formula(paste("Surv(time, status) ~ strata(`Hospital Area (Main Hospital 1/East Hospital 2)`) + Age + Gender + ", paste(paste0("`", colnames(stroke3.imputed)[colnames(stroke3.imputed) %in% lasso.vars], "`"), collapse = " + "))) lasso.cox <- coxph(lasso.for, data = stroke3.imputed) lasso.result <- summary(lasso.cox) lasso.result$concordance # C se(C) #0.76300698 0.02399951 0.76300698+-1.96*0.02399951 anova(lasso.cox) #Random Forest Survival Analysis library(randomForestSRC) #set.seed(2) rf.cox <- rfsrc(Surv(time, status) ~ ., data = stroke3.imputed[, 3:77], ntree = 500, na.action = "na.impute", tree.err = TRUE, importance = TRUE, block.size = 1) print(rf.cox) #C-index 1-rf.cox$err.rate[rf.cox$ntree] #0.5615545 ## plot results of trained forest plot(rf.cox) rf.result <- plot(rf.cox) rf.vars <- c("Systolic Blood Pressure", "History of Cerebral Infarction", "HCT", "ALT ", "PDW ", "HDL", "UA/HDL", "LYMPH% ", "NIHSS score", "MCV", "Neutrophil/Lymphocyte", "PT", "DDL ", "MCHC", "B2MG", "HGB", "GLU", "RBC", "Cerebral Hemorrhage", "CRP", "WBC", "Subtype", "TG/HDL ", "MPV ", "CREA", "History of Hypertension", "Age", "AST ", "Coronary Heart Disease", "NEUT%", "PLT", "Fg", "TG/CYC", "BUN", "URCA") final.vars <- union(lasso.vars,rf.vars) final.vars <- final.vars[!final.vars %in% c("subtype", "hospital area (main hospital 1/east hospital 2)")] #Fitting Cox regression table(stroke3.imputed$status) final.for <- as.formula(paste("Surv(time, status) ~ strata(`hospital area (main hospital 1/east hospital 2)`) + age + gender + ", paste(paste0("`", colnames(stroke3.imputed)[colnames(stroke3.imputed) %in% final.vars], "`"), collapse = " + "))) final.cox <- coxph(final.for, data = stroke3.imputed) final.result <- summary(final.cox) final.result$concordance anova(final.cox) #rf.cox <- rfsrc(Surv(time, status) ~ `Blood Pressure (High)` + Cerebral Infarction + GLU + PT + ALT + MCHC + HGB + `LYMPH%` + `UA/HDL`, data = stroke3.imputed[, 3:77], ntree = 500, # na.action = "na.impute", tree.err = TRUE, importance = TRUE, block.size = 1) #print(rf.cox) #C-index #1-rf.cox$err.rate[rf.cox$ntree] #0.6137697 ##Package datahttp://127.0.0.1:39569/graphics/63bb6826-a509-4063-b2ca-2d4390054b1d.png library(Hmisc) library(rms) coxm <- cph(Surv(time, status) ~ `History of hypertension` + `History of CI` + `Cerebral artery stenosis` + CAS + SBP + URCA + RDW-CV + MPV, data = stroke3.imputed, x = T, y = T, surv=T) #Proportional Hazards Assumption #cox.zph(coxm) ddist <- datadist(stroke3.imputed) options(datadist='ddist') #Cox model nomogram plotting med <- Quantile(coxm) surv <- Survival(coxm) # Establish survival function plot(nomogram(coxm, fun=list(function(x) surv(12, x), function(x) surv(12*2, x), function(x) surv(12*3, x)), funlabel=c("1-year Survival Probability", "2-year Survival Probability", "3-year Survival Probability"))) #Readmission Death Outcome: Too Few Cases length(unique(stroke3.imputed[status==1]$new_id)) #112 re <- stroke2[paste(new_id, date.in) %in% paste(recur.new$new_id, recur.new$date.in.again)] re[, nn:=1:.N, by="new_id"][nn>1] re <- re[nn==1][, nn:=NULL] re[, status:=ifelse(`Outcome (Survived 1/Dead 2)`=="1", 0, 1)] table(re$status) re[, time:=as.numeric(difftime(date.out, date.in, units = "days"))] #/365.25*12 summary(re$time)