clinical <- read.delim('nom1.txt') index_X <- which(clinical$status == 1) index_Y <- which(clinical$status == 0) # 从这些索引中随机选择10个 random_X <- sample(index_X, 10) random_Y <- sample(index_Y, 10) # 把这20个随机选择的索引合并 to_delete <- c(random_X, random_Y) # 删除这些行 clinical <- clinical[-to_delete, ] table(clinical$status) library(tidyverse) set.seed(456) # 为了可复现性 clinical[,3:26] = lapply(clinical[, 3:26], as.character) # 根据3:7的比例随机分配行 index_30pct <- sample(1:nrow(clinical), size = round(0.3 * nrow(clinical ))) subset_30pct <- clinical [index_30pct, ] subset_70pct <- clinical [-index_30pct, ] results <- list() # 用于存储每个变量的模型结果 for (var in names(subset_70pct)[-c(1:2)]) { # 不包括响应变量 formula <- as.formula(paste("status ~", var)) model <- glm(formula, data = subset_70pct, family = binomial) results[[var]] <- summary(model) } results[["Profession"]]$coefficients lapply(results, class) summary_list <- lapply(results, function(x) { coef_summary <- x$coefficients if (is.matrix(coef_summary) && nrow(coef_summary) > 1) { data.frame( Estimate = coef_summary[, "Estimate"], Std.Error = coef_summary[, "Std. Error"], z_value = coef_summary[, "z value"], Pr = coef_summary[, "Pr(>|z|)"] ) } else { return(NULL) } }) # 过滤出NULL的结果 summary_list <- Filter(Negate(is.null), summary_list) # 合并结果 summary_table <- do.call(rbind, summary_list) summary_table$OR = exp(summary_table$Estimate) summary_table$lci95 = exp(summary_table$Estimate - summary_table$Std.Error*1.96) summary_table$uci95 = exp(summary_table$Estimate + summary_table$Std.Error*1.96) summary_table = summary_table %>% mutate(significance = case_when( Pr < 0.05 ~ '*', Pr>0.05 ~ 'ns' )) summary_table[c(5:7)] <- lapply(summary_table[c(5:7)], round, 2) summary_table$OR1 = paste0(summary_table$OR, '(', summary_table$lci95, '-', summary_table$uci95, ')') write.csv(summary_table, 'univariat_logistic.csv') ######################lasso lasso_dat = subset_70pct %>% select(1:3,5,8,16,18:23) #install.packages("glmnet") BiocManager::install('glmnet') library(glmnet) library(glmnet) lasso_dat[,3:12] = lapply(lasso_dat[,3:12], as.numeric) v1<-as.matrix(lasso_dat[,c(3:12)]) v2 <-lasso_dat[,2] mod <- glmnet(v1, v2, family = "binomial") plot(mod, xvar = "lambda", label = TRUE) cvmod <- cv.glmnet(v1, v2, family="binomial") plot(cvmod) cvmod$lambda.min coe <- coef(mod, s = cvmod$lambda.1se) act_index <- which(coe != 0) act_coe <- coe[act_index] row.names(coe)[act_index] library(broom) # 提取数据,就是这么简单! tidy_df <- broom::tidy(mod) tidy_cvdf <- broom::tidy(cvmod) head(tidy_df) head(tidy_cvdf) library(ggplot2) library(RColorBrewer) ?brewer.pal tidy_df$term tidy_df=tidy_df%>% filter(term != '(Intercept)') #随便定义几个颜色,多找几个,防止不够用 mypalette <- c(brewer.pal(11,"BrBG"),brewer.pal(5,"Spectral"))#,brewer.pal(8,"Accent"),brewer.pal(11,"RdYlGn"),brewer.pal(7,"RdYlBu"),brewer.pal(5,"RdGy"))#,brewer.pal(11,"RdBu"),brewer.pal(11,"PuOr")) ggplot(tidy_df, aes(step, estimate, group = term,color=term)) + geom_line(size=1.2)+ geom_hline(yintercept = 0)+ ylab("Coefficients")+ scale_color_manual(name="variable",values = mypalette)+ theme_bw() p2 <- ggplot(tidy_df, aes(lambda, estimate, group = term, color = term)) + geom_line(size=1.2)+ geom_hline(yintercept = 0)+ scale_x_log10(name = "Log Lambda")+ ylab("Coefficients")+ scale_color_manual(name="variable",values = mypalette)+ theme_bw() p2 p3 <- ggplot()+ geom_point(data=tidy_cvdf, aes(lambda,estimate))+ geom_errorbar(data = tidy_cvdf, aes(x=lambda,ymin=conf.low,ymax=conf.high))+ scale_x_log10(name = "Log Lambda")+ ylab("Coefficients")+ theme_bw() p3 library(patchwork) p2 / p3 coef=coef(mod, s = cvmod$lambda.min) index=which(coef != 0) actCoef=coef[index] lassoGene=row.names(coef)[index] geneCoef=cbind(Gene=lassoGene,Coef=actCoef) write.table(geneCoef,file="geneCoef2.txt",sep="\t",quote=F,row.names=F) ##############nomogram library(regplot) library(rms) library(rmda) #转为因子 lasso_dat$gender<-factor(lasso_dat$gender,labels=c("0","1")) lasso_dat$Rankin<-factor(lasso_dat$Rankin,labels=c("0","1","2")) lasso_dat$BI<-factor(lasso_dat$BI,labels=c("0","1","2","3")) lasso_dat$history_of_hyperlipidemia<-factor(lasso_dat$history_of_hyperlipidemia,labels=c("0","1","2","3")) lasso_dat$Time_from_onset_to_hospitalization<-factor(lasso_dat$Time_from_onset_to_hospitalization,labels=c("0","1","2")) lasso_dat$cg02550950<-factor(lasso_dat$cg02550950,labels=c("0","1","2")) lasso_dat$NIHSS<-factor(lasso_dat$NIHSS,labels=c("0","1","2")) lasso_dat$Nature_of_the_work<-factor(lasso_dat$Nature_of_the_work,labels=c("0","1","2")) lasso_dat$cultural_level<-factor(lasso_dat$cultural_level,labels=c("0","1","2")) lasso_dat$location<-factor(lasso_dat$location,labels=c("0","1","2")) ddist <- datadist(lasso_dat) options(datadist="ddist") mylog<- glm(status~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + gender +Rankin + cg02550950 + location,family=binomial(link = "logit"),data = lasso_dat) summary(mylog) coefficients(mylog) exp(coefficients(mylog)) exp(confint(mylog)) library(tidyverse) mylog <-glm(status ~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + cg02550950 + Nature_of_the_work + location, data =lasso_dat,family=binomial(link = "logit")) coef_summary = summary(mylog) coefficients <- coef_summary$coefficients data <- data.frame( Term = rownames(coefficients), Estimate = coefficients[, "Estimate"], StdError = coefficients[, "Std. Error"], zValue = coefficients[, "z value"], Pr = coefficients[, "Pr(>|z|)"] ) summary_table = data summary_table$OR = exp(summary_table$Estimate) summary_table$lci95 = summary_table$Estimate - summary_table$StdError*1.96 summary_table$uci95 = summary_table$Estimate + summary_table$StdError*1.96 summary_table = summary_table %>% mutate(significance = case_when( Pr < 0.05 ~ '*', Pr>0.05 ~ 'ns' )) summary_table[c(2,7:8)] <- lapply(summary_table[c(2,7:8)], round, 2) summary_table$OR1 = paste0(summary_table$Estimate, '(', summary_table$lci95, ', ', summary_table$uci95, ')') write.csv(summary_table, 'multivariat_logistic.csv') #status中1代表活着,2代表死了. 默认数值较小者为结局,表明该模型以生存为结局事件 mylog summary(mylog) coefficients(mylog) exp(coefficients(mylog)) exp(confint(mylog)) #绘制列线图, #因子化分类变量每个协变量用方框表示,数值型变量以连续性坐标轴表示。 nom1<-regplot(mylog, clickable=TRUE, points=TRUE, rank="sd",prfail = T) #指定标记的样本行 nom2<-regplot(mylog,observation=lasso_dat[36,], clickable=TRUE, points=TRUE, rank="sd",droplines=T,prfail = T, other=(list(bvcol="red",sq="green",obscol="blue"))) patients = subset(subset_70pct, id == 46) ##############Cindex library(Hmisc) Cindex <- rcorrcens(lasso_dat$status~predict(mylog)) Cindex 0.963- (0.038/sqrt(62))*1.96 ##############bootstrat mylog<-lrm(status~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=lasso_dat,x=T,y=T) set.seed(300) myc<-validate(mylog,method="b",B = 1000,pr=T,dxy=T) c_index<-(myc[1,5]+1)/2 c_index ##############Calibration mycal<-calibrate(mylog,method="boot",B=1000) pdf("Calibration_2.pdf") plot(mycal,xlab="Nomogram-predicted probability of PSD",ylab="Actual diagnosed PSD (proportion)",sub=T) dev.off() library(rms) library(rmda) modul1<- decision_curve(status~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=lasso_dat, family = binomial(link ='logit'), thresholds= seq(0,1, by = 0.01), confidence.intervals = 0.95) modul2<- decision_curve(status~history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + Nature_of_the_work + location,data=lasso_dat, family = binomial(link ='logit'), thresholds= seq(0,1, by = 0.01), confidence.intervals = 0.95) modul3<- decision_curve(status~ cg02550950,data=lasso_dat, family = binomial(link ='logit'), thresholds= seq(0,1, by = 0.01), confidence.intervals = 0.95) plot_decision_curve(list(modul1,modul2,modul3), curve.names= c("clinical + cg02550950","only clinical", "only cg02550950"), xlab="Threshold probability", cost.benefit.axis =FALSE,col=c( "#8c6bb1","#8c0000","#ffcc80"), confidence.intervals=FALSE, standardize = FALSE) library(ROCR) pre_rate<-predict(mylog) ROC1<- prediction(pre_rate,lasso_dat$status) ROC2<- performance(ROC1,"tpr","fpr") AUC <- performance(ROC1,"auc") AUC = 0.913 plot(ROC2,col="gray", xlab="False positive rate",ylab="True positive rate",lty=1,lwd=3,main=paste("AUC=",AUC)) abline(0,1,lty=2,lwd=3) ###############################validation validation = subset_30pct %>% select(1:3,5,8,16,18:23) #转为因子 validation$gender<-factor(validation$gender,labels=c("0","1")) validation$Rankin<-factor(validation$Rankin,labels=c("0","1","2")) validation$BI<-factor(validation$BI,labels=c("0","1","2","3")) validation$history_of_hyperlipidemia<-factor(validation$history_of_hyperlipidemia,labels=c("0","1","2","3")) validation$Time_from_onset_to_hospitalization<-factor(validation$Time_from_onset_to_hospitalization,labels=c("0","1","2")) validation$cg02550950<-factor(validation$cg02550950,labels=c("0","1","2")) validation$NIHSS<-factor(validation$NIHSS,labels=c("0","1","2")) validation$Nature_of_the_work<-factor(validation$Nature_of_the_work,labels=c("0","1","2")) validation$cultural_level<-factor(validation$cultural_level,labels=c("0","1","2")) validation$location<-factor(validation$location,labels=c("0","1","2")) ddist <- datadist(validation) options(datadist="ddist") mylog<- glm(status~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + gender +Rankin + cg02550950 + location,family=binomial(link = "logit"),data = validation) summary(mylog) ##############Cindex library(Hmisc) Cindex_validation <- rcorrcens(validation$status~predict(mylog)) Cindex_validation ##############Calibration mylog<-lrm(status~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=validation,x=T,y=T) mycal<-calibrate(mylog,method="boot",B=1000) plot(mycal,xlab="Nomogram-predicted probability of PSD",ylab="Actual diagnosed PSD (proportion)",sub=T) library(rms) library(rmda) modul1<- decision_curve(status~ history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=validation, family = binomial(link ='logit'), thresholds= seq(0,1, by = 0.01), confidence.intervals = 0.95) modul2<- decision_curve(status~history_of_hyperlipidemia + NIHSS + Time_from_onset_to_hospitalization + cultural_level + gender +Rankin + BI + Nature_of_the_work + location,data=validation, family = binomial(link ='logit'), thresholds= seq(0,1, by = 0.01), confidence.intervals = 0.95) modul3<- decision_curve(status~ cg02550950,data=validation, family = binomial(link ='logit'), thresholds= seq(0,1, by = 0.01), confidence.intervals = 0.95) plot_decision_curve(list(modul1,modul2,modul3), curve.names= c("clinical + cg02550950","only clinical", "only cg02550950"), xlab="Threshold probability", cost.benefit.axis =FALSE,col=c( "#8c6bb1","#8c0000","#ffcc80"), confidence.intervals=FALSE, standardize = FALSE) library(ROCR) pre_rate<-predict(mylog) ROC1<- prediction(pre_rate,validation$status) ROC2<- performance(ROC1,"tpr","fpr") AUC <- performance(ROC1,"auc") AUC = 0.953 plot(ROC2,col="gray", xlab="False positive rate",ylab="True positive rate",lty=1,lwd=3,main=paste("AUC=",AUC)) abline(0,1,lty=2,lwd=3)