clinical <- read.delim('nom1.txt')
index_X <- which(clinical$status == 1)
index_Y <- which(clinical$status == 0)

# 从这些索引中随机选择10个
random_X <- sample(index_X, 10)
random_Y <- sample(index_Y, 10)

# 把这20个随机选择的索引合并
to_delete <- c(random_X, random_Y)

# 删除这些行
clinical <- clinical[-to_delete, ]

table(clinical$status)
library(tidyverse)
set.seed(456) # 为了可复现性
clinical[,3:26] = lapply(clinical[, 3:26], as.character)
  

# 根据3:7的比例随机分配行
index_30pct <- sample(1:nrow(clinical), size = round(0.3 * nrow(clinical )))

subset_30pct <- clinical [index_30pct, ]
subset_70pct <- clinical [-index_30pct, ]

results <- list()  # 用于存储每个变量的模型结果

for (var in names(subset_70pct)[-c(1:2)]) {  # 不包括响应变量
  formula <- as.formula(paste("status ~", var))
  model <- glm(formula, data = subset_70pct, family = binomial)
  results[[var]] <- summary(model)
}
results[["Profession"]]$coefficients

lapply(results, class)

summary_list <- lapply(results, function(x) {
  coef_summary <- x$coefficients
  
  if (is.matrix(coef_summary) && nrow(coef_summary) > 1) {
    data.frame(
      Estimate = coef_summary[, "Estimate"],
      Std.Error = coef_summary[, "Std. Error"],
      z_value = coef_summary[, "z value"],
      Pr = coef_summary[, "Pr(>|z|)"]
    )
  } else {
    return(NULL)
  }
})

# 过滤出NULL的结果
summary_list <- Filter(Negate(is.null), summary_list)

# 合并结果
summary_table <- do.call(rbind, summary_list)

summary_table$OR = exp(summary_table$Estimate)
summary_table$lci95 = exp(summary_table$Estimate - summary_table$Std.Error*1.96)
summary_table$uci95 = exp(summary_table$Estimate + summary_table$Std.Error*1.96)
summary_table = summary_table %>%
  mutate(significance = 
           case_when(
             Pr < 0.05 ~ '*',
             Pr>0.05 ~ 'ns'
           ))

summary_table[c(5:7)] <- lapply(summary_table[c(5:7)], round, 2)
summary_table$OR1 = paste0(summary_table$OR, '(', summary_table$lci95, '-', summary_table$uci95, ')')
write.csv(summary_table, 'univariat_logistic.csv')





######################lasso
lasso_dat = subset_70pct %>%
  select(1:3,5,8,16,18:23)


#install.packages("glmnet")
BiocManager::install('glmnet')
library(glmnet)


library(glmnet)
lasso_dat[,3:12] = lapply(lasso_dat[,3:12], as.numeric)
v1<-as.matrix(lasso_dat[,c(3:12)])
v2 <-lasso_dat[,2]
mod <- glmnet(v1, v2, family = "binomial")
plot(mod, xvar = "lambda", label = TRUE)



cvmod <- cv.glmnet(v1, v2, family="binomial")
plot(cvmod)

cvmod$lambda.min
coe <- coef(mod, s = cvmod$lambda.1se)
act_index <- which(coe != 0)
act_coe <- coe[act_index]
row.names(coe)[act_index]

library(broom)

# 提取数据，就是这么简单！
tidy_df <- broom::tidy(mod)
tidy_cvdf <- broom::tidy(cvmod)
head(tidy_df)
head(tidy_cvdf)
library(ggplot2)
library(RColorBrewer)
?brewer.pal
tidy_df$term
tidy_df=tidy_df%>%
  filter(term != '(Intercept)')

#随便定义几个颜色，多找几个，防止不够用
mypalette <- c(brewer.pal(11,"BrBG"),brewer.pal(5,"Spectral"))#,brewer.pal(8,"Accent"),brewer.pal(11,"RdYlGn"),brewer.pal(7,"RdYlBu"),brewer.pal(5,"RdGy"))#,brewer.pal(11,"RdBu"),brewer.pal(11,"PuOr"))

ggplot(tidy_df, aes(step, estimate, group = term,color=term)) +
  geom_line(size=1.2)+
  geom_hline(yintercept = 0)+
  ylab("Coefficients")+
  scale_color_manual(name="variable",values = mypalette)+
  theme_bw()

p2 <- ggplot(tidy_df, aes(lambda, estimate, group = term, color = term)) +
  geom_line(size=1.2)+
  geom_hline(yintercept = 0)+
  scale_x_log10(name = "Log Lambda")+
  ylab("Coefficients")+
  scale_color_manual(name="variable",values = mypalette)+
  theme_bw()
p2  

p3 <- ggplot()+
  geom_point(data=tidy_cvdf, aes(lambda,estimate))+
  geom_errorbar(data = tidy_cvdf, aes(x=lambda,ymin=conf.low,ymax=conf.high))+
  scale_x_log10(name = "Log Lambda")+
  ylab("Coefficients")+
  theme_bw()
p3
library(patchwork)

p2 / p3


coef=coef(mod, s = cvmod$lambda.min)
index=which(coef != 0)
actCoef=coef[index]
lassoGene=row.names(coef)[index]
geneCoef=cbind(Gene=lassoGene,Coef=actCoef)
write.table(geneCoef,file="geneCoef2.txt",sep="\t",quote=F,row.names=F)












##############nomogram
library(regplot)
library(rms)
library(rmda)


#转为因子
lasso_dat$gender<-factor(lasso_dat$gender,labels=c("0","1"))
lasso_dat$Rankin<-factor(lasso_dat$Rankin,labels=c("0","1","2"))
lasso_dat$BI<-factor(lasso_dat$BI,labels=c("0","1","2","3"))
lasso_dat$history_of_hyperlipidemia<-factor(lasso_dat$history_of_hyperlipidemia,labels=c("0","1","2","3"))
lasso_dat$Time_from_onset_to_hospitalization<-factor(lasso_dat$Time_from_onset_to_hospitalization,labels=c("0","1","2"))
lasso_dat$cg02550950<-factor(lasso_dat$cg02550950,labels=c("0","1","2"))
lasso_dat$NIHSS<-factor(lasso_dat$NIHSS,labels=c("0","1","2"))
lasso_dat$Nature_of_the_work<-factor(lasso_dat$Nature_of_the_work,labels=c("0","1","2"))
lasso_dat$cultural_level<-factor(lasso_dat$cultural_level,labels=c("0","1","2"))
lasso_dat$location<-factor(lasso_dat$location,labels=c("0","1","2"))

ddist <- datadist(lasso_dat)
options(datadist="ddist") 
mylog<- glm(status~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + 
            gender +Rankin +  cg02550950 + location,family=binomial(link = "logit"),data = lasso_dat)  


summary(mylog)
coefficients(mylog)
exp(coefficients(mylog))
exp(confint(mylog))

library(tidyverse)
mylog <-glm(status ~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
              gender +Rankin + BI + cg02550950 + Nature_of_the_work + location, data =lasso_dat,family=binomial(link = "logit"))
coef_summary = summary(mylog)
coefficients <- coef_summary$coefficients
data <- data.frame(
  Term = rownames(coefficients),
  Estimate = coefficients[, "Estimate"],
  StdError = coefficients[, "Std. Error"],
  zValue = coefficients[, "z value"],
  Pr = coefficients[, "Pr(>|z|)"]
)
summary_table = data
summary_table$OR = exp(summary_table$Estimate)
summary_table$lci95 = summary_table$Estimate - summary_table$StdError*1.96
summary_table$uci95 = summary_table$Estimate + summary_table$StdError*1.96
summary_table = summary_table %>%
  mutate(significance = 
           case_when(
             Pr < 0.05 ~ '*',
             Pr>0.05 ~ 'ns'
           ))

summary_table[c(2,7:8)] <- lapply(summary_table[c(2,7:8)], round, 2)
summary_table$OR1 = paste0(summary_table$Estimate, '(', summary_table$lci95, ', ', summary_table$uci95, ')')
write.csv(summary_table, 'multivariat_logistic.csv')

#status中1代表活着,2代表死了. 默认数值较小者为结局，表明该模型以生存为结局事件
mylog
summary(mylog)
coefficients(mylog)
exp(coefficients(mylog))
exp(confint(mylog))

#绘制列线图, #因子化分类变量每个协变量用方框表示，数值型变量以连续性坐标轴表示。
nom1<-regplot(mylog, clickable=TRUE, 
              points=TRUE, rank="sd",prfail = T)

#指定标记的样本行
nom2<-regplot(mylog,observation=lasso_dat[36,], clickable=TRUE, 
              points=TRUE, rank="sd",droplines=T,prfail = T,
              other=(list(bvcol="red",sq="green",obscol="blue")))


patients = subset(subset_70pct, id == 46)


##############Cindex
library(Hmisc)

Cindex <- rcorrcens(lasso_dat$status~predict(mylog))
Cindex

0.963- (0.038/sqrt(62))*1.96


##############bootstrat
mylog<-lrm(status~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
             gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=lasso_dat,x=T,y=T)
set.seed(300)
myc<-validate(mylog,method="b",B = 1000,pr=T,dxy=T)
c_index<-(myc[1,5]+1)/2
c_index


##############Calibration
mycal<-calibrate(mylog,method="boot",B=1000)

pdf("Calibration_2.pdf")
plot(mycal,xlab="Nomogram-predicted probability of PSD",ylab="Actual diagnosed PSD (proportion)",sub=T)
dev.off()




library(rms)
library(rmda)


modul1<- decision_curve(status~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
                          gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=lasso_dat, 
                        family = binomial(link ='logit'),
                        thresholds= seq(0,1, by = 0.01),
                        confidence.intervals = 0.95)

modul2<- decision_curve(status~history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
                          gender +Rankin + BI + Nature_of_the_work + location,data=lasso_dat, 
                        family = binomial(link ='logit'),
                        thresholds= seq(0,1, by = 0.01),
                        confidence.intervals = 0.95)

modul3<- decision_curve(status~ cg02550950,data=lasso_dat,
                        family = binomial(link ='logit'),
                        thresholds= seq(0,1, by = 0.01),
                        confidence.intervals = 0.95)




plot_decision_curve(list(modul1,modul2,modul3),
                    curve.names= c("clinical + cg02550950","only clinical", "only cg02550950"), xlab="Threshold probability",
                    cost.benefit.axis =FALSE,col=c( "#8c6bb1","#8c0000","#ffcc80"),
                    confidence.intervals=FALSE,
                    standardize = FALSE)




library(ROCR)

pre_rate<-predict(mylog)
ROC1<- prediction(pre_rate,lasso_dat$status) 
ROC2<- performance(ROC1,"tpr","fpr")
AUC <- performance(ROC1,"auc")

AUC = 0.913

plot(ROC2,col="gray", xlab="False positive rate",ylab="True positive rate",lty=1,lwd=3,main=paste("AUC=",AUC))
abline(0,1,lty=2,lwd=3)






###############################validation
validation = subset_30pct %>%
  select(1:3,5,8,16,18:23)
#转为因子
validation$gender<-factor(validation$gender,labels=c("0","1"))
validation$Rankin<-factor(validation$Rankin,labels=c("0","1","2"))
validation$BI<-factor(validation$BI,labels=c("0","1","2","3"))
validation$history_of_hyperlipidemia<-factor(validation$history_of_hyperlipidemia,labels=c("0","1","2","3"))
validation$Time_from_onset_to_hospitalization<-factor(validation$Time_from_onset_to_hospitalization,labels=c("0","1","2"))
validation$cg02550950<-factor(validation$cg02550950,labels=c("0","1","2"))
validation$NIHSS<-factor(validation$NIHSS,labels=c("0","1","2"))
validation$Nature_of_the_work<-factor(validation$Nature_of_the_work,labels=c("0","1","2"))
validation$cultural_level<-factor(validation$cultural_level,labels=c("0","1","2"))
validation$location<-factor(validation$location,labels=c("0","1","2"))

ddist <- datadist(validation)
options(datadist="ddist") 
mylog<- glm(status~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + 
              gender +Rankin +  cg02550950 + location,family=binomial(link = "logit"),data = validation)  


summary(mylog)
##############Cindex
library(Hmisc)

Cindex_validation <- rcorrcens(validation$status~predict(mylog))
Cindex_validation


##############Calibration
mylog<-lrm(status~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
             gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=validation,x=T,y=T)
mycal<-calibrate(mylog,method="boot",B=1000)

plot(mycal,xlab="Nomogram-predicted probability of PSD",ylab="Actual diagnosed PSD (proportion)",sub=T)


library(rms)
library(rmda)


modul1<- decision_curve(status~ history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
                          gender +Rankin + BI + cg02550950 + Nature_of_the_work + location,data=validation, 
                        family = binomial(link ='logit'),
                        thresholds= seq(0,1, by = 0.01),
                        confidence.intervals = 0.95)

modul2<- decision_curve(status~history_of_hyperlipidemia +  NIHSS +  Time_from_onset_to_hospitalization + cultural_level + 
                          gender +Rankin + BI + Nature_of_the_work + location,data=validation, 
                        family = binomial(link ='logit'),
                        thresholds= seq(0,1, by = 0.01),
                        confidence.intervals = 0.95)

modul3<- decision_curve(status~ cg02550950,data=validation,
                        family = binomial(link ='logit'),
                        thresholds= seq(0,1, by = 0.01),
                        confidence.intervals = 0.95)




plot_decision_curve(list(modul1,modul2,modul3),
                    curve.names= c("clinical + cg02550950","only clinical", "only cg02550950"), xlab="Threshold probability",
                    cost.benefit.axis =FALSE,col=c( "#8c6bb1","#8c0000","#ffcc80"),
                    confidence.intervals=FALSE,
                    standardize = FALSE)




library(ROCR)

pre_rate<-predict(mylog)
ROC1<- prediction(pre_rate,validation$status) 
ROC2<- performance(ROC1,"tpr","fpr")
AUC <- performance(ROC1,"auc")

AUC = 0.953

plot(ROC2,col="gray", xlab="False positive rate",ylab="True positive rate",lty=1,lwd=3,main=paste("AUC=",AUC))
abline(0,1,lty=2,lwd=3)