--- title: "Model_construction" author: "SL" date: "2023/10/18" output: html_document --- ```{r} rm(list = ls()) gc() getwd() ``` ```{r} library(openxlsx) library(data.table) library(tidyverse) library(dplyr) library(plyr) library(survival) library(survminer) ``` # data processing-TCGA autophagy genes ```{r} Autophagy <- fread("Autophagy_genes/Autophagy.txt") ``` ```{r} exp <- fread("TCGA/TCGA_exp.txt",data.table = F) head(exp) exp <- exp %>% dplyr::select(Patients_ID,Autophagy$Symbol) head(exp) ``` ```{r} clinical <- fread("TCGA/TCGA_clinical.txt") head(clinical) clinical <- clinical %>% dplyr::select(Patients_ID,PFI,PFI.time) head(clinical) ``` merge PFI data and autophagy genes ```{r} data <- clinical %>% left_join(exp) head(data) ``` # Unicox regression ```{r} colnames(data) <- gsub("-","_",colnames(data)) # 2. Survival analysis y<- Surv(time = data$PFI.time,event = data$PFI==1) variable.names<- dput(names(data[,4:ncol(data)])) univ_formulas <- sapply(variable.names, function(x) as.formula(paste('y~', x))) univ_models <- lapply( univ_formulas, function(x){coxph(x, data = data)}) univ_results <- lapply(univ_models, function(x){ x <- summary(x) p.value<-signif(x$wald["pvalue"], digits=2) HR <-signif(x$coef[2], digits=2);#exp(beta) HR.confint.lower <- signif(x$conf.int[,"lower .95"],2) HR.confint.upper <- signif(x$conf.int[,"upper .95"],2) HR_CI <- paste0(HR, " (", HR.confint.lower, "-", HR.confint.upper, ")") res<-c(HR_CI,HR,HR.confint.lower,HR.confint.upper,p.value) names(res)<-c("HR (95%CI)", "HR","HR.confint.lower","HR.confint.upper","p.value") return(res) }) res <- t(as.data.frame(univ_results, check.names = FALSE)) res <- res %>% as.data.frame() %>% na.omit() res$HR <- as.numeric(res$HR) res$p.value <- as.numeric(res$p.value) res <- data.frame(Symbol=rownames(res),res) fwrite(res,"TCGA/Unicox.csv") head(res) ``` Genes with p<0.05 ```{r} sig_res <- res %>% filter(p.value<0.05) dim(sig_res) sig_res ``` # LASSO regression ```{r} library(glmnet) ``` select genes for Lasso regression ```{r} head(data) data_lasso <- data %>% dplyr::select(Patients_ID,PFI,PFI.time,rownames(sig_res)) head(data_lasso) ``` Lasso regression ```{r} # data processing data_lasso$PFI.time <- as.double(data_lasso$PFI.time) data_lasso$PFI <- as.double(data_lasso$PFI) y <- data.matrix(Surv(time = data_lasso$PFI.time, event = data_lasso$PFI==1)) x <- data_lasso %>% dplyr::select(-c(PFI.time,PFI)) %>% as.matrix() # LASSO regression fit = glmnet(x, y, data=data_lasso, family = "cox", alpha=1, nlambda = 50, standardize = T) plot(fit) # cross validation cvfit = cv.glmnet(x, y, family = "cox", alpha=1, nlambda = 50, standardize = T) pdf("TCGA/LASSO_2.pdf") plot(cvfit) # results coef.min = coef(cvfit, s = "lambda.min") coef.min active.min = which(coef.min != 0) active.min index.min = coef.min[active.min] index.min sig_gene_multi_cox <- row.names(coef.min)[active.min] sig_gene_multi_cox choose_gene <- data.frame(Gene=sig_gene_multi_cox, Coef=index.min) choose_gene <- arrange(choose_gene,Coef) head(choose_gene) choose_gene <- choose_gene[-1,] choose_gene$Role <- ifelse(choose_gene$Coef>0,"Risk","Protective") choose_gene ``` output results ```{r} fwrite(choose_gene,"TCGA/LASSO_coef.csv") ``` Coefficients of LASSO regression ```{r} choose_gene <- fread("TCGA/LASSO_coef.csv",data.table = F) choose_gene rownames(choose_gene) <- choose_gene$Gene coef <- choose_gene %>% dplyr::select(Coef) coef ``` expression matrix for model construction ```{r} data <- fread("TCGA/TCGA_exp.txt",data.table = F) head(data) ``` genes for model construction ```{r} variables <- data %>% dplyr::select(choose_gene$Gene) head(variables) names(variables) ``` patients for model construction ```{r} data_train_prediction <- data %>% dplyr::select(Patients_ID) head(data_train_prediction) ``` AutophagyScores ```{r} for (i in 1:nrow(coef)) { data_train_prediction <- data_train_prediction %>% mutate(y=variables[,i]*coef[i,]) colnames(data_train_prediction)[i+1] <- rownames(coef)[i] } head(data_train_prediction) ``` ```{r} data_train_prediction$prediction <- rowSums(data_train_prediction[,2:ncol(data_train_prediction)]) data_train_prediction <- data_train_prediction %>% dplyr::select(Patients_ID,prediction) head(data_train_prediction) colnames(data_train_prediction)[2] <- "AutophagyScores" fwrite(data_train_prediction,"TCGA/AutophagyScores.csv") ``` merge data ```{r} clinical <- fread("TCGA/TCGA_clinical.txt",data.table = F) data_merge <- data_train_prediction %>% left_join(clinical) head(data_merge) ``` # model validation ROC curve ```{r} library(timeROC) ``` ```{r} cutoff <- 365 time_roc_res <- timeROC( T = data_merge$PFI.time, delta = data_merge$PFI, marker = data_merge$AutophagyScores, cause = 1, weighting="marginal", times = c(1 * 365, 3 * 365, 5*365), ROC = TRUE, iid = TRUE ) time_roc_res$AUC confint(time_roc_res, level = 0.95)$CI_AUC plot(time_roc_res, time=1 * 365, col = "red", title = FALSE) plot(time_roc_res, time=3 * 365, add=TRUE, col="blue") plot(time_roc_res, time=5 * 365, add=TRUE, col="green") legend("bottomright",c("1 Years" ,"3 Years", "5 Years"), col=c("red", "blue", "green"), lty=1, lwd=2) time_ROC_df <- data.frame( TP_1year = time_roc_res$TP[, 1], FP_1year = time_roc_res$FP[, 1], TP_3year = time_roc_res$TP[, 2], FP_3year = time_roc_res$FP[, 2], TP_5year = time_roc_res$TP[, 3], FP_5year = time_roc_res$FP[, 3] ) p <- ggplot(data = time_ROC_df) + geom_line(aes(x = FP_1year, y = TP_1year), linewidth = 1, color = "#BC3C29FF") + geom_line(aes(x = FP_3year, y = TP_3year), linewidth = 1, color = "#0072B5FF") + geom_line(aes(x = FP_5year, y = TP_5year), linewidth = 1, color = "#E18727FF") + geom_abline(slope = 1, intercept = 0, color = "grey", size = 1, linetype = 2) + theme_bw() + annotate("text", x = 0.75, y = 0.25, size = 4.5, label = paste0("AUC at 1 years = ", sprintf("%.3f", time_roc_res$AUC[[1]])), color = "#BC3C29FF" ) + annotate("text", x = 0.75, y = 0.15, size = 4.5, label = paste0("AUC at 3 years = ", sprintf("%.3f", time_roc_res$AUC[[2]])), color = "#0072B5FF" ) + annotate("text", x = 0.75, y = 0.05, size = 4.5, label = paste0("AUC at 5 years = ", sprintf("%.3f", time_roc_res$AUC[[3]])), color = "#E18727FF" ) + labs(x = "False positive rate", y = "True positive rate") + theme( axis.text = element_text(face = "bold", size = 11, color = "black"), axis.title.x = element_text(face = "bold", size = 14, color = "black", margin = margin(c(15, 0, 0, 0))), axis.title.y = element_text(face = "bold", size = 14, color = "black", margin = margin(c(0, 15, 0, 0))) )+ ggtitle("") plot(p) ggsave("TCGA/ROC.pdf",p,width=6,height = 6) ```