rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./00_rawdata")){ dir.create("./00_rawdata") } setwd("./00_rawdata") library(dplyr) library(tidyr) library(tinyarray) library(tidyverse) library(GEOquery) library(Biobase) library(AnnoProbe) #idmap library(clusterProfiler) library(enrichplot) # GSE20685 setwd("/data/nas3///29_YQ953-11/00_rawdata/") if (!dir.exists("GSE20685")) {dir.create("GSE20685")} setwd("GSE20685") # library(Biobase) library(GEOquery) # # GSE20685 gset<-getGEO("GSE20685", destdir = '.', GSEMatrix = T, getGPL = F) gset$GSE20685_series_matrix.txt.gz@annotation expr<-as.data.frame(exprs(gset[[1]])) gpl<-getGEO("GPL570",destdir = '../../../../pipeline/GPL/') a=gset[[1]] #gpl<-Table(gpl) gpl <- idmap(gpl = 'GPL570') colnames(gpl) library(clusterProfiler) probe2symobl<-gpl # probe2symobl<-gpl %>% # dplyr::select('ID','Symbol')%>% # filter('Symbol'!='')%>% # separate('Symbol',c('symbol','drop'),sep = '///')%>% # dplyr::select(-drop) probe2symobl=probe2symobl[probe2symobl$symbol!='',] colnames(probe2symobl)<-c('ID','symbol') dat.va<-expr dat.va$ID<-rownames(dat.va) dat.va$ID<-as.character(dat.va$ID) probe2symobl$ID<-as.character(probe2symobl$ID) dat.va<-dat.va %>% inner_join(probe2symobl,by='ID')%>% dplyr::select(-ID)%>% ## dplyr::select(symbol,everything())%>% ## mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% ## arrange(desc(rowMean))%>% ## distinct(symbol,.keep_all = T)%>% ## symbol dplyr::select(-rowMean)%>% ## rowMean tibble::column_to_rownames(colnames(.)[1]) ## ## pd <- pData(a) colnames(pd) # table(pd$source_name_ch1) # pd <- subset(pd,source_name_ch1 == 'Breast tissue, cancer') survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.3,OS.time=pd$characteristics_ch1.4) table(survival$OS) survival$OS <- ifelse(survival$OS=='event_death: 0',0,1) table(survival$OS.time) survival$OS.time <- gsub('follow_up_duration (years): ','',survival$OS.time,fixed = T) survival$OS.time <- as.numeric(survival$OS.time)*365 dat <- dat.va[,survival$sample] write.csv(dat,file = 'dat(GSE20685).csv') write.csv(survival,file = 'survival(GSE20685).csv') # # GSE20685 # if (!dir.exists("GSE42568")) # {dir.create("GSE42568")} # setwd("GSE42568") # # # # GSE42568 # gset<-getGEO("GSE42568", # destdir = '.', # GSEMatrix = T, # getGPL = F) # gset$GSE42568_series_matrix.txt.gz@annotation # expr<-as.data.frame(exprs(gset[[1]])) # gpl<-getGEO("GPL570",destdir = '../../../../pipeline/GPL/') # a=gset[[1]] # #gpl<-Table(gpl) # gpl <- idmap(gpl = 'GPL570') # colnames(gpl) # library(clusterProfiler) # probe2symobl<-gpl # # probe2symobl<-gpl %>% # # dplyr::select('ID','Symbol')%>% # # filter('Symbol'!='')%>% # # separate('Symbol',c('symbol','drop'),sep = '///')%>% # # dplyr::select(-drop) # probe2symobl=probe2symobl[probe2symobl$symbol!='',] # colnames(probe2symobl)<-c('ID','symbol') # dat.va<-expr # dat.va$ID<-rownames(dat.va) # dat.va$ID<-as.character(dat.va$ID) # probe2symobl$ID<-as.character(probe2symobl$ID) # dat.va<-dat.va %>% # inner_join(probe2symobl,by='ID')%>% # dplyr::select(-ID)%>% ## # dplyr::select(symbol,everything())%>% ## # mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% ## # arrange(desc(rowMean))%>% ## # distinct(symbol,.keep_all = T)%>% ## symbol # dplyr::select(-rowMean)%>% ## rowMean # tibble::column_to_rownames(colnames(.)[1]) ## # # ## # pd <- pData(a) # table(pd$source_name_ch1) # pd <- subset(pd,source_name_ch1 == 'Breast tissue, cancer') # survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.9,OS.time=pd$characteristics_ch1.8) # # # table(survival$OS) # survival$OS <- ifelse(survival$OS=='overall survival event: 0',0,1) # table(survival$OS.time) # survival$OS.time <- gsub('overall survival time_days: ','',survival$OS.time,fixed = T) # #survival$OS.time <- as.numeric(survival$OS.time)*30 # dat <- dat.va[,survival$sample] # # write.csv(dat,file = 'dat(GSE42568).csv') # write.csv(survival,file = 'survival(GSE42568).csv') # # # # GSE7390 # if (!dir.exists("GSE7390")) # {dir.create("GSE7390")} # setwd("GSE7390") # # # # GSE7390 # gset<-getGEO("GSE7390", # destdir = '.', # GSEMatrix = T, # getGPL = F) # gset$GSE7390_series_matrix.txt.gz@annotation # expr<-as.data.frame(exprs(gset[[1]])) # gpl<-getGEO("GPL96",destdir = '../../../../pipeline/GPL/') # a=gset[[1]] # #gpl<-Table(gpl) # gpl <- idmap(gpl = 'GPL96') # colnames(gpl) # library(clusterProfiler) # probe2symobl<-gpl # # probe2symobl<-gpl %>% # # dplyr::select('ID','Symbol')%>% # # filter('Symbol'!='')%>% # # separate('Symbol',c('symbol','drop'),sep = '///')%>% # # dplyr::select(-drop) # probe2symobl=probe2symobl[probe2symobl$symbol!='',] # colnames(probe2symobl)<-c('ID','symbol') # dat.va<-expr # dat.va$ID<-rownames(dat.va) # dat.va$ID<-as.character(dat.va$ID) # probe2symobl$ID<-as.character(probe2symobl$ID) # dat.va<-dat.va %>% # inner_join(probe2symobl,by='ID')%>% # dplyr::select(-ID)%>% ## # dplyr::select(symbol,everything())%>% ## # mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% ## # arrange(desc(rowMean))%>% ## # distinct(symbol,.keep_all = T)%>% ## symbol # dplyr::select(-rowMean)%>% ## rowMean # tibble::column_to_rownames(colnames(.)[1]) ## # # ## # pd <- pData(a) # table(pd$source_name_ch1) # #pd <- subset(pd,source_name_ch1 == 'Breast tissue, cancer') # survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.16,OS.time=pd$characteristics_ch1.15) # # # table(survival$OS) # survival$OS <- ifelse(survival$OS=='e.os: 0',0,1) # table(survival$OS.time) # survival$OS.time <- gsub('t.os: ','',survival$OS.time,fixed = T) # #survival$OS.time <- as.numeric(survival$OS.time)*30 # dat <- dat.va[,survival$sample] # # write.csv(dat,file = 'dat(GSE7390).csv') # write.csv(survival,file = 'survival(GSE7390).csv') ## GSE58812 #if (!dir.exists("GSE58812")) #{dir.create("GSE58812")} #setwd("GSE58812") # # ## GSE58812 #gset<-getGEO("GSE58812", # destdir = '.', # GSEMatrix = T, # getGPL = F) #gset$GSE58812_series_matrix.txt.gz@annotation #expr<-as.data.frame(exprs(gset[[1]])) #gpl<-getGEO("GPL570",destdir = '../../../../pipeline/GPL/') #a=gset[[1]] ##gpl<-Table(gpl) #gpl <- idmap(gpl = 'GPL570') #colnames(gpl) #library(clusterProfiler) #probe2symobl<-gpl ## probe2symobl<-gpl %>% ## dplyr::select('ID','Symbol')%>% ## filter('Symbol'!='')%>% ## separate('Symbol',c('symbol','drop'),sep = '///')%>% ## dplyr::select(-drop) #probe2symobl=probe2symobl[probe2symobl$symbol!='',] #colnames(probe2symobl)<-c('ID','symbol') #dat.va<-expr #dat.va$ID<-rownames(dat.va) #dat.va$ID<-as.character(dat.va$ID) #probe2symobl$ID<-as.character(probe2symobl$ID) #dat.va<-dat.va %>% # inner_join(probe2symobl,by='ID')%>% # dplyr::select(-ID)%>% ## # dplyr::select(symbol,everything())%>% ## # mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>% ## # arrange(desc(rowMean))%>% ## # distinct(symbol,.keep_all = T)%>% ## symbol # dplyr::select(-rowMean)%>% ## rowMean # tibble::column_to_rownames(colnames(.)[1]) ## # ### #pd <- pData(a) #colnames(pd) #table(pd$source_name_ch1) #pd <- subset(pd,source_name_ch1 == 'Breast tumor biospsie') #survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.4,OS.time=pd$characteristics_ch1.5) # # #table(survival$OS) #survival$OS <- ifelse(survival$OS=='death: 0',0,1) #table(survival$OS.time) #survival$OS.time <- gsub('os (days): ','',survival$OS.time,fixed = T) ##survival$OS.time <- as.numeric(survival$OS.time)*30 #dat <- dat.va[,survival$sample] #dat <- log2(dat+1) #write.csv(dat,file = 'dat(GSE58812).csv') #write.csv(survival,file = 'survival(GSE58812).csv') rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./00_rawdata")){ dir.create("./00_rawdata") } setwd("./00_rawdata") library(readr) library(readxl) library(tidyverse) library(tibble) ## annotation <- read.table(file = './gencode.v36.annotation.gtf.gene.probemap') probe2symbol <- annotation[,(1:2)] colnames(probe2symbol) <- c('ID', 'symbol') probe2symbol <- probe2symbol[-1,] probe2symbol$ID <- as.character(probe2symbol$ID) #count count.log <- read_tsv(file = './TCGA-BRCA.star_counts.tsv.gz') %>% column_to_rownames(., var = 'Ensembl_ID') #count.log <- tinyarray::trans_exp(count.log,mrna_only = T)# count <- 2^count.log-1 #xenalog2+1, dat_count <- count dat_count$ID <- rownames(dat_count) %>% as.character() ##id dat_count <- dat_count %>% inner_join(probe2symbol, by = 'ID') %>% #ID dplyr::select(-ID) %>% #ID, dplyr::select(symbol, everything()) %>% #symbol mutate(rowMean = rowMeans(.[grep('TCGA', names(.))])) %>% #TCGA,mutateDataFrame,。 arrange(desc(rowMean)) %>% # distinct(symbol, .keep_all = T) %>% #symbol,.keep_all dplyr::select(-rowMean) %>% #rowMean tibble::column_to_rownames(colnames(.)[1]) #symbol,column_to_rownamestibble dim(dat_count) ##59427 589 # ## survival <- read_tsv('./TCGA-BRCA.survival.tsv.gz') dat_count <- dat_count[,colnames(dat_count) %in% survival$sample] dim(dat_count) # 59427 576 ## phenotype <- read_tsv('./TCGA-BRCA.clinical.tsv.gz') %>% data.frame(.) colnames(phenotype) # oral_cancer_sites <- c("Base of tongue, NOS", "Cheek mucosa", "Floor of mouth, NOS", # "Gum, NOS", "Hard palate", "Lip, NOS", "Mouth, NOS", # "Overlapping lesion of lip, oral cavity and pharynx", # "Palate, NOS", "Tongue, NOS", "Tonsil, NOS") # # # # phenotype <- phenotype[phenotype$tissue_or_organ_of_origin.diagnoses %in% oral_cancer_sites, ] table(phenotype$sample_type.samples) table(phenotype$sample_id.samples) group <- subset(phenotype, sample_type.samples == 'Primary Tumor' | sample_type.samples == 'Solid Tissue Normal') group <- group[group$sample %in% colnames(dat_count),] group <- group[order(group$sample_type.samples, decreasing = T),] group <- data.frame(sample = group$sample, group = group$sample_type.samples) group$group <- ifelse(group$group == 'Solid Tissue Normal', 'Normal', 'Tumor') group <- data.frame(row.names = group$sample, group = group$group )####Typegroup # dat_count <- subset(dat_count, select=rownames(group)) identical(colnames(dat_count), rownames(group)) dim(group) #574 1 table(group$group) # Normal Tumor # 59 515 write.csv(group, file = '03.group_TCGA.csv', quote=F) ### mRNA library("rtracklayer") gtf_data = import('./gencode.v36.annotation.gtf.gz') %>%as.data.frame() head(gtf_data) table(gtf_data$gene_type) protein_coding=gtf_data%>% dplyr::filter(type=="gene",gene_type=="protein_coding")%>% dplyr::select(gene_id,gene_type,gene_name) mRNA.count<-dat_count[rownames(dat_count)%in%protein_coding$gene_name,] dim(mRNA.count) # 19938 574 identical(colnames(mRNA.count), rownames(group)) write.csv(mRNA.count,file = '01.expr_count_TCGA.csv', quote = F, row.names = T) ##tpm fpkm.log <- read_tsv(file = './TCGA-BRCA.star_tpm.tsv.gz') %>% as.data.frame() %>% column_to_rownames(var = 'Ensembl_ID') fpkm <- 2^fpkm.log-1 #xenalog2+1, dat_fpkm <- fpkm dat_fpkm$ID <- rownames(dat_fpkm) dat_fpkm$ID <- as.character(dat_fpkm$ID) ##id dat_fpkm <- dat_fpkm %>% inner_join(probe2symbol, by = 'ID') %>% #ID dplyr::select(-ID) %>% #ID, dplyr::select(symbol, everything()) %>% #symbol mutate(rowMean = rowMeans(.[grep('TCGA', names(.))])) %>% #,mutateDataFrame,。 arrange(desc(rowMean)) %>% # distinct(symbol, .keep_all = T) %>% #symbol,.keep_all dplyr::select(-rowMean) %>% #rowMean tibble::column_to_rownames(colnames(.)[1]) #,column_to_rownamestibble dim(dat_fpkm) ## 59427 589 dat_fpkm <- dat_fpkm[,colnames(mRNA.count)] dat_fpkm <- dat_fpkm[rownames(mRNA.count),] dat_fpkm <- na.omit(dat_fpkm) identical(colnames(dat_fpkm), rownames(group)) identical(colnames(dat_fpkm), colnames(mRNA.count)) identical(rownames(dat_fpkm), rownames(mRNA.count)) dim(dat_fpkm) # 19938 574 dat_fpkm_log2 <- log2(dat_fpkm + 1) write.csv(dat_fpkm_log2, file = '02.expr_tpm_TCGA.csv', row.names = T, quote = F) # survival <- read_tsv(file = './TCGA-BRCA.survival.tsv.gz') ## group1 <- read.csv("./03.group_TCGA.csv") table(group1$group) group2 <- group1[group1$group == 'Tumor',] survival <- survival[survival$sample %in% group2$X,] survival$`_PATIENT` survival <- survival %>% select(-`_PATIENT`) dim(survival) # 515 3 write.csv(survival, file = '04.survival_TCGA.csv', quote=F) #phe- phe1 <- read.delim2("./TCGA-BRCA.clinical.tsv.gz", sep = "\t") colnames(phe1) table(phe1$race.demographic) cli_dat1 <- cbind(sample = phe1$sample, gender = phe1$gender.demographic, age = phe1$age_at_index.demographic, histological=phe1$neoplasm_histologic_grade, #smoking=phe1$tobacco_smoking_history, #race=phe1$race.demographic, stage = phe1$ajcc_pathologic_stage.diagnoses, T_stage = phe1$ajcc_pathologic_t.diagnoses, N_stage = phe1$ajcc_pathologic_n.diagnoses, M_stage = phe1$ajcc_pathologic_m.diagnoses ) cli_dat <- as.data.frame(cli_dat1) #table(cli_dat$M_stage)# cli_dat[cli_dat == ""] <- NA cli_dat <- na.omit(cli_dat) #cli_dat$age <- as.numeric(cli_dat$age) / 365 write.csv(cli_dat,file = "05.phenotype_TCGA.csv",quote=F,row.names = F) rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./01_DEG_TCGA")){ dir.create("./01_DEG_TCGA") } setwd("./01_DEG_TCGA") #DEG1-TCGA----------------------------------------------------------------------------------------- library(tidyverse) library(lance) library(DESeq2) dat <- read.csv('../00_rawdata/01.expr_count_TCGA.csv',row.names = 1,check.names = F) dat <- round(dat,0) group <- read.csv('../00_rawdata/03.group_TCGA.csv') colnames(group) <- c('sample','group') table(group$group) # Normal Tumor # 32 375 colnames(group) condition = factor(group$group) coldata <- data.frame(row.names = colnames(dat), condition) dds <- DESeqDataSetFromMatrix(countData = dat,colData=coldata,design = ~condition) dds = dds[rowSums(counts(dds)) > 1,] dds <- estimateSizeFactors(dds) # normalized_counts <- counts(dds,normalized=T) #write.table(normalized_counts,file = 'normalized.counts.csv',sep = ',',row.names = T,quote = F) # dds <- DESeq(dds) # res = results(dds, contrast = c("condition","Tumor","Normal")) res = res[order(res$pvalue),] head(res) summary(res) table(res$pvalue<0.05) DEG <- as.data.frame(res) DEG <- na.omit(DEG) dim(DEG) head(DEG) ## change logFC_cutoff <- 1 DEG$change = as.factor( ifelse(DEG$pvalue<0.05&abs(DEG$log2FoldChange)>logFC_cutoff, ifelse(DEG$log2FoldChange>logFC_cutoff,'UP','DOWN'),'NOT')) table(DEG$change) sig_diff <- subset(DEG, DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) >= logFC_cutoff) table(sig_diff$change) #1 DOWN NOT UP # 1982 0 3095 #0.5 DOWN NOT UP # 3600 0 5977 write.csv(DEG,file = "01.DEG1_all.csv",quote = F,row.names = T) write.csv(sig_diff,file = "02.DEG1_sig1.csv",quote = F,row.names = T) saveRDS(res,file = 'res.rds') #----- library(dplyr) library(ggfun) library(grid) library(ggplot2) library(ggthemes) library(Ipaper) library(scales) library(ggrepel) library(pheatmap) library(RColorBrewer) library(gplots) library(tidyverse) library(lance) dat_rep<-DEG[rownames(DEG)%in% rownames(rbind(head(sig_diff[order(sig_diff$log2FoldChange,decreasing = T),],10), head(sig_diff[order(sig_diff$log2FoldChange,decreasing = F),],10))),] volcano_plot<- ggplot(data = DEG, aes(x = log2FoldChange, y = -log10(pvalue), color =change)) + scale_color_manual(values = c("#29AEB1", "darkgray","#B72230")) + # scale_x_continuous(breaks = c(-10,-6,-3,-1,0,1,3,6,10)) + #scale_y_continuous(#trans = "log1p",breaks = c(0,50,100,150,200,500)) + geom_point(size = 2.4, alpha = 0.4, na.rm=T) + theme_bw(base_size = 12, base_family = "Times") + geom_vline(xintercept = c(-1,1),#--------------------------------------------#########FC####### lty = 4, col = "gray40", lwd = 0.6)+ geom_hline(yintercept = -log10(0.05), lty = 4, col = "gray40", lwd = 0.6)+ theme(legend.position = "right", panel.grid = element_blank(), legend.title = element_blank(), legend.text = element_text(face="bold", color="black", family = "Times", size=13), plot.title = element_text(hjust = 0.5), axis.text.x = element_text(face = "bold", color = "black", size = 15), axis.text.y = element_text(face = "bold", color = "black", size = 15), axis.title.x = element_text(face = "bold", color = "black", size = 15), axis.title.y = element_text(face = "bold", color = "black", size = 15)) + geom_label_repel( data = dat_rep, aes(label = rownames(dat_rep)), max.overlaps = 1000, size = 4, box.padding = unit(0.5, "lines"), min.segment.length = 0, point.padding = unit(0.8, "lines"), segment.color = "black", show.legend = FALSE )+ labs(x = "log2(Fold Change)", y = "-log10(P.Value)") volcano_plot pdf('03.volcano_2.pdf', w=8,h=6,family='Times') volcano_plot dev.off() png('03.volcano_2.png', w=8,h=6,family='Times', units='in',res=600) volcano_plot dev.off() # ------ library(ComplexHeatmap) library(circlize) dat_rep<-sig_diff[rownames(sig_diff)%in% rownames(rbind(head(sig_diff[order(sig_diff$log2FoldChange,decreasing = T),],10), head(sig_diff[order(sig_diff$log2FoldChange,decreasing = F),],10))),] dat_rep <- dat_rep[order(dat_rep$change,decreasing = T),] rt <- dat rt <- rt[rownames(dat_rep),] group <- group[order(group$group),] table(group$group) group$group <- ifelse(group$group == 'Tumor','BC','Control') rt <- rt[,group$sample] x<-log2(rt+1) mat <- t(scale(t(x)))# mat[mat < (-2)] <- (-2) mat[mat > 2] <- 2 pdf('06.heatmap.pdf', w=6,h=6,family='Times') densityHeatmap(mat ,title = "Distribution as heatmap", ylab = " ",height = unit(3, "cm")) %v% HeatmapAnnotation(Group = group$group, col = list(Group = c("BC" = "#B72230", "Control" = "#104680"))) %v% Heatmap(mat, row_names_gp = gpar(fontsize = 9), show_column_names = F, show_row_names = T, ###show_colnames = FALSE, name = "expression", ###cluster_cols = F, cluster_rows = F, height = unit(6, "cm"), #cluster_columns = FALSE, ###cluster_rows = FALSE, col = colorRampPalette(c("darkgreen", "white","orange"))(100)) dev.off() png('06.heatmap.png',w=6,h=6,units='in',res=600,family='Times') densityHeatmap(mat ,title = "Distribution as heatmap", ylab = " ",height = unit(3, "cm")) %v% HeatmapAnnotation(Group = group$group, col = list(Group = c("BC" = "#B72230", "Control" = "#104680"))) %v% Heatmap(mat, row_names_gp = gpar(fontsize = 9), show_column_names = F, show_row_names = T, ###show_colnames = FALSE, name = "expression", ###cluster_cols = F, cluster_rows = F, height = unit(6, "cm"), #cluster_columns = FALSE, ###cluster_rows = FALSE, col = colorRampPalette(c("darkgreen", "white","orange"))(100)) dev.off() rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./02_intersect")){ dir.create("./02_intersect") } setwd("./02_intersect") gene1 <- read.csv('../01_DEG_TCGA/02.DEG1_sig.csv') #5077 gene2 <- read.csv("../00_rawdata/L-RGs.csv") #179 library(VennDiagram) venn_list<-list(gene1=gene1$X, gene2=gene2$symbol ) venn.plot <-venn.diagram(venn_list, filename = NULL , force.unique = T, # print.mode = c("percent", "raw"), fill = c("#E41A1C","#377EB8"), alpha = 0.6, col = FALSE, cex = 1.2, # cat.fontface = "bold", # cat.col = c("#E41A1C","#377EB8"), # category.names = c('DEGs','LRGs'), # #fontfamily = 'serif', cat.pos = c(-20,20), cat.cex = 1.3, ## cat.fontfamily = 'serif', margin = 0.1, scaled =FALSE ) inter <- get.venn.partitions(venn_list) #venn.plotgrid.drawpdf pdf("01.DEGs_LRGs.pdf") grid.draw(venn.plot) dev.off() venn.diagram(venn_list, filename = '01.DEGs_LRGs.png', imagetype = 'png', fill = c("#E41A1C","#377EB8"), alpha = 0.6, col = FALSE, cat.fontface = "bold", cex = 1.2, force.unique = T, cat.col = c("#E41A1C","#377EB8"), category.names = c('DEGs','LRGs'),# print.mode = c("percent", "raw"), fontfamily = 'serif', cat.pos = c(-20,20), cat.cex = 1.3, #cat.fontfamily = 'serif', margin = 0.1, scaled =FALSE) inter <- get.venn.partitions(venn_list) for (i in 1:nrow(inter)) inter[i,'values'] <- paste(inter[[i,'..values..']], collapse = ', ') #paste;collapse Candidate_genes<-inter[1,]$..values.. $`1` write.csv(Candidate_genes,'01.DEGs_LRGs.csv',quote = F,row.names = F) # ############------------------------ # # rm(list = ls()) # setwd("/data/nas3///13_YQ835-2/") # if (! dir.exists("./02_intersect")){ # dir.create("./02_intersect") # } # setwd("./02_intersect") # # gene1 <- read.csv('../01_DEG_TCGA/02.DEG1_sig.csv') #2152 # gene2 <- read.csv("../00_rawdata/NETs-RGs.csv") #136 # # # # library(VennDiagram) # venn_list<-list(gene1=gene1$X, # gene2=gene2$symbol # # ) # venn.plot <-venn.diagram(venn_list, filename = NULL , # force.unique = T, # # print.mode = c("percent", "raw"), # fill = c("#E41A1C","#377EB8"), alpha = 0.6, # col = FALSE, # cex = 1.2, # # cat.fontface = "bold", # # cat.col = c("#E41A1C","#377EB8"), # # category.names = c('DEGs','NETs-RGs'), # # #fontfamily = 'serif', # #cat.pos = c(20,-20), # cat.cex = 1.3, ## # cat.fontfamily = 'serif', # margin = 0.1, # scaled =FALSE # ) # # inter <- get.venn.partitions(venn_list) # #venn.plotgrid.drawpdf # pdf("02.DEGs_NETs-RGs.pdf") # grid.draw(venn.plot) # dev.off() # # venn.diagram(venn_list, filename = '02.DEGs_NETs-RGs.png', imagetype = 'png', # fill = c("#E41A1C","#377EB8"), alpha = 0.6, # col = FALSE, # cat.fontface = "bold", # cex = 1.2, # force.unique = T, # cat.col = c("#E41A1C","#377EB8"), # category.names = c('DEGs','NETs-RGs'),# # print.mode = c("percent", "raw"), # fontfamily = 'serif', # #cat.pos = c(20,-20), # cat.cex = 1.3, # #cat.fontfamily = 'serif', # margin = 0.1, # scaled =FALSE) # inter <- get.venn.partitions(venn_list) # for (i in 1:nrow(inter)) inter[i,'values'] <- paste(inter[[i,'..values..']], collapse = ', ') # #paste;collapse # Candidate_genes<-inter[1,]$..values.. $`1` # write.csv(Candidate_genes,'02.DEGs_NETs-RGs.csv',quote = F,row.names = F)rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (!dir.exists("03_enrichment")) {dir.create("03_enrichment")} setwd("03_enrichment") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(clusterProfiler) library(org.Hs.eg.db) library(enrichplot) library(ggplot2) library(GOplot) library(Rgraphviz) library(ggnewscale) library(DOSE) library(dplyr) library(tidyverse) library(data.table) library(ggraph) library(tidygraph) library(stringr) library(ComplexHeatmap) # charset ----------------------------------------------------------------- go.circ = 1 kegg.circ = 1 kegg.buble = 1 #kegg.chord = 1 # load('../00_rawdat/00_temp/enrich.Rdata') # dat prepare ------------------------------------------------------------- enrich_ID <- read.csv("../02_intersect/01.DEGs_LRGs.csv") # enrich_ID <- readRDS("../00_rawdat/00_temp/rsult.sig1.rds")[[20]] select_DERNA <- read.csv("../01_DEG_TCGA/02.DEG1_sig.csv",row.names = 1)# # grep('',colnames(select_DERNA)) if(str_detect(colnames(select_DERNA)[1],'logFC')){ colnames(select_DERNA)[1] <- 'log2FoldChange' } gene_symbol <- clusterProfiler::bitr( geneID = enrich_ID$x, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db") library(clusterProfiler) enrich_go <- enrichGO(gene = gene_symbol[,2], OrgDb = org.Hs.eg.db, keyType = "ENTREZID", ont = "ALL", pAdjustMethod = "fdr", pvalueCutoff = 1, qvalueCutoff = 1, readable = TRUE) enrich_go <- mutate(enrich_go, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio))) GO_ALL <- enrich_go@result if(median(GO_ALL$richFactor) < 0.1){GO_ALL$richFactor<- GO_ALL$richFactor*10} GO_ALL <- GO_ALL[GO_ALL$pvalue < 0.05,] BP <- GO_ALL[GO_ALL$ONTOLOGY=='BP', ] BP <- BP[order(BP$pvalue,decreasing = F),] CC <- GO_ALL[GO_ALL$ONTOLOGY=='CC', ] CC <- CC[order(CC$pvalue,decreasing = F),] MF <- GO_ALL[GO_ALL$ONTOLOGY=='MF', ] MF <- MF[order(MF$pvalue,decreasing = F),] paste0("",dim(GO_ALL)[[1]],",",dim(BP)[[1]],",",dim(CC)[[1]],",",dim(MF)[[1]],"") #"231,179,14,38" write.csv(GO_ALL,file = "00.go_ALL.csv",row.names = F) write.csv(as.data.frame(BP), '00.GO_BP.csv', row.names = FALSE) write.csv(as.data.frame(CC), '00.GO_CC.csv', row.names = FALSE) write.csv(as.data.frame(MF), '00.GO_MF.csv', row.names = FALSE) display_number = c(5, 5, 5) ego_result_BP <- as.data.frame(BP)[1:display_number[1], ] ego_result_CC <- as.data.frame(CC)[1:display_number[2], ] ego_result_MF <- as.data.frame(MF)[1:display_number[3], ] paste0(ego_result_BP$Description,collapse = "();") paste0(ego_result_CC$Description,collapse = "();") paste0(ego_result_MF$Description,collapse = "();") col1 <- c("#8DA0CB","#E78AC3","#A6D854","#FFD92F","#E5C494","#8DD3C7") col2 <- c("#BC80BD","#CCEBC5","#FFED6F","#1B9E77","#D95F02","#7570B3") col3 <- c("#E4C755", "#9FA3A8",'#D6E7A3','#F4A371FF','#F9C187FF','#FACF94FF') Col1 <- sample(col1,1) Col2 <- sample(col2,1) Col3 <- sample(col3,1) #GO- if(T){ go_enrich_df <- data.frame( ID=c(ego_result_BP$ID, ego_result_CC$ID, ego_result_MF$ID), Description=c(ego_result_BP$Description,ego_result_CC$Description,ego_result_MF$Description), GeneNumber=c(ego_result_BP$Count, ego_result_CC$Count, ego_result_MF$Count), type=factor(c(rep("BP:biological process", display_number[1]), rep("CC:cellular component", display_number[2]), rep("MF:molecular function", display_number[3])), levels=c("BP:biological process", "CC:cellular component","MF:molecular function" ))) go_enrich_df$type_order=factor(rev(as.integer(rownames(go_enrich_df))),labels=rev(go_enrich_df$Description)) COLS <- c(Col1, Col2,Col3) p <- ggplot(data=go_enrich_df, aes(x=type_order,y=GeneNumber, fill=type)) + # geom_bar(stat="identity", width=0.8) + #, scale_fill_manual(values = COLS) + ### coord_flip() + ##, xlab("GO term") + ylab("Gene Number") + labs(title = "GO Terms")+ theme_bw()+theme( legend.background = element_rect(fill = "white", color = "black", size = 0.2), legend.text = element_text(face="bold",color="black",family = "Times",size=12), plot.title = element_text(hjust = 0.5, face = "bold",color = "black",family = "Times",size = 18), axis.text.x = element_text(face = "bold",color = "black",size = 14), axis.text.y = element_text(face = "bold",color = "black",size = 14), axis.title.x = element_text(face = "bold",color = "black",family = "Times",size = 18), axis.title.y = element_text(face = "bold",color = "black",family = "Times",size = 18), plot.subtitle = element_text(hjust = 0.5,family = "Times", size = 14, face = "italic", colour = "black") )+scale_x_discrete(labels=function(x) str_wrap(x, width=40)) pdf("01.go_barplot.pdf", width = 12, height = 8,family = "Times") print(p) dev.off() png("01.go_barplot.png", width = 12, height = 8,res = 600,units = 'in',family = "Times") print(p) dev.off() } # circ if(go.circ == 1){ go_tem <- rbind(ego_result_BP,ego_result_CC,ego_result_MF) go <- data.frame(Category = go_tem[,'ONTOLOGY'],ID = go_tem[,'ID'],Term = go_tem[,'Description'], Genes = gsub("/", ", ", go_tem[,'geneID']), adj_pval = go_tem[,'pvalue']) genelist <- data.frame(ID = rownames(select_DERNA), logFC = select_DERNA$log2FoldChange) row.names(genelist)=genelist[,1] library(GOplot) circ <- circle_dat(go, genelist) go_tem$pvalue <- -log10(go_tem$pvalue) go_enrich_df <- data.frame( category=go_tem$ONTOLOGY, gene_num.min = 0, gene_num.max = max(go_tem$Count), gene_num.rich=go_tem$Count, "-log10.p" = go_tem$pvalue, up.regulated = 0, down.regulated = 0, rich.factor = go_tem$richFactor, Description=go_tem$ID) circ$sig <- ifelse(circ$logFC > 0,"up","down") tem1 <- go_enrich_df$Description i =1 for(i in 1:15){ x = tem1[i] circ1 <- circ[grep(x,circ$ID),] num1 <- nrow(circ1) tem2 <- as.data.frame(table(circ1$sig)) if(nrow(tem2) == 2){ go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]] go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[2]] }else if(tem2[1,1] == "up"){ go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[1]] }else{go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]]} } go_enrich_df$Description <- factor(go_enrich_df$Description, levels = go_enrich_df$Description) rownames(go_enrich_df) <- go_enrich_df$Description pdf(file = paste0("02.GOcirclize_plot.pdf"),width = 12,height = 6,family = "Times") a <- dev.cur() png(file = paste0("02.GOcirclize_plot.png"),width= 12, height= 6, units="in", res=300,family = "Times") dev.control("enable") par(mar = c(2,2,2,2),cex=1,family="Times") circle_size = unit(1, 'snpc') # circlize library(circlize) ## circos.par(gap.degree = 2, start.degree = 90) ##, ko plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max')] #, ko ko_color <- c(rep(Col1, 5), rep(Col2, 5), rep(Col3, 5)) # circos.genomicInitialize(plot_data, plotType = NULL, major.by = 1) # circos.track( ylim = c(0, 1), track.height = 0.05, bg.border = NA, bg.col = ko_color, #、 panel.fun = function(x, y) { ylim = get.cell.meta.data('ycenter') #ylim、xlim ko id xlim = get.cell.meta.data('xcenter') sector.name = get.cell.meta.data('sector.index') #sector.name ko id circos.axis(h = 'top', labels.cex = 0.4, major.tick.percentage = 0.4, labels.niceFacing = FALSE) # circos.text(xlim, ylim, sector.name, cex = 0.8, niceFacing = FALSE) # ko id } ) colnames(go_enrich_df)[5] <- c("-log10(pvalue)") ##, p plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.rich', '-log10(pvalue)')] label_data <- go_enrich_df['gene_num.rich'] p_max <- round(max(go_enrich_df$`-log10(pvalue)`)) + 1 colorsChoice <- colorRampPalette(c('white','#4393C3')) color_assign <- colorRamp2(breaks = 0:p_max, col = colorsChoice(p_max + 1)) circos.genomicTrackPlotRegion( plot_data, track.height = 0.08, bg.border = NA, stack = TRUE, #、 panel.fun = function(region, value, ...) { circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...) #, p ylim = get.cell.meta.data('ycenter') #,ylim、xlim、sector.name () xlim = label_data[get.cell.meta.data('sector.index'),1] / 2 sector.name = label_data[get.cell.meta.data('sector.index'),1] circos.text(xlim, ylim, sector.name, cex = 0.3, niceFacing = FALSE) #() } ) go_enrich_df$all.regulated <- go_enrich_df$up.regulated + go_enrich_df$down.regulated go_enrich_df$up.proportion <- go_enrich_df$up.regulated / go_enrich_df$all.regulated go_enrich_df$down.proportion <- go_enrich_df$down.regulated / go_enrich_df$all.regulated go_enrich_df$up <- go_enrich_df$up.proportion * go_enrich_df$gene_num.max plot_data_up <- go_enrich_df[c('Description', 'gene_num.min', 'up')] names(plot_data_up) <- c('Description', 'start', 'end') plot_data_up$type <- 1 go_enrich_df$down <- go_enrich_df$down.proportion * go_enrich_df$gene_num.max + go_enrich_df$up plot_data_down <- go_enrich_df[c('Description', 'up', 'down')] names(plot_data_down) <- c('Description', 'start', 'end') plot_data_down$type <- 2 plot_data <- rbind(plot_data_up, plot_data_down) label_data <- go_enrich_df[c('up', 'down', 'up.regulated', 'down.regulated')] color_assign <- colorRamp2(breaks = c(1, 2), col = c('#FDDBC7', '#9ACD32')) circos.genomicTrackPlotRegion( plot_data, track.height = 0.08, bg.border = NA, stack = TRUE, #、 panel.fun = function(region, value, ...) { circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...) #,, ylim = get.cell.meta.data('cell.bottom.radius') - 0.5 #,ylim、xlim、sector.name () xlim = label_data[get.cell.meta.data('sector.index'),1] / 2 sector.name = label_data[get.cell.meta.data('sector.index'),3] circos.text(xlim, ylim, sector.name, cex = 0.4, niceFacing = FALSE) #() xlim = (label_data[get.cell.meta.data('sector.index'),2]+label_data[get.cell.meta.data('sector.index'),1]) / 2 sector.name = label_data[get.cell.meta.data('sector.index'),4] circos.text(xlim, ylim, sector.name, cex = 0.4, niceFacing = FALSE) #, } ) plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max', 'rich.factor')] label_data <- go_enrich_df['category'] color_assign <- c('BP' = Col1, 'CC' = Col2, 'MF' = Col3) circos.genomicTrack( plot_data, ylim = c(0, max(plot_data$rich.factor)), track.height = 0.3, bg.col = 'gray95', bg.border = NA, #、 panel.fun = function(region, value, ...) { sector.name = get.cell.meta.data('sector.index') circos.genomicRect(region, value, col = color_assign[label_data[sector.name,1]], border = NA, ytop.column = 1, ybottom = 0, ...) #,, ko circos.lines(c(0, max(region)), c(0.05, 0.05), col = 'gray', lwd = 0.3) } ) circos.clear() category_legend <- Legend( labels = c("BP:biological process", "CC:cellular component","MF:molecular function" ), type = 'points', pch = NA, background = c(Col1,Col2,Col3), labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm')) updown_legend <- Legend( labels = c('Up-regulated', 'Down-regulated'), type = 'points', pch = NA, background = c('#FDDBC7', '#9ACD32'), labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm')) pvalue_legend <- Legend( col_fun = colorRamp2(round(seq(0, p_max, length.out = 6), 0), colorRampPalette(c('white','#4393C3'))(6)), legend_height = unit(3, 'cm'), labels_gp = gpar(fontsize = 8), title_gp = gpar(fontsize = 9), title_position = 'topleft', title = '-Log10(pvalue)') lgd_list_vertical <- packLegend(updown_legend, pvalue_legend) pushViewport(viewport(x = 0.85, y = 0.5)) grid.draw(lgd_list_vertical) upViewport() lgd_list_vertical <- packLegend(category_legend) pushViewport(viewport(x = 0.5, y = 0.5)) grid.draw(lgd_list_vertical) upViewport() dev.copy(which = a) dev.off() dev.off() } go_tem <- rbind(ego_result_BP,ego_result_CC,ego_result_MF) library(tidyverse) df <- go_tem[,c('ONTOLOGY','Description','geneID','Count','pvalue')] %>% filter(ONTOLOGY %in% c("BP","CC","MF")) %>% mutate(`-log10(pvalue)`=-log10(pvalue)) %>% arrange(ONTOLOGY,desc(`-log10(pvalue)`)) df1 <- df new_df <- data.frame() columns <- names(df1) for (i in 1:nrow(df1)) { current_row <- df1[i, ] new_row <- as.data.frame(matrix(NA, nrow = 1, ncol = length(columns))) colnames(new_row) <- columns new_df <- rbind(new_df, current_row, new_row) } new_df <- new_df %>% mutate(id=as.character(row_number()), col=case_when(ONTOLOGY=="BP" ~ "#7294D4", ONTOLOGY=="CC" ~ "#00A08A", ONTOLOGY=="MF" ~ "#F98400", TRUE ~ "white")) new_df$id <- factor(new_df$id,levels = new_df$id %>% rev()) col_mapping <- new_df[,c('id','col')] %>% deframe() replace_fifth_slash <- function(text) { parts <- str_split(text, "/")[[1]] if (length(parts) > 13) { parts[14] <- paste0("\n", parts[14]) } return(paste(parts, collapse = "/")) } new_df <- new_df %>% mutate(geneID = sapply(geneID, replace_fifth_slash)) colnames(new_df) p <- ggplot(new_df,aes(`-log10(pvalue)`,id))+ geom_bar(stat = "identity",aes(fill=ONTOLOGY), show.legend = F)+ geom_text(data=new_df %>% drop_na(), aes(label=Count),vjust=0.5,hjust=0,size=3)+ geom_text(data=new_df %>% drop_na() %>% filter(Count <= 13), aes(x=0.01,label=geneID,color=ONTOLOGY), vjust=0.5,hjust=0,size=3,nudge_y =-1,show.legend = F)+ geom_text(data=new_df %>% drop_na() %>% filter(Count > 13), aes(x=0.01,label=geneID,color=ONTOLOGY), vjust=0.5,hjust=0,size=3,nudge_y =-1.5 ,show.legend = F)+ scale_x_continuous(expand= expansion(mult = c(0,0.05)))+ scale_y_discrete(label=new_df$Description %>% rev())+ scale_fill_manual(values = c("#7294D4","#00A08A","#F98400"), na.translate = FALSE)+ scale_color_manual(values = c("#7294D4","#00A08A","#F98400"))+ labs(y=NULL)+ coord_cartesian(clip = "off")+ theme(axis.text.y=element_text(color=rev(col_mapping)), axis.ticks.y=element_blank(), panel.background = element_blank(), axis.line.x=element_line(color="black"), plot.margin=unit(c(0.5,0.5,0.5,0.5),unit="cm"))+ annotate(geom="rect",xmin=-0.12,xmax=-0.01,ymin=0.2,ymax=11, fill="#F98400",alpha=0.6)+ annotate(geom="rect",xmin=-0.12,xmax=-0.01,ymin=11,ymax=21, fill="#00A08A",alpha=0.6)+ annotate(geom="rect",xmin=-0.12,xmax=-0.01,ymin=21,ymax=31, fill="#7294D4",alpha=0.6)+ annotate(geom="text",x=-0.065,y=25.5,label="BP",size=3,color="black", angle=90,fontface="bold")+ annotate(geom="text",x=-0.065,y=15.5,label="CC",size=3,color="black", angle=90,fontface="bold")+ annotate(geom="text",x=-0.065,y=5.5,label="MF",size=3,color="black", angle=90,fontface="bold") pdf("06.go_barplot.pdf", width = 10, height = 6,family = "Times") p dev.off() png("06.go_barplot.png", width = 10, height = 6,res = 600,units = 'in',family = "Times") p dev.off() library(tidyverse) library(RColorBrewer) library(MetBrewer) library(ggnewscale) colnames(df1) df1$GeneRatio <- df1$Count / 109 pp <- ggplot(df1,aes(GeneRatio,Description))+ geom_segment(aes(x=0,xend=GeneRatio,y=Description,yend=Description,color=ONTOLOGY), size=6,show.legend = F,alpha=0.5)+ geom_text(aes(x=0,y=Description,label=Description),vjust=0.5,hjust=0,size=3.5,color="black")+ scale_color_manual(values=c("#abd0a7","#3B9AB2","#00A08A"))+ new_scale_color()+ geom_point(aes(size=GeneRatio,color=pvalue,fill=pvalue),shape=19)+ scale_fill_gradientn(colors=met.brewer("Cassatt1")) + scale_color_gradientn(colors=met.brewer("Cassatt1")) + guides(size=guide_legend(title="GeneRatio"))+ facet_grid(ONTOLOGY~.,scale="free_y")+ scale_x_continuous(expand = c(0,0.005))+ theme_classic()+ coord_cartesian(clip="off")+ theme(axis.title = element_blank(), axis.text.y=element_blank(), axis.text.x=element_text(color="black",face="bold"), axis.ticks.y=element_blank(), panel.background = element_blank(), strip.background = element_rect(fill = "grey85", colour = NA), panel.spacing.y = unit(0, "cm"), legend.key=element_blank(), legend.title = element_text(color="black",size=9), legend.text = element_text(color="black",size=8), legend.spacing.x=unit(0.1,'cm'), legend.key.width=unit(0.5,'cm'), legend.key.height=unit(0.5,'cm'), legend.background=element_blank(), legend.box.margin = margin(1,1,1,1), #legend.position = c(0.92,0.78), plot.margin=margin(0.5,0.5,0.5,0.5,"cm") ) pdf("07.go_barplot.pdf", width = 7, height = 6,family = "Times") print(pp) dev.off() png("07.go_barplot.png", width = 7, height = 6,res = 600,units = 'in',family = "Times") print(pp) dev.off() # _KEGG library(clusterProfiler) library(stringr) library(ComplexHeatmap) options(stringsAsFactors = F) gene_symbol <- clusterProfiler::bitr( geneID = enrich_ID$x, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db") KEGG <- enrichKEGG( gene = gene_symbol$ENTREZID,# organism = "hsa", # keyType = "kegg", # minGSSize = 1, maxGSSize = 500, pvalueCutoff = 1, pAdjustMethod = "BH", qvalueCutoff = 0.9 ) KEGG <- mutate(KEGG, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio))) kk <- setReadable(KEGG, OrgDb = org.Hs.eg.db, keyType="ENTREZID") kegg_result <- kk@result if(median(kegg_result$richFactor) < 0.1){kegg_result$richFactor<- kegg_result$richFactor*10} hh <- as.data.frame(kegg_result) rownames(hh) <- 1:nrow(hh) hh <- hh[hh$pvalue <= 0.05,] dim(hh)#31 hh <- hh[order(hh$pvalue,decreasing = F),] write.csv(hh, file = "00.KEGG.csv") rownames(hh) <- 1:nrow(hh) hh$order=factor(rev(as.integer(rownames(hh))),labels = rev(hh$Description)) kk <- hh[c(1:10),] paste0(kk$Description,collapse = "();") if(kegg.circ == 1){ go <- data.frame(Category = "KEGG",ID = kk[,'ID'],Term = kk[,'Description'], Genes = gsub("/", ", ", kk[,'geneID']), adj_pval = kk[,'pvalue']) genelist <- data.frame(ID = rownames(select_DERNA), logFC = select_DERNA$log2FoldChange) row.names(genelist)=genelist[,1] library(GOplot) circ <- circle_dat(go, genelist) kk$pvalue <- -log10(kk$pvalue) go_enrich_df <- data.frame( category="KEGG", gene_num.min = 0, gene_num.max = max(kegg_result$Count), gene_num.rich=kk$Count, "-log10.p" = kk$pvalue, up.regulated = 0, down.regulated = 0, rich.factor = kk$richFactor, Description=kk$ID) circ$sig <- ifelse(circ$logFC > 0,"up","down") tem1 <- go_enrich_df$Description for(i in 1:10){ x = tem1[i] circ1 <- circ[grep(x,circ$ID),] num1 <- nrow(circ1) tem2 <- as.data.frame(table(circ1$sig)) if(nrow(tem2) == 2){ go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]] go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[2]] }else if(tem2[1,1] == "up"){ go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[1]] }else{ go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]]} } go_enrich_df$Description <- factor(go_enrich_df$Description, levels = go_enrich_df$Description) rownames(go_enrich_df) <- go_enrich_df$Description library(ggplot2) pdf(file = paste0("06.KEGG_cic.pdf"),width = 13,height = 6,family = "Times") a <- dev.cur() png(file = paste0("06.KEGG_cic.png"),width= 13, height= 6, units="in", res=300,family = "Times") dev.control("enable") par(family="Times") circle_size = unit(1, 'snpc') library(circlize) kegg.color <- sample(col1,1) circos.par(gap.degree = 2, start.degree = 90) plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max')] #, ko ko_color <- c(rep(kegg.color, 15)) # circos.genomicInitialize(plot_data, plotType = NULL, major.by = 1) # circos.track( ylim = c(0, 1), track.height = 0.05, bg.border = NA, bg.col = ko_color, #、 panel.fun = function(x, y) { ylim = get.cell.meta.data('ycenter') #ylim、xlim ko id xlim = get.cell.meta.data('xcenter') sector.name = get.cell.meta.data('sector.index') #sector.name ko id circos.axis(h = 'top', labels.cex = 0.6, major.tick.percentage = 0.4, labels.niceFacing = FALSE) # circos.text(xlim, ylim, sector.name, cex = 0.8, niceFacing = FALSE) # ko id } ) colnames(go_enrich_df)[5] <- c("-log10(pvalue)") plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.rich', '-log10(pvalue)')] #, p label_data <- go_enrich_df['gene_num.rich'] #, p_max <- round(max(go_enrich_df$`-log10(pvalue)`)) + 1 # p , colorsChoice <- colorRampPalette(c('white','#4393C3')) # p color_assign <- colorRamp2(breaks = 0:p_max, col = colorsChoice(p_max + 1)) circos.genomicTrackPlotRegion( plot_data, track.height = 0.08, bg.border = NA, stack = TRUE, #、 panel.fun = function(region, value, ...) { circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...) #, p ylim = get.cell.meta.data('ycenter') #,ylim、xlim、sector.name () xlim = label_data[get.cell.meta.data('sector.index'),1] / 2 sector.name = label_data[get.cell.meta.data('sector.index'),1] circos.text(xlim, ylim, sector.name, cex = 0.6, niceFacing = FALSE) #() } ) go_enrich_df$all.regulated <- go_enrich_df$up.regulated + go_enrich_df$down.regulated go_enrich_df$up.proportion <- go_enrich_df$up.regulated / go_enrich_df$all.regulated go_enrich_df$down.proportion <- go_enrich_df$down.regulated / go_enrich_df$all.regulated go_enrich_df$up <- go_enrich_df$up.proportion * go_enrich_df$gene_num.max plot_data_up <- go_enrich_df[c('Description', 'gene_num.min', 'up')] names(plot_data_up) <- c('Description', 'start', 'end') plot_data_up$type <- 1 # 1 go_enrich_df$down <- go_enrich_df$down.proportion * go_enrich_df$gene_num.max + go_enrich_df$up plot_data_down <- go_enrich_df[c('Description', 'up', 'down')] names(plot_data_down) <- c('Description', 'start', 'end') plot_data_down$type <- 2 # 2 plot_data <- rbind(plot_data_up, plot_data_down) label_data <- go_enrich_df[c('up', 'down', 'up.regulated', 'down.regulated')] color_assign <- colorRamp2(breaks = c(1, 2), col = c('#FDDBC7', '#9ACD32')) circos.genomicTrackPlotRegion( plot_data, track.height = 0.08, bg.border = NA, stack = TRUE, #、 panel.fun = function(region, value, ...) { circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...) #,, ylim = get.cell.meta.data('cell.bottom.radius') - 0.5 #,ylim、xlim、sector.name () xlim = label_data[get.cell.meta.data('sector.index'),1] / 2 sector.name = label_data[get.cell.meta.data('sector.index'),3] circos.text(xlim, ylim, sector.name, cex = 0.6, niceFacing = FALSE) #() xlim = (label_data[get.cell.meta.data('sector.index'),2]+label_data[get.cell.meta.data('sector.index'),1]) / 2 sector.name = label_data[get.cell.meta.data('sector.index'),4] circos.text(xlim, ylim, sector.name, cex = 0.6, niceFacing = FALSE) #, } ) plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max', 'rich.factor')] #, label_data <- go_enrich_df['category'] #,, color_assign <- c('KEGG' = kegg.color) circos.genomicTrack( plot_data, ylim = c(0, max(plot_data$rich.factor)), track.height = 0.3, bg.col = 'gray95', bg.border = NA, #、 panel.fun = function(region, value, ...) { sector.name = get.cell.meta.data('sector.index') #sector.name ko id , ko , circos.genomicRect(region, value,col = color_assign[label_data[sector.name,1]], border = NA, ytop.column = 1, ybottom = 0, ...) #,, ko circos.lines(c(0, max(region)), c(0.05, 0.05), col = 'gray', lwd = 0.3) # 0.5 }) circos.clear() category_legend <- Legend( labels = c("KEGG Pathway"), type = 'points', pch = NA, background = c(kegg.color), labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm')) updown_legend <- Legend( labels = c('Up-regulated', 'Down-regulated'), type = 'points', pch = NA, background =c('#FDDBC7', '#9ACD32'), labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm')) pvalue_legend <- Legend( col_fun = colorRamp2(round(seq(0, p_max, length.out = 6), 0), colorRampPalette(c('white','#4393C3'))(6)), legend_height = unit(3, 'cm'), labels_gp = gpar(fontsize = 8), title_gp = gpar(fontsize = 9), title_position = 'topleft', title = '-Log10(pvalue)') lgd_list_vertical <- packLegend(updown_legend, pvalue_legend) pushViewport(viewport(x = 0.85, y = 0.5)) grid.draw(lgd_list_vertical) upViewport() lgd_list_vertical <- packLegend(category_legend) pushViewport(viewport(x = 0.5, y = 0.5)) grid.draw(lgd_list_vertical) upViewport() dev.copy(which = a) dev.off() dev.off() } if(kegg.buble == 1){ kk <- hh[c(1:10),] p2 <- ggplot(kk,aes(y=order,x=Count))+ geom_point(aes(size=Count,color= pvalue))+# scale_x_continuous(breaks = seq(min(kk$Count), max(kk$Count), by = 1), # x limits = c(min(kk$Count) - 0.5, max(kk$Count) + 0.5))+ #x scale_color_gradient(high="#EE0000B2",low = "#008B45B2")+ labs(color=expression(pvalue,size="Count"), x="Gene Number",y="Pathways",title="KEGG Enrichment")+ theme_bw()+ theme( legend.background = element_rect(fill = "white", color = "black", size = 0.2), legend.text = element_text(face="bold",color="black",family = "Times",size=14), plot.title = element_text(hjust = 0.5, face = "bold",color = "black",family = "Times",size = 18), axis.text.x = element_text(face = "bold",color = "black",size = 14), axis.text.y = element_text(face = "bold",color = "black",size = 14), axis.title.x = element_text(face = "bold",color = "black",family = "Times",size = 18), axis.title.y = element_text(face = "bold",color = "black",family = "Times",size = 18), plot.subtitle = element_text(hjust = 0.5,family = "Times", size = 14, face = "bold", colour = "black") )+scale_y_discrete(labels=function(x) str_wrap(x, width=40)) pdf("04.KEGG_dot.pdf", width = 9, height = 8,family = "Times") print(p2) dev.off() png("04.KEGG_dot.png", width = 9, height = 8,res = 600,units = 'in',family = "Times") print(p2) dev.off() } save.image('enrich.Rdata') # important segment ------------------------------------------------------- # load('enrich.Rdata') # paste0("",dim(GO_ALL)[[1]],",",dim(BP)[[1]],",",dim(CC)[[1]],",",dim(MF)[[1]],"") # paste0(ego_result_BP$Description,collapse = "()、") # paste0(ego_result_CC$Description,collapse = "()、") # paste0(ego_result_MF$Description,collapse = "()、") # hh2 <- subset(hh,pvalue<0.05) # dim(hh2) # paste0(hh$Description[1:15],collapse = "()、") # # # > paste0("",dim(GO_ALL)[[1]],",",dim(BP)[[1]],",",dim(CC)[[1]],",",dim(MF)[[1]],"") # [1] "2090,1955,37,98" # > paste0(ego_result_BP$Description,collapse = "()、") # [1] "lymph vessel development()、lymph vessel morphogenesis()、epithelial cell proliferation()、leukocyte migration()、ameboidal-type cell migration" # > paste0(ego_result_CC$Description,collapse = "()、") # [1] "platelet alpha granule()、external side of plasma membrane()、platelet alpha granule lumen()、collagen-containing extracellular matrix()、integrin complex" # > paste0(ego_result_MF$Description,collapse = "()、") # [1] "receptor ligand activity()、signaling receptor activator activity()、integrin binding()、transmembrane receptor protein kinase activity()、growth factor binding" # > hh2 <- subset(hh,pvalue<0.05) # > dim(hh2) # [1] 137 11 # > paste0(hh$Description[1:15],collapse = "()、") # [1] "PI3K-Akt signaling pathway()、Focal adhesion()、Proteoglycans in cancer()、Ras signaling pathway()、MAPK signaling pathway()、AGE-RAGE signaling pathway in diabetic complications()、Rap1 signaling pathway()、EGFR tyrosine kinase inhibitor resistance()、Relaxin signaling pathway()、Human cytomegalovirus infection()、Renal cell carcinoma()、TNF signaling pathway()、Calcium signaling pathway()、HIF-1 signaling pathway()、Yersinia infection" # #prognostic model #06_single_factor_cox------------------------------------------------ rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./05_single_factor_cox")){ dir.create("./05_single_factor_cox") } setwd("./05_single_factor_cox") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(lance) library(readxl) library(readr) library(tidyverse) survival<-read.csv('../00_rawdata/04.survival_TCGA.csv',row.names = 1) #survival$sample<-rownames(survival) dat.tcga<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F) #colnames(dat.tcga)<-gsub('.','-',colnames(dat.tcga),fixed = T) hubgene<-read.csv('../02_intersect/01.DEGs_LRGs.csv') #write.csv(survival,file = '01.survival.csv',quote = F,row.names = F) train_dat<-dat.tcga[,survival$sample] train_dat<-t(train_dat) train_dat<-as.data.frame(train_dat) train_dat<-train_dat[,colnames(train_dat)%in%hubgene$x,] train_dat$sample<-rownames(train_dat) train_dat<-merge(survival,train_dat,by='sample') train_dat<-column_to_rownames(train_dat,var = 'sample') train_data<-train_dat write.csv(train_data,"02.single_factor_cox_data.csv") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(survival) library(survminer) colnames_sum <- colnames(train_data) colnames_sum <- gsub("-","_",colnames_sum) colnames_sum <- gsub(" ","_",colnames_sum) colnames(train_data) <- colnames_sum covariates <- colnames_sum[-which(colnames_sum %in% c("OS", "OS.time"))] #Surv() univ_formulas <- sapply(covariates, function(x) as.formula(paste("Surv(OS.time, OS)~", x))) #as.formula(). 。formula # coxphcox cox univ_models <- lapply(univ_formulas, function(x) {coxph(x, data = train_data)}) # ph univ_cox_zph <- lapply(univ_models, function(x) { y <- cox.zph(x) #p p<-y$table[-nrow(y$table),][3] chisq <- y$table[-nrow(y$table),][1] df <- y$table[-nrow(y$table),][2] res<-c(chisq,df,p) names(res)<-c("chisq","df","p") return(res) }) univ_cox_zph <- t(as.data.frame(univ_cox_zph, check.names = FALSE)) univ_cox_zph <- as.data.frame(univ_cox_zph) write.csv(univ_cox_zph,"PH_Unifactor_all.csv") univ_cox_zph2 <- subset(univ_cox_zph,p>0.05) write.csv(univ_cox_zph2,"PH_Unifactor.csv") univ_cox_zph <- rownames(univ_cox_zph)[univ_cox_zph$p>0.05] #98 train_dat<-train_dat[,colnames(train_dat)%in%univ_cox_zph,] train_dat$sample<-rownames(train_dat) train_dat<-merge(survival,train_dat,by='sample') train_dat<-column_to_rownames(train_dat,var = 'sample') train_data<-train_dat write.csv(train_data,"02.single_factor_cox_data_PH_after.csv") #cox library(survival) library(survminer) colnames_sum <- colnames(train_data) colnames_sum <- gsub("-","_",colnames_sum) colnames_sum <- gsub(" ","_",colnames_sum) colnames(train_data) <- colnames_sum covariates <- colnames_sum[-which(colnames_sum %in% c("OS", "OS.time"))] #Surv() univ_formulas <- sapply(covariates, function(x) as.formula(paste("Surv(OS.time, OS)~", x))) #as.formula(). 。formula # coxphcox cox univ_models <- lapply(univ_formulas, function(x) {coxph(x, data = train_data)}) univ_results <- lapply(univ_models, function(x){ x <- summary(x) #p p.value<-signif(x$wald["pvalue"], digits=3) #HR HR <-signif(x$coef[2], digits=4); #95% HR.confint.lower <- signif(x$conf.int[,"lower .95"],4) HR.confint.upper <- signif(x$conf.int[,"upper .95"],4) HR <- paste0(HR, " (", HR.confint.lower, "-", HR.confint.upper, ")") res<-c(p.value,HR) names(res)<-c("p.value","HR (95% CI for HR)") return(res) }) ## coefb(beta)。exp(coef)cox(HR) ## zwald,coefse(coef)。ower .95 upper .95exp(coef)95%,,,,。 res_mod <- t(as.data.frame(univ_results, check.names = FALSE)) res_mod <- as.data.frame(res_mod) res_results_0.01 <- res_mod[which(as.numeric(res_mod$p.value) < 0.01),] res_results_0.01 <- na.omit(res_results_0.01) # 4 write.csv(res_results_0.01,file = "03.single_factor_cox_results_0.05.csv",quote = F,row.names = T) dim(res_results_0.01) ## library(tidyr) res_results_0.01_2 <- separate(res_results_0.01, "HR (95% CI for HR)", into = c("HR", "HR.95L", "HR.95H"), sep = " ") res_results_0.01_2 <- separate(res_results_0.01_2, "HR.95L", into = c("HR.95L", "HR.95H"), sep = "\\-") res_results_0.01_2$HR.95L <- gsub("\\(", "", res_results_0.01_2$HR.95L) res_results_0.01_2$HR.95H <- gsub("\\)", "", res_results_0.01_2$HR.95H) res_results_0.01_2[,1:ncol(res_results_0.01_2)] <- as.numeric(unlist(res_results_0.01_2[,1:ncol(res_results_0.01_2)])) res_results_0.01_2 <- res_results_0.01_2[order(res_results_0.01_2$HR),] hz <- paste(round(res_results_0.01_2$HR,4), "(",round(res_results_0.01_2$HR.95L,4), "-",round(res_results_0.01_2$HR.95H,4),")",sep = "") tabletext <- cbind(c(NA,"Gene",rownames(res_results_0.01_2)), c(NA,"P value",ifelse(res_results_0.01_2$p.value<0.001, "< 0.001", round(res_results_0.01_2$p.value,3))), c(NA,"Hazard Ratio(95% CI)",hz)) nrow(tabletext) + 1 .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(forestplot) # pdf(file = "04.univariate_cox_forest.pdf", height = 9, width = 10, onefile = F,family = 'Times') forestplot(labeltext=tabletext, graph.pos=4, #Pvalue is.summary = c(TRUE, TRUE,rep(FALSE, 70)), col=fpColors(box="darkred", lines="darkblue", zero = "gray50"), mean=c(NA,NA,res_results_0.01_2$HR), lower=c(NA,NA,res_results_0.01_2$HR.95L), #95% upper=c(NA,NA,res_results_0.01_2$HR.95H), #95% boxsize=0.1,lwd.ci=3, #, ci.vertices.height = 0.08,ci.vertices=TRUE, #、、 zero=1,lwd.zero=0.5, #zero colgap=unit(5,"mm"), # xticks = c(0.2,0.4,0.6,0.8,1,1.2,1.4,1.6,1.8,2,2.2,2.4,2.6), # lwd.xaxis=2, #X lineheight = unit(1.2,"cm"), # graphwidth = unit(.5,"npc"), # cex=0.9, fn.ci_norm = fpDrawCircleCI, # hrzl_lines = list("3" = gpar(col = "black", lty = 1, lwd = 2)), # hrzl_lines=list("2" = gpar(lwd=2, col="black"), # "3" = gpar(lwd=2, col="black"), #, # "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1 # mar=unit(rep(0.01, times = 4), "cm"),# #fpTxtGpcex txt_gp=fpTxtGp(label=gpar(cex=1), ticks=gpar(cex=0.8, fontface = "bold"), xlab=gpar(cex = 1, fontface = "bold"), title=gpar(cex = 1.25, fontface = "bold")), xlab="Hazard Ratio", grid = T, clip = c(0,2.2)) # x, dev.off() png(filename = "04.univariate_cox_forest.png", height = 9, width = 10,units='in',res=600,family='Times') forestplot(labeltext=tabletext, graph.pos=4, #Pvalue is.summary = c(TRUE, TRUE,rep(FALSE, 70)), col=fpColors(box="darkred", lines="darkblue", zero = "gray50"), mean=c(NA,NA,res_results_0.01_2$HR), lower=c(NA,NA,res_results_0.01_2$HR.95L), #95% upper=c(NA,NA,res_results_0.01_2$HR.95H), #95% boxsize=0.1,lwd.ci=3, #, ci.vertices.height = 0.08,ci.vertices=TRUE, #、、 zero=1,lwd.zero=0.5, #zero colgap=unit(5,"mm"), # xticks = c(0.2,0.4,0.6,0.8,1,1.2,1.4,1.6,1.8,2,2.2,2.4,2.6), # lwd.xaxis=2, #X lineheight = unit(1.2,"cm"), # graphwidth = unit(.5,"npc"), # cex=0.9, fn.ci_norm = fpDrawCircleCI, # hrzl_lines = list("3" = gpar(col = "black", lty = 1, lwd = 2)), # hrzl_lines=list("2" = gpar(lwd=2, col="black"), # "3" = gpar(lwd=2, col="black"), #, # "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1 # mar=unit(rep(0.01, times = 4), "cm"),# #fpTxtGpcex txt_gp=fpTxtGp(label=gpar(cex=1), ticks=gpar(cex=0.8, fontface = "bold"), xlab=gpar(cex = 1, fontface = "bold"), title=gpar(cex = 1.25, fontface = "bold")), xlab="Hazard Ratio", grid = T, clip = c(0,2.2)) # x, dev.off() # gene_list <- rownames(res_results_0.01) # #gene_list <- lasso_geneids # cox_data <- as.formula(paste0('Surv(OS.time, OS)~', # paste(gene_list, # sep = '', # collapse = '+'))) # # cox_more <- coxph(cox_data, # data = train_data) # cox_zph <- cox.zph(cox_more) # cox_table <- cox_zph$table[-nrow(cox_zph$table),] # cox_table2 <- data.frame(cox_table) # write.csv(cox_table2,'PH.csv') # ph <- subset(cox_table2,p>0.05) # write.csv(ph,'05.ph_test.csv') #--------------------------------------------------------------------------------------- #07_Lasso-------------------------------------------------------------------------------------- rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (!dir.exists("06_Lasso")){dir.create("06_Lasso")} setwd("06_Lasso") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(glmnet) library(dplyr) library(ggplot2) train_data <- read.csv('../05_single_factor_cox/02.single_factor_cox_data_PH_after.csv',check.names = F,row.names = 1) res_results_0.01 <- read.csv('../05_single_factor_cox/03.single_factor_cox_results_0.05.csv',check.names = F,row.names = 1) x <- subset(train_data, select = -c(OS, OS.time)) x <- x[,rownames(res_results_0.01)] %>% as.matrix() y <- subset(train_data, select = c(OS, OS.time)) y <- data.matrix(Surv(y$OS.time,y$OS)) .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) # #if(lasso == 'lasso') library("glmnet") library("survival") # info <- data.frame(seed = "",num = "") # info <- info[-1,] # if(T){ # for(i in 1:1000){ # set.seed(i) # fit <- glmnet(x,y,family="cox") #Cox, # lasso_fit <- cv.glmnet(x=x, y=y, family='cox', type.measure='deviance',maxit = 1000,nfolds = 10) ## ,maxit = 1000 # coefficient <- coef(lasso_fit, s=lasso_fit$lambda.min) # Active.Index <- which(as.numeric(coefficient)!=0) # astive.coefficients <- as.numeric(coefficient)[Active.Index] # lasso_geneids <- rownames(coefficient)[Active.Index] # lasso<-data.frame(gene=lasso_geneids ,coef=astive.coefficients ) # info <- rbind(info,data.frame(seed = i,num = nrow(lasso))) # } # # } # table(info$num) # write.csv(info,'info.csv') # info <- read.csv('info.csv',row.names = 1) # a <- rstatix::filter(info,!duplicated(info$num)) x_all <- subset(train_data, select = -c(OS, OS.time)) x_all <- x_all[,rownames(res_results_0.01)] y_all <- subset(train_data, select = c(OS, OS.time)) fit <- glmnet(as.matrix(x_all), Surv(y_all$OS.time,y_all$OS),family = "cox") set.seed(68) # setwd("/data/nas3///29_YQ953-11/06_Lasso/") { cvfit = cv.glmnet(as.matrix(x_all), Surv(y_all$OS.time,y_all$OS),nfold=10, family = "cox") # pdf(file = "01.lasso_model.pdf", height = 5) # plot(fit, xvar = "lambda", cex.lab = 1) + abline(v = c(log(cvfit$lambda.min), # log(cvfit$lambda.1se)), lty = 2) + text(x = log(cvfit$lambda.min), # y = -5, paste("Lambda.min\n", round(cvfit$lambda.min, 4)), cex = 1.6, # adj = 0.9) + text(x = log(cvfit$lambda.1se), y = 5, paste("Lambda.lse\n", # round(cvfit$lambda.1se, 4)), cex = 1.6, adj = 0.9) # dev.off() # png(filename = "01.lasso_model.png", height = 400, width = 500) # plot(fit, xvar = "lambda", cex.lab = 1) + abline(v = c(log(cvfit$lambda.min), # log(cvfit$lambda.1se)), lty = 2) + text(x = log(cvfit$lambda.min), # y = -5, paste("Lambda.min\n", round(cvfit$lambda.min, 4)), cex = 1.6, # adj = 0.9) + text(x = log(cvfit$lambda.1se), y = 5, paste("Lambda.lse\n", # round(cvfit$lambda.1se, 4)), cex = 1.6, adj = 0.9) # # dev.off() # # png(filename = "02.lasso_verify.png", height = 400, width = 500) # plot(cvfit, las =1) # dev.off() # pdf(file = "02.lasso_verify.pdf", height = 5) # plot(cvfit, las =1) # dev.off() coef.min = coef(cvfit, s = "lambda.min") # lambda.min & lambda.1se cvfit$lambda.min #0.01362608 # 0 active.min = which(coef.min@i != 0) coef.min # lasso_geneids <- coef.min@Dimnames[[1]][coef.min@i+1] lasso_geneids } #"IGF2BP2" "PCF11" "RPUSD4" "NUDT11" lasso_geneids <- data.frame(lasso_geneids) colnames(lasso_geneids) <- 'symbol' write.csv(lasso_geneids, "03.lasso_genes.csv",row.names = T) coef.min # 1 # CD24 0.1758641 # CEBPD -0.1753270 # CCL19 -0.0829096 # ZIC2 0.1191970 saveRDS(cvfit,file = '04.cvfit.rds') saveRDS(x_all,file = '05.x_all.rds') saveRDS(coef.min,file = '06.coef.min.rds') cvfit$lambda.min # 0.001690118 log(cvfit$lambda.min) #-6.382957 # x <- coef(fit) tmp <- as.data.frame(as.matrix(x)) tmp$coef <- row.names(tmp) tmp <- reshape::melt(tmp, id = "coef") tmp$variable <- as.numeric(gsub("s", "", tmp$variable)) tmp$coef <- gsub('_','-',tmp$coef) tmp$lambda <- fit$lambda[tmp$variable+1] # extract the lambda values tmp$norm <- apply(abs(x[-1,]), 2, sum)[tmp$variable+1] # compute L1 norm # head(tmp) p <- ggplot(tmp,aes(log(lambda),value,color = coef)) + geom_vline(xintercept = log(cvfit$lambda.min), size=0.8,color='grey60', alpha=0.8,linetype=2)+ geom_line(size=1) + #ylim(-10,-2)+ xlab("Log(Lambda)") + ylab('Coefficients')+ theme_bw(base_rect_size = 2)+ scale_color_manual(values= c('#287eb7','#ff8c33','#ff9898','#dedb8e', '#c49d93',"#E78AC3","#A6D854","#FFD92F","#E5C494","#4682B4","#CD3700",'seagreen',"darkblue","#6a3d9a","#e5486e",'#A73030FF'))+ scale_x_continuous(expand = c(0.01,0.01))+ scale_y_continuous(expand = c(0.01,0.01))+ theme(panel.grid = element_blank(), axis.title = element_text(size=15,color='black'), axis.text = element_text(size=12,color='black'), legend.title = element_blank(), legend.text = element_text(size=12,color='black'), legend.position = 'right')+ annotate('text',x = -5.5,y=0.25, label='Optimal Lambda = 0.001690118', color='black')+ guides(col=guide_legend(ncol = 1)) xx <- data.frame(lambda=cvfit[["lambda"]], cvm=cvfit[["cvm"]], cvsd=cvfit[["cvsd"]], cvup=cvfit[["cvup"]], cvlo=cvfit[["cvlo"]], nozezo=cvfit[["nzero"]]) xx$ll<- log(xx$lambda) xx$NZERO<- paste0(xx$nozezo,' vars') p pdf('03.lasso_model.pdf',w=7,h=5,family='Times') print(p) dev.off() png('03.lasso_model.png',w=7,h=5,units='in',res=600,family='Times') print(p) dev.off() p <- ggplot(xx,aes(ll,cvm,color=NZERO))+ geom_errorbar(aes(x=ll,ymin=cvlo,ymax=cvup), width=0.05,size=1)+ geom_vline(xintercept = xx$ll[which.min(xx$cvm)], size=0.8,color='grey60',alpha=0.8, linetype=2)+ geom_point(size=2)+ xlab("Log(Lambda)") + ylab('Partial Likelihood Deviance')+ theme_bw(base_rect_size = 1.5)+ scale_color_manual(values= c('#287eb7','#ff8c33','#ff9898','#dedb8e', '#c49d93',"#E78AC3","#A6D854","#FFD92F","#E5C494","#4682B4","#CD3700",'seagreen',"darkblue","#6a3d9a","#e5486e",'#A73030FF'))+ scale_x_continuous(expand = c(0.02,0.02))+ scale_y_continuous(expand = c(0.02,0.02))+ theme(panel.grid = element_blank(), axis.title = element_text(size=15, color='black'), axis.text = element_text(size=12, color='black'), legend.title = element_blank(), legend.text = element_text(size=12, color='black'), legend.position = 'bottom' )+ annotate('text',x= -5.8,y=13.5, label='Optimal Lambda = 0.001690118', color='black')+ guides(col=guide_legend(ncol = 6)) p pdf('03.lasso_verify.pdf',w=6,h=5,family='Times') print(p) dev.off() png('03.lasso_verify.png',w=6,h=5,units='in',res=600,family='Times') print(p) dev.off() #08_Train_estimate-------------------------------------------------------------------------------------- rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (!dir.exists("07_Train_estimate")){dir.create("07_Train_estimate")} setwd("07_Train_estimate") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) train_data <- read.csv('../05_single_factor_cox/02.single_factor_cox_data_PH_after.csv',check.names = F,row.names = 1) #res_results_0.01 <- read.csv('../06_single_factor_cox/03.single_factor_cox_results_0.01.csv',check.names = F,row.names = 1) coxGene <- read.csv('../06_Lasso/03.lasso_genes.csv',row.names = 1,check.names = F) cvfit <- readRDS('../06_Lasso/04.cvfit.rds') x_all <- readRDS('../06_Lasso/05.x_all.rds') coef.min <- readRDS('../06_Lasso/06.coef.min.rds') cox_gene_rt <- train_data[ ,c('OS.time','OS', coxGene$symbol)] # lasso riskScore <- predict(cvfit, newx = as.matrix(x_all),s=cvfit$lambda.min) riskScore<-as.numeric(riskScore) class(riskScore) coxGene <- coxGene$symbol outCol=c("OS","OS.time",coxGene) # GC risk=as.vector(ifelse(riskScore>median(riskScore),'High','Low')) risk <- as.data.frame(c(cbind(id=rownames(cbind(train_data[,outCol],riskScore,risk)), cbind(train_data[,outCol],riskScore,risk)))) table(risk$risk) riskScore <- data.frame(risk) #write.csv(riskScore, file = "01.train_riskScore.csv", quote = F) cox_gene_rt<-riskScore median(cox_gene_rt$riskScore) # 0.4308912 .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(survival) library(survminer) OS_dat <- risk gene<- colnames(OS_dat) #surv_cutpoint res.cut <- surv_cutpoint(OS_dat,time = 'OS.time',event = 'OS',variables = 'riskScore',minprop = 0.4)# cutpoint <- res.cut$cutpoint[1]#0.329995 OS_dat$group <- factor(ifelse(OS_dat$riskScore>cutpoint['riskScore',],'High','Low'),levels = c('High','Low')) table(OS_dat$group) # kmfit_out <- survfit(Surv(OS.time, OS) ~ group, data = OS_dat) surv_pvalue(kmfit_out) # # High Low # 522 562 # kmfit <- survfit(Surv(OS.time, OS) ~ risk, data = cox_gene_rt) # surv_pvalue(kmfit) # obj_is_vector <- function(obj) { # is.vector(obj) && length(dim(obj)) == 1 # } .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(ggplot2) #trace(ggsurvplot,edit=T) pdf('02.Train_survival_KM.pdf',w=7,h=7,onefile=F,family='Times') train_survival_median <- ggsurvplot(kmfit_out, pval = TRUE, conf.int = F, legend.labs=c("High risk","Low risk" ), legend.title="Risk score", title="Train KM", font.main = c(15,"bold"), risk.table = TRUE, risk.table.col = "strata", linetype = "strata", surv.median.line = "hv", ggtheme = theme_bw(), palette = c("#A73030FF", "#0073C2FF")) print(train_survival_median) dev.off() png('02.Train_survival_KM.png',w=7,h=7,family='Times',units='in',res=600) train_survival_median <- ggsurvplot(kmfit_out, pval = TRUE, conf.int = F, legend.labs=c("High risk","Low risk" ), legend.title="Risk score", title="Train KM", font.main = c(15,"bold"), risk.table = TRUE, risk.table.col = "strata", linetype = "strata", surv.median.line = "hv", ggtheme = theme_bw(), palette = c("#A73030FF", "#0073C2FF")) print(train_survival_median) dev.off() colnames(OS_dat) OS_dat <- OS_dat[,-9] colnames(OS_dat)[9] <- 'risk' write.csv(OS_dat, file = "01.train_OS_dat.csv", quote = F) table(OS_dat$risk) risk2 <- risk risk <- OS_dat #4 ROC .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(survivalROC) ROC <- risk multi_ROC <- function(time_vector, risk_score_table){ library(survivalROC) single_ROC <- function(single_time){ for_ROC <- survivalROC(Stime=risk_score_table$OS.time, status=risk_score_table$OS, marker=risk_score_table$riskScore, predict.time=single_time, method = 'KM') data.frame('True_positive'=for_ROC$TP, 'False_positive'=for_ROC$FP, 'Cut_values'=for_ROC$cut.values, 'Time'=rep(single_time, length(for_ROC$TP)), 'AUC'=rep(for_ROC$AUC, length(for_ROC$TP))) } multi_ROC_list <- lapply(time_vector, single_ROC) do.call(rbind, multi_ROC_list) } for_multi_ROC <- multi_ROC(time_vector = c(365*seq(1,5,2)), risk_score_table = risk) table(for_multi_ROC$AUC) for_multi_ROC$Time <- factor(for_multi_ROC$Time) # ROC library(geomROC) library(plotROC) library(ggthemes) auc_y1 <- round(for_multi_ROC[which(for_multi_ROC$Time==365),5][1],3) auc_y3 <- round(for_multi_ROC[which(for_multi_ROC$Time==1095),5][1],3) #auc_y4 <- round(for_multi_ROC[which(for_multi_ROC$Time==1460),5][1],2) auc_y5 <- round(for_multi_ROC[which(for_multi_ROC$Time==1825),5][1],3) #auc_y6 <- round(for_multi_ROC[which(for_multi_ROC$Time==2190),5][1],2) #auc_y7 <- round(for_multi_ROC[which(for_multi_ROC$Time==2555),5][1],3) ROC <- ggplot(for_multi_ROC, aes(x=False_positive, y=True_positive, label=Cut_values, color=Time)) + scale_color_manual(breaks = c('365',"1095",'1825'), labels = c("1 year","3 years","5 years"), values = c("#4682B4", "#FF7F00", "#984EA3")) + plotROC::geom_roc(labels = F, stat = 'identity') + style_roc() + geom_abline(slope = 1, intercept = 0, color = 'gray', linetype=2) + theme_bw() + labs(title = "Train ROC") + theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'), panel.grid = element_blank(), plot.title = element_text(size = 15, hjust = 0.5, face = "bold")) + annotate("text", x=0.75, y=c(0.25, 0.15, 0.05), label = c(paste('AUC of 1 year =', format(auc_y1,nsmall=3)), paste('AUC of 3 years =', format(auc_y3,nsmall=3)), paste('AUC of 5 years =', format(auc_y5,nsmall=3)) )) ROC pdf("03.Train_ROC.pdf", width = 6, height = 5,family = "Times") ROC dev.off() png("03.Train_ROC.png", width = 6, height = 5,res = 600,units = 'in',family = "Times") ROC dev.off() riskScore <- risk$riskScore risk_dis <- ggplot(risk, aes(x=reorder(id, riskScore), y=riskScore, color = factor(risk, levels = c('High', 'Low'), labels = c("High Risk", "Low Risk")))) + geom_point() + scale_color_manual(values = c("#A73030FF", "#0073C2FF")) + scale_x_discrete(breaks = risk[order(risk$riskScore),]$id[c(1,100,200,300,400,500,600,700,800,900,1000,1100)], labels = c(1,100,200,300,400,500,600,700,800,900,1000,1100), expand = c(0.02,0)) + geom_vline(xintercept = nrow(risk[which(risk$risk=='Low'),]) + 0.5, lty = 2) + geom_hline(#yintercept = median(risk$riskScore), yintercept = cutpoint['riskScore',], lty =2) + labs(x = "Patients(increasing risk score)", y = "Risk Score", title = "Train Risk Score Distribution") + theme_base() + theme(legend.title = element_blank(), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), # legend.margin = margin(c(-5,4,4,3)), legend.background = element_rect(color = "black", size = .3), plot.title = element_text(size = 15, hjust = 0.5)) risk_dis pdf("04.Train_Risk_Score_Distribution.pdf", width = 7, height = 5,family = "Times") risk_dis dev.off() png("04.Train_Risk_Score_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times") risk_dis dev.off() surv_stat <- ggplot(risk, aes(x=reorder(id, riskScore), y=OS.time/365, color = factor(OS, levels = c(0,1), labels = c("Alive", "Dead")))) + geom_point() + scale_color_manual(values = c("#0073C2FF","#A73030FF")) + scale_x_discrete(breaks = risk[order(risk$riskScore),]$id[c(1,100,200,300,400,500,600,700,800,900,1000,1100)], labels = c(1,100,200,300,400,500,600,700,800,900,1000,1100), expand = c(0.02,0)) + ylim(c(0,10))+ geom_vline(xintercept = nrow(risk[which(risk$risk=='Low'),]) + 0.5, lty = 2) + labs(x = "Patients(increasing risk score)", y = "Survival time(Years)", title = "Train survival state distribution") + theme_base() + theme(legend.title = element_blank(), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), # legend.margin = margin(c(-5,4,4,3)), legend.background = element_rect(color = "black", size = .3), plot.title = element_text(size = 15, hjust = 0.5)) surv_stat pdf("05.Train_survival_state_Distribution.pdf", width = 7, height = 5,family = "Times") surv_stat dev.off() png("05.Train_survival_state_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times") surv_stat dev.off() library(ComplexHeatmap) risk <- risk[order(risk$risk, decreasing = T),] risk2 <- data.frame(row.names=risk$id,Risk = risk$risk) risk2$Risk <- as.factor(risk2$Risk) rownames(cox_gene_rt)<-cox_gene_rt[,1] cox_gene_rt2 <- cox_gene_rt[, coxGene] fpkm_case_rt2 <- t(cox_gene_rt2) %>% as.data.frame(.) select.expr <- fpkm_case_rt2[coxGene,rownames(risk2)] select.expr <- log2(select.expr+1) matrix <- t(scale(t(select.expr)))# matrix[matrix< (-1)] <- -1 matrix[matrix>1] <- 1 identical(colnames(matrix), rownames(risk2)) ann.color <- list(Risk = c(High="#dc143c",Low = "#6495ed")) group_rat <- data.frame(row.names=rownames(risk2),risk = risk2$Risk) group_rat$risk <- as.factor(group_rat$risk) cellwidth = 300/nrow(group_rat) cellheight = 200/length(coxGene) pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight, main = 'Train', fontfamily = "Times", fontsize = 10, annotation_colors = ann.color, annotation_col= risk2, annotation_names_col = F, annotation_names_row = F, color = colorRampPalette(c("navy", "white","firebrick1"))(100), cluster_rows = F, cluster_cols = F, show_colnames = F, width=8,height=5, file='06.heatmap.png') pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight, main = 'Train', fontfamily = "Times", fontsize = 10, annotation_colors = ann.color, annotation_col= risk2, annotation_names_col = F, annotation_names_row = F, color = colorRampPalette(c("navy", "white","firebrick1"))(100), cluster_rows = F, cluster_cols = F, show_colnames = F, width=8,height=5, file='06.heatmap.pdf') #09_Validation_estimate---------------------------------------------------------------------------------------- rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./08_Validation_estimate")){ dir.create("./08_Validation_estimate") } setwd("./08_Validation_estimate") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(glmnet) library(readr) library(tidyverse) dat <- read.csv("../00_rawdata/GSE20685/dat(GSE20685).csv",row.names = 1,check.names = F) # "FAM129A""NIBAN1" #rownames(dat)[which(rownames(dat) == "FAM129A")] <- "NIBAN1" #dat <- log2(dat+1) survival.va <- read.csv('../00_rawdata/GSE20685/survival(GSE20685).csv',header = T) #survival.va$sample <- rownames(survival.va) #survival.va <- subset(survival.va,OS.time>59) dat <- dat[,survival.va$sample] lasso_geneids <- read.csv('../06_Lasso/03.lasso_genes.csv',row.names = 1,check.names = F) coef.min <- readRDS('../06_Lasso/06.coef.min.rds') dat <- t(dat)%>%data.frame() dat$sample <- rownames(dat) test_data <- merge(survival.va,dat,by='sample') test_data <- column_to_rownames(test_data,var='sample') lasso_geneids test_data2<-test_data[,lasso_geneids$symbol] #test_data2 <- test_data2[,-2] risk_out<-data.frame(test_data2) risk_out$risk_out<-NA risk_out$riskScore_out<-NA cnt<-1 coef.min<-coef.min[lasso_geneids$symbol,] while (cnt < length(rownames(test_data2))+1) { risk_out$riskScore_out[cnt]<-sum(coef.min*test_data2[cnt,]) cnt = cnt + 1 } dim(test_data2) cnt<-1 while (cnt < length(rownames(test_data2))+1) { risk_out$risk_out[cnt]=as.vector(ifelse(risk_out$riskScore_out[cnt]>median(risk_out$riskScore_out),0,1)) cnt = cnt + 1 } riskScore_out<-as.numeric(risk_out$riskScore_out) coxGene <- lasso_geneids$symbol outCol=c("OS","OS.time",coxGene) risk_out=as.vector(ifelse(riskScore_out>median(riskScore_out),'High','Low')) median(riskScore_out) #2.762742 risk_out <- as.data.frame(c(cbind(id=rownames(cbind(test_data[,outCol], riskScore_out, risk_out)), cbind(test_data[,outCol], riskScore_out, risk_out)))) table(risk_out$risk_out) #write.csv(risk_out,file = '01.risk_va.csv',row.names = T) .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) # # K-M library(survival) library(survminer) OS_dat <- risk_out gene<- colnames(OS_dat) #surv_cutpoint res.cut <- surv_cutpoint(OS_dat,time = 'OS.time',event = 'OS',variables = 'riskScore_out',minprop = 0.4)# cutpoint <- res.cut$cutpoint[1] #0.1854183 OS_dat$group <- factor(ifelse(OS_dat$riskScore_out>cutpoint['riskScore_out',],'High','Low'),levels = c('High','Low')) table(OS_dat$group) # # High Low # 188 139 kmfit_out <- survfit(Surv(OS.time, OS) ~ group, data = OS_dat) surv_pvalue(kmfit_out) #pval=0.01 # obj_is_vector <- function(obj) { # is.vector(obj) && length(dim(obj)) == 0 # } # kmfit <- survfit(Surv(OS.time, OS) ~ risk_out, data = risk_out) # surv_pvalue(kmfit) pdf('02.Verify_survival_KM.pdf',w=7,h=7,onefile=F,family='Times') train_survival_median <- ggsurvplot(kmfit_out, pval = T, conf.int = F, legend.labs=c("High risk","Low risk" ), legend.title="Risk score", title="Verify KM", font.main = c(15,"bold"), risk.table = TRUE, risk.table.col = "strata", linetype = "strata", surv.median.line = "hv", ggtheme = theme_bw(), palette = c("#A73030FF", "#0073C2FF")) print(train_survival_median) dev.off() png('02.Verify_survival_KM.png',w=7,h=7,family='Times',units='in',res=600) train_survival_median <- ggsurvplot(kmfit_out, pval = T, conf.int = F, legend.labs=c("High risk","Low risk" ), legend.title="Risk score", title="Verify KM", font.main = c(15,"bold"), risk.table = TRUE, risk.table.col = "strata", linetype = "strata", surv.median.line = "hv", ggtheme = theme_bw(), palette = c("#A73030FF", "#0073C2FF")) print(train_survival_median) dev.off() colnames(OS_dat) OS_dat <- OS_dat[,-9] colnames(OS_dat)[9] <- 'risk_out' write.csv(OS_dat, file = "01.verify_OS_dat.csv", quote = F) risk2 <- risk_out risk <- OS_dat #4 ROC library(survivalROC) ROC <- risk multi_ROC <- function(time_vector, risk_score_table){ library(survivalROC) single_ROC <- function(single_time){ for_ROC <- survivalROC(Stime=risk_score_table$OS.time, status=risk_score_table$OS, marker=risk_score_table$riskScore, predict.time=single_time, method = 'KM') data.frame('True_positive'=for_ROC$TP, 'False_positive'=for_ROC$FP, 'Cut_values'=for_ROC$cut.values, 'Time'=rep(single_time, length(for_ROC$TP)), 'AUC'=rep(for_ROC$AUC, length(for_ROC$TP))) } multi_ROC_list <- lapply(time_vector, single_ROC) do.call(rbind, multi_ROC_list) } for_multi_ROC <- multi_ROC(time_vector = c(365*seq(1,5,2)), risk_score_table = risk_out) table(for_multi_ROC$AUC) for_multi_ROC$Time <- factor(for_multi_ROC$Time) # ROC library(geomROC) library(plotROC) auc_y1 <- round(for_multi_ROC[which(for_multi_ROC$Time==365),5][1],3) #auc_y2 <- round(for_multi_ROC[which(for_multi_ROC$Time==730),5][1],2) auc_y3 <- round(for_multi_ROC[which(for_multi_ROC$Time==1095),5][1],3) #auc_y4 <- round(for_multi_ROC[which(for_multi_ROC$Time==1460),5][1],2) auc_y5 <- round(for_multi_ROC[which(for_multi_ROC$Time==1825),5][1],3) #auc_y6 <- round(for_multi_ROC[which(for_multi_ROC$Time==2190),5][1],2) #auc_y7 <- round(for_multi_ROC[which(for_multi_ROC$Time==2555),5][1],2) ROC <- ggplot(for_multi_ROC, aes(x=False_positive, y=True_positive, label=Cut_values, color=Time)) + scale_color_manual(breaks = c('365',"1095",'1825'), labels = c("1 year","3 years","5 years"), values = c("#4682B4", "#FF7F00", "#984EA3")) + plotROC::geom_roc(labels = F, stat = 'identity') + style_roc() + geom_abline(slope = 1, intercept = 0, color = 'gray', linetype=2) + theme_bw() + labs(title = "Verify ROC") + theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'), panel.grid = element_blank(), plot.title = element_text(size = 15, hjust = 0.5, face = "bold")) + annotate("text", x=0.75, y=c(0.25, 0.15, 0.05), label = c(paste('AUC of 1 year =', format(auc_y1,nsmall=2)), paste('AUC of 3 years =', format(auc_y3,nsmall=2)), paste('AUC of 5 years =', format(auc_y5,nsmall=2)) )) ROC pdf("03.Verify_ROC.pdf", width = 6, height = 5,family = "Times") ROC dev.off() png("03.Verify_ROC.png", width = 6, height = 5,res = 600,units = 'in',family = "Times") ROC dev.off() library(ggthemes) #risk <- risk_out riskScore_out <- risk$riskScore_out risk_dis <- ggplot(risk, aes(x=reorder(id, riskScore_out), y=riskScore_out, color = factor(risk_out, levels = c('High', 'Low'), labels = c("High Risk", "Low Risk")))) + geom_point() + scale_color_manual(values = c("#A73030FF", "#0073C2FF")) + scale_x_discrete(breaks = risk[order(risk$riskScore_out),]$id[c(1,30,60,90,120,150,180,210,240,270,300,330)], labels = c(1,30,60,90,120,150,180,210,240,270,300,330), expand = c(0.02,0)) + geom_vline(xintercept = nrow(risk[which(risk$risk_out=='Low')+0.5,]) , lty = 2) + geom_hline(#yintercept = median(risk_out$riskScore_out), yintercept = cutpoint['riskScore',], lty =2) + labs(x = "Patients(increasing risk score)", y = "Risk Score", title = "Verify Risk Score Distribution") + theme_base() + theme(legend.title = element_blank(), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), # legend.margin = margin(c(-5,4,4,3)), legend.background = element_rect(color = "black", size = .3), plot.title = element_text(size = 15, hjust = 0.5)) risk_dis pdf("04.Verify_Risk_Score_Distribution.pdf", width = 7, height = 5,family = "Times") risk_dis dev.off() png("04.Verify_Risk_Score_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times") risk_dis dev.off() surv_stat <- ggplot(risk, aes(x=reorder(id, riskScore_out), y=OS.time/365, color = factor(OS, levels = c(0,1), labels = c("Alive", "Dead")))) + geom_point() + scale_color_manual(values = c("#0073C2FF","#A73030FF")) + scale_x_discrete(breaks = risk[order(risk$riskScore_out),]$id[c(1,30,60,90,120,150,180,210,240,270,300,330)], labels = c(1,30,60,90,120,150,180,210,240,270,300,330), expand = c(0.02,0)) + ylim(c(0,20))+ geom_vline(xintercept = nrow(risk[which(risk$risk_out=='Low'),]) + 0.5, lty = 2) + labs(x = "Patients(increasing risk score)", y = "Survival time(Years)", title = "Verify survival state distribution") + theme_base() + theme(legend.title = element_blank(), legend.key = element_blank(), legend.justification = c(0,1), legend.position = c(0,1), # legend.margin = margin(c(-5,4,4,3)), legend.background = element_rect(color = "black", size = .3), plot.title = element_text(size = 15, hjust = 0.5)) surv_stat pdf("05.Verify_survival_state_Distribution.pdf", width = 7, height = 5,family = "Times") surv_stat dev.off() png("05.Verify_survival_state_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times") surv_stat dev.off() library(ComplexHeatmap) risk <- risk[order(risk$risk_out, decreasing = T),] risk2 <- data.frame(row.names=risk$id,Risk = risk$risk_out) risk2$Risk <- as.factor(risk2$Risk) cox_gene_rt2 <- dat[, coxGene] fpkm_case_rt2 <- t(cox_gene_rt2) %>% as.data.frame(.) select.expr <- fpkm_case_rt2[coxGene,rownames(risk2)] select.expr <- log2(select.expr+1) matrix <- t(scale(t(select.expr)))# matrix[matrix< (-1)] <- -1 matrix[matrix>1] <- 1 identical(colnames(matrix), rownames(risk2)) ann.color <- list(Risk = c(High="#dc143c",Low = "#6495ed")) group_rat <- data.frame(row.names=rownames(risk2),risk = risk2$Risk) group_rat$risk <- as.factor(group_rat$risk) cellwidth = 300/nrow(group_rat) cellheight = 200/length(coxGene) pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight, main = 'Verify', fontfamily = "Times", fontsize = 10, annotation_colors = ann.color, annotation_col= risk2, annotation_names_col = F, annotation_names_row = F, color = colorRampPalette(c("navy", "white","firebrick1"))(100), cluster_rows = F, cluster_cols = F, show_colnames = F, width=8,height=5, file='06.heatmap.png') pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight, main = 'Verify', fontfamily = "Times", fontsize = 10, annotation_colors = ann.color, annotation_col= risk2, annotation_names_col = F, annotation_names_row = F, color = colorRampPalette(c("navy", "white","firebrick1"))(100), cluster_rows = F, cluster_cols = F, show_colnames = F, width=8,height=5, file='06.heatmap.pdf') rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./09_progmodel")){ dir.create("./09_progmodel") } setwd("./09_progmodel") ## 07-1 Cox---------- #.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) survival <- read.csv('../00_rawdata/04.survival_TCGA.csv',row.names = 1) phenotype<-read.csv('../00_rawdata/05.phenotype_TCGA.csv') colnames(phenotype) train_phenotype<-data.frame(sample=phenotype$sample, age=phenotype$age, gender=phenotype$gender, stage=phenotype$stage, T.stage=phenotype$T_stage, N.stage=phenotype$N_stage, M.stage=phenotype$M_stage) write.csv(train_phenotype,'01.train_phenotype.csv') # Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1 # Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2 # Current reformed smoker for > 15 years (greater than 15 years) = 3 # Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4 # Current reformed smoker, duration not specified = 5 # Smoking History not documented = 7 train_phenotype<-merge(train_phenotype,survival,by='sample') train_phenotype$OS<-as.numeric(train_phenotype$OS) train_phenotype$OS.time<-as.numeric(train_phenotype$OS.time) train_phenotype2<-train_phenotype table(train_phenotype2$stage) train_phenotype2$stage <- gsub('A','',train_phenotype2$stage) train_phenotype2$stage <- gsub('B','',train_phenotype2$stage) train_phenotype2$stage <- gsub('C','',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage III','3/4',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage IV','3/4',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage II','2',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage I','1',train_phenotype2$stage) #train_phenotype2$stage <- gsub('not reported',NA,train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage X',NA,train_phenotype2$stage) table(train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('a','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('b','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('c','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('d','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('TX',NA,train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T3','3/4',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T4','3/4',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T2','2',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T1','1',train_phenotype2$T.stage) table(train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('a','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('b','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('c','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('d','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub(' (i-)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub(' (i+)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub(' (mol+)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub('mi','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N3','3',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N2','2',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N1','1',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N0','0',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('NX',NA,train_phenotype2$N.stage) table(train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub('MX',NA,train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub(' (i+)','',train_phenotype2$M.stage,fixed = T) train_phenotype2$M.stage<-gsub('c','',train_phenotype2$M.stage,fixed = T) train_phenotype2$M.stage<-gsub('M0','0',train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub('M1','1',train_phenotype2$M.stage) train_phenotype2 <- na.omit(train_phenotype2) table(train_phenotype2$age) train_phenotype2$age <- as.numeric(train_phenotype2$age) train_phenotype2$age <- ifelse(train_phenotype2$age>60,1,0) median(train_phenotype2$age) table(train_phenotype2$gender) train_phenotype2$gender <- ifelse(train_phenotype2$gender=='female',1,0) colnames(train_phenotype2) colnames(train_phenotype2)<-c('id','Age','Gender','Stage','T.stage','N.stage',"M.stage",'OS.time','OS') risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1) sub_risk <- subset(risk, select = c(id, riskScore)) train_risk_clinical <- merge(train_phenotype2, sub_risk, by = "id") rownames(train_risk_clinical) <- train_risk_clinical$id train_risk_clinical = subset(train_risk_clinical, select = -c(id)) dim(train_risk_clinical) os_risk_clinical <- train_risk_clinical colnames_train <- colnames(train_risk_clinical) covariates_train <- colnames_train[-which(colnames_train %in% c("OS", "OS.time"))] covariates <- covariates_train train_risk_clinical$T.stage<-factor(train_risk_clinical$T.stage) train_risk_clinical$N.stage<-factor(train_risk_clinical$N.stage) train_risk_clinical$M.stage<-factor(train_risk_clinical$M.stage) train_risk_clinical$Stage <- factor(train_risk_clinical$Stage) library(survival) library(dplyr) res.risk = coxph(Surv(time = OS.time, event = OS) ~ riskScore, data = train_risk_clinical) %>% summary res.risk = c(res.risk$conf.int[-2], res.risk$coefficients[5]) res.age = coxph(Surv(time = OS.time, event = OS) ~ Age, data = train_risk_clinical) %>% summary res.age = c(res.age$conf.int[-2], res.age$coefficients[5]) res.gender = coxph(Surv(time = OS.time, event = OS) ~ Gender, data = train_risk_clinical) %>% summary res.gender = c(res.gender$conf.int[-2], res.gender$coefficients[5]) # res.smoke = coxph(Surv(time = OS.time, event = OS) ~ Smoking, data = train_risk_clinical) %>% summary # res.smoke = cbind(res.smoke$conf.int[,-2], res.smoke$coefficients[,5]) res.stage = coxph(Surv(time = OS.time, event = OS) ~ Stage, data = train_risk_clinical) %>% summary res.stage = cbind(res.stage$conf.int[,-2], res.stage$coefficients[,5]) res.T_stage = coxph(Surv(time = OS.time, event = OS) ~ T.stage, data = train_risk_clinical) %>% summary res.T_stage = cbind(res.T_stage$conf.int[,-2], res.T_stage$coefficients[,5]) res.M_stage = coxph(Surv(time = OS.time, event = OS) ~ M.stage, data = train_risk_clinical) %>% summary res.M_stage = c(res.M_stage$conf.int[-2], res.M_stage$coefficients[5]) res.N_stage = coxph(Surv(time = OS.time, event = OS) ~ N.stage, data = train_risk_clinical) %>% summary res.N_stage = cbind(res.N_stage$conf.int[,-2], res.N_stage$coefficients[,5]) res.ref = c(1,1,1,NA) res = rbind(res.risk,res.age,res.gender,res.ref,res.stage,res.ref,res.T_stage,res.ref,res.N_stage,res.M_stage) %>% as.data.frame() rownames(res) res$Indicators = c("riskScore","Age","Gender","Stage1(Reference)",'Stage2','Stage3/4',"T1(Reference)",'T2','T3/4','N0(Reference)','N1','N2','N3','M1 vs M0') colnames(res) = c("hr","low","up","pv","Indicator") res$p = signif(res$pv, 2) %>% paste0("p = ", .) res$p[is.na(res$pv)] = NA res$Indicator = factor(res$Indicator, levels = rev(res$Indicator)) rownames(res) <- res$Indicator res2 <- data.frame(p.value=res$pv, HR=res$hr, HR.95L=res$low, HR.95H=res$up, Indicator=res$Indicator) rownames(res2) <- res2$Indicator write.csv(res2, file = "univariate_cox_prog_forest.csv",quote = F) res2 <- subset(res2, select = -c(Indicator)) library(tidyr) hz <- paste(round(res2$HR,3), "(",round(res2$HR.95L,3), "-",round(res2$HR.95H,3),")",sep = "") hz hz[c(4,7,10)] <- "" tabletext <- cbind(c(NA,rownames(res2)), c("P value",ifelse(res2$p.value<0.0001, "< 0.0001", round(res2$p.value,4))), c("Hazard Ratio(95% CI)",hz)) nrow(tabletext) + 1 library(forestplot) pdf(file = '01.uncoxforest.pdf',height = 6, width = 12, onefile = F) forestplot(labeltext=tabletext, graph.pos=4, #Pvalue is.summary = c(TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,rep(FALSE, 7)), col=fpColors(box="darkred", lines="darkblue", zero = "gray50"), mean=c(NA,res2$HR), lower=c(NA,res2$HR.95L), #95% upper=c(NA,res2$HR.95H), #95% boxsize=0.2,lwd.ci=3, #, ci.vertices.height = 0.08,ci.vertices=TRUE, #、、 zero=1,lwd.zero=0.5, #zero colgap=unit(5,"mm"), # xticks = c(0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9), # lwd.xaxis=2, #X lineheight = unit(1.2,"cm"), # graphwidth = unit(.6,"npc"), # cex=0.9, fn.ci_norm = fpDrawCircleCI, # hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)), # hrzl_lines=list("2" = gpar(lwd=2, col="black"), # "3" = gpar(lwd=2, col="black"), #, # "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1 # mar=unit(rep(0.01, times = 4), "cm"),# #fpTxtGpcex txt_gp=fpTxtGp(label=gpar(cex=1), ticks=gpar(cex=0.8, fontface = "bold"), xlab=gpar(cex = 1, fontface = "bold"), title=gpar(cex = 1.25, fontface = "bold")), xlab="Hazard Ratio", grid = T, title = "Univariate", clip = c(0,9)) # x, dev.off() png(file = '01.uncoxforest.png',height = 400, width = 900) forestplot(labeltext=tabletext, graph.pos=4, #Pvalue is.summary = c(TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,rep(FALSE, 7)), col=fpColors(box="darkred", lines="darkblue", zero = "gray50"), mean=c(NA,res2$HR), lower=c(NA,res2$HR.95L), #95% upper=c(NA,res2$HR.95H), #95% boxsize=0.2,lwd.ci=3, #, ci.vertices.height = 0.08,ci.vertices=TRUE, #、、 zero=1,lwd.zero=0.5, #zero colgap=unit(5,"mm"), # xticks = c(0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9), # lwd.xaxis=2, #X lineheight = unit(1.2,"cm"), # graphwidth = unit(.6,"npc"), # cex=0.9, fn.ci_norm = fpDrawCircleCI, # hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)), # hrzl_lines=list("2" = gpar(lwd=2, col="black"), # "3" = gpar(lwd=2, col="black"), #, # "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1 # mar=unit(rep(0.01, times = 4), "cm"),# #fpTxtGpcex txt_gp=fpTxtGp(label=gpar(cex=1), ticks=gpar(cex=0.8, fontface = "bold"), xlab=gpar(cex = 1, fontface = "bold"), title=gpar(cex = 1.25, fontface = "bold")), xlab="Hazard Ratio", grid = T, title = "Univariate", clip = c(0,9)) # x, dev.off() colnames(train_risk_clinical) #PH------------- library(survival) library(survminer) res.all <- data.frame() for (i in 1:length(covariates)) { #i <- 1 gene_list <- covariates[i] cox_data <- as.formula(paste0('Surv(OS.time, OS)~', paste(gene_list, sep = '', collapse = '+'))) cox_more <- coxph(cox_data, data = os_risk_clinical) cox_zph <- cox.zph(cox_more) cox_table <- cox_zph$table[-nrow(cox_zph$table),] cox_table res <- data.frame(gene=covariates[i],PH.p=cox_table[3]) res.all <- rbind(res.all, res) p <- ggcoxzph(cox_zph) # pdf(paste0('01.ph.test_',gene_list,'.pdf'), width = 6,height = 4,onefile = F,family='Times') # print(p) # dev.off() # # png(paste0('01.ph.test_',gene_list,'.png'), width = 6,height = 4,family='Times',units='in',res=600) # print(p) # dev.off() } colnames(res.all)[1] <- 'phenotype' write.csv(res.all, file = "01.uniCox_PH.csv", quote = F, row.names = F) ph.gene <- res.all[as.numeric(res.all$PH.p)>0.05,] dim(ph.gene) ph.gene$phenotype # "Age" "Gender" "N.stage" "M.stage" "riskScore" ## 07-2 Cox---------- cox_more = coxph(Surv(time = OS.time, event = OS) ~ riskScore+Age+Stage+T.stage+N.stage+M.stage, data = train_risk_clinical) cox_zph <- cox.zph(cox_more) cox_table <- cox_zph$table[-nrow(cox_zph$table),] #PH cox_table2 <- data.frame(cox_table) write.csv(cox_table2,'PH_test.csv') #T stage res.mul = coxph(Surv(time = OS.time, event = OS) ~ riskScore+Age+N.stage+M.stage, data = train_risk_clinical)%>% summary res.mul = cbind(res.mul$conf.int[,-2], res.mul$coefficients[,5]) %>% as.data.frame() rownames(res.mul) res.mul = rbind(res.mul[c(1:2),], res.ref, res.mul[c(3:6),]) res.mul$Indicators = c("riskScore",'Age','N0(Reference)','N1','N2','N3','M1 vs M0') colnames(res.mul) = c("hr","low","up","pv","Indicator") res.mul$p = signif(res.mul$pv, 2) %>% paste0("p = ", .) res.mul$p[is.na(res.mul$pv)] = NA res.mul$Indicator = factor(res.mul$Indicator, levels = rev(res.mul$Indicator)) rownames(res.mul) <- res.mul$Indicator #res.mul <- res.mul[1,] multi_res <- data.frame(p.value=res.mul$pv, HR=res.mul$hr, HR.95L=res.mul$low, HR.95H=res.mul$up, Indicator=res.mul$Indicator) rownames(multi_res) <- multi_res$Indicator multi_res write.csv(multi_res, file = "multivariate_cox_prog_result.csv", quote = F, row.names = T) multi_res <- subset(multi_res, select = -c(Indicator)) library(tidyr) hz <- paste(round(multi_res$HR,3), "(",round(multi_res$HR.95L,3), "-",round(multi_res$HR.95H,3),")",sep = "") hz hz[c(3)] <- "" tabletext <- cbind(c(NA,rownames(multi_res)), c("P value",ifelse(multi_res$p.value<0.0001, "< 0.0001", round(multi_res$p.value,4))), c("Hazard Ratio(95% CI)",hz)) nrow(tabletext) + 1 pdf(file = '02.mulcoxforest.pdf',height = 6, width = 12, onefile = F) forestplot(labeltext=tabletext, graph.pos=4, #Pvalue is.summary = c(TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE), col=fpColors(box="darkred", lines="darkblue", zero = "gray50"), mean=c(NA,multi_res$HR), lower=c(NA,multi_res$HR.95L), #95% upper=c(NA,multi_res$HR.95H), #95% boxsize=0.2,lwd.ci=3, #, ci.vertices.height = 0.08,ci.vertices=TRUE, #、、 zero=1,lwd.zero=0.5, #zero colgap=unit(5,"mm"), # xticks = c(0,1,2,3,4,5,6,7), # lwd.xaxis=2, #X lineheight = unit(1.2,"cm"), # graphwidth = unit(.6,"npc"), # cex=0.9, fn.ci_norm = fpDrawCircleCI, # hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)), # hrzl_lines=list("2" = gpar(lwd=2, col="black"), # "3" = gpar(lwd=2, col="black"), #, # "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1 # mar=unit(rep(0.01, times = 4), "cm"),# #fpTxtGpcex txt_gp=fpTxtGp(label=gpar(cex=1), ticks=gpar(cex=0.8, fontface = "bold"), xlab=gpar(cex = 1, fontface = "bold"), title=gpar(cex = 1.25, fontface = "bold")), xlab="Hazard Ratio", grid = T, title = "Multivariate", clip = c(0,7)) # x, dev.off() png(file = '02.mulcoxforest.png',height = 6, width = 12,units = 'in',res = 600) forestplot(labeltext=tabletext, graph.pos=4, #Pvalue is.summary = c(TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE), col=fpColors(box="darkred", lines="darkblue", zero = "gray50"), mean=c(NA,multi_res$HR), lower=c(NA,multi_res$HR.95L), #95% upper=c(NA,multi_res$HR.95H), #95% boxsize=0.2,lwd.ci=3, #, ci.vertices.height = 0.08,ci.vertices=TRUE, #、、 zero=1,lwd.zero=0.5, #zero colgap=unit(5,"mm"), # xticks = c(0,1,2,3,4,5,6,7), # lwd.xaxis=2, #X lineheight = unit(1.2,"cm"), # graphwidth = unit(.6,"npc"), # cex=0.9, fn.ci_norm = fpDrawCircleCI, # hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)), # hrzl_lines=list("2" = gpar(lwd=2, col="black"), # "3" = gpar(lwd=2, col="black"), #, # "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1 # mar=unit(rep(0.01, times = 4), "cm"),# #fpTxtGpcex txt_gp=fpTxtGp(label=gpar(cex=1), ticks=gpar(cex=0.8, fontface = "bold"), xlab=gpar(cex = 1, fontface = "bold"), title=gpar(cex = 1.25, fontface = "bold")), xlab="Hazard Ratio", grid = T, title = "Multivariate", clip = c(0,7)) # x, dev.off() ##07-3 COX,--------- #train_risk_clinical2 <- train_risk_clinical multi_cov<-c('riskScore','Age','N.stage','M.stage') cox_data_prog <- as.formula(paste0('Surv(OS.time, OS)~', paste(multi_cov, sep = '', collapse = '+'))) cox_more_prog <- coxph(cox_data_prog, data = as.data.frame(train_risk_clinical)) # Nomogram library(rms) ddist <- datadist(train_risk_clinical) options(datadist='ddist') # COX, res.cox <- psm(cox_data_prog, data = train_risk_clinical, dist = 'lognormal') surv <- Survival(res.cox) # function(x) surv(365, x) # 1 # function(x) surv(730, x) # 2 function(x) surv(1095, x) # 3 function(x) surv(1825, x) # 5 #function(x) surv(2555, x) # 5 nom.cox <- nomogram(res.cox, fun = list(function(x) surv(365, x), function(x)surv(1095, x), function(x)surv(1825, x)), funlabel=c("1-year Survival Probability", "3-year Survival Probability", "5-year Survival Probability"), maxscale = 10, fun.at = c(0.01,seq(0.1,0.9,by=0.2),0.95,0.99), lp=F) plot(nom.cox, cex.axis = 1.5, cex.var = 1.7) png(filename = "03.nomogram_line_points.png", height = 10, width = 17,units = 'in',res = 300) plot(nom.cox, cex.axis = 1.5, cex.var = 1.7) dev.off() pdf(file = "03.nomogram_line_points.pdf", height = 10, width = 17) plot(nom.cox, cex.axis = 1.5, cex.var = 1.7) dev.off() library(rms) library(regplot) ddist <- datadist(train_risk_clinical) options(datadist = "ddist") y3 <- 1 y5 <- 3 y7 <- 5 LR1 <- survival::coxph(Surv(OS.time,OS) ~ riskScore+Age+N.stage+M.stage,data = train_risk_clinical) p <- regplot::regplot(#res.cox, LR1, observation = train_risk_clinical[200,], interval = "confidence", points=TRUE,## title = "Nomogram", clickable = F,## subticks = T, failtime = c(365*y3,365*y5,365*y7), prfail = F,##T1、23,F1、23 plots=c("density","boxes"), dencol="#95A2BF", boxcol="#ED7474", droplines=TRUE ) ## # 03.nomogram_line_points_after.png # 03.nomogram_line_points_after.pdf ##07-4 --------- coxm_3 <- cph(cox_data_prog, data=train_risk_clinical, surv=T, x=T,y=T, time.inc = 1*365) cal_3 <-calibrate(coxm_3,u=1*365,cmethod='KM',m=100,B=100) coxm_5 <- cph(cox_data_prog, data=train_risk_clinical, surv=T, x=T,y=T, time.inc = 3*365) cal_5 <-calibrate(coxm_5,u=3*365,cmethod='KM',m=100,B=100) coxm_7 <- cph(cox_data_prog, data=train_risk_clinical, surv=T, x=T,y=T, time.inc = 5*365) cal_7 <-calibrate(coxm_7,u=5*365,cmethod='KM',m=100,B=100) pdf(file = '04.calibrate.pdf',w=8,h=8) par(mar=c(7,4,4,3),cex=1.5) plot(cal_3, subtitles = F, lwd=2,lty=1, ## errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ## xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',# ylab='Actual 1-5 year Survival Probability',# col="#00468b",# xlim = c(0,1),ylim = c(0,1)) ##xy plot(cal_5, add = T, subtitles = F, lwd=2,lty=1, ## errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ## xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',# ylab='Actual 1-5 year Survival Probability',# col="#ed0000",# xlim = c(0,1),ylim = c(0,1)) ##xy plot(cal_7, add = T, subtitles = F, lwd=2,lty=1, ## errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ## xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',# ylab='Actual 1-5 year Survival Probability',# col="#42b540",# xlim = c(0,1),ylim = c(0,1)) ##xy # legend("bottomright", legend=c("1-year", "3-years", "5-years"), col=c("#00468b", "#ed0000", "#42b540"), lwd=2) # abline(0,1,lty=5,lwd=2,col="grey") dev.off() png(file = '04.calibrate.png',w=8,h=8,units = 'in',res = 300) par(mar=c(7,4,4,3),cex=1.5) plot(cal_3, subtitles = F, lwd=2,lty=1, ## errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ## xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',# ylab='Actual 1-5 year Survival Probability',# col="#00468b",# xlim = c(0,1),ylim = c(0,1)) ##xy plot(cal_5, add = T, subtitles = F, lwd=2,lty=1, ## errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ## xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',# ylab='Actual 1-5 year Survival Probability',# col="#ed0000",# xlim = c(0,1),ylim = c(0,1)) ##xy plot(cal_7, add = T, subtitles = F, lwd=2,lty=1, ## errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ## xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',# ylab='Actual 1-5 year Survival Probability',# col="#42b540",# xlim = c(0,1),ylim = c(0,1)) ##xy # legend("bottomright", legend=c("1-year", "3-years", "5-years"), col=c("#00468b", "#ed0000", "#42b540"), lwd=2) # abline(0,1,lty=5,lwd=2,col="grey") dev.off() # # ## DCA----- # library(rmda) # dca.dat <- train_risk_clinical # complex<-decision_curve(OS ~ riskScore+Age+N.stage+M.stage,data =dca.dat,family = binomial(link ='logit'), # thresholds = seq(0,1, by = 0.01), # confidence.intervals= 0.95, # study.design = 'case-control', # population.prevalence= 0.3 # ) # riskScore.dca <- decision_curve(OS ~riskScore,data =dca.dat,family = binomial(link ='logit'), # thresholds = seq(0,1, by = 0.01), # confidence.intervals= 0.95, # study.design = 'case-control', # population.prevalence= 0.3 # ) # Age.dca <- decision_curve(OS ~Age,data =dca.dat,family = binomial(link ='logit'), # thresholds = seq(0,1, by = 0.01), # confidence.intervals= 0.95, # study.design = 'case-control', # population.prevalence= 0.3 # ) # N.stage.dca <- decision_curve(OS ~N.stage,data =dca.dat,family = binomial(link ='logit'), # thresholds = seq(0,1, by = 0.01), # confidence.intervals= 0.95, # study.design = 'case-control', # population.prevalence= 0.3 # ) # M.stage.dca <- decision_curve(OS ~M.stage,data =dca.dat,family = binomial(link ='logit'), # thresholds = seq(0,1, by = 0.01), # confidence.intervals= 0.95, # study.design = 'case-control', # population.prevalence= 0.3 # ) # dca.list <- list(riskScore.dca,Age.dca,N.stage.dca,M.stage.dca,complex) # # pdf("05.DCA.pdf", width = 5, height = 5) # plot_decision_curve(dca.list, # curve.names=c('riskScore','Age','N.stage','M.stage','Nomogram'), # cost.benefit.axis =FALSE,col= c("#FDC086","#7FC97F","#BEAED4","#74CCBE","#ED7474"), # confidence.intervals=FALSE, # standardize = FALSE) # dev.off() # # png("05.DCA.png", width = 5, height = 5,units = 'in',res = 300) # plot_decision_curve(dca.list, # curve.names=c('riskScore','Age','N.stage','M.stage','Nomogram'), # cost.benefit.axis =FALSE,col= c("#FDC086","#7FC97F","#BEAED4","#74CCBE","#ED7474"), # confidence.intervals=FALSE, # standardize = FALSE) # dev.off() # ##roc res.mul = coxph(Surv(time = OS.time, event = OS) ~ riskScore+Age+N.stage+M.stage, data = train_risk_clinical) train_risk_clinical$riskscore_prog = predict(res.mul, newdata = train_risk_clinical, type = "lp") train_risk_clinical$risk_prog = ifelse(train_risk_clinical$riskscore_prog > median(train_risk_clinical$riskscore_prog, na.rm = T), "high", "low") train_risk_clinical$risk_prog = factor(train_risk_clinical$risk_prog, levels = c("high", "low"), labels = c("High risk", "Low risk")) train_risk_clinical2 <- train_risk_clinical library(survival) library(survminer) multi_ROC_out <- function(time_vector, risk_score_table){ library(survivalROC) single_ROC <- function(single_time){ for_ROC <- survivalROC(Stime=risk_score_table$OS.time, status=risk_score_table$OS, marker=risk_score_table$riskscore_prog, predict.time=single_time,method = 'KM') data.frame('True_positive'=for_ROC$TP, 'False_positive'=for_ROC$FP, 'Cut_values'=for_ROC$cut.values, 'Time'=rep(single_time, length(for_ROC$TP)), 'AUC'=rep(for_ROC$AUC, length(for_ROC$TP))) } multi_ROC_list <- lapply(time_vector, single_ROC) do.call(rbind, multi_ROC_list) } for_multi_ROC <- multi_ROC_out(time_vector = c(1*365,3*365,5*365), risk_score_table = train_risk_clinical2) for_multi_ROC$Time <- factor(for_multi_ROC$Time) # ROC library(scales) library(geomROC) library(plotROC) auc_y3 <- round(for_multi_ROC[which(for_multi_ROC$Time==1*365),5][1],2) auc_y5 <- round(for_multi_ROC[which(for_multi_ROC$Time==3*365),5][1],2) auc_y7 <- round(for_multi_ROC[which(for_multi_ROC$Time==5*365),5][1],2) ROC <- ggplot(for_multi_ROC, aes(x=False_positive, y=True_positive, label=Cut_values, color=Time)) + scale_color_manual(breaks = c("365",'1095','1825'), labels = c("1 year","3 year","5 year"), values = c("#4682B4", "#FF7F00", "#984EA3")) + plotROC::geom_roc(labels = F, stat = 'identity') + style_roc() + geom_abline(slope = 1, intercept = 0, color = 'gray', linetype=2) + theme_bw() + labs(title = "Independent prognosis ROC") + theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'), panel.grid = element_blank(), plot.title = element_text(size = 15, hjust = 0.5, face = "bold")) + # annotate('text', x=.75, y=.25, label=paste('AUC of 1 years =', round(auc_y1,2))) + # annotate('text', x=.75, y=.15, label=paste('AUC of 3 years =', round(auc_y3,2))) + # annotate('text', x=.75, y=.05, label=paste('AUC of 5 years =', round(auc_y5,2))) + annotate("text", x=0.75, y=c(0.25, 0.15, 0.05), label = c(paste('AUC of 1 year =', format(auc_y3,nsmall=2)), paste('AUC of 3 years =', format(auc_y5,nsmall=2)), paste('AUC of 5 years =', format(auc_y7,nsmall=2)))) ROC ggsave('06.Prog_ROC.png', ROC,width = 6, height = 5) ggsave('06.Prog_ROC.pdf', ROC,width = 6, height = 5) # # DCA--------------------------------------------------------------------- # Nomogram<- coxph(cox_data_prog,data = train_risk_clinical) # library(ggDCA) # # fig3<-ggDCA::dca(Nomogram, # new.data = NULL, # times=365*1 # ) # fig5<-ggDCA::dca(Nomogram, # new.data = NULL, # times=365*3 # ) # fig7<-ggDCA::dca(Nomogram, # new.data = NULL, # times=5*365 # ) # # library(ggprism) # p3<-ggplot(fig3,linetype =F,lwd = 1.2)+ # theme_classic()+ # theme_prism(base_size =17)+ # theme(legend.position="top")+ # theme(axis.title.x = element_text(size = 22, face = "bold", family = "Times"), # axis.title.y = element_text(size = 22, face = "bold", family = "Times"), # axis.text.x = element_text(size = 17, color='black',face = "bold", family = "Times"), # axis.text.y = element_text(size = 17, face = "bold", family = "Times"), # legend.text = element_text(size = 17, family = "Times",face = "bold"), # legend.title = element_blank(), # plot.title = element_text(size = 22, face = "bold", family = "Times",hjust=0.5), # #plot.margin = ggplot2::margin(t=.3,b=0,l=2,r=.5, unit = "cm"), # text = element_text(family = "Times"), # panel.grid.major=element_blank(),panel.grid.minor=element_blank())+ # scale_x_continuous( # #limits = c(0, 1), #x # guide = "prism_minor") + # scale_colour_prism( # palette = "candy_bright", # name = "Cylinders", # label = c("3 Year", "ALL", "None")) # # p3 # # p5<-ggplot(fig5,linetype =F,lwd = 1.2)+ # theme_classic()+ # theme_prism(base_size =17)+ # theme(legend.position="top")+ # theme(axis.title.x = element_text(size = 22, face = "bold", family = "Times"), # axis.title.y = element_text(size = 22, face = "bold", family = "Times"), # axis.text.x = element_text(size = 17, color='black',face = "bold", family = "Times"), # axis.text.y = element_text(size = 17, face = "bold", family = "Times"), # legend.text = element_text(size = 17, family = "Times",face = "bold"), # legend.title = element_blank(), # plot.title = element_text(size = 22, face = "bold", family = "Times",hjust=0.5), # #plot.margin = ggplot2::margin(t=.3,b=0,l=2,r=.5, unit = "cm"), # text = element_text(family = "Times"), # panel.grid.major=element_blank(),panel.grid.minor=element_blank())+ # scale_x_continuous( # #limits = c(0, 1), #x # guide = "prism_minor") + # scale_colour_prism( # palette = "candy_bright", # name = "Cylinders", # label = c("5 Year", "ALL", "None")) # # p5 # # p7 <- ggplot(fig7,linetype =F,lwd = 1.2)+ # theme_classic()+ # theme_prism(base_size =17)+ # theme(legend.position="top")+ # theme(axis.title.x = element_text(size = 22, face = "bold", family = "Times"), # axis.title.y = element_text(size = 22, face = "bold", family = "Times"), # axis.text.x = element_text(size = 17, color='black',face = "bold", family = "Times"), # axis.text.y = element_text(size = 17, face = "bold", family = "Times"), # legend.text = element_text(size = 17, family = "Times",face = "bold"), # legend.title = element_blank(), # plot.title = element_text(size = 22, face = "bold", family = "Times",hjust=0.5), # #plot.margin = ggplot2::margin(t=.3,b=0,l=2,r=.5, unit = "cm"), # text = element_text(family = "Times"), # panel.grid.major=element_blank(),panel.grid.minor=element_blank())+ # scale_x_continuous( # #limits = c(0, 1), #x # guide = "prism_minor") + # scale_colour_prism( # palette = "candy_bright", # name = "Cylinders", # label = c("7 Year", "ALL", "None")) # # p7 # # # library(cowplot) # pdf('07.DCA_year.pdf',w=15,h=5,bg='white') # plot_grid(p3,p5,p7,ncol=3) # dev.off() # # png('07.DCA_year.png',w=15,h=5,bg='white',units='in',res=600) # plot_grid(p3,p5,p7,ncol=3) # dev.off() rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./10_clinical_gene")){ dir.create("./10_clinical_gene") } setwd("./10_clinical_gene") ## library(lance) library(tidyverse) hubgene <- read.csv('../06_Lasso/03.lasso_genes.csv') hubgene <- hubgene$symbol dat<-read.csv('../00_rawdata/02.expr_tpm_TCGA.csv',row.names = 1)%>%lc.tableToNum() colnames(dat)<-gsub('.','-',colnames(dat),fixed = T) ## tumor group <- read.csv('../00_rawdata/03.group_TCGA.csv') tumor.sample <- group$X[which(group$group=='Tumor')] dat <- dat[,tumor.sample] dat <- t(dat[hubgene,])%>%as.data.frame()%>%rownames_to_column(var = 'sample') survival<-read.csv("../00_rawdata/04.survival_TCGA.csv",row.names = 1) phenotype<-read.csv('../00_rawdata/05.phenotype_TCGA.csv') #dat<-read.csv('../00_rawdata/02.expr_tpm(log)_TCGA.csv') colnames(phenotype) train_phenotype<-data.frame(sample=phenotype$sample, age=phenotype$age, gender=phenotype$gender, stage=phenotype$stage, T.stage=phenotype$T_stage, N.stage=phenotype$N_stage, M.stage=phenotype$M_stage) write.csv(train_phenotype,'01.train_phenotype.csv') # Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1 # Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2 # Current reformed smoker for > 15 years (greater than 15 years) = 3 # Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4 # Current reformed smoker, duration not specified = 5 # Smoking History not documented = 7 train_phenotype<-merge(train_phenotype,survival,by='sample') train_phenotype$OS<-as.numeric(train_phenotype$OS) train_phenotype$OS.time<-as.numeric(train_phenotype$OS.time) train_phenotype2<-train_phenotype table(train_phenotype2$stage) train_phenotype2$stage <- gsub('A','',train_phenotype2$stage) train_phenotype2$stage <- gsub('B','',train_phenotype2$stage) train_phenotype2$stage <- gsub('C','',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage III','3/4',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage IV','3/4',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage II','2',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage I','1',train_phenotype2$stage) #train_phenotype2$stage <- gsub('not reported',NA,train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage X',NA,train_phenotype2$stage) table(train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('a','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('b','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('c','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('d','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('TX',NA,train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T3','3/4',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T4','3/4',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T2','2',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T1','1',train_phenotype2$T.stage) table(train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('a','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('b','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('c','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('d','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub(' (i-)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub(' (i+)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub(' (mol+)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub('mi','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N3','3',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N2','2',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N1','1',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N0','0',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('NX',NA,train_phenotype2$N.stage) table(train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub('MX',NA,train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub(' (i+)','',train_phenotype2$M.stage,fixed = T) train_phenotype2$M.stage<-gsub('c','',train_phenotype2$M.stage,fixed = T) train_phenotype2$M.stage<-gsub('M0','0',train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub('M1','1',train_phenotype2$M.stage) table(train_phenotype2$age) train_phenotype2$age <- as.numeric(train_phenotype2$age) train_phenotype2$age <- ifelse(train_phenotype2$age>60,">60",'<=60') colnames(train_phenotype2) train_phenotype2<-na.omit(train_phenotype2) train_phenotype3 <- merge(train_phenotype2,dat,by = 'sample') write.csv(train_phenotype3,'train_phenotype.csv') library(tidyverse) library(lance) library(ggpubr) library(Ipaper) library(ggthemes) ##Age----- table(train_phenotype3$age) my_comparisons <- list(c("<=60",">60")) age_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() age.group <- train_phenotype3[,c('sample','age')] age.group <- na.omit(age.group) age_data <- age_data[,colnames(age_data)%in%age.group$sample] age.sample <- age.group$sample[which(age.group$age=='>60')] age_data$Symbol <- rownames(age_data) age_data2<-gather(age_data, key = sample, value = expr, -c("Symbol")) age_data2$Group <- ifelse(age_data2$sample%in%age.sample,'>60','<=60') library(rstatix) stat.test<-age_data2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') write.csv(stat.test,file = 'age.wilcox.csv',row.names = F,quote = F) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.05,"**", ifelse(stat.test$p<0.05,"*",'ns'))) age_plot <- ggplot(age_data2,aes(x = Symbol, y = expr, fill = Group)) + geom_violin(trim=F,color="black",aes(fill=Group)) + #geom_violin(trim=F,color="black") + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ labs(title="Age", x="", y = "",size=20) + stat_compare_means(data = age_data2, mapping = aes(group = Group), label ="p.signif", method = 'wilcox.test', paired = F) + theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) #+ # guides(fill='none')#+ # facet_wrap(~Symbol,scales = "free",nrow = 3) age_plot ##Gender----- table(train_phenotype3$gender) gender_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() gender.group <- train_phenotype3[,c('sample','gender')] gender.group <- na.omit(gender.group) gender_data <- gender_data[,colnames(gender_data)%in%gender.group$sample] gender.sample <- gender.group$sample[which(gender.group$gender=='male')] gender_data$Symbol <- rownames(gender_data) gender_data2<-gather(gender_data, key = sample, value = expr, -c("Symbol")) gender_data2$Group <- ifelse(gender_data2$sample%in%gender.sample,'male','female') library(rstatix) stat.test<-gender_data2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') write.csv(stat.test,file = 'gender.wilcox.csv',row.names = F,quote = F) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.05,"**", ifelse(stat.test$p<0.05,"*",'ns'))) gender_plot <- ggplot(gender_data2,aes(x = Symbol, y = expr, fill = Group)) + #geom_violin(trim=F,color="black") + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 geom_violin(trim=F,color="black",aes(fill=Group)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ labs(title="Gender", x="", y = "",size=20) + stat_compare_means(data = gender_data2, mapping = aes(group = Group), label ="p.signif", method = 'wilcox.test', paired = F) + theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) gender_plot ##T.stage----- table(train_phenotype3$T.stage) T.stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() T.stage.group <- train_phenotype3[,c('sample','T.stage')] T.stage.group <- na.omit(T.stage.group) table(T.stage.group$T.stage) T.stage_data <- T.stage_data[,colnames(T.stage_data)%in%T.stage.group$sample] T.stage.sample <- T.stage.group$sample[which(T.stage.group$T.stage=='1'|T.stage.group$T.stage=='2')] T.stage_data$Symbol <- rownames(T.stage_data) T.stage_data2<-gather(T.stage_data, key = sample, value = expr, -c("Symbol")) T.stage_data2$Group <- ifelse(T.stage_data2$sample%in%T.stage.sample,'1/2','3/4') library(rstatix) stat.test<-T.stage_data2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') write.csv(stat.test,file = 'T.stage.wilcox.csv',row.names = F,quote = F) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.05,"**", ifelse(stat.test$p<0.05,"*",'ns'))) T.stage_plot <- ggplot(T.stage_data2,aes(x = Symbol, y = expr, fill = Group)) + #geom_violin(trim=F,color="black") + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 geom_violin(trim=F,color="black",aes(fill=Group)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ labs(title="T.stage", x="", y = "",size=20) + stat_compare_means(data = T.stage_data2, mapping = aes(group = Group), label ="p.signif", method = 'wilcox.test', paired = F) + theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) # facet_wrap(~Symbol,scales = "free",nrow = 3) T.stage_plot ##N.stage----- table(train_phenotype3$N.stage) N.stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() N.stage.group <- train_phenotype3[,c('sample','N.stage')] N.stage.group <- na.omit(N.stage.group) table(N.stage.group$N.stage) N.stage_data <- N.stage_data[,colnames(N.stage_data)%in%N.stage.group$sample] N.stage.sample <- N.stage.group$sample[which(N.stage.group$N.stage=='0'|N.stage.group$N.stage=='1')] N.stage_data$Symbol <- rownames(N.stage_data) N.stage_data2<-gather(N.stage_data, key = sample, value = expr, -c("Symbol")) N.stage_data2$Group <- ifelse(N.stage_data2$sample%in%N.stage.sample,'0/1','2/3') library(rstatix) stat.test<-N.stage_data2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') write.csv(stat.test,file = 'N.stage.wilcox.csv',row.names = F,quote = F) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.05,"**", ifelse(stat.test$p<0.05,"*",'ns'))) N.stage_plot <- ggplot(N.stage_data2,aes(x = Symbol, y = expr, fill = Group)) + #geom_violin(trim=F,color="black") + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 geom_violin(trim=F,color="black",aes(fill=Group)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ labs(title="N.stage", x="", y = "",size=20) + stat_compare_means(data = N.stage_data2, mapping = aes(group = Group), label ="p.signif", method = 'wilcox.test', paired = F) + theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) N.stage_plot ##M.stage----- table(train_phenotype3$M.stage) M.stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() M.stage.group <- train_phenotype3[,c('sample','M.stage')] M.stage.group <- na.omit(M.stage.group) table(M.stage.group$M.stage) M.stage_data <- M.stage_data[,colnames(M.stage_data)%in%M.stage.group$sample] M.stage.sample <- M.stage.group$sample[which(M.stage.group$M.stage=='0')] M.stage_data$Symbol <- rownames(M.stage_data) M.stage_data2<-gather(M.stage_data, key = sample, value = expr, -c("Symbol")) M.stage_data2$Group <- ifelse(M.stage_data2$sample%in%M.stage.sample,'0','1') library(rstatix) stat.test<-M.stage_data2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') write.csv(stat.test,file = 'M.stage.wilcox.csv',row.names = F,quote = F) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.05,"**", ifelse(stat.test$p<0.05,"*",'ns'))) M.stage_plot <- ggplot(M.stage_data2,aes(x = Symbol, y = expr, fill = Group)) + #geom_violin(trim=F,color="black") + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 geom_violin(trim=F,color="black",aes(fill=Group)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ labs(title="M.stage", x="", y = "",size=20) + stat_compare_means(data = M.stage_data2, mapping = aes(group = Group), label ="p.signif", method = 'wilcox.test', paired = F) + theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) M.stage_plot ##stage----- table(train_phenotype3$stage) stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() stage.group <- train_phenotype3[,c('sample','stage')] stage.group <- na.omit(stage.group) table(stage.group$stage) stage_data <- stage_data[,colnames(stage_data)%in%stage.group$sample] stage.sample <- stage.group$sample[which(stage.group$stage=='1'|stage.group=='2')] stage_data$Symbol <- rownames(stage_data) stage_data2<-gather(stage_data, key = sample, value = expr, -c("Symbol")) stage_data2$Group <- ifelse(stage_data2$sample%in%stage.sample,'1/2','3/4') library(rstatix) stat.test<-stage_data2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') write.csv(stat.test,file = 'stage.wilcox.csv',row.names = F,quote = F) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.05,"**", ifelse(stat.test$p<0.05,"*",'ns'))) stage_plot <- ggplot(stage_data2,aes(x = Symbol, y = expr, fill = Group)) + #geom_violin(trim=F,color="black") + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 geom_violin(trim=F,color="black",aes(fill=Group)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ labs(title="Stage", x="", y = "",size=20) + stat_compare_means(data = stage_data2, mapping = aes(group = Group), label ="p.signif", method = 'wilcox.test', paired = F) + theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) stage_plot ##histological-------- #table(train_phenotype3$histological) #histological_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame() #histological.group <- train_phenotype3[,c('sample','histological')] #histological.group <- na.omit(histological.group) #table(histological.group$histological) #histological_data <- histological_data[,colnames(histological_data)%in%histological.group$sample] #histological.sample <- histological.group$sample[which(histological.group$histological=='G1'|histological.group$histological=='G2')] #histological_data$Symbol <- rownames(histological_data) # #histological_data2<-gather(histological_data, # key = sample, # value = expr, # -c("Symbol")) # #histological_data2$Group <- ifelse(histological_data2$sample%in%histological.sample,'G1/2','G3') #library(rstatix) # #stat.test<-histological_data2%>% # group_by(Symbol)%>% # wilcox_test(expr ~ Group)%>% # adjust_pvalue(method = 'fdr') #write.csv(stat.test,file = 'histological.wilcox.csv',row.names = F,quote = F) # #stat.test$p<-ifelse(stat.test$p<0.001,"***", # ifelse(stat.test$p<0.05,"**", # ifelse(stat.test$p<0.05,"*",'ns'))) #histological_plot <- ggplot(histological_data2,aes(x = Symbol, y = expr, fill = Group)) + # #geom_violin(trim=F,color="black") + #, “color=”(#white,) # #"trim"TRUE(),。FALSE,。 # geom_violin(trim=F,color="black",aes(fill=Group)) + # stat_boxplot(geom="errorbar", # width=0.1, # position = position_dodge(0.9)) + # geom_boxplot(width=0.7, # position=position_dodge(0.9), # outlier.shape = NA)+ #,width=0.1 # scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ # labs(title="Histological", x="", y = "",size=20) + # stat_compare_means(data = histological_data2, # mapping = aes(group = Group), # label ="p.signif", # method = 'wilcox.test', # paired = F) + # theme_bw()+ # theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), # axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), # axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), # axis.title.x=element_text(size=16,face="bold"), # axis.title.y=element_text(size=16,face="bold"), # legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), # legend.title = element_text(face = "bold", size = 12), # legend.position = "top", # panel.grid.major = element_blank(), # panel.grid.minor = element_blank()) #histological_plot library(patchwork) all_clinical_index <- age_plot +gender_plot + T.stage_plot + N.stage_plot+ M.stage_plot+ stage_plot + plot_layout(ncol = 3) & theme_bw() & theme(legend.title = element_blank(), legend.position = "top", legend.text = element_text(face = "bold"), plot.title = element_text(hjust = 0.5, face = "bold"), # plot.title = element_blank(), panel.grid = element_blank(), axis.title = element_text(face = "bold"), axis.text.y = element_text(face = 'bold',size = 12), axis.text.x = element_text(angle = 45,hjust = 1,face = "bold",size = 12)) all_clinical_index cairo_pdf("01.clinical.pdf", width = 12, height = 10,family = "Arial") all_clinical_index dev.off() #all_clinical_index <- age_plot +gender_plot + stage_plot + histological_plot + # plot_layout(ncol = 4) & # theme_bw() & # theme(legend.title = element_blank(), # legend.position = "top", # legend.text = element_text(face = "bold"), # plot.title = element_text(hjust = 0.5, face = "bold"), # # plot.title = element_blank(), # panel.grid = element_blank(), # axis.title = element_text(face = "bold"), # axis.text.y = element_text(face = 'bold',size = 12), # axis.text.x = element_text(angle = 45,hjust = 1,face = "bold",size = 12)) #all_clinical_index #cairo_pdf("01.clinical2.pdf", width = 12, height = 5,family = "Arial") #all_clinical_index #dev.off() #rm(list = ls()) #08 ------------- setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./10_clinical")){ dir.create("./10_clinical") } setwd("./10_clinical") library(readr) library(readxl) survival <- read.csv('../00_rawdata/04.survival_TCGA.csv',row.names = 1) phenotype<-read.csv('../00_rawdata/05.phenotype_TCGA.csv') colnames(phenotype) train_phenotype<-data.frame(sample=phenotype$sample, age=phenotype$age, gender=phenotype$gender, stage=phenotype$stage, T.stage=phenotype$T_stage, N.stage=phenotype$N_stage, M.stage=phenotype$M_stage) write.csv(train_phenotype,'01.train_phenotype.csv') # Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1 # Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2 # Current reformed smoker for > 15 years (greater than 15 years) = 3 # Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4 # Current reformed smoker, duration not specified = 5 # Smoking History not documented = 7 train_phenotype<-merge(train_phenotype,survival,by='sample') train_phenotype$OS<-as.numeric(train_phenotype$OS) train_phenotype$OS.time<-as.numeric(train_phenotype$OS.time) train_phenotype2<-train_phenotype table(train_phenotype2$stage) train_phenotype2$stage <- gsub('A','',train_phenotype2$stage) train_phenotype2$stage <- gsub('B','',train_phenotype2$stage) train_phenotype2$stage <- gsub('C','',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage III','3/4',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage IV','3/4',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage II','2',train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage I','1',train_phenotype2$stage) #train_phenotype2$stage <- gsub('not reported',NA,train_phenotype2$stage) train_phenotype2$stage <- gsub('Stage X',NA,train_phenotype2$stage) table(train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('a','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('b','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('c','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('d','',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('TX',NA,train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T3','3/4',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T4','3/4',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T2','2',train_phenotype2$T.stage) train_phenotype2$T.stage<-gsub('T1','1',train_phenotype2$T.stage) table(train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('a','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('b','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('c','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('d','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub(' (i-)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub(' (i+)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub(' (mol+)','',train_phenotype2$N.stage,fixed = T) train_phenotype2$N.stage<-gsub('mi','',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N3','3',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N2','2',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N1','1',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('N0','0',train_phenotype2$N.stage) train_phenotype2$N.stage<-gsub('NX',NA,train_phenotype2$N.stage) table(train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub('MX',NA,train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub(' (i+)','',train_phenotype2$M.stage,fixed = T) train_phenotype2$M.stage<-gsub('c','',train_phenotype2$M.stage,fixed = T) train_phenotype2$M.stage<-gsub('M0','0',train_phenotype2$M.stage) train_phenotype2$M.stage<-gsub('M1','1',train_phenotype2$M.stage) table(train_phenotype2$age) train_phenotype2$age <- as.numeric(train_phenotype2$age) train_phenotype2$age <- ifelse(train_phenotype2$age>60,">60",'<=60') #median(train_phenotype2$age) #table(train_phenotype2$gender) #train_phenotype2$gender <- ifelse(train_phenotype2$gender=='female',1,0) colnames(train_phenotype2) colnames(train_phenotype2)<-c('id','Age','Gender','Stage','T.stage','N.stage',"M.stage",'OS.time','OS') train_phenotype2<-na.omit(train_phenotype2) risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1) sub_risk <- subset(risk, select = c(id, riskScore)) train_risk_clinical <- merge(train_phenotype2, sub_risk, by = "id") rownames(train_risk_clinical) <- train_risk_clinical$id train_risk_clinical = subset(train_risk_clinical, select = -c(id)) dim(train_risk_clinical) write.csv(train_risk_clinical,'02.train_clinical.csv') train_phenotype3 <- train_risk_clinical library(ggpubr) library(Ipaper) library(ggthemes) ##Age----- table(train_phenotype3$Age) my_comparisons <- list(c("<=60",">60")) age_data <- data.frame(riskScore = train_phenotype3$riskScore, stage = factor(train_phenotype3$Age, levels = c("<=60",">60"))) age_data <- na.omit(age_data) age<-ggplot(age_data,aes(x = stage, y = riskScore, fill = stage)) + geom_violin(trim=F,color="black",aes(fill=stage)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(alpha=0.7) + scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ scale_x_discrete(name = "") + ggtitle("Age") + theme_bw() + geom_signif(comparisons = my_comparisons, test = wilcox.test, map_signif_level = T, y_position = c(2,3,4))+ theme(plot.title = element_text(size = 16, face = "bold"), text = element_text(size = 20), axis.title = element_text(face="bold"), axis.text.x=element_text(size = 20))+ guides(fill='none') age #gender table(train_phenotype3$Gender) my_comparisons <- list(c("female","male")) gender_data <- data.frame(riskScore = train_phenotype3$riskScore, stage = factor(train_phenotype3$Gender, levels = c("female","male"))) gender_data <- na.omit(gender_data) gender<-ggplot(gender_data,aes(x = stage, y = riskScore, fill = stage)) + geom_violin(trim=F,color="black",aes(fill=stage)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(alpha=0.7) + scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ scale_x_discrete(name = "") + ggtitle("Gender") + theme_bw() + geom_signif(comparisons = my_comparisons, test = wilcox.test, map_signif_level = T, y_position = c(2,3,4))+ theme(plot.title = element_text(size = 16, face = "bold"), text = element_text(size = 20), axis.title = element_text(face="bold"), axis.text.x=element_text(size = 20))+ guides(fill='none') gender ## T.stage----- table(train_phenotype3$T.stage) my_comparisons <- list(c("1","2"),c("2","3/4"),c('1','3/4')) T.stage_data <- data.frame(riskScore = train_phenotype3$riskScore, stage = factor(train_phenotype3$T.stage, levels = c("1","2", "3/4"))) T.stage_data <- na.omit(T.stage_data) T.stage<-ggplot(T.stage_data,aes(x = stage, y = riskScore, fill = stage)) + geom_violin(trim=F,color="black",aes(fill=stage)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(alpha=0.7) + scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957","#F19685"), name = "Group")+ scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ scale_x_discrete(name = "") + ggtitle("T Stage") + theme_bw() + geom_signif(comparisons = my_comparisons, test = wilcox.test, map_signif_level = T, y_position = c(2,2.8,3.6,3.4,2,4.8))+ theme(plot.title = element_text(size = 16, face = "bold"), text = element_text(size = 20), axis.title = element_text(face="bold"), axis.text.x=element_text(size = 20))+ guides(fill='none') T.stage ## N stage----- table(train_phenotype3$N.stage) my_comparisons <- list(c("0","1"),c("1","2"),c("2","3"),c('0','2'),c('0','3'),c('1','3')) N.stage_data <- data.frame(riskScore = train_phenotype3$riskScore, stage = factor(train_phenotype3$N.stage, levels = c("0","1", "2",'3'))) N.stage_data <- na.omit(N.stage_data) N.stage<-ggplot(N.stage_data,aes(x = stage, y = riskScore, fill = stage)) + geom_violin(trim=F,color="black",aes(fill=stage)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(alpha=0.7) + scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957","#F19685"), name = "Group")+ scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ scale_x_discrete(name = "") + ggtitle("N Stage") + theme_bw() + geom_signif(comparisons = my_comparisons, test = wilcox.test, map_signif_level = T, y_position = c(2,3,4,5,6,7))+ theme(plot.title = element_text(size = 16, face = "bold"), text = element_text(size = 20), axis.title = element_text(face="bold"), axis.text.x=element_text(size = 20))+ guides(fill='none') N.stage # M stage----- table(train_phenotype3$M.stage) my_comparisons <- list(c("0","1")) M.stage_data <- data.frame(riskScore = train_phenotype3$riskScore, stage = factor(train_phenotype3$M.stage, levels = c("0","1"))) M.stage_data <- na.omit(M.stage_data) M.stage<-ggplot(M.stage_data,aes(x = stage, y = riskScore, fill = stage)) + geom_violin(trim=F,color="black",aes(fill=stage)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(alpha=0.7) + scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+ scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ scale_x_discrete(name = "") + ggtitle("M Stage") + theme_bw() + geom_signif(comparisons = my_comparisons, test = wilcox.test, map_signif_level = T, y_position = c(2))+ theme(plot.title = element_text(size = 16, face = "bold"), text = element_text(size = 20), axis.title = element_text(face="bold"), axis.text.x=element_text(size = 20))+ guides(fill='none') M.stage ## Histological---- #table(train_phenotype3$Histological) #my_comparisons <- list(c("G1","G2"),c('G2','G3'),c('G1','G3')) #Histological_data <- data.frame(riskScore = train_phenotype3$riskScore, # stage = factor(train_phenotype3$Histological, # levels = c("G1","G2",'G3'))) #Histological_data <- na.omit(Histological_data) #Histological<-ggplot(Histological_data,aes(x = stage, y = riskScore, fill = stage)) + # geom_violin(trim=F,color="black",aes(fill=stage)) + # stat_boxplot(geom="errorbar", # width=0.1, # position = position_dodge(0.9)) + # geom_boxplot(alpha=0.7) + # scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957"), name = "Group")+ # scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ # scale_x_discrete(name = "") + # ggtitle("Histological") + # theme_bw() + # geom_signif(comparisons = my_comparisons, # test = wilcox.test, # map_signif_level = T, # y_position = c(2,3,4))+ # theme(plot.title = element_text(size = 16, face = "bold"), # text = element_text(size = 20), # axis.title = element_text(face="bold"), # axis.text.x=element_text(size = 20))+ # guides(fill='none') #Histological # Stage---- table(train_phenotype3$Stage) my_comparisons <- list(c("1","2"),c('2','3/4'),c("1","3/4")) Stage_data <- data.frame(riskScore = train_phenotype3$riskScore, stage = factor(train_phenotype3$Stage, levels = c("1","2",'3/4'))) Stage_data <- na.omit(Stage_data) Stage<-ggplot(Stage_data,aes(x = stage, y = riskScore, fill = stage)) + geom_violin(trim=F,color="black",aes(fill=stage)) + stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(alpha=0.7) + scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957","#F19685"), name = "Group")+ scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+ scale_x_discrete(name = "") + ggtitle("Stage") + theme_bw() + geom_signif(comparisons = my_comparisons, test = wilcox.test, map_signif_level = T, y_position = c(2,2.5,3,3.5,4,4.5))+ theme(plot.title = element_text(size = 16, face = "bold"), text = element_text(size = 20), axis.title = element_text(face="bold"), axis.text.x=element_text(size = 20))+ guides(fill='none') Stage library(patchwork) all_clinical_index <- age + gender + T.stage + N.stage + M.stage + Stage+ plot_layout(ncol = 3) & theme_bw() & theme(legend.title = element_blank(), legend.position = "top", legend.text = element_text(face = "bold"), plot.title = element_text(hjust = 0.5, face = "bold"), panel.grid = element_blank(), axis.title = element_text(face = "bold"), axis.text = element_text(face = "bold",size = 10)) all_clinical_index cairo_pdf("01.clinical.pdf", width = 8, height = 6,family = "Arial") all_clinical_index dev.off() #library(patchwork) #all_clinical_index2 <- age + gender + Histological + Stage+ # plot_layout(ncol = 4) & # theme_bw() & # theme(legend.title = element_blank(), # legend.position = "top", # legend.text = element_text(face = "bold"), # plot.title = element_text(hjust = 0.5, face = "bold"), # panel.grid = element_blank(), # axis.title = element_text(face = "bold"), # axis.text = element_text(face = "bold",size = 10)) #all_clinical_index2 # #cairo_pdf("01.clinical2.pdf", width = 10, height = 4,family = "Arial") #all_clinical_index2 #dev.off() rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (!dir.exists("11_GSEA")){dir.create("11_GSEA")} setwd("11_GSEA") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(data.table) library(org.Hs.eg.db) library(clusterProfiler) library(biomaRt) library(enrichplot) library(limma) library(tidyverse) library(lance) ## kegggmt<-read.gmt("/data/nas3//pipeline/MSigDB/h.all.v2023.2.Hs.symbols.gmt") #gmt dat<-read.csv("../00_rawdata/01.expr_count_TCGA.csv", row.names = 1,check.names = F) colnames(dat)<-gsub('.','-',colnames(dat),fixed = T) dat <- round(dat,digits = 0) #hubgene <- read.csv('../07_Lasso/03.lasso_genes.csv') hub.exp<-dat#[hubgene$symbol,] hub.exp<-hub.exp[order(rownames(hub.exp),decreasing = F),] ## risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1) rownames(risk) <- risk$id train<-t(hub.exp) colData=data.frame(sample=risk$id,group=ifelse(risk$risk=='High','High risk','Low risk')) ##TNFRSF1A colData$group <- factor(colData$group,levels = c("High risk","Low risk")) dat <- dat[,colData$sample] gsea_exp<-dat library(DESeq2) dds<-DESeqDataSetFromMatrix(countData = gsea_exp,colData=colData,design = ~group) dds = dds[rowSums(counts(dds)) > 1,] dds<-DESeq(dds) res =results(dds, contrast = c("group","High risk","Low risk")) res =res[order(res$padj),] head(res) summary(res) table(res$padj<0.05) allGeneSets<-as.data.frame(res) allGeneSets<-na.omit(allGeneSets) logFCcutoff <- 0 allGeneSets$change = as.factor( ifelse(allGeneSets$padj < 0.05 & abs(allGeneSets$log2FoldChange) > logFCcutoff, ifelse(allGeneSets$log2FoldChange > logFCcutoff,'UP','DOWN'),'NOT') ) table(allGeneSets$change) DEGeneSets <- allGeneSets allGeneSets <- allGeneSets[order(allGeneSets$log2FoldChange,decreasing = T),] genelist <- allGeneSets$log2FoldChange names(genelist) <- rownames(allGeneSets) geneList <- sort(genelist, decreasing = T) DEGeneSets <- DEGeneSets[order(DEGeneSets$padj),] KEGG<-GSEA(geneList,TERM2GENE = kegggmt,pvalueCutoff = 1) #GSEA sortKEGG<-data.frame(KEGG) sortKEGG <- subset(sortKEGG,pvalue < 0.05 & abs(NES) > 1) sortKEGG<-sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#enrichment score write.csv(sortKEGG,file = '01.hall.result.csv',row.names = F) my_colors <- c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00") if(T){ gsea.plot = function(res.kegg, top.hall,name){ gsdata <- do.call(rbind, lapply(top.hall, enrichplot:::gsInfo, object = res.kegg)) gsdata$Description = factor(gsdata$Description, levels = top.hall) p1 = ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + # theme_bw() + theme_classic(14) + theme(panel.grid.major = element_line(colour = "grey92"), panel.grid.minor = element_line(colour = "grey92"), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + scale_x_continuous(expand = c(0, 0)) + scale_color_manual(values = my_colors) + #scale_color_brewer(palette = "Paired") + ggtitle(paste0("Gene Set Enrichment Analysis: ",name)) + # geom_hline(yintercept = 0, color = "black", size = 0.8) + geom_line(aes_(y = ~runningScore, color = ~Description), size = 1) + theme(legend.position = "top", legend.justification = c(0,1), legend.title = element_blank(), legend.background = element_rect(fill = "transparent")) + guides( color = guide_legend( ncol = 1, byrow = TRUE, reverse = T) )+ ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line.x = element_blank(), text = element_text(face = "bold", family = "Times"))+ theme( axis.title.y =element_text(size=15,family = "Times", face = "bold"), axis.text.y=element_text(size=15,family = "Times", face = "bold"), panel.grid.major=element_blank(),panel.grid.minor=element_blank() ) i = 0 for (term in unique(gsdata$Description)) { idx <- which(gsdata$ymin != 0 & gsdata$Description == term) gsdata[idx, "ymin"] <- i gsdata[idx, "ymax"] <- i + 1 i <- i + 1 } p2 = ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin, ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) + # theme_bw() + theme_classic(14) + theme(legend.position = "none", axis.ticks = element_blank(), axis.text = element_blank(), axis.line.x = element_blank(), text = element_text(face = "bold", family = "Times")) + scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + scale_color_manual(values = my_colors) + #scale_color_brewer(palette = "Paired")+ theme( panel.grid.major=element_blank(), panel.grid.minor=element_blank(), # legend.title=element_text(size=15) , legend.text=element_text(size=14)) p3 <- gseaplot2(res.kegg,top.hall,subplots = 3)+ theme(axis.title.x =element_text(size=15,family = "Times", face = "bold"), axis.text.x =element_text(angle=0,size=15,hjust = 0,family = "Times", face = "bold"), axis.title.y =element_text(size=15,family = "Times", face = "bold"), axis.text.y=element_text(size=15,family = "Times", face = "bold"))+ theme( panel.grid.major=element_blank(),panel.grid.minor=element_blank(), legend.title=element_text(size=15) , legend.text=element_text(size=14)) # p = aplot::insert_bottom(p1, p2, height = 0.15) p <- p1/p2/p3+plot_layout(ncol = 1, height = c(4,1,2)) return(p) } } # sortKEGG2 <- sortKEGG[order(sortKEGG$NES, decreasing = F),] #enrichment score # paths <- rbind(head(sortKEGG2,3),tail(sortKEGG2,3)) sortKEGG2 <- sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#p paths <- sortKEGG2[1:5,] select_path <- paths$Description library(patchwork) pal <- gsea.plot(KEGG, select_path,"Results") pdf("01.GSEA_HALL.pdf", width = 10, height = 9,family = 'Times') print(pal) dev.off() png("01.GSEA_HALL.png", width = 10, height = 9,units = 'in',res = 300,family = 'Times') print(pal) dev.off() # # #############KEGG################## # rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (!dir.exists("11_GSEA")){dir.create("11_GSEA")} setwd("11_GSEA") .libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(data.table) library(org.Hs.eg.db) library(clusterProfiler) library(biomaRt) library(enrichplot) library(limma) library(tidyverse) library(lance) ## kegggmt<-read.gmt("/data/nas3//pipeline/MSigDB/c2.cp.kegg.v2023.1.Hs.symbols.gmt") #gmt dat<-read.csv("../00_rawdata/01.expr_count_TCGA.csv", row.names = 1,check.names = F) colnames(dat)<-gsub('.','-',colnames(dat),fixed = T) dat <- round(dat,digits = 0) #hubgene <- read.csv('../07_Lasso/03.lasso_genes.csv') hub.exp<-dat#[hubgene$symbol,] hub.exp<-hub.exp[order(rownames(hub.exp),decreasing = F),] ## risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1) rownames(risk) <- risk$id train<-t(hub.exp) colData=data.frame(sample=risk$id,group=ifelse(risk$risk=='High','High risk','Low risk')) ##TNFRSF1A colData$group <- factor(colData$group,levels = c("High risk","Low risk")) dat <- dat[,colData$sample] gsea_exp<-dat library(DESeq2) dds<-DESeqDataSetFromMatrix(countData = gsea_exp,colData=colData,design = ~group) dds = dds[rowSums(counts(dds)) > 1,] dds<-DESeq(dds) res =results(dds, contrast = c("group","High risk","Low risk")) res =res[order(res$padj),] head(res) summary(res) table(res$padj<0.05) allGeneSets<-as.data.frame(res) allGeneSets<-na.omit(allGeneSets) logFCcutoff <- 0 allGeneSets$change = as.factor( ifelse(allGeneSets$padj < 0.05 & abs(allGeneSets$log2FoldChange) > logFCcutoff, ifelse(allGeneSets$log2FoldChange > logFCcutoff,'UP','DOWN'),'NOT') ) table(allGeneSets$change) DEGeneSets <- allGeneSets allGeneSets <- allGeneSets[order(allGeneSets$log2FoldChange,decreasing = T),] genelist <- allGeneSets$log2FoldChange names(genelist) <- rownames(allGeneSets) geneList <- sort(genelist, decreasing = T) DEGeneSets <- DEGeneSets[order(DEGeneSets$padj),] KEGG<-GSEA(geneList,TERM2GENE = kegggmt,pvalueCutoff = 1) #GSEA sortKEGG<-data.frame(KEGG) sortKEGG <- subset(sortKEGG,pvalue < 0.05 & abs(NES) > 1) sortKEGG<-sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#enrichment score write.csv(sortKEGG,file = '02.kegg.result.csv',row.names = F) my_colors <- c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00","#FFFF33") if(T){ gsea.plot = function(res.kegg, top.hall,name){ gsdata <- do.call(rbind, lapply(top.hall, enrichplot:::gsInfo, object = res.kegg)) gsdata$Description = factor(gsdata$Description, levels = top.hall) p1 = ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + # theme_bw() + theme_classic(14) + theme(panel.grid.major = element_line(colour = "grey92"), panel.grid.minor = element_line(colour = "grey92"), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + scale_x_continuous(expand = c(0, 0)) + scale_color_manual(values = my_colors) + #scale_color_brewer(palette = "Paired") + ggtitle(paste0("Gene Set Enrichment Analysis: ",name)) + # geom_hline(yintercept = 0, color = "black", size = 0.8) + geom_line(aes_(y = ~runningScore, color = ~Description), size = 1) + theme(legend.position = "top", legend.justification = c(0,1), legend.title = element_blank(), legend.background = element_rect(fill = "transparent")) + guides( color = guide_legend( ncol = 1, byrow = TRUE, reverse = T) )+ ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line.x = element_blank(), text = element_text(face = "bold", family = "Times"))+ theme( axis.title.y =element_text(size=15,family = "Times", face = "bold"), axis.text.y=element_text(size=15,family = "Times", face = "bold"), panel.grid.major=element_blank(),panel.grid.minor=element_blank() ) i = 0 for (term in unique(gsdata$Description)) { idx <- which(gsdata$ymin != 0 & gsdata$Description == term) gsdata[idx, "ymin"] <- i gsdata[idx, "ymax"] <- i + 1 i <- i + 1 } p2 = ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin, ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) + # theme_bw() + theme_classic(14) + theme(legend.position = "none", axis.ticks = element_blank(), axis.text = element_blank(), axis.line.x = element_blank(), text = element_text(face = "bold", family = "Times")) + scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + scale_color_manual(values = my_colors) + #scale_color_brewer(palette = "Paired")+ theme( panel.grid.major=element_blank(), panel.grid.minor=element_blank(), # legend.title=element_text(size=15) , legend.text=element_text(size=14)) p3 <- gseaplot2(res.kegg,top.hall,subplots = 3)+ theme(axis.title.x =element_text(size=15,family = "Times", face = "bold"), axis.text.x =element_text(angle=0,size=15,hjust = 0,family = "Times", face = "bold"), axis.title.y =element_text(size=15,family = "Times", face = "bold"), axis.text.y=element_text(size=15,family = "Times", face = "bold"))+ theme( panel.grid.major=element_blank(),panel.grid.minor=element_blank(), legend.title=element_text(size=15) , legend.text=element_text(size=14)) # p = aplot::insert_bottom(p1, p2, height = 0.15) p <- p1/p2/p3+plot_layout(ncol = 1, height = c(4,1,2)) return(p) } } #paths <- rownames(kkTab[c(1:5),]) #sortKEGG2 <- sortKEGG[order(sortKEGG$NES, decreasing = F),]#enrichment score #paths <- rbind(head(sortKEGG2,3),tail(sortKEGG2,3))3 #sortKEGG2 <- sortKEGG[order(sortKEGG$pvalue, decreasing = F),] sortKEGG2 <- sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#p paths <- sortKEGG2[1:5,] select_path <- paths$Description library(patchwork) pal <- gsea.plot(KEGG, select_path,"Results") pdf("02.GSEA_KEGG.pdf", width = 10, height = 9,family = 'Times') print(pal) dev.off() png("02.GSEA_KEGG.png", width = 10, height = 9,units = 'in',res = 300,family = 'Times') print(pal) dev.off() # # # # # # # # # # ######################### # rm(list = ls()) setwd("/data/nas3///29_YQ953-11/11_GSEA/") if (!dir.exists("gene")){dir.create("gene")} setwd("gene") #GSEA--------------------------------------------------------------------- library(data.table) library(org.Hs.eg.db) library(clusterProfiler) library(biomaRt) library(enrichplot) library(limma) library(tidyverse) library(lance) library(tibble) library(ggplot2) library(patchwork) data <- read.csv("../../00_rawdata/02.expr_tpm_TCGA.csv",row.names = 1) colnames(data)<-gsub('.','-',colnames(data),fixed = T) group <- read.csv("../../07_Train_estimate/01.train_OS_dat.csv",row.names = 1) gene <- read.csv('../../06_Lasso/03.lasso_genes.csv') geneList_survival <- gene$symbol rownames(group) <- group$id data_train<-data<-data[,rownames(group)] library(corrplot) library(org.Hs.eg.db) library(clusterProfiler) library(enrichplot) library(GseaVis) library(RColorBrewer) library(psych) library(reshape2) library(plyr) library(RColorBrewer) library(ggplot2) my_colors <- c("#FC8D62","#8DA0CB","#E78AC3","#A6D854","#FFD92F") if(T){ gsea.plot = function(res.kegg, top.hall,name){ gsdata <- do.call(rbind, lapply(top.hall, enrichplot:::gsInfo, object = res.kegg)) gsdata$Description = factor(gsdata$Description, levels = top.hall) p1 = ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + # theme_bw() + theme_classic(14) + theme(panel.grid.major = element_line(colour = "grey92"), panel.grid.minor = element_line(colour = "grey92"), panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + scale_x_continuous(expand = c(0, 0)) + scale_color_manual(values = my_colors) + #scale_color_brewer(palette = "Paired") + ggtitle(paste0("Gene Set Enrichment Analysis: ",name)) + # geom_hline(yintercept = 0, color = "black", size = 0.8) + geom_line(aes_(y = ~runningScore, color = ~Description), size = 1) + theme(legend.position = "top", legend.justification = c(0,1), legend.title = element_blank(), legend.background = element_rect(fill = "transparent")) + guides( color = guide_legend( ncol = 1, byrow = TRUE, reverse = T) )+ ylab("Running Enrichment Score") + theme(axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.line.x = element_blank(), text = element_text(face = "bold", family = "Times"))+ theme( axis.title.y =element_text(size=13,family = "Times", face = "bold"), axis.text.y=element_text(size=13,family = "Times", face = "bold"), panel.grid.major=element_blank(),panel.grid.minor=element_blank() ) i = 0 for (term in unique(gsdata$Description)) { idx <- which(gsdata$ymin != 0 & gsdata$Description == term) gsdata[idx, "ymin"] <- i gsdata[idx, "ymax"] <- i + 1 i <- i + 1 } p2 = ggplot(gsdata, aes_(x = ~x)) + geom_linerange(aes_(ymin = ~ymin, ymax = ~ymax, color = ~Description)) + xlab(NULL) + ylab(NULL) + # theme_bw() + theme_classic(14) + theme(legend.position = "none", axis.ticks = element_blank(), axis.text = element_blank(), axis.line.x = element_blank(), text = element_text(face = "bold", family = "Times")) + scale_x_continuous(expand = c(0,0)) + scale_y_continuous(expand = c(0,0)) + scale_color_manual(values = my_colors) + #scale_color_brewer(palette = "Paired")+ theme( panel.grid.major=element_blank(), panel.grid.minor=element_blank(), # legend.title=element_text(size=15) , legend.text=element_text(size=15)) p3 <- gseaplot2(res.kegg,top.hall,subplots = 3)+ theme(axis.title.x =element_text(size=0,family = "Times", face = "bold"), axis.text.x =element_text(angle=0,size=0,hjust = 0,family = "Times", face = "bold"), axis.title.y =element_text(size=13,family = "Times", face = "bold"), axis.text.y=element_text(size=13,family = "Times", face = "bold"))+ theme( panel.grid.major=element_blank(),panel.grid.minor=element_blank(), legend.title=element_text(size=15) , legend.text=element_text(size=15)) # p = aplot::insert_bottom(p1, p2, height = 0.15) p <- p1/p2/p3+plot_layout(ncol = 1, height = c(4,1,2)) return(p) } } rt<-data for (ii in geneList_survival){ tryCatch({ set.seed(1) #ii="DECR1" tar.exp <- rt[ii,] y <- as.numeric(tar.exp) data1 <- data.frame() for (i in rownames(rt)) { dd <- corr.test(as.numeric(rt[i,]), y , method="spearman",adjust = "fdr") data1 = rbind(data1,data.frame(gene=i,cor=dd$r,p.value=dd$p)) } data1 <- data1[order(data1$cor,decreasing = T),] gene1 <- data1$cor names(gene1) <- mapIds(org.Hs.eg.db,keys = data1$gene,column = 'ENTREZID', keytype = 'SYMBOL',multiVals='filter') names(gene1)<-data1$gene gene1 <- na.omit(gene1) # gmt=read.gmt('/data/nas3//pipeline/MSigDB/c5.go.v7.4.symbols.gmt') kk=GSEA(gene1, TERM2GENE=gmt, pvalueCutoff = 1) kkTab=as.data.frame(kk) kkTab=kkTab[kkTab$pvalue<0.05,] write.csv(kkTab,paste0(ii,'.kegg_gsea.csv'),quote = F,row.names = F) paths <- rownames(kkTab[c(1:5),]) pal <- gsea.plot(kk,paths,ii) pdf(paste0(ii,'.GSEA.KEGG.pdf'),width = 9,height = 7,family = "Times") print(pal) dev.off() png(paste0(ii,'.GSEA.KEGG.png'),width = 800,height = 500,family = "Times") print(pal) dev.off() },error=function(e){}) } # # # # a <- read.csv('./CCDC80.kegg_gsea.csv',header = TRUE) # # b <- read.csv('./CSPG4.kegg_gsea.csv') # # c <- read.csv('./MAP1A.kegg_gsea.csv') # # d <- read.csv('./NIBAN1.kegg_gsea.csv') # # e <- read.csv('./PCOLCE2.kegg_gsea.csv') # # f <- read.csv('./PDGFRA.kegg_gsea.csv') # #-ssGSEA -------- rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./12_immunity")){ dir.create("./12_immunity") } setwd("./12_immunity") library(lance) library(magrittr) expr<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F) %>% lance::lc.tableToNum() colnames(expr)<-gsub('.','-',colnames(expr),fixed = T) risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1,check.names = F) #risk$id <- rownames(risk) group <- risk[,c("id",'risk')] expr <- expr[,group$id] # high.sample<-group$id[which(group$risk=='High')] low.sample<-group$id[which(group$risk=='Low')] expr <- expr[,group$id] library(GSVA) gene_set <- read.table("/data/nas3//pipeline/ssGSEA/mmc3.txt", header = T, sep ="\t") dat.final <- as.matrix(expr) gene_list <- split(as.matrix(gene_set)[,1], gene_set[,2]) ssgsea_score = gsva(dat.final, gene_list, method = "ssgsea", ssgsea.norm = TRUE, verbose = TRUE) ssgsea_score1 <- as.data.frame(ssgsea_score) write.csv(ssgsea_score1, file = "01.ssgsea_result_cell.csv", quote = F) colnames(group) <- c('sample', 'group') group<-group[order(group$group),] ssgsea_score<-read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) ssgsea_score<-ssgsea_score[,group$sample] annotation_col<-as.data.frame(group$group) colnames(annotation_col)='Group' rownames(annotation_col)=colnames(ssgsea_score) ssgsea_score <- as.matrix(ssgsea_score) library(pheatmap) color.key<-c("#3777ac","#FF7F00") ann_colors<-list(Group = c('Low'="#3777ac",'High'="#FF7F00")) p <- pheatmap( ssgsea_score, color = colorRampPalette(color.key)(50), border_color = 'darkgrey', annotation_col = annotation_col, annotation_colors = ann_colors, labels_row = NULL, clustering_method = 'ward.D2', show_rownames = T, show_colnames = F, fontsize_col = 5, cluster_cols = F, cluster_rows = T) png(file = "01.heatmap.png", height = 5, width = 8, units = 'in',res = 600,family='Times') p dev.off() pdf(file = "01.heatmap.pdf", height = 5,width = 8,family='Times') p dev.off() table(group$group) group_case<-group[group$group=='High',] group_case<-as.character(group_case$sample) group_control<-group[group$group=='Low',] group_control<-as.character(group_control$sample) library(dplyr) tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) tiics_result <- tiics_result[, group$sample] %>% as.matrix() pvalue = padj <- matrix(0, nrow(tiics_result), 1) for (i in 1:nrow(tiics_result)){ pvalue[i, 1] = p.value = wilcox.test(tiics_result[i, group_case], tiics_result[i, group_control])$p.value } padj <- p.adjust(as.vector(pvalue), "fdr", n = length(pvalue)) rTable <- data.frame(#log2FoldChange, pvalue, padj, row.names = rownames(tiics_result)) rTable$immune_cell <- rownames(rTable) rTable$sig <- ifelse(rTable$pvalue < 0.05, ifelse(rTable$pvalue < 0.01, ifelse(rTable$pvalue < 0.001, ifelse(rTable$pvalue < 0.0001, paste(rTable$immune_cell, "****", sep = ""), paste(rTable$immune_cell, "***", sep = "")), paste(rTable$immune_cell, "**", sep = "")), paste(rTable$immune_cell, "*", sep = "")), rTable$immune_cell) diff_Table<-rTable[which(rTable$pvalue<0.05),] dim(diff_Table) # 16 write.csv(rTable, file = "02.cell_all_wilcox_test.csv", quote = F, row.names = F) write.csv(diff_Table, file = "02.cell_diff_wilcox_test.csv", quote = F, row.names = F) paste0(diff_Table$immune_cell,collapse = "()、") # "Activated CD4 T cell()、CD56bright natural killer cell()、Central memory CD8 T cell()、Effector memory CD4 T cell()、Gamma delta T cell()、Immature dendritic cell()、Mast cell()、MDSC()、Memory B cell()、Monocyte()、Natural killer T cell()、Plasmacytoid dendritic cell()、Regulatory T cell()、T follicular helper cell()、Type 17 T helper cell()、Type 2 T helper cell" library(tidyr) library(ggplot2) library(ggpubr) library(Ipaper) cell.data <- data.frame(Immune_Cell=rownames(tiics_result), tiics_result, pvalue=rTable$pvalue) plot.cell <- cell.data[which(cell.data$pvalue<0.05),] diff_tiics <- rownames(plot.cell) violin_dat <- gather(plot.cell, key=Group, value=score, -c("Immune_Cell","pvalue")) violin_dat$Group <- ifelse(gsub("\\.","-",violin_dat$Group) %in% group_control, "Low", "High") violin_dat$Group <- factor(violin_dat$Group, levels = c("Low", "High")) head(violin_dat) boxplot_diff_TIICs <- ggplot(violin_dat, aes(x=Immune_Cell, y=score, fill=Group)) + #geom_violin(trim=T,color='black') + #, “color=”(#white,)#"trim"TRUE(),。FALSE,。 stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA, color='black')+ #,width=0.1 scale_fill_manual(values= c("#3777ac","#FF7F00"))+ # labs(title="Cell Infiltration", x="", y = "Score",size=20) + stat_compare_means(data = violin_dat, method = "wilcox.test", #, mapping = aes(group = Group), label ="p.signif", hide.ns = F) + theme_bw()+# ylim(-0.25,0.8) + theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=16), # axis.text.x=element_text(angle=90,hjust=1,colour="black",face="bold",size=12), #x45,1(hjust = 1),14 axis.text.y=element_text(hjust=0.5,colour="black",size=12), #y,,plain axis.title.x=element_text(size=16,face="bold"),#x axis.title.y=element_text(size=14,face="bold"), #y legend.text=element_text(face="bold", hjust = 0.5,colour="black", size=11), # legend.title=element_text(face="bold", colour="black", size=11),# text=element_text(family = 'Times'), #legend.justification=c(-0.1,1.2), #。##(1,1)1(),1。 #legend.position=c(0, 1.04), #legend.position=c(0,1),(1,1)。 panel.grid.major = element_blank(), # panel.grid.minor = element_blank()) # boxplot_diff_TIICs pdf("02.ssgsea_Box.pdf", width = 10, height = 6,family = 'Times') print(boxplot_diff_TIICs) dev.off() png("02.ssgsea_Box.png", width = 10, height = 6,units = 'in',res = 300,family = 'Times') boxplot_diff_TIICs dev.off() res3 <- read.csv('01.ssgsea_result_cell.csv',row.names = 1) dif_cell <- read.csv('02.cell_diff_wilcox_test.csv') res3 <- res3[rownames(res3) %in% dif_cell$immune_cell,] res3 <- t(res3) %>% as.data.frame() library(ggcorrplot) cor_data <- cor(res3,method="spearman") corp <- cor_pmat(res3) dat.r <- data.frame(cor_data) dat.r$cell <- rownames(dat.r) dat.r <- gather(dat.r,'Gene','r',-c(cell)) dat.p <- data.frame(corp) dat.p$cell <- rownames(dat.p) dat.p <- gather(dat.p,'Gene','p',-c(cell)) corrp <- (cbind(dat.r, dat.p))[,c("cell","Gene",'r',"p")] colnames(corrp) <- c("cell1","cell2",'Correlation',"Pvalue") corrp$cell2 <- gsub("\\.", " ",corrp$cell2) write.csv(corrp,file="03.cell_cor.csv",quote=F) library(ggcorrplot) library(corrplot) #--------------------- # # pdf("03.cell_corHeatmap.pdf",height=14,width=14,family='Times') # col1 <- colorRampPalette(c("#3777ac", "white","#fbbab6")) # corr_plot<-corrplot(cor_data, # tl.cex=1.5, # method = "square", #square,circle,method = 'shade' # is.corr = T, # type = "lower", #lower,upper,full # p.mat = corp, # insig = "blank", # outline = "white", # addCoef.col ="black", # col = col1(200), # tl.col = 'black', # tl.offset = 0.4, # number.font = 2, # cl.cex = 0.8, # number.cex = 1.2 # ) # dev.off() # # png("03.cell_corHeatmap.png",height=14,width=14,family='Times',units='in',res=600) # col1 <- colorRampPalette(c("#3777ac", "white","#fbbab6")) # corr_plot<-corrplot(cor_data, # tl.cex=1.5, # method = "square", #square,circle,method = 'shade' # is.corr = T, # type = "lower", #lower,upper,full # p.mat = corp, # insig = "blank", # outline = "white", # addCoef.col ="black", # col = col1(200), # tl.col = 'black', # tl.offset = 0.4, # number.font = 2, # cl.cex = 0.8, # number.cex = 1.2 # ) # dev.off() #------------------------------------ library (dplyr) data_expr<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F) gene <- read.csv('../06_Lasso/03.lasso_genes.csv') expr <- t(data_expr) %>% as.data.frame() expr <- expr[,gene$symbol] expr <- expr[rownames(expr) %in% group$sample,] tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) %>% lc.tableToNum tiics_result <- tiics_result[, group$sample] %>% as.matrix() tiics_result <- t(tiics_result) %>% as.data.frame() tiics_result <- tiics_result[rownames(expr),] diff <- read.csv('02.cell_diff_wilcox_test.csv') tiics_result <- tiics_result[,diff$immune_cell] identical(rownames(expr), rownames(tiics_result)) cor_r <- cor(expr,tiics_result,method = "spearman") library(psych) d <- corr.test(expr,tiics_result,use="complete",method = 'spearman') cor_p <- d$p cor_r2 <- cor_r %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>% tidyr::gather(., cell,Correlation,-gene)# cor cor_p2 <- cor_p %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>% tidyr::gather(., cell, Pvalue, -gene)# p cor_dat <- cbind(cor_r2, cor_p2)[,c("gene","cell","Correlation","Pvalue")] write.csv(cor_dat,"04.correlation_cor.csv",row.names = F) cor_dat$gene <- factor(cor_dat$gene) cor_dat$Pvalue2 <- ifelse(cor_dat$Pvalue<0.05,"P<0.05","ns") cor_dat$Correlation2 <- ifelse(abs(cor_dat$Correlation)<=0.3,"No correlation", ifelse(cor_dat$Correlation>0.3,"Medium or strong+","Medium or strong-")) library(ggplot2) library(tidyverse) library(ggpubr) library(ggsci) library(rstatix) library(ggthemes) bub_plot <- ggplot(data = cor_dat, mapping = aes(x =gene , y = cell)) + geom_point(aes(size = Pvalue2, color = Correlation)) + scale_color_gradient2(high = "#FF7F00", mid = "white", low = "#3777ac", limits = c(-1, 1), breaks = c(-1, -0.5, 0, 0.5, 1)) + theme_few() + theme(text = element_text(family = "Times"), axis.text.y = element_text(size = 15, face = "bold",color = "black"), axis.text.x = element_text(size = 15, face = "bold", angle = 45, hjust = 1, vjust = 1, color = "black"), axis.title.x = element_text(size = 15, face = "bold"), legend.title = element_text(face = "bold", size = 20), legend.text = element_text(face = "bold", size = 12), strip.text = element_text(size = 15, family = "Times")) + labs(x = "", y = "", size = "Pvalue", color = "Correlation") + # P 0.05 geom_text(aes(label = ifelse(Pvalue < 0.05, round(Correlation, 2), NA)), vjust = 0, size = 3, angle = 0) pdf("04.correlation111.pdf", width = 10, height = 10,family = 'Times') bub_plot dev.off() png("04.correlation111.png", width = 10, height = 10,units = 'in',res = 300,family = 'Times') bub_plot dev.off() # # # #############riskscore######## # setwd("/data/nas3///20_KMZK-40407-8/12_immunity/") # if (! dir.exists("./05.riskscore")){ # dir.create("./05.riskscore") # } # setwd("./05.riskscore") # # # # group_corr <- read.csv("../../10_Train_estimate/01.train_OS_dat.csv",sep=",",row.names = 1,header = T)%>% dplyr::select(id,risk)%>% as.data.frame() # # group_corr <- group_corr[order(group_corr$risk),] # # riskscore <- read.csv("../../07_Train_estimate/01.train_OS_dat.csv",sep=",",row.names = 1,header = T)%>% dplyr::select(id,riskScore) # rownames(riskscore) <- riskscore$id # # # , # riskscore$id <- NULL # rownames(riskscore)<-gsub('-','.',rownames(riskscore),fixed = T) # #riskscore1 <- riskscore[(group_corr$id),] ### # # # riskscore <- riskscore[,-1]%>% as.data.frame()#riskScore # # #riskscore <- read.csv('./riskscore.csv',row.names = 1) # re3 <- res3 # # train<-t(riskscore) # # cnt<-1 # while(cnt < 2){ #56 # library(psych) # x <-as.numeric(train[cnt,]) # y <-data.frame(re3,check.names = F) # library(psych) # d <- corr.test(y,x,use="complete",method = 'spearman') # r <- data.frame(d$r) # p <- data.frame(d$p) # write.csv(p,file=paste(rownames(train)[cnt], "p.csv",sep='_')) # write.csv(r,file=paste(rownames(train)[cnt], "r.csv",sep='_')) # correlation<-data.frame(rownames(p),r$d.r,p$d.p) # colnames(correlation) <- c("cell","Correlation","p.value") # # correlation<-correlation[correlation$p.value < 0.05 & abs(correlation$Correlation) > 0.3,] # # correlation$sig[correlation$p.value>0.05] <- "ns" # correlation$sig[correlation$p.value<0.05&correlation$p.value>0.01] <- "*" # correlation$sig[correlation$p.value<0.01&correlation$p.value>0.001] <- "**" # correlation$sig[correlation$p.value<0.001&correlation$p.value>0.0001] <- "***" # correlation$sig[correlation$p.value<0.0001] <- "****" # correlation$'correlation'<-abs(correlation$Correlation) # #correlation_sig<-correlation[c(3,9,10,11,23,29,7,12,18,31,1,14,15,25,28),] # library("viridis") # correlation <- na.omit(correlation) # #trace(ggdotchart,edit=T) # p0<-ggdotchart(correlation, x = "cell", y = "Correlation", # dot.size ='correlation', # color ='p.value', # sorting = "descending", # add = "segments", # # rotate = TRUE, # ggtheme = theme_pubr(), # # ylab="Correlation Coefficient (r)", # xlab='', # title=rownames(train)[cnt] # )+scale_colour_gradient( high = "#631879FF",low = "#00A087FF") # p10<-p0+theme(legend.position = "right", # panel.background = element_blank())+geom_hline(aes(yintercept=0),linetype="dashed",lwd = 0.2)+ # theme(axis.title.x =element_text(size=20,family = "Times", face = "bold"), # axis.text.x =element_text(size=16,family = "Times", face = "bold"), # axis.title.y =element_text(size=20,family = "Times", face = "bold"), # axis.text.y=element_text(size=16,family = "Times", face = "bold"), # plot.title=element_text(size=22,family = "Times", face = "bold",hjust=0.5), # legend.text = element_text(size = 16, family = "Times"), # legend.title = element_text(size = 20, family = "Times",face = "bold"))+ # theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank()) # p10 # ggsave(paste(rownames(train)[cnt],'pdf',sep='.'),w=8,h=9) # ggsave(paste(rownames(train)[cnt],'png',sep='.'),w=8,h=9) # cnt<-cnt+1 # } #----------------------------------- # rm(list = ls()) # setwd("/data/nas3///20_KMZK-40407-8/") # if (! dir.exists("./09_TMB_immunecycle")){ # dir.create("./09_TMB_immunecycle") # } # setwd("./09_TMB_immunecycle") # # library(TCGAmutations) # library(maftools) # library(grid) # # maf <- TCGAmutations::tcga_load(study = "GBM") # risk <- read.csv("../04_Riskmodel/01_risk.csv") # risk1 <- risk[,c('id','risk')] # High.sample <- risk1$id[which(risk1$risk==0)] # Low.sample <- risk1$id[which(risk1$risk==1)] # # ##----------------------------------------------- # maf_sample <- data.frame(barcode = maf@clinical.data$Tumor_Sample_Barcode) # maf_sample$sample <- stringr::str_sub(maf_sample$barcode, 1, 16) # maf_sample <- merge(maf_sample, risk, by.x = "sample",by.y = 'id') # sample <- subset(maf_sample)$barcode # maf <- subsetMaf(maf,tsb = sample) # # ##HIGH LOW # high <- maf_sample[(maf_sample$sample) %in% High.sample,] #high # high <- subset(high)$barcode #highbarcode # maf.high <- subsetMaf(maf,tsb = high) # # pdf(file = "01.oncoplot.high.pdf", height = 10, width = 15,paper = "a4r") # par(mar = c(5, 5, 5, 5) + 0.1) # # oncoplot(maf = maf.high, top = 20) # dev.off() # # png(file = "01.oncoplot.high.png", family = "Times", height = 10, width = 15, # units = "in", res = 600) # par(mar = c(5, 5, 5, 5) + 0.1) # # oncoplot(maf = maf.high, top = 20) # dev.off() # # low <- maf_sample[(maf_sample$sample)%in%Low.sample,] # low <- subset(low)$barcode # maf.low <- subsetMaf(maf,tsb = low) # pdf(file = "02.oncoplot.low.pdf", height = 10, width = 15,paper = "a4r") # par(mar = c(5, 5, 5, 5) + 0.1) # # oncoplot(maf = maf.low, top = 20) # dev.off() # # png(file = "02.oncoplot.low.png", family = "Times", height = 10, width = 15, # units = "in", res = 600) # par(mar = c(5, 5, 5, 5) + 0.1) # # oncoplot(maf = maf.low, top = 20) # dev.off() # # ##wilcox-TMB,TMB----------- # library(TCGAmutations) # library(ggplot2) # library(ggpubr) # library(dplyr) # library(ggsci) # # tmp<-as.data.frame(tcga_available()) # dt<-maf # dt<-dt@data # dt1<-as.data.frame(table(dt$Tumor_Sample_Barcode)) # names(dt1)<-c('Barcode','Freq') # dt1$tmb<-dt1$Freq/38 # tmb<-dt1 # tmb$Barcode <- substr(tmb$Barcode,1,16) # tmb <- tmb[tmb$Barcode%in%risk1$id,] # colnames(tmb) <- c('sample','Freq','tmb') # # dat <- merge(risk, tmb, by.x = 'id', by.y = 'sample') # dat$risk <- factor(dat$risk) # write.csv(dat,"01.TMB-score.csv") # # p1<-ggplot(dat,aes(x=riskScore,y=tmb))+ # geom_point(aes(x=riskScore,y=tmb,color =risk), size=2,alpha=0.9)+ # geom_smooth(method = 'lm', formula = y ~ x, se = T,color='black')+ # stat_cor(data=dat, method = "spearman")+ # scale_fill_manual(values =c("black")) + # theme_bw()+ # scale_color_manual(values = c("#6495ed","#dc143c"), name = "risk") + # theme(axis.title = element_text(size = 20, face = "bold", family = "Times",color='black'), # axis.text.x = element_text(size = 16, face = "bold", family = "Times",color='black'), # axis.text.y = element_text(size = 16, face = "bold", family = "Times",color='black'), # legend.text = element_text(size = 14, face = "bold", family = "Times",color='black'), # legend.title= element_text(size =16, face = "bold", family = "Times",color='black'), # legend.position = "bottom")+ # labs(x="Risk score",y='TMB')+ # theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank()) # p1 # my_comparisons = list(c('Low_Risk','High_Risk')) #'Low_Risk','High_Risk' # p2<-ggplot(dat, aes(x=risk, y=tmb,fill=risk)) + # geom_violin()+ # stat_boxplot(geom="errorbar", # width=0.1, # position = position_dodge(0.9)) + # geom_boxplot(width=0.4, # position=position_dodge(0.9), # outlier.shape = NA, # fill='white')+ #,width=0.1 # geom_point(aes(fill = risk), # size = 0.5, # position = position_dodge(0.9))+ # scale_fill_manual(values = c("#FF6347","#6495ED"), name = "Group") + # stat_compare_means(qcomparisons = my_comparisons, # label = "p.signif", # method = "wilcox.test", # color='black',family="Times",face = "bold")+ # theme_bw()+ # # ylim(-50,160) + # theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'), # panel.grid = element_blank()) + # theme(axis.text.x=element_text(family="Times",size=12,face="bold"), #x15,1(hjust = 1),Times20 # axis.text.y=element_text(family="Times",size=12,face="bold"), #y,,plain # axis.title.y=element_text(family="Times",size = 15,face="bold"), #y # axis.title.x=element_text(family="Times",size = 10,face="bold"), # plot.title = element_text(hjust = 0.5,family="Times",size = 15,face="bold"), # legend.text = element_text(face = "bold"), # legend.title = element_text(face = "bold"))+ # labs(title="", x="", y="")+ # guides(fill='none') # p2 # library(patchwork) # p<-p1+p2+plot_layout(widths = c(3, 2)) # p # pdf('03.tmb.pdf',width=12,height=6,family='Times') # print(p) # dev.off() # # png('03.tmb.png',width=12,height=6,family='Times',units='in',res=600) # print(p) # dev.off() # # ##------------------------------------ # library(psych) # library(ggExtra) # # dat_riskScore <- dplyr::select(dat, id, riskScore) # dat_riskScore <- tibble::column_to_rownames(dat_riskScore, var = "id") # dat_tmb <- dplyr::select(dat, id, tmb) # dat_tmb <- tibble::column_to_rownames(dat_tmb, var = "id") # obj <- corr.test(dat_tmb, dat_riskScore, use = "complete", method = "spearman") # cor.r <- obj$r # cor.p <- obj$p # # dat4plot <- cbind(dat_riskScore, dat_tmb) # p1 <- ggplot(dat4plot, aes(riskScore, tmb))+ # xlab("riskScore")+ylab("Tumor Burden Mutation")+ # geom_point()+geom_smooth(method = 'lm',formula = y~x)+ # theme_bw()+ # stat_cor(method = 'spearman',aes(x=riskScore,y=tmb))+ # ylim(0, 4) # p1 # p2 <- ggMarginal(p1,type = 'density',xparams = list(fill='seagreen'), # yparams = list(fill='#A73030FF')) # p2 # ggsave("04.cor_plot.pdf", p2, w=5, h=5) # ggsave("04.cor_plot.png", p2, w=5, h=5, dpi=600) rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./13_checkpoint")){ dir.create("./13_checkpoint") } setwd("./13_checkpoint") gene <- read.csv('../00_rawdata/immune.csv') dat<-read.csv('../00_rawdata/02.expr_tpm_TCGA.csv',row.names = 1) colnames(dat)<-gsub('.','-',colnames(dat),fixed = T) group <- read.csv('../07_Train_estimate/01.train_OS_dat.csv')%>%dplyr::select(c('id','risk')) colnames(group) <- c('sample','group') table(group$group) hub_exp<-dat[gene$symbol,] hub_exp <- na.omit(hub_exp) # rownames(hub_exp) hub_exp2<-hub_exp hub_exp2$Symbol<-rownames(hub_exp2) hub_exp2<-gather(hub_exp2, key = sample, value = expr, -c("Symbol")) hub_exp2 <- na.omit(hub_exp2) High.sample <- group$sample[which(group$group=='High')] ## hub_exp2$Group<-ifelse(hub_exp2$sample%in%High.sample,'High','Low') ## library(ggplot2) library(tidyverse) library(ggpubr) library(ggsci) library(rstatix) stat.test<-hub_exp2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') de.test <- subset(stat.test,p<0.05) de.test$Symbol#29 write.csv(de.test,"00.de.test.csv") # [1] "CD276" "CD28" "CD40" "CD40LG" "CD44" "CD48" "CTLA4" "CXCL9" "GZMA" "HLA-A" "HLA-DPA1" "HLA-DPB1" # [13] "HLA-DQA1" "HLA-DRA" "HMGB1" "IL1B" "ITGB2" "LGALS9" "MICB" "PDCD1" "PRF1" "SELP" "SLAMF7" "TGFB1" # [25] "TIGIT" "TNFSF15" "TNFSF9" "VEGFA" "VEGFB" de.test$p<-ifelse(de.test$p<0.001,"***", ifelse(de.test$p<0.01,"**", ifelse(de.test$p<0.05,"*",'ns'))) datcheck <- dat[de.test$Symbol,group$sample] ciber.res <- t(datcheck) violin_dat <-data.frame(cbind(ciber.res,group$group)) colnames(violin_dat)<-c(colnames(ciber.res),'group') violin_dat$id <- rownames(violin_dat) violin_dat <- reshape2::melt(violin_dat,id = c("id","group")) violin_dat$value <-as.numeric(violin_dat$value) colnames(violin_dat)<-c('id','group','variable','value') violin_dat$group<- factor(violin_dat$group) colnames(violin_dat) p <- ggplot(violin_dat, aes(x=variable,y=value,fill=group)) + geom_boxplot() + #stat_compare_means(aes(group = condition),method = 'wilcox.test',label = "p.signif",cex=4)+ annotate(geom = "text", x = de.test$Symbol, y = 14, size = 5, #family = "Times", label =as.character(de.test$p,face = "bold")) + theme_bw()+ # scale_fill_npg()+ scale_fill_manual(values=c("#CD3700","#4682B4"))+ theme(axis.title.x =element_text(size=20, face = "bold"), axis.text.x =element_text(size=14, face = "bold",angle=45,hjust = 1), axis.title.y =element_text(size=27, face = "bold"), axis.text.y=element_text(size=16, face = "bold"))+ theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank(), legend.title=element_text(size=15) , legend.text=element_text(size=14))+ xlab("") + ylab("checkpoint expression") + theme(axis.title.y = element_text(size = 15, vjust = 0.5, hjust = 0.5)) p pdf("01.checkpoint_boxplot.pdf", width = 12, height = 6,family = 'Times') p dev.off() png("01.checkpoint_boxplot.png", width = 12, height = 6,units = 'in',res = 300,family = 'Times') p dev.off() # gene <- read.csv("../09_Lasso/03.lasso_genes.csv",row.names = 1) # datch<-dat[gene$symbol,] %>% t() %>% as.data.frame() # # library(psych) # hub_exp <- dat[de.test$Symbol,] %>% t()%>% as.data.frame() # # cor_r <- WGCNA::cor(hub_exp,datch,method = "spearman") # cor_p <- WGCNA::corPvalueStudent(cor_r,length(rownames(hub_exp))) # cor_r2 <- cor_r %>% as.data.frame %>% tibble::rownames_to_column(var = "cell") %>% # tidyr::gather(., gene,Correlation,-cell)# cor # cor_p2 <- cor_p %>% as.data.frame %>% tibble::rownames_to_column(var = "cell") %>% # tidyr::gather(., gene, Pvalue, -cell)# p # # cor_dat <- cbind(cor_r2, cor_p2)[,c("cell","gene","Correlation","Pvalue")] # # cor_dat$cell <- factor(cor_dat$cell) # cor_dat$Pvalue2 <- ifelse(cor_dat$Pvalue<0.05,"P<0.05","ns") # cor_dat$Correlation2 <- ifelse(abs(cor_dat$Correlation)<=0.3,"No correlation", # ifelse(cor_dat$Correlation>0.3,"Medium or strong+","Medium or strong-")) # write.csv(cor_dat,"02.correlation_cor.csv") # cor_dat2 <- subset(cor_dat,cor_dat$Pvalue < 0.05 & abs(cor_dat$Correlation) > 0.3) # cor_dat2 <- cor_dat2[order(cor_dat2$cell),] # cor_dat2$gene # write.csv(cor_dat2,"03.correlation_degcor.csv") # # # # library(ggplot2) # library(ggthemes) # bub_plot <- ggplot(data = cor_dat, mapping = aes(x=cell,y=gene)) + # geom_point(aes(size=Pvalue2,color=Correlation))+ # scale_color_gradient2(high = "#EF767A",mid = "white",low = "#456990",limits=c(-1, 1),breaks = c(-1,-0.6,-0.2,0.2,0.6,1))+ # theme_few()+ # theme(text = element_text(family = "Times"), # axis.text.y = element_text(size = 15,face = "bold",color = "black"), # axis.text.x = element_text(size = 15, face = "bold",angle = 45,hjust = 1,vjust = 1,color = "black"), # axis.title.x = element_text(size = 15, face = "bold"), # legend.title = element_text(face = "bold",size = 20), # legend.text = element_text(face = "bold",size = 12), # strip.text = element_text(size = 15, family = "Times"))+ # labs(x="",y="",size="Pvalue",color="Correlation")+ # geom_text(aes(label = round(Correlation, 2)),vjust = -1, size = 3) # bub_plot # pdf("04.correlation.pdf", width = 18, height = 6,family = 'Times') # bub_plot # dev.off() # png("04.correlation.png", width = 18, height = 6,units = 'in',res = 300,family = 'Times') # bub_plot # dev.off() # immunecycle------------------------------------------------ rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./14_immunecycle")){ dir.create("./14_immunecycle") } setwd("./14_immunecycle") train_data <- read.csv("../00_rawdata/02.expr_tpm_TCGA.csv",header=TRUE, row.names=1, check.names=FALSE) riskScore<-read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names=1) group <- riskScore[,c("id","risk")] colnames(group) <- c("sample","group") dat <- train_data[,group$sample] identical(colnames(dat),group$sample) library(GSVA) library(tidyverse) gene_set <- read.table("../00_rawdata/cancer_immunity_cycle.txt",header = T,sep ="\t") gene_set$Steps <- c(rep('Step 1:Relase of cancer cell antigenes',40), rep('Step 2:Cancer antigen presentation',36), rep('Step 3:Priming and activation',62), rep('Step 4:Trafficking of immune cells to tumors',76), rep('Step 5:Infiltration of immune cells into tumors',9), rep('Step 6:Recongnition of cancer cells by T cells',25), rep('Step 7:Killing of cancer cells',25)) gene_set$group <- gene_set$Steps %>% substr(.,8,60) gene_set$step <- gene_set$Steps %>% substr(.,1,6) table(gene_set$group[grep("Step 4",gene_set$Steps)] ) gene_set$group[grep("Step 4",gene_set$Steps)] <- gene_set$ImmuneCellType[grep("Step 4",gene_set$Steps)] gene_set$group[grep("Step 4",gene_set$Steps)] <- paste0(gene_set$group[grep("Step 4",gene_set$Steps)]," recruiting") gene_set$group <- paste0(gene_set$step,' ',gene_set$group) dat.final <- as.matrix(dat) gene_list <- split(as.matrix(gene_set)[,1],gene_set[,5]) ssgsea_score <- gsva(dat.final, gene_list,method = "ssgsea",ssgsea.norm = TRUE,verbose = TRUE) write.csv(ssgsea_score,"05.immune_cycle.csv",row.names = T,quote = F) #7--------------------- library(magrittr) library(rstatix) library(tidyverse) tiic_fpkm <- ssgsea_score %>% t %>% as.data.frame %>% rownames_to_column(var="sample") dat <- merge(group, tiic_fpkm, by = "sample")#cibersort dat2 <- tidyr::gather(dat, step, score, -c("sample", "group"))#,,Cells, score stat_res <- dat2 %>% group_by(step) %>% wilcox_test(score ~ group) %>% # adjust_pvalue(method = "BH") %>% add_significance("p") stat_res Pathways_sig <- stat_res[stat_res$p<0.05,] paste0(Pathways_sig$step,collapse = ",") stat_res$p2 <- signif(stat_res$p, 3)#p, stat_res$p2 <- ifelse(stat_res$p <= 0.05, paste0("~italic(p)==", signif(stat_res$p,2)), stat_res$p.signif) write.csv(stat_res,"06.res_wicox_step.csv",row.names = F,quote = F) library(ggpubr) set.seed(1) dat2$group <- ifelse(dat2$group=="High","High risk","Low risk") dat2$group <- factor(dat2$group,levels = c("High risk","Low risk")) # col <- readRDS("mypalette.rds")[1:7] library(ggplot2) library(ggsci) p <- ggplot(dat2, aes(x=step, y=score, fill=group)) + geom_boxplot(width=0.8, alpha=0.8, position=position_dodge(0.9)) + scale_fill_manual(values = c("#EE2C2C", "#114889"), name = "Group") + # “ns” annotate(geom = "text", x = stat_res$step, y = 0.8, size = 6, family = "Times", label = ifelse(stat_res$p.signif != "ns", as.character(stat_res$p.signif), "")) + theme_bw() + theme(legend.position = "top") + theme(axis.title.x = element_text(size=20, family = "Times", face = "bold"), axis.text.x = element_text(size=13, hjust=1, family = "Times", face = "bold"), axis.title.y = element_text(size=20, family = "Times", face = "bold"), axis.text.y = element_text(size=16, family = "Times", face = "bold")) + theme(legend.title = element_text(size=15,family = "Times"), legend.text = element_text(size=14,family = "Times")) + theme(axis.line = element_line(color = "black"), # axis.ticks = element_line(color = "black")) + # labs(x="", y="Cancer Immunity Cycle") + coord_flip() # print(p) ggsave(width=10,height=10,'05.immuCycle.pdf') ggsave(width=10,height=10,'05.immuCycle.png') #--------------- dat2$step<- substr(dat2$step ,1,6) dat2$High<-ifelse(dat2$group =="High risk",'High',NA) dat2$Low<-ifelse(dat2$group =="Low risk",'Low',NA) ## step1<-filter(dat2, group=="Low risk" & step=="Step 1") step1$rowmedians<-median(step1$score) low1<-median(step1$score) step2<-filter(dat2, group=="Low risk" & step=="Step 2") step2$rowmedians<-median(step2$score) low2<-median(step2$score) step3<-filter(dat2, group=="Low risk" & step=="Step 3") step3$rowmedians<-median(step3$score) low3<-median(step3$score) step4<-filter(dat2, group=="Low risk" & step=="Step 4") step4$rowmedians<-median(step4$score) low4<-median(step4$score) step5<-filter(dat2, group=="Low risk" & step=="Step 5") step5$rowmedians<-median(step5$score) low5<-median(step5$score) step6<-filter(dat2, group=="Low risk" & step=="Step 6") step6$rowmedians<-median(step6$score) low6<-median(step6$score) step7<-filter(dat2, group=="Low risk" & step=="Step 7") step7$rowmedians<-median(step7$score) low7<-median(step7$score) ## step1<-filter(dat2, group=="High risk" & step=="Step 1") step1$rowmedians<-median(step1$score) High1<-median(step1$score) step2<-filter(dat2, group=="High risk" & step=="Step 2") step2$rowmedians<-median(step2$score) High2<-median(step2$score) step3<-filter(dat2, group=="High risk" & step=="Step 3") step3$rowmedians<-median(step3$score) High3<-median(step3$score) step4<-filter(dat2, group=="High risk" & step=="Step 4") step4$rowmedians<-median(step4$score) High4<-median(step4$score) step5<-filter(dat2, group=="High risk" & step=="Step 5") step5$rowmedians<-median(step5$score) High5<-median(step5$score) step6<-filter(dat2, group=="High risk" & step=="Step 6") step6$rowmedians<-median(step6$score) High6<-median(step6$score) step7<-filter(dat2, group=="High risk" & step=="Step 7") step7$rowmedians<-median(step7$score) High7<-median(step7$score) ## # library(ggradar) library(tidyverse) #df<-c(High1,High2,High3,High4,High5,High6,High7) #dn<-c(low1,low2,low3,low4,low5,low6,low7) exam_scores <-data.frame(step1=c(High1,low1), step2=c(High2,low2), step3=c(High3,low3), step4=c(High4,low4), step5=c(High5,low5), step6=c(High6,low6), step7=c(High7,low7), row.names = c("High","Low")) # max_min <- data.frame( step1 = c(0.6, -0.6), step2 = c(0.6, -0.6), step3 = c(0.6, -0.6), step4 = c(0.6, -0.6),step5 = c(0.6, -0.6), step6 = c(0.6, -0.6), step7 = c(0.6, -0.6) ) rownames(max_min) <- c("Max", "Min") df <- rbind(max_min, exam_scores) png('06.ssgsea.png',w=8,h=6,family='Times',units='in',res=600) pdf('06.ssgsea.pdf',w=8,h=6,family='Times') #install.packages("fmsb") library(fmsb) radarchart( df, axistype = 1, # Customize the polygon pcol = c("#DB3587", "#35B3B2"), pfcol = scales::alpha(c("#DB3587","#35B3B2"),0.5), plwd = 2, plty = 1, # Customize the grid cglcol = "grey", cglty = 1, cglwd = 0.8, # Customize the axis axislabcol = "grey", # Variable labels vlcex = 0.7, vlabels = colnames(df), caxislabels = c(-0.3,-0.2,-0.1,0,0.1,0.2,0.3)) # Add an horizontal legend legend( x = "bottom", legend = rownames(df[-c(1,2),]), horiz = TRUE, bty = "n", pch = 20 , col = c("#DB3587", "#35B3B2"), text.col = "black", cex = 1, pt.cex = 1.5 ) dev.off() dev.off() # ##############7########### # ## # library(curl) # curl_download("http://biocc.hrbmu.edu.cn/TIP/download/signature%20annotation.txt", # destfile = paste0(dest_dir, "/geneSet_TIP.txt")) # geneInfo_TIP = read.table("./8、/ESCA/geneSet_TIP.txt", header = TRUE, sep = "\t") # geneInfo_TIP$step_cell = paste0("step ", geneInfo_TIP$Steps, " ", geneInfo_TIP$ImmuneCellType) # geneInfo_TIP$Description = paste0(geneInfo_TIP$step_cell, " ", geneInfo_TIP$Direction) # # pathInfo_TIP = unique(geneInfo_TIP[,-1]) # geneSet_TIP = list() # for(i in 1:length(pathInfo_TIP$Description)){ # geneSet_TIP[[i]] = geneInfo_TIP$GeneSymbol[which(geneInfo_TIP$Description==pathInfo_TIP$Description[i])] # } # names(geneSet_TIP) = pathInfo_TIP$Description # # ## ssGSEA # library(GSVA) # data_analysis = read.csv("./data/data_log2(tpm+1)_ESCA.csv", row.names = 1, check.names = FALSE) # data_analysis = data_analysis[,survRisk_train$sample_id] # score_TIP1 = gsva(as.matrix(data_analysis), geneSet_TIP, method = 'ssgsea', kcdf = 'Gaussian', abs.ranking = TRUE) # # ## permutation 100 # ## # score_TIP = list() # for(j in 1:length(colnames(score_TIP1))){ # score_TIP[[j]] = as.data.frame(score_TIP1[,j]) # colnames(score_TIP[[j]])[1] = "permutation1" # } # names(score_TIP) = colnames(score_TIP1) # # for(i in 2:101){ # data_random = data_analysis # rownames(data_random) = sample(rownames(data_random), size = nrow(data_random), replace = FALSE) # score_tmp = gsva(as.matrix(data_random), geneSet_TIP, method = 'ssgsea', kcdf = 'Gaussian', abs.ranking = TRUE) # for(j in 1:length(colnames(score_tmp))){ # score_TIP[[j]] = cbind(score_TIP[[j]], as.data.frame(score_tmp[,j])) # colnames(score_TIP[[j]])[i] = paste0("permutation", i) # } # } # # ## z score # zscore_TIP = matrix(0, nrow = length(pathInfo_TIP$Description), ncol = ncol(data_analysis)) # rownames(zscore_TIP) = pathInfo_TIP$Description # colnames(zscore_TIP) = colnames(data_analysis) # for(j in 1:ncol(zscore_TIP)){ # for(i in 1:length(pathInfo_TIP$Description)){ # zscore_TIP[i, j] = (score_TIP[[j]][i, 1]-mean(as.numeric(score_TIP[[j]][i,])))/sd(as.numeric(score_TIP[[j]][i,])) # } # } # # activeScore_TIP = matrix(0, nrow = length(unique(pathInfo_TIP$step_cell)), ncol = ncol(data_analysis)) # rownames(activeScore_TIP) = unique(pathInfo_TIP$step_cell) # colnames(activeScore_TIP) = colnames(data_analysis) # # ## positivenegativeNES(positive)-NES(negative) # for(i in 1:nrow(activeScore_TIP)){ # path_name = pathInfo_TIP$Description[which(pathInfo_TIP$step_cell==rownames(activeScore_TIP)[i])] # if(length(path_name)==2){ # activeScore_TIP[i,] = as.numeric(zscore_TIP[paste0(rownames(activeScore_TIP)[i], " positive"),]) - as.numeric(zscore_TIP[paste0(rownames(activeScore_TIP)[i], " negative"),]) # }else{ # activeScore_TIP[i,] = as.numeric(zscore_TIP[path_name,]) # } # } # write.csv(activeScore_TIP, paste0(dest_dir, "/activeScore_TIP.csv")) # # # library(IOBR) # library(ggcorrplot) # library(corrplot) # pdf("03.cell_corHeatmap.pdf",height=14,width=14,family='Times') # col1 <- colorRampPalette(c("#3777ac", "white","#FF7F00")) # corr_plot<-corrplot(cor_data, # tl.cex=1.5, # method = "square", #square,circle,method = 'shade' # is.corr = T, # type = "lower", #lower,upper,full # p.mat = corp, # insig = "blank", # outline = "white", # addCoef.col ="black", # col = col1(200), # tl.col = 'black', # tl.offset = 0.4, # number.font = 2, # cl.cex = 0.8, # number.cex = 1.2 # ) # dev.off() # # png("03.cell_corHeatmap.png",height=14,width=14,family='Times',units='in',res=600) # col1 <- colorRampPalette(c("#3777ac", "white","#FF7F00")) # corr_plot<-corrplot(cor_data, # tl.cex=1.5, # method = "square", #square,circle,method = 'shade' # is.corr = T, # type = "lower", #lower,upper,full # p.mat = corp, # insig = "blank", # outline = "white", # addCoef.col ="black", # col = col1(200), # tl.col = 'black', # tl.offset = 0.4, # number.font = 2, # cl.cex = 0.8, # number.cex = 1.2 # ) # dev.off() # library (dplyr) # data_expr<-read.csv("../00_rawdata/02.expr_fpkm_TCGA.csv", row.names = 1,check.names = F) # gene <- read.csv('../09_Lasso/03.lasso_genes.csv') # # expr <- t(data_expr) %>% as.data.frame() # expr <- expr[,gene$symbol] # expr <- expr[rownames(expr) %in% group$sample,] # # tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) %>% lc.tableToNum # tiics_result <- tiics_result[, group$sample] %>% as.matrix() # tiics_result <- t(tiics_result) %>% as.data.frame() # tiics_result <- tiics_result[rownames(expr),] # # diff <- read.csv('02.cell_diff_wilcox_test.csv') # tiics_result <- tiics_result[,diff$immune_cell] # # identical(rownames(expr), rownames(tiics_result)) # cor_r <- cor(expr,tiics_result,method = "spearman") # library(psych) # d <- corr.test(expr,tiics_result,use="complete",method = 'spearman') # cor_p <- d$p # cor_r2 <- cor_r %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>% # tidyr::gather(., cell,Correlation,-gene)# cor # cor_p2 <- cor_p %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>% # tidyr::gather(., cell, Pvalue, -gene)# p # # cor_dat <- cbind(cor_r2, cor_p2)[,c("gene","cell","Correlation","Pvalue")] # write.csv(cor_dat,"04.correlation_cor.csv",row.names = F) # # cor_dat$gene <- factor(cor_dat$gene) # cor_dat$Pvalue2 <- ifelse(cor_dat$Pvalue<0.05,"P<0.05","ns") # cor_dat$Correlation2 <- ifelse(abs(cor_dat$Correlation)<=0.3,"No correlation", # ifelse(cor_dat$Correlation>0.3,"Medium or strong+","Medium or strong-")) # library(ggplot2) # library(ggthemes) # bub_plot <- ggplot(data = cor_dat, mapping = aes(x=gene,y=cell)) + # geom_point(aes(size=Pvalue2,color=Correlation))+ # scale_color_gradient2(high = "#EF767A",mid = "white",low = "#456990",limits=c(-1,1),breaks = c(-1,-0.5,0,0.5,1))+ # theme_few()+ # theme(text = element_text(family = "Times"), # axis.text.y = element_text(size = 15,color = "black"), # axis.text.x = element_text(size = 15,face = "bold",angle = 90,hjust = 1,vjust = 1,color = "black"), # axis.title.x = element_text(size = 25, face = "bold"), # legend.title = element_text(face = "bold",size = 20), # legend.text = element_text(face = "bold",size = 12), # strip.text = element_text(size = 20))+ # labs(x="",y="",size="Pvalue",color="Correlation")+ # geom_text(aes(label = round(Correlation, 2)),vjust = -1, size = 3.5) # bub_plot # pdf("04.correlation.pdf", width = 9, height = 10,family = 'Times') # bub_plot # dev.off() # # png("04.correlation.png", width = 9, height = 10,units = 'in',res = 300,family = 'Times') # bub_plot # dev.off() rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./15_Immune_pathway")){ dir.create("./15_Immune_pathway") } setwd("./15_Immune_pathway") path <- read.csv('../00_rawdata/pathway.csv') path2 <- path %>% separate_rows(Genes, sep = ", ") path2 <- path2 %>% separate_rows(Genes, sep = ",") library(lance) library(magrittr) expr<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F) %>% lance::lc.tableToNum() colnames(expr)<-gsub('.','-',colnames(expr),fixed = T) risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1,check.names = F) #risk$id <- rownames(risk) group <- risk[,c("id",'risk')] expr <- expr[,group$id] # high.sample<-group$id[which(group$risk=='High')] low.sample<-group$id[which(group$risk=='Low')] expr <- expr[,group$id] library(GSVA) gene_set <- path2 length(table(gene_set$Pathway)) dat.final <- as.matrix(expr) gene_list <- split(as.matrix(gene_set)[,2], gene_set[,1]) ssgsea_score = gsva(dat.final, gene_list, method = "ssgsea", ssgsea.norm = TRUE, verbose = TRUE) ssgsea_score1 <- as.data.frame(ssgsea_score) write.csv(ssgsea_score1, file = "01.ssgsea_result_cell.csv", quote = F) library(writexl) library(openxlsx) wb <- createWorkbook() addWorksheet(wb,sheetName = 'Data') writeData(wb,sheet = 'Data',x=ssgsea_score,row.names = T) saveWorkbook(wb,file = '01.ssgsea_result_cell.xlsx',overwrite = T) #a <- rownames(expr)%>% as.data.frame() #diff_vec1 <- setdiff(gene_set$Pathway, rownames(ssgsea_score1)) colnames(group) <- c('sample', 'group') group<-group[order(group$group),] ssgsea_score<-read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) ssgsea_score<-ssgsea_score[,group$sample] annotation_col<-as.data.frame(group$group) colnames(annotation_col)='Group' rownames(annotation_col)=colnames(ssgsea_score) ssgsea_score <- as.matrix(ssgsea_score) # library(pheatmap) # color.key<-c("#3777ac","#FF7F00") # ann_colors<-list(Group = c('Low'="#3777ac",'High'="#FF7F00")) # p <- pheatmap( # ssgsea_score, # color = colorRampPalette(color.key)(50), # border_color = 'darkgrey', # annotation_col = annotation_col, # annotation_colors = ann_colors, # labels_row = NULL, # clustering_method = 'ward.D2', # show_rownames = T, # show_colnames = F, # fontsize_col = 5, # cluster_cols = F, # cluster_rows = T) # p # png(file = "01.heatmap.png", height = 5, width = 8, units = 'in',res = 600,family='Times') # p # dev.off() # pdf(file = "01.heatmap.pdf", height = 5,width = 8,family='Times') # p # dev.off() table(group$group) group_case<-group[group$group=='High',] group_case<-as.character(group_case$sample) group_control<-group[group$group=='Low',] group_control<-as.character(group_control$sample) library(dplyr) tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) tiics_result <- tiics_result[, group$sample] %>% as.matrix() pvalue = padj <- matrix(0, nrow(tiics_result), 1) for (i in 1:nrow(tiics_result)){ pvalue[i, 1] = p.value = wilcox.test(tiics_result[i, group_case], tiics_result[i, group_control])$p.value } padj <- p.adjust(as.vector(pvalue), "fdr", n = length(pvalue)) rTable <- data.frame(#log2FoldChange, pvalue, padj, row.names = rownames(tiics_result)) rTable$panthway <- rownames(rTable) rTable$sig <- ifelse(rTable$pvalue < 0.05, ifelse(rTable$pvalue < 0.01, ifelse(rTable$pvalue < 0.001, ifelse(rTable$pvalue < 0.0001, paste(rTable$panthway, "****", sep = ""), paste(rTable$panthway, "***", sep = "")), paste(rTable$panthway, "**", sep = "")), paste(rTable$panthway, "*", sep = "")), rTable$panthway) diff_Table<-rTable[which(rTable$pvalue<0.05),] dim(diff_Table) # 18 write.csv(rTable, file = "02.all_wilcox_test.csv", quote = F, row.names = F) rTable <- read.csv('02.all_wilcox_test.csv') library(writexl) library(openxlsx) wb <- createWorkbook() addWorksheet(wb,sheetName = 'Data') writeData(wb,sheet = 'Data',x=rTable,row.names = T) saveWorkbook(wb,file = '02.all_wilcox_test.xlsx',overwrite = T) write.csv(diff_Table, file = "02.diff_wilcox_test.csv", quote = F, row.names = F) diff_Table <- read.csv('02.diff_wilcox_test.csv') library(writexl) library(openxlsx) wb <- createWorkbook() addWorksheet(wb,sheetName = 'Data') writeData(wb,sheet = 'Data',x=diff_Table,row.names = T) saveWorkbook(wb,file = '02.diff_wilcox_test.xlsx',overwrite = T) paste0(diff_Table$panthway,collapse = "()、") #"Base_excision_repair()、Cell_cycle()、DNA_replication()、Fanconi_anemia_pathway()、FGFR3-coexpressed_genes()、Homologous_recombination()、Hypoxia()、MicroRNAs_in_cancer()、Mismatch_repair()、Nucleotide_excision_repair()、Oocyte_meiosis()、p53_signaling_pathway()、PPARG_network()、Progesterone-mediated_oocyte_maturation()、Pyrimidine_metabolism()、Spliceosome()、Viral_carcinogenesis()、WNT-β-catenin_network" library(tidyr) library(ggplot2) library(ggpubr) library(Ipaper) cell.data <- data.frame(Immune_Cell=rownames(tiics_result), tiics_result, pvalue=rTable$pvalue) #plot.cell <- cell.data[which(cell.data$pvalue<0.05),] diff_tiics <- rownames(cell.data) violin_dat <- gather(cell.data, key=Group, value=score, -c("Immune_Cell","pvalue")) violin_dat$Group <- ifelse(gsub("\\.","-",violin_dat$Group) %in% group_control, "Low", "High") violin_dat$Group <- factor(violin_dat$Group, levels = c("Low", "High")) head(violin_dat) boxplot_diff_TIICs <- ggplot(violin_dat, aes(x=Immune_Cell, y=score, fill=Group)) + #geom_violin(trim=T,color='black') + #, “color=”(#white,)#"trim"TRUE(),。FALSE,。 stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.7, position=position_dodge(0.9), outlier.shape = NA, color='black')+ #,width=0.1 scale_fill_manual(values= c("#3777ac","#FF7F00"))+ # labs(title="Cell Infiltration", x="", y = "Score",size=20) + stat_compare_means(data = violin_dat, method = "wilcox.test", #, mapping = aes(group = Group), label ="p.signif", hide.ns = F) + theme_bw()+# ylim(-0.3,1) + theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=16), # axis.text.x=element_text(angle=90,hjust=1,colour="black",face="bold",size=12), #x45,1(hjust = 1),14 axis.text.y=element_text(hjust=0.5,colour="black",size=12), #y,,plain axis.title.x=element_text(size=16,face="bold"),#x axis.title.y=element_text(size=14,face="bold"), #y legend.text=element_text(face="bold", hjust = 0.5,colour="black", size=11), # legend.title=element_text(face="bold", colour="black", size=11),# text=element_text(family = 'Times'), #legend.justification=c(-0.1,1.2), #。##(1,1)1(),1。 #legend.position=c(0, 1.04), #legend.position=c(0,1),(1,1)。 panel.grid.major = element_blank(), # panel.grid.minor = element_blank()) # boxplot_diff_TIICs pdf("02.ssgsea_Box.pdf", width = 12, height = 7,family = 'Times') print(boxplot_diff_TIICs) dev.off() png("02.ssgsea_Box.png", width = 12, height = 6,units = 'in',res = 300,family = 'Times') boxplot_diff_TIICs dev.off() rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (! dir.exists("./16_IC50")){ dir.create("./16_IC50") } setwd("./16_IC50") #.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths())) library(pRRophetic) library(ggplot2) set.seed(12345) drug<-read.table(file = '/data/nas3//pipeline/IC50_drug/drugs.txt',sep='\t',header=F) fpkm.log <- read.csv('../00_rawdata/02.expr_tpm_TCGA.csv', row.names = 1,check.names = F) risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv') #colnames(risk)[1] <- 'id' fpkm.log <- fpkm.log[,risk$id] identical(colnames(fpkm.log), risk$id) dat <- fpkm.log ###IC50 a <- data.frame(row.names=colnames(fpkm.log),colnames(fpkm.log)) # cnt<-1 while (cnt < 139) { print(cnt) predictedPtype <- pRRopheticPredict(as.matrix(dat), drug[cnt,],selection=1) Tipifarnib<-data.frame(predictedPtype) colnames(Tipifarnib)<-drug[cnt,] a<-cbind(a,Tipifarnib) cnt = cnt + 1 } save.image('IC50.RData') write.csv(a,'01.IC50.csv') load('IC50.RData') #----------------------------------------------------------------- risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv') risk1 <- data.frame(riskScore = risk$riskScore) rownames(risk1) <- risk$id risk <- risk1 median(risk$riskScore) #-0.1099384 #IC50.diff <- read.csv('./01.IC50_diff_wilcox.csv') IC50 <- read.csv('./01.IC50.csv',row.names = 1) IC50 <- IC50[,-1] #IC50 <- t(IC50) %>% as.data.frame() IC50 <- IC50[rownames(risk),] library(tidyr) library(psych) y <-IC50 x <-risk identical(rownames(x),rownames(y)) library(psych) d <- corr.test(x,y,use="complete",method = 'spearman') r <- t(data.frame(d$r)) p <- t(data.frame(d$p)) identical(rownames(r),rownames(p)) dat.r <- data.frame(r) dat.r$drug <- rownames(dat.r) dat.r <- gather(dat.r,'riskScore','r',-c(drug)) dat.p <- data.frame(p) dat.p$drug <- rownames(dat.p) dat.p <- gather(dat.p,'riskScore','p',-c(drug)) corrp <- (cbind(dat.r, dat.p))[,c("drug","riskScore",'r',"p")] corrp$change <- as.factor(ifelse(corrp$p < 0.05 & abs(corrp$r) > 0.3, ifelse(corrp$r > 0.3, 'Positive correlation', 'Negative correlation'), 'NOT')) write.csv(corrp,file="01.risk_IC50_cor.csv",quote=F) corrp.0.05.0.3 <- corrp %>% as.data.frame() corrp.0.05.0.3 <- corrp.0.05.0.3[corrp.0.05.0.3$p<0.05,] corrp.0.05.0.3 <- corrp.0.05.0.3[abs(corrp.0.05.0.3$r)>0.3,] table(corrp.0.05.0.3$change) write.csv(corrp.0.05.0.3,file="01.cor_0.05_0.3.csv",quote=F) #50 top <- rbind(head(corrp.0.05.0.3[order(corrp.0.05.0.3$r,decreasing = T),],5), head(corrp.0.05.0.3[order(corrp.0.05.0.3$r,decreasing = F),],5) ) # #devtools::install_github("kongdd/Ipaper") library(ggplot2) library(ggthemes) library(RColorBrewer) library(Ipaper) library(scales) library(ggrepel) volcano_plot <- ggplot(data = corrp, aes(x = r, #aesggplot()xy, y = -log10(p), color = change)) + scale_color_manual(values = c("#377EB8", "darkgray","#F0027F")) + scale_x_continuous(breaks = c(-1,-0.3,0,0.3,1)) + scale_y_continuous(#trans = "log1p", #y trans = "identity", breaks = c(0,1,5,10,20,50,100,200)) + geom_point(size = 1.2, alpha = 0.4, na.rm=T) + # #theme_bw(base_size = 12, base_family = "Times") + #,。 theme_classic()+ geom_vline(xintercept = c(-0.3,0.3), # lty = 4, col = "darkgray", lwd = 0.6)+ geom_hline(yintercept = -log10(0.05), lty = 4, col = "darkgray", lwd = 0.6)+ theme(legend.position = "right", # panel.grid = element_blank(), # legend.title = element_text(face = "bold",color = "black",size = 15), legend.title=element_blank(), legend.text = element_text(face = "bold",color = "black",size = 13), plot.title = element_text(hjust = 0.5), axis.text.x = element_text(face = "bold",color = "black",size = 15), axis.text.y = element_text(face = "bold",color = "black",size = 15), axis.title.x = element_text(face = "bold",color = "black",size = 15), axis.title.y = element_text(face = "bold",color = "black",size = 15), text = element_text(family = "Times")) + #geom_label_repel( # geom_text_repel( data = top, aes(label = top$drug), max.overlaps = 30, size = 4, box.padding = unit(0.5, "lines"), min.segment.length = 0, point.padding = unit(0.8, "lines"), force= 2, # segment.color = "black", show.legend = FALSE, family = "Times" )+ annotate("text", x = 0.3 , y = 0,label = "total = 138 drugs",colour="black",family='Times') + labs(x = "Correlation coefficient(risk-IC50)", y = "-log10 (p value)", legend = '') volcano_plot pdf('01.risk_IC50_cor.pdf', w=8,h=6,family='Times') volcano_plot dev.off() png('01.risk_IC50_cor.png', w=8,h=6,family='Times', units='in',res=600) volcano_plot dev.off() #------------------ corrp.0.05.0.3 <- read.csv('./01.cor_0.05_0.3.csv') IC50 <- read.csv('./01.IC50.csv',row.names = 1) IC50 <- IC50[,-1] #IC50 <- t(IC50) %>% as.data.frame() IC50 <- IC50[,corrp.0.05.0.3$drug] IC50$sample <- rownames(IC50) risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv') risk <- risk[,c('id','risk')] colnames(risk) <- c('sample','risk') data <- merge(risk, IC50, by = "sample") data2 <- gather(data, key = drug, value = IC50, -c('sample', 'risk')) data2 <- data.frame(data2) ## library(ggplot2) library(tidyverse) library(ggpubr) library(ggsci) library(rstatix) stat.test<-data2%>% group_by(drug)%>% wilcox_test(IC50 ~ risk)%>% adjust_pvalue(method = 'fdr') stat.test$p1<-ifelse(stat.test$p<0.0001,"****", ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.01,"**", ifelse(stat.test$p<0.05,"*",'ns')))) write.csv(stat.test,'02.IC50_diff_wilcox.csv') top <- head(stat.test[order(stat.test$p,decreasing = F),],10) top.ic50 <- data2[data2$drug %in% top$drug,] length(unique(top.ic50$drug)) #15 unique(top.ic50$drug) #------------ library(ggplot2) library(ggpubr) top.ic50$risk <- factor(top.ic50$risk,levels = c('Low','High')) top <- top[order(top$drug,decreasing = F),] top$drug <- factor(top$drug,levels = top$drug) exp_plot <- ggplot(top.ic50,aes(x = risk, y = IC50, color = risk)) + geom_violin(trim=F,color='black',aes(fill=risk)) + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 stat_boxplot(geom="errorbar", width=0.1, color="black", position = position_dodge(0.9)) + geom_boxplot(width=0.2, color="black", position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#0373c9","#eac001"), name = "risk")+ labs(title="", x="", y = "",size=20) + scale_colour_manual(values = c("#0373c9","#eac001"))+ # scale_color_brewer(palette = 'Set2')+ stat_compare_means(aes(group = risk,label = "p.format"), method = "wilcox.test",label = 'p.signif',label.x = 1.4)+ # stat_pvalue_manual(top, # y.position = c(rep(1,15)), # size = 3.2, # family = "Times", # label = "p1", #w # #parse = T, # face = "bold")+ theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=8), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank(), text=element_text(family='Times'))+ facet_wrap(~drug,scales = "free",nrow = 2) exp_plot ggsave(filename = '02.IC50_diff.pdf',exp_plot,w=8,h=6) ggsave(filename = '02.IC50_diff.png',exp_plot,w=8,h=6,dpi = 600) rm(list = ls()) setwd("/data/nas3///29_YQ953-11/") if (!dir.exists("17_Expression_val")){dir.create("17_Expression_val")} setwd("17_Expression_val") library(ggpubr) gene <- read.csv('../06_Lasso/03.lasso_genes.csv') ## 1-------- dat<-read.csv('../00_rawdata/02.expr_tpm_TCGA.csv',row.names = 1) colnames(dat)<-gsub('.','-',colnames(dat),fixed = T) group <- read.csv('../00_rawdata/03.group_TCGA.csv') table(group$group) #dat <- dat[group$X] library(tidyr) hub_exp<-dat[gene$symbol,] hub_exp2<-hub_exp hub_exp2$Symbol<-rownames(hub_exp2) hub_exp2<-gather(hub_exp2, key = sample, value = expr, -c("Symbol")) control.sample <- group$X[which(group$group=='Normal')] ## hub_exp2$Group<-ifelse(hub_exp2$sample%in%control.sample,'Normal','Tumor') ## library(ggplot2) library(tidyverse) library(ggpubr) library(ggsci) library(rstatix) stat.test<-hub_exp2%>% group_by(Symbol)%>% wilcox_test(expr ~ Group)%>% adjust_pvalue(method = 'fdr') diff <- read.csv('../01_DEG_TCGA/02.DEG1_sig.csv',row.names = 1) df<-diff[stat.test$Symbol,] stat.test$p<-df$pvalue%>%as.numeric()%>%round(digits = 3) stat.test$p<-ifelse(stat.test$p<0.001,"***", ifelse(stat.test$p<0.01,"**", ifelse(stat.test$p<0.05,"*",'ns'))) library(ggplot2) library(ggpubr) hub_exp2$Group <- factor(hub_exp2$Group,levels = c('Normal','Tumor')) exp_plot <- ggplot(hub_exp2,aes(x = Group, y = expr, color = Group)) + geom_violin(trim=F,color="black",aes(fill=Group)) + #, “color=”(#white,) #"trim"TRUE(),。FALSE,。 stat_boxplot(geom="errorbar", width=0.1, position = position_dodge(0.9)) + geom_boxplot(width=0.4, position=position_dodge(0.9), outlier.shape = NA)+ #,width=0.1 scale_fill_manual(values= c("#FF7F00","#e5486e"), name = "Group")+ labs(title="Expression in Train", x="", y = "",size=20) + scale_colour_manual(values = c("#FF7F00","#e5486e"))+ stat_pvalue_manual(stat.test, #y.position = c(rep(1,5)), y.position = c(15,18,15,7.5), size = 3.2, family = "Times", label = "p", #parse = T, face = "bold")+ theme_bw()+ theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), axis.text.x=element_text(angle=45,hjust=1,colour="black",face="bold",size=12), axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), axis.title.x=element_text(size=16,face="bold"), axis.title.y=element_text(size=16,face="bold"), legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), legend.title = element_text(face = "bold", size = 12), legend.position = "top", panel.grid.major = element_blank(), panel.grid.minor = element_blank())+ facet_wrap(~Symbol,scales = "free",nrow = 1) exp_plot ggsave(filename = '01.expression.pdf',exp_plot,w=8,h=4,dpi = 300) ggsave(filename = '01.expression.png',exp_plot,w=8,h=4,units = 'in',dpi = 300) ###GEO #dat<-read.csv('../00_rawdata/dat(GSE30219).csv',row.names = 1) #group <- read.csv('../00_rawdata/group(GSE30219).csv') #table(group$group) # #hub_exp<-dat[gene$gene,] #hub_exp2<-hub_exp #hub_exp2$Symbol<-rownames(hub_exp2) #hub_exp2<-gather(hub_exp2, # key = sample, # value = expr, # -c("Symbol")) # #control.sample <- group$sample[which(group$group=='Normal')] ### #hub_exp2$Group<-ifelse(hub_exp2$sample%in%control.sample,'control','LUAD') ### #library(ggplot2) #library(tidyverse) #library(ggpubr) #library(ggsci) #library(rstatix) #stat.test<-hub_exp2%>% # group_by(Symbol)%>% # wilcox_test(expr ~ Group)%>% # adjust_pvalue(method = 'fdr') ## diff <- read.csv('../01_DEG_TCGA/02.DEG_sig.csv',row.names = 1) ## df<-diff[stat.test$Symbol,] ##stat.test$p<-df$pvalue%>%as.numeric()%>%round(digits = 3) #stat.test$p<-ifelse(stat.test$p<0.001,"***", # ifelse(stat.test$p<0.01,"**", # ifelse(stat.test$p<0.05,"*",'ns'))) #library(ggplot2) #library(ggpubr) #hub_exp2$Group <- factor(hub_exp2$Group,levels = c('control','LUAD')) #exp_plot <- ggplot(hub_exp2,aes(x = Group, y = expr, color = Group)) + # geom_violin(trim=F,color="black",aes(fill=Group)) + #, “color=”(#white,) # #"trim"TRUE(),。FALSE,。 # stat_boxplot(geom="errorbar", # width=0.1, # position = position_dodge(0.9)) + # geom_boxplot(width=0.4, # position=position_dodge(0.9), # outlier.shape = NA)+ #,width=0.1 # scale_fill_manual(values= c("#FF7F00","#e5486e"), name = "Group")+ # labs(title="Expression in Verify", x="", y = "",size=20) + # scale_colour_manual(values = c("#FF7F00","#e5486e"))+ # stat_pvalue_manual(stat.test, # #y.position = c(rep(1,5)), # y.position = c(11,11,12,10), # size = 3.2, # family = "Times", # label = "p", # #parse = T, # face = "bold")+ # theme_bw()+ # theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15), # axis.text.x=element_text(angle=45,hjust=1,colour="black",face="bold",size=12), # axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), # axis.title.x=element_text(size=16,face="bold"), # axis.title.y=element_text(size=16,face="bold"), # legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12), # legend.title = element_text(face = "bold", size = 12), # legend.position = "top", # panel.grid.major = element_blank(), # panel.grid.minor = element_blank())+ # facet_wrap(~Symbol,scales = "free",nrow = 1) #exp_plot #ggsave(filename = '02.expression-verify.pdf',exp_plot,w=8,h=4,dpi = 300) #ggsave(filename = '02.expression-verify.png',exp_plot,w=8,h=4,units = 'in',dpi = 300)