#CIMP in gastric cancer rm(list = ls()) options( stringsAsFactors = F ) setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad") ###Processing methylation data Data<-read_tsv("./TCGA-STAD.methylation450.tsv") ###Load methylation data load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/Illumina450k_annotation.RData") ###load Illumina450k_annotation class(Data) View(head(Data)) Data<-as.data.frame(Data) class(Data) View(head(Data)) dim(Data) # 485577 398 row.names(Data)<-Data$`Composite Element REF` View(head(row.names(Data))) Data<-Data[,-1] View(head(Data)) dim(Data) #485577 397 Data1 <- na.omit(Data) ##Remove missing values dim(Data1) sum(duplicated(rownames(Data1))) #0 matData1 = Data1 load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/Illumina450k_annotation.RData") keep <- !(row.names(matData1) %in% ann450k2$Name[ann450k2$chr %in% c("chrX","chrY")]) table(keep) # FALSE TRUE #9078 364255 matData1 <- matData1[keep,] ###Remove probes from sex chromosomes library(minfi) matData1<-as.matrix(matData1) View(head(matData1)) grset=makeGenomicRatioSetFromMatrix(matData1,what="Beta") class(grset) mSetSqFlt <- dropLociWithSnps(grset) ##remove probes with SNPs at CpG site dim(mSetSqFlt) library(methylationArrayAnalysis) dataDirectory <- system.file("extdata", package = "methylationArrayAnalysis") xReactiveProbes <- read.csv(file=paste(dataDirectory, "48639-non-specific-probes-Illumina450k.csv", sep="/"), stringsAsFactors=FALSE) keep <- !(featureNames(mSetSqFlt) %in% xReactiveProbes$TargetID) ###exclude cross reactive probes table(keep) # FALSE TRUE #14161 348497 mSetSqFlt <- mSetSqFlt[keep,] dim(mSetSqFlt) bVals <- getBeta(mSetSqFlt) ####Get beta value dim(bVals) View(bVals) save(bVals,file = "bVals.Rdata") BiocManager::install("lumi") require(lumi) mVals<-beta2m(bVals) ####Get M value dim(mVals) #348497 397 View(head(mVals)) save(mVals,file = "mVals.Rdata") ###Get Clinical Information Matrix metadata <- data.frame(colnames(mVals)) for (i in 1:length(metadata[,1])) { num <- as.numeric(substring(metadata[i,1],14,15)) if (num %in% seq(1,9)) {metadata[i,2] <- "Tumor"} if (num %in% seq(10,29)) {metadata[i,2] <- "Normal"} } names(metadata) <- c("TCGA_id","sample") View(metadata) metadata$sample <- as.factor(metadata$sample) require(dplyr) metadata %>% dplyr::group_by(sample) %>% summarise(n()) save(metadata,file = "metadata.Rdata") sum(metadata$TCGA_id == colnames(mVals)) View(head(mVals)) Normal_barcode <- colnames(mVals)[metadata$sample =="Normal"] mVals_df <- as.data.frame(mVals) Normal_mVals <- mVals_df %>% dplyr::select(Normal_barcode) dim(Normal_mVals) save(Normal_mVals,file = "Normal_mVals.Rdata") Tumor_barcode <- colnames(mVals)[metadata$sample =="Tumor"] Tumor_mVals <- mVals_df %>% dplyr::select(Tumor_barcode) dim(Tumor_mVals) colnames(Tumor_mVals) <- substr(colnames(Tumor_mVals),1,15) Tumor_mVals = Tumor_mVals[,!duplicated(colnames(Tumor_mVals))] dim(Tumor_mVals) Tumor_mVals <- Tumor_mVals[,substr(colnames(Tumor_mVals),14,15) == "01"] dim(Tumor_mVals) sum(metadata$TCGA_id == colnames(bVals)) View(metadata) View(bVals) Normal_barcode <- colnames(bVals)[metadata$sample =="Normal"] bVals_df <- as.data.frame(bVals) Normal_bVals <- bVals_df %>%dplyr::select(Normal_barcode) ###Get the beta value of normal tissue dim(Normal_bVals) save(Normal_bVals,file = "Normal_bVals.Rdata") Tumor_barcode <- colnames(bVals)[metadata$sample =="Tumor"] ###Get the beta value of tumor tissue Tumor_bVals <- bVals_df %>% dplyr::select(Tumor_barcode) dim(Tumor_bVals) colnames(Tumor_bVals) <- substr(colnames(Tumor_bVals),1,15) ###Reduce duplication in tumor samples Tumor_bVals = Tumor_bVals[,!duplicated(colnames(Tumor_bVals))] dim(Tumor_bVals) Tumor_bVals <- Tumor_bVals[,substr(colnames(Tumor_bVals),14,15) == "01"] dim(Tumor_bVals) ###Unsupervised clustering Tumor_bVals$SD <- apply(Tumor_bVals,1,sd) ###Screen for methylation sites with variability greater than 0.2 sum(Tumor_bVals$SD>0.20) inclusion_Tumor <- (Tumor_bVals$SD>0.20) View(inclusion_Tumor) final_Tumor <- row.names(Tumor_bVals)[inclusion_Tumor] length(final_Tumor) Normal_bVals$average <- apply(Normal_bVals,1,mean) #relatively low β-values in normal samples inclusion_Normal <- (Normal_bVals$average < 0.05) final_Normal <- row.names(Normal_bVals)[inclusion_Normal ] length(final_Normal ) clust_probe <- intersect(final_Tumor,final_Normal) ##Screening probes length(clust_probe) save(clust_probe,file = "clust_probe.Rdata") ann450k2_CIMP <- ann450k2 %>% dplyr::filter(Name %in% clust_probe) dim(ann450k2_CIMP) save(ann450k2_CIMP,file = "ann450k2_CIMP.Rdata") load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/clust_probe.Rdata") length(clust_probe) Tumor_mVals3 <- Tumor_mVals[row.names(Tumor_mVals) %in% clust_probe,] dim(Tumor_mVals3) Tumor_mVals4 <- as.matrix(Tumor_mVals3) require(ConsensusClusterPlus) set.seed(2019) results<-ConsensusClusterPlus(Tumor_mVals4,maxK=10,pItem=0.8, ####ConsensusCluster clusterAlg = 'km',distance='euclidean', plot="pdf",seed= 200000) View(results) clust_result<- results[[3]][["consensusClass"]] clust_result <- as.data.frame(clust_result) View(clust_result) clust_result$clust_result[clust_result$clust_result ==1] <- "CIMP-L" clust_result$clust_result[clust_result$clust_result ==2] <- "CIMP-M" clust_result$clust_result[clust_result$clust_result ==3] <- "CIMP-H" clust_result$patient_ID<-row.names(clust_result) clust_result<-clust_result[,c(2,1)] names(clust_result)[2]<-"cluster_result" View(clust_result) write.csv(clust_result,file = "clust_result.csv",row.names = F) save(clust_result,file = "clust_result.Rdata") ###read clinical information stad_tcga_clinical_data <- read.delim("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/stad_tcga_clinical_data.tsv") stad_tcga_clinical_data<-stad_tcga_clinical_data[,c(3,4,5,6,7,9,12,23,24,28,31,32,35,44,45,53,54,61,64,67,73)] names(stad_tcga_clinical_data) View(stad_tcga_clinical_data) names(stad_tcga_clinical_data)<-c("ID","Age","Pathologic_M","Pathologic_N","Pathologic_Stage", "Pathologic_T","Type","DFS_time","DFS","Family_History", "Grade","Type.name","H.P","Positive.N","Examined.N","OS_time", "OS","Primary.site","Race","Margin","Gender") save(stad_tcga_clinical_data,file = "stad_tcga_clinical_data.Rdata") TCGA_clinical <- stad_tcga_clinical_data final1 <- inner_join(TCGA_clinical,clust_result) dim(final1) View(final1) final1$OS[final1$OS == "DECEASED"] <- 1 final1$OS[final1$OS == "LIVING"] <- 0 final1$OS <- as.numeric(final1$OS) table(final1$DFS) final1$DFS[final1$DFS == "Recurred/Progressed"] <- 1 final1$DFS[final1$DFS == "DiseaseFree"] <- 0 final1$DFS <- as.numeric(final1$DFS) ### draw the Heatmap clust_result1 <- clust_result[order(clust_result$clust_result),] View(clust_result1) Tumor_bVals5 <- Tumor_bVals3 %>% dplyr::select(clust_result1$ID) load("./final10.Rdata") lowname<-final10$ID[final10$clust_result=="CIMP-L"] middlename<-final10$ID[final10$clust_result=="CIMP-M"] highname<-final10$ID[final10$clust_result=="CIMP-H"] allname<-c(highname,lowname,middlename) Tumor_bVals5<-select(Tumor_bVals5,c(lowname,middlename,highname)) sum(colnames(Tumor_bVals5)==allname2) library(ComplexHeatmap) library(matrixStats) library(circlize) library(RColorBrewer) require(ggplot2) require(ggpubr) library(tableone) library(ggtern) ha = HeatmapAnnotation(Cluster = final10$clust_result, Age = final10$Age, Gender = final10$Gender, Grade = final10$Grade, Pathologic_M = final10$Pathologic_M, Pathologic_N = final10$Pathologic_N, Pathologic_T = final10$Pathologic_T, Stage = final10$Pathologic_Stage, Pathologic_Location = final10$Primary.site, DEAD = final10$OS, HP.infection = final10$H.P, Reccurence = final10$DFS, Family_History = final10$Family_History, MSI.status=final10$MSI.status, col = list(Cluster = structure(names = c("CIMP-L", "CIMP-M", "CIMP-H"), c("#4DAF4A","#377EB8","#E41A1C")), Age = structure(names = c("<60 years", ">60 years"),c("#8DD3C7","#FFFFB3")), Gender = structure(names = c("Female", "Male"),c("#BEBADA","#FB8072")), Grade = structure(names = c("G1/G2", "G3/G4"),c("#80B1D3","#FDB462")), Pathologic_M = structure(names = c("M0", "M1/MX"), c("#B3DE69","#FCCDE5")), Pathologic_N = structure(names = c("N0", "N1/2/3"),c("#D9D9D9","#BC80BD")), Pathologic_T = structure(names = c("T1+T2", "T3+T4"),c("#CCEBC5","#FFED6F")), Stage = structure(names = c("Stage I","Stage II", "Stage III","Stage IV"),brewer.pal(4, "Paired")), Pathologic_Location= structure(names = c("Antrum/Distal", "Cardia/Proximal","Fundus/Body"),brewer.pal(3, "Dark2")), DEAD = structure(names = c("0", "1"),c("white","#66C2A5")), HP.infection= structure(names = c("No", "Yes"),c("white","#FC8D62")), Reccurence = structure(names = c("0", "1"),c("white","#8DA0CB")), Family_History= structure(names = c("NO", "YES"),c("white","#E78AC3")), MSI.status=structure(names = c("MSI", "MSS"),c("white","#A6D854")) ), na_col = "grey", show_legend = c(TRUE, TRUE, TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,T,T,T,T,T), annotation_height = unit(rep(4,13), "mm"), annotation_legend_param = list( Cluster = list(title = "Cluster"), Stage = list(title = "Stage"), Gender = list(title = "Gender"), Grade = list(title = "Grade"), Pathologic_M = list(title = "Pathologic_M"), Pathologic_N = list(title = "Pathologic_N"), Pathologic_T = list(title = "Pathologic_T"), Pathologic_Location = list(title = "Pathologic_Location") )) col_fun = colorRamp2(c(0, 0.5, 1), c("#377EB8", "white", "#E41A1C")) ht_list= Heatmap(as.matrix(Tumor_bVals5), col = col_fun, name = "Methylation", clustering_distance_rows = "euclidean", row_dend_reorder = TRUE, cluster_columns = FALSE, cluster_rows = FALSE, show_row_dend = FALSE, show_column_dend = FALSE, show_row_names = FALSE, show_column_names = FALSE, bottom_annotation = ha, column_title = qq("STAD samples (n = @{ncol(Tumor_bVals5)})"), column_title_gp = gpar(font = 2), row_title_gp = gpar(col = "#FFFFFF00"))+ Heatmap(Normal_bVals1, col = col_fun, cluster_rows = F,show_row_names = FALSE, show_column_names = FALSE, show_column_dend = FALSE, column_title = "Normal", column_title_gp = gpar(font = 2), show_heatmap_legend = FALSE, width = unit(1, "cm"))+ Heatmap(ann450k2_CIMP$Relation_to_Island, name = "Relation to CGI", cluster_rows = F, show_row_names = FALSE, show_column_names = FALSE, width = unit(5, "mm"), col = structure(names = c("Island", "N_Shelf", "N_Shore", "OpenSea","S_Shelf","S_Shore"), c("#377EB8","#4DAF4A","#984EA3","#FF7F00","#E41A1C","#56B4E9")))+ Heatmap(ann450k2_CIMP$island_location, name = "Relation to TSS", cluster_rows = F, show_row_names = FALSE, show_column_names = FALSE, width = unit(5, "mm"),structure(names = c("Body","TSS200","5'UTR", "TSS1500","Intergenic","1stExon","3'UTR"), brewer.pal(7, "Set1"))) draw(ht_list, annotation_legend_side = "left", heatmap_legend_side = "left") annotation_titles = c(Cluster = "Cluster", Age = "Age", Gender = "Gender", Grade = "Grade", Pathologic_M = "Pathologic_M", Pathologic_N = "Pathologic_N", Pathologic_T = "Pathologic_T", Stage = "Stage", Pathologic_Location = 'Pathologic_Location', DEAD = "DEAD", HP.infection = "HP.infection", Reccurence = "Reccurence", Family_History = "Family_History", MSI.status="MSI.status" ) for(an in names(annotation_titles)) { decorate_annotation(an, { grid.text(annotation_titles[an], unit(-2, "mm"), just = "right",gp = gpar(font = 2,cex=0.8)) grid.rect(gp = gpar(fill = NA, col = "black")) }) } ###draw the OS curve require(survival) fit <- survfit(Surv(OS_time, OS) ~ clust_result, data = final1) require(survminer) ggsurv <- ggsurvplot( fit, # survfit object with calculated statistics. data = final1, #palette= c("#E41A1C","#4DAF4A"), # data used to fit survival curves. risk.table = TRUE, # show risk table. pval = TRUE, # show p-value of log-rank test. conf.int = F, xlim = c(0,60), # present narrower X axis, but not affect # survival estimates. xlab = "Time in months", # customize X axis label. ylab = "Overall Survival Rate", break.time.by = 6, # break X axis in time intervals by 500. ggtheme = theme_classic2(), # customize plot and risk table with a theme. risk.table.y.text.col = T,# colour risk table text annotations. risk.table.height = 0.25, # the height of the risk table risk.table.y.text = F,# show bars instead of names in text annotations # in legend of risk table. ncensor.plot = F, conf.int.style = "step", # add the median survival pointer. legend.labs = c("CIMP-H","CIMP-L","CIMP-M")# change legend labels. ) ggsurv$plot <- ggsurv$plot + labs(title = "TCGA Gastric Cancer Cohort") + theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+ theme(legend.text = element_text(size = 12,face = 'bold')) ggsurv$plot <- ggpar( ggsurv$plot, font.title = c(16, "bold", "darkblue"), font.y = c(14, "bold.italic", "darkred"), font.x = c(14, "bold.italic", "blue") , font.xtickslab = c(12, "plain", "darkgreen","bold"), font.ytickslab = c(10,"darkgreen","bold"), legend = "top",legend.title = "", ) ggsurv$table <- ggpar( ggsurv$table, font.title = c(13, "bold.italic", "#E41A1C"))+ theme(axis.line = element_blank(),axis.ticks = element_blank(), axis.title = element_blank(),axis.text.x = element_blank()) ggsurv$table <- ggsurv$table + theme_cleantable() ggsurv ##draw the PFS curve fit <- survfit(Surv(DFS_time, DFS) ~ clust_result, data = final2) ggsurv <- ggsurvplot( fit, # survfit object with calculated statistics. data = final1, #palette= c("#E41A1C","#4DAF4A"), # data used to fit survival curves. risk.table = TRUE, # show risk table. pval = TRUE, # show p-value of log-rank test. conf.int = F, xlim = c(0,60), # present narrower X axis, but not affect # survival estimates. xlab = "Time in months", # customize X axis label. ylab = "Recurrence-Free Survival Rate", break.time.by = 6, # break X axis in time intervals by 500. ggtheme = theme_classic2(), # customize plot and risk table with a theme. risk.table.y.text.col = T,# colour risk table text annotations. risk.table.height = 0.25, # the height of the risk table risk.table.y.text = F,# show bars instead of names in text annotations # in legend of risk table. ncensor.plot = F, conf.int.style = "step", # add the median survival pointer. legend.labs = c("CIMP-H","CIMP-L")# change legend labels. ) ggsurv$plot <- ggsurv$plot + labs(title = "TCGA Stomach Cancer cohort") + theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+ theme(legend.text = element_text(size = 12,face = 'bold')) ggsurv$plot <- ggpar( ggsurv$plot, font.title = c(16, "bold", "darkblue"), font.y = c(14, "bold.italic", "darkred"), font.x = c(14, "bold.italic", "blue") , font.xtickslab = c(12, "plain", "darkgreen","bold"), font.ytickslab = c(10,"darkgreen","bold"), legend = "top",legend.title = "", ) ggsurv$table <- ggpar( ggsurv$table, font.title = c(13, "bold.italic", "#E41A1C"))+ theme(axis.line = element_blank(),axis.ticks = element_blank(), axis.title = element_blank(),axis.text.x = element_blank()) ggsurv$table <- ggsurv$table + theme_cleantable() ggsurv ####The abundance of tumor-infiltrating immune cells setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad") immune<-read.table("./Immnue-cibersort.csv",header = T,sep=",",stringsAsFactors = F) View(immune) names(immune)[1]<-"ID" load("./final7.Rdata") View(final7) require(dplyr) final8<-dplyr::select(final7,ID,clust_result) View(final8) immune_score<-inner_join(final8,immune) View(immune_score) immune_score<-immune_score[,c(-25,-26,-27)] summary(immune_score) names(immune_score) class(immune_score$clust_result) immune_score$clust_result<-factor(immune_score$clust_result,levels=c("CIMP-L","CIMP-M","CIMP-H")) row.names(immune_score)<-immune_score$ID names(immune_score)[11] <- "T.cells.regulatory.Tregs" CIMP_H <- immune_score$ID[immune_score$clust_result =="CIMP-H"] CIMP_M <- immune_score$ID[immune_score$clust_result =="CIMP-M"] CIMP_L <- immune_score$ID[immune_score$clust_result =="CIMP-L"] length(CIMP_H) #68 length(CIMP_M) #112 length(CIMP_L) #191 View(immune_score) class(immune_score) im1<-immune_score[,-1] view(im1) im1$clust_result <- factor(im1$clust_result,levels = c("CIMP-L","CIMP-M","CIMP-H")) im1$ID<-row.names(im1) im1<-dplyr::select(im1,ID,everything()) require(reshape2) im3<- melt(im1[,-1],id.vars = c("clust_result")) view(im3) class(im3$value) im3$value<-im3$value*100 pvalue_im3 <- im3 %>% group_by(variable) %>% summarize(p.value = kruskal.test(value ~ clust_result)$p.value) View(pvalue_im3) sig <- rep("",22) for (i in seq(22)) { if (pvalue_im3$p.value[i] < 0.001) sig[i] <- "***" else if (pvalue_im3$p.value[i] < 0.01) sig[i] <- "**" else if (pvalue_im3$p.value[i] < 0.05) sig[i] <- "*" } #pdf("vioplot.pdf", width=15, height=7) ggplot(im3)+ geom_boxplot(aes(x=variable,y=value,fill=clust_result),width=0.6,position = position_dodge(0.6),outlier.color = NA, outlier.size = 0.01)+ scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"), breaks=c("CIMP-L","CIMP-M","CIMP-H"), labels=c("CIMP-L","CIMP-M","CIMP-H"))+ # geom_smooth(data=c3,aes(x=variable2,y=value,color=group),size=1,linetype = "dashed")+ #stat_summary(fun.y = mean, geom = "errorbar", aes(x=variable,y=value,ymax = ..y.., ymin = ..y..,color=group),width = .75, linetype = "dashed") xlab("")+ ylab("Fraction of immune cells (%)")+ theme_bw()+ coord_cartesian(ylim = c(0,45))+ theme( legend.position = "top", legend.background=element_blank(), legend.key = element_blank(), legend.margin=margin(0,0,0,0,"mm"), axis.text.x = element_text(angle = 60,hjust=1,size=12), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 14), axis.line.y = element_line(size = 0.5, colour = "black"), legend.text=element_text(size=rel(1.1)), legend.title=element_blank() )+ guides(color=T)+ annotate("text",x=seq(1,22),y = 45,label=sig) #dev.off() ###draw the mutation profiles setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad") library(maftools) #BiocManager::install("maftools") stad.mutect2.maf <- GDCquery_Maf("STAD", pipelines = "mutect2") class(stad.mutect2.maf) save(stad.mutect2.maf,file = "stad.mutect2.maf.Rdata") load("./stad.mutect2.maf.Rdata") write.csv(stad.mutect2.maf,file = "Mutation profile of TCGA cohort.csv",row.names = F) View(head(stad.mutect2.maf)) getwd() load("./final5.Rdata") load("./stad.mutect2.maf.Rdata") #names(final5)[4]<-"Tumor_Sample_Barcode" #names(final5)[17]<-"OS.time" options(stringsAsFactors = F) require(dplyr) names(final5) final_maf<-dplyr::select(final5,ID,clust_result,OS_time,OS) View(final_maf) names(final_maf)[1]<-"Tumor_Sample_Barcode" names(final_maf)[3]<-"OS.time" require(stringr) final_maf$Tumor_Sample_Barcode<-str_sub(final_maf$Tumor_Sample_Barcode,1,12) save(final_maf,file = "final_maf.Rdata") maf = read.maf(maf = stad.mutect2.maf ,clinicalData = final_maf,isTCGA = T) stad.maf<- subsetMaf(maf =maf, tsb = final_maf$Tumor_Sample_Barcode, mafObj = TRUE) oncoplot(maf = stad.maf, top = 30, fontSize =0.5 ,showTumorSampleBarcodes = F ) #getSampleSummary(stad.maf) #getClinicalData(stad.maf) #View(stad.maf@data) #id=as.character(unique(stad.maf@data$Tumor_Sample_Barcode)) View(Tumor_Sample_Barcode) require(dplyr) final5<-final4 require(stringr) final5$match_id<-str_sub(final5$ID,1,12) View(final5) final5<-inner_join(Tumor_Sample_Barcode,final5,by="match_id") final5<-final5[,-2] final5 <- final5 %>% dplyr::rename(Overall_Survival_Status = OS, time = OS_time) write.table(final5,'stad_clinical_final5.tsv',sep = '\t',row.names = F,quote = F) maf_clinical <- subsetMaf(maf = stad.maf, tsb = final5$Tumor_Sample_Barcode, mafObj = TRUE) #### plotmafSummary(maf = stad.maf, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE) col = RColorBrewer::brewer.pal(n =8, name = 'Paired') names(col) = c("Missense_Mutation","Frame_Shift_Del", 'Nonsense_Mutation',"Splice_Site", "In_Frame_Del","Frame_Shift_Ins","In_Frame_Ins","Multi_Hit") fabcolors = RColorBrewer::brewer.pal(n = 3,name = 'Set1') names(fabcolors) = c("CIMP-H", "CIMP-M", "CIMP-L") fabcolors = list(clust_result = fabcolors) oncoplot(maf = stad.maf,colors = col, genes =c("TTN","TP53","MUC16","LRP1B","SYNE1","ARID1A","CSMD3","FLG","FAT4", "HMCN1","ZFHX4","PIK3CA"), legendFontSize = 1.5, clinicalFeatures = 'clust_result', sortByAnnotation = TRUE, annotationColor = fabcolors) require(ComplexHeatmap) #decorate_annotation("clust_result", { # grid.text("clust_result", x = unit(-0.5, "mm"), just = "right") #}) #tumor mutation burden final_L<-filter(final_maf,clust_result=="CIMP-L") View(final_M) final_M<-filter(final_maf,clust_result=="CIMP-M") final_H<-filter(final_maf,clust_result=="CIMP-H") maf_L<-subsetMaf(maf = stad.maf, tsb = final_L$Tumor_Sample_Barcode, mafObj = TRUE) maf_M<-subsetMaf(maf = stad.maf, tsb = final_M$Tumor_Sample_Barcode, mafObj = TRUE) maf_H<-subsetMaf(maf = stad.maf, tsb = final_H$Tumor_Sample_Barcode, mafObj = TRUE) TMB_L <- as.data.frame(maf_L@variants.per.sample) TMB_M<- as.data.frame(maf_M@variants.per.sample) TMB_H<- as.data.frame(maf_H@variants.per.sample) View(maf_L@variants.per.sample) View(TMB_L) TMB_L$TMB <- TMB_L$Variants%/%35 TMB_M$TMB <- TMB_M$Variants%/%35 TMB_H$TMB <- TMB_H$Variants%/%35 View(TMB_L$TMB) View(TMB_H$TMB) View(TMB_L) TMB_L$Cluster <- "CIMP-L" TMB_M$Cluster <- "CIMP-M" TMB_H$Cluster <- "CIMP-H" TMB_total <- rbind(TMB_L,TMB_M,TMB_H) TMB_total$Cluster <- factor(TMB_total$Cluster,levels = c("CIMP-L","CIMP-M","CIMP-H")) TMB_total$TMB my_comparisons <- list(c("CIMP-L", "CIMP-M"), c("CIMP-L", "CIMP-H"), c("CIMP-M","CIMP-H")) require(ggpubr) result_TMB<- as.data.frame(compare_means(TMB~Cluster, data=TMB_total,method = "wilcox.test")) save(result_TMB,file = "result_TMB.Rdata") write.table(result,"TMB_CLUSTER.csv",sep=",") write.table(TMB_total,"TMB_result.csv",sep=",") p <- ggplot(TMB_total, aes(x = Cluster, y = TMB,color=Cluster))+ geom_boxplot(outlier.color = NA)+ stat_compare_means(method = "kruskal.test", label.x = 2,label.y = 50) + #label的位置 stat_compare_means(label = "p.signif", method = "wilcox.test")+ theme_bw() + #去除背景色 theme(panel.grid =element_blank()) + #去除网格线 theme(panel.border = element_blank()) + #去除外层边框 theme(axis.line = element_line(colour = "black")) + #沿坐标轴显示直线 xlab("Cluster") + theme(axis.title.x=element_text(size=30,face="bold"), axis.title.y=element_text(size=30,face="bold"), axis.text=element_text(size=15,face="bold"), legend.text = element_text( size = 15, face = 'bold'))+ guides(color=FALSE) p + ylim(0,60)+geom_dotplot(binaxis = "y", #沿y轴堆积,并沿着x轴分组 binwidth = 0.5, #最大组距 dotsize = 0.6, #点的大小 #如果点太多,两组叠在一起,就需要运行下面这行把它们分开 #stackgroups = T, binpositions="all", aes(colour =Cluster), stackdir = "center")+ scale_color_manual(values = c("#4DAF4A", "#377EB8","#E41A1C")) dev.off() #### DEGs analysis setwd("/Users/zengzhuo/Downloads/R/TCGA_STA") metadata <- jsonlite::fromJSON("metadata.cart.2019-08-12.json") View(metadata) metadata_id <- metadata %>% dplyr::select(c(file_name,associated_entities)) naid_df <- data.frame() for (i in 1:373){ naid_df[i,1] <- substr(metadata_id$file_name[i],1,nchar(metadata_id$file_name[i])-3) naid_df[i,2] <- metadata_id$associated_entities[i][[1]]$entity_submitter_id } #读入所有解压的文件 373个 setwd("/Users/zengzhuo/Downloads/R/TCGA_STA/00_data_read_in_one_file") nameList <- list.files("data_unzip/") length(nameList) location <- which(naid_df==nameList[1],arr.ind = TRUE) ##which函数有一个已知value返回坐标的功能 TCGA_id <- as.character(naid_df[location[1],2]) ##通过坐标,获取TCGA_id expr_df<- read.table(paste0("data_unzip/",nameList[1]),stringsAsFactors = F, header = F) #读入第一个文件,保存为data.frame names(expr_df) <- c("gene_id",TCGA_id) #给刚才数据库命名 for (i in 2:length(nameList)){ location <- which(naid_df==nameList[i],arr.ind = TRUE) TCGA_id <- as.character(naid_df[location[1],2]) dfnew <- read.table(paste0("data_unzip/",nameList[i]),stringsAsFactors = F,header = F) names(dfnew) <- c("gene_id",TCGA_id) expr_df <- inner_join(expr_df,dfnew,by="gene_id") } tail(expr_df$gene_id,10) expr_df <- expr_df[1:(length(expr_df$gene_id)-5),] save(expr_df,file = "expr_df.Rdata") expr_df_nopoint <- expr_df %>% tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.") %>% dplyr::select(-drop) save(expr_df_nopoint,file = "expr_df_nopoint.Rdata") gtf1 <- rtracklayer::import('Homo_sapiens.GRCh38.97.chr.gtf') gtf_df <- as.data.frame(gtf1) save(gtf_df,file = "gtf_df.Rdata") test <- gtf_df[1:5,] View(test) mRNA_exprSet <- gtf_df %>% dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>% #筛选gene,和编码指标 dplyr::select(c(gene_name,gene_id,gene_biotype)) %>% dplyr::inner_join(expr_df_nopoint,by ="gene_id") %>% tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ") dim(mRNA_exprSet ) #19631 374 save(mRNA_exprSet,file = "mRNA_exprSet.Rdata") load("/Users/zengzhuo/Downloads/R/TCGA_STA/00_data_read_in_one_file/expr_df_nopoint.Rdata") View(head(expr_df_nopoint)) write.csv(expr_df_nopoint,file = "Expression profile of TCGA cohort.csv",row.names = F) View(mRNA_exprSet) names(mRNA_exprSet) metadata <- data.frame(names(mRNA_exprSet)[-1]) View(metadata) View(final1) names(metadata)<-"ID" nchar(metadata$ID[1]) metadata$ID<-str_sub(metadata$ID,1,15) metadata<-inner_join(metadata,final1) View(metadata) metadata<-select(metadata,ID,clust_result) table(metadata$clust_result) #CIMP-H CIMP-L CIMP-M # 52 162 92 metadata1<-metadata metadata<-filter(metadata,clust_result!="CIMP-M") View(metadata) View(mRNA_exprSet) mRNA_exprSet1<-mRNA_exprSet length(names(mRNA_exprSet)) length(metadata$ID) b<-vector() for(i in 1:length(metadata$ID)) { b[i]<-which(str_sub(names(mRNA_exprSet),1,15)==metadata$ID[i]) } View(b) mRNA_exprSet<-mRNA_exprSet[,c(1,b)] View(mRNA_exprSet) View(metadata1) metadata<-metadata1 table(metadata$clust_result) metadata<-filter(metadata,clust_result!="CIMP-M") View(metadata) metadata$clust_result[metadata$clust_result=="CIMP-L"]<-"L" names(metadata) <- c("TCGA_id","sample") metadata$sample <- as.factor(metadata$sample) metadata$sample<-relevel(metadata$sample,ref = "L") metadata$sample View(metadata) mycounts <- mRNA_exprSet keepGene=rowSums(edgeR::cpm(mycounts[-1])>0) >=2 table(keepGene) dim(mycounts) mycounts <-mycounts[keepGene,] dim(mycounts) #19514 215 library(DESeq) dds <-DESeqDataSetFromMatrix(countData=mycounts, colData=metadata, design=~sample, tidy=TRUE) dds$sample dds <- DESeq(dds) class(dds) metadata$sample save(dds,file = "dds_DEseq.RData") res <- results(dds, tidy=TRUE) #获取结果 res <- as_tibble(res) require(dplyr) res <- res %>% separate(row,c("symbol","ensemble","genetype"),sep = " \\| ") %>% dplyr::select(- c(ensemble,genetype)) %>% arrange(desc(abs(log2FoldChange))) %>% #排序。为了去重 distinct(symbol,.keep_all = TRUE) %>% arrange(desc(log2FoldChange))#再次按照log2FoldChange从大到小排序 View(res) save(res,file = "result.Rda") load("./result.Rda") View(res) P <- 0.05 foldChange <- 2 diffSig = res[(res$padj < P & (res$log2FoldChange>foldChange | res$log2FoldChange<(-foldChange))),] View(diffSig) ## LASSO analysis #rm(list = ls()) load("./diffSig.Rdata") View(diffSig) write.csv(diffSig,file = "DEGs between CIMP-H and CIMP-L gruops.csv",row.names = F) gene.diff<-diffSig$symbol save(gene.diff,file = "/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/gene.diff.Rdata") length(gene.diff) mRNA_diff<-mRNA_exprSet_vst[gene.diff,] dim(mRNA_diff) View(mRNA_diff) names(mRNA_diff) require(stringr) names(mRNA_diff)<-str_sub(names(mRNA_diff),1,15) save(mRNA_diff,file = "mRNA_diff.Rdata") load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/final_c2.Rdata") View(final_c2) cross1<-rownames(final_c2)%in%names(mRNA_diff) sum(cross1) final2<-final_c2[cross1,] View(final2) final2<-final2[,c(1,2)] View(final2) names(final2) save(final2,file = "final2.Rdata") e<-vector() for(i in 1:length(rownames(final2))) { e[i]<-which(names(mRNA_diff)==rownames(final2)[i]) } View(e) mRNA_diff2<-mRNA_diff[,e] load("./final_d1.Rdata") View(final_d1) table(final_d1$OS) require(survival) Coxoutput=data.frame() dim(final_d1) View(head(final_d1)) for(i in colnames(final_d1[,4:ncol(final_d1)])){ cox <- coxph(Surv(OS_time, OS) ~ final_d1[,i], data = final_d1) coxSummary = summary(cox) Coxoutput=rbind(Coxoutput,cbind(gene=i,HR=coxSummary$coefficients[,"exp(coef)"], z=coxSummary$coefficients[,"z"], pvalue=coxSummary$coefficients[,"Pr(>|z|)"], lower=coxSummary$conf.int[,3], upper=coxSummary$conf.int[,4])) } class(Coxoutput) View(Coxoutput) dim(Coxoutput) for(i in c(2:6)){ Coxoutput[,i] <- as.numeric(as.vector(Coxoutput[,i])) } View(Coxoutput) Coxoutput <- arrange(Coxoutput,pvalue) %>% #按照p值排序 filter(pvalue < 0.05) TCGA_HR_UP <- Coxoutput$gene[Coxoutput$HR>1] TCGA_HR_DOWN<-Coxoutput$gene[Coxoutput$HR<1] View(TCGA_HR_UP) class(TCGA_HR_UP) library(glmnet) library(survival) mRNA_diff4<-as.matrix(mRNA_diff4) row.names(final_d1)==row.names(mRNA_diff4) cvfit = cv.glmnet(mRNA_diff4, Surv(mysurv$OS_time,mysurv$OS), nfold=1000, family = "cox" ) plot(cvfit) fit <- glmnet(mRNA_diff4, Surv(mysurv$OS_time,mysurv$OS), family = "cox") plot(fit, label = TRUE) coef.min = coef(cvfit, s = "lambda.min") coef.min active.min = which(coef.min != 0) View(active.min) geneids <- colnames(mRNA_diff4)[active.min] geneids ### develop a CIMP-related prognostic gene signature load("/Users/zengzhuo/Downloads/R2/TCGA_STA/00_data_read_in_one_file/mRNA_exprSet.Rdata") View(mRNA_exprSet) mRNA_exprSet_all<-mRNA_exprSet View(mRNA_exprSet_all) names(mRNA_exprSet_all)[-1]<-str_sub(names(mRNA_exprSet_all)[-1],1,15) metadata_all <- data.frame(names(mRNA_exprSet_all)[-1]) View(metadata_all) names(metadata_all)<-"ID" class(metadata_all$ID) metadata_all$ID<-as.character(metadata_all$ID) metadata_all<-inner_join(metadata_all,final1) View(metadata_all) dim(metadata_all) metadata_all<-dplyr::select(metadata_all,ID,cluster_result) table(metadata_all$cluster_result) b<-vector() for(i in 1:length(metadata_all$ID)) { b[i]<-which(names(mRNA_exprSet_all)==metadata_all$ID[i]) } View(b) mRNA_exprSet_all<-mRNA_exprSet_all[,c(1,b)] View(mRNA_exprSet_all) View(metadata_all) names(metadata_all) <- c("TCGA_id","sample") table(metadata_all$sample) metadata_all$sample[metadata_all$sample=="CIMP-L"]<-"L" metadata_all$sample[metadata_all$sample=="CIMP-H"]<-"H" metadata_all$sample[metadata_all$sample=="CIMP-M"]<-"M" table(metadata_all$sample) metadata_all$sample <- factor(metadata_all$sample,levels = c("H","M","L")) View(metadata_all) mycounts <- mRNA_exprSet_all keepGene=rowSums(edgeR::cpm(mycounts[-1])>0) >=2 table(keepGene) #FALSE TRUE #108 19523 mycounts <-mycounts[keepGene,] dim(mycounts) #19513 307 dds <-DESeqDataSetFromMatrix(countData=mycounts, colData=metadata_all, design=~sample, tidy=TRUE) vsd <- vst(dds, blind = FALSE) mRNA_exprSet_all_vst <- as.data.frame(assay(vsd)) View(mRNA_exprSet_all_vst) mRNA_exprSet_all_vst$row<-row.names(mRNA_exprSet_all_vst) mRNA_exprSet_all_vst <- mRNA_exprSet_all_vst %>% separate(row,c("symbol","ensemble","genetype"),sep = " \\| ") %>% dplyr::select(- c(ensemble,genetype)) mRNA_exprSet_all_vst <- mRNA_exprSet_all_vst %>% dplyr::select(symbol,everything()) mRNA_exprSet_all_vst <- mRNA_exprSet_all_vst %>% distinct(symbol,.keep_all = TRUE) View(mRNA_exprSet_all_vst) row.names(mRNA_exprSet_all_vst)<-mRNA_exprSet_all_vst$symbol mRNA_exprSet_all_vst<-mRNA_exprSet_all_vst[,-1] View(mRNA_exprSet_all_vst) save(mRNA_exprSet_all_vst,file = "mRNA_exprSet_all_vst.Rdata") mRNA_g1<-mRNA_exprSet_all_vst[geneids,] View(mRNA_g1) mRNA_g2<-t(mRNA_g1) View(mRNA_g2) mRNA_g3<-scale(mRNA_g2,scale = T,center = T) View(mRNA_g3) mRNA_g3 <- as.data.frame(mRNA_g3) mRNA_g3$ID <- row.names(mRNA_g3) mRNA_g3<- inner_join(final1,mRNA_g3) View(mRNA_g3) row.names(mRNA_g3)<-mRNA_g3$ID save(mRNA_g3,file = "mRNA_g3.Rdata") fmla <- as.formula(paste("Surv(OS_time, OS) ~", paste0(geneids, collapse = "+"))) cox <- coxph(fmla, data = mRNA_g3) coef <- cox$coefficients View(coef) ##calculate the cutoff value View(mRNA_g3) names(mRNA_g3) View(as.matrix(mRNA_g3[,c(23:28)])) riskScore_test <- as.matrix(mRNA_g3[,c(23:28)]) %*% coef View(riskScore_test) riskScore_test <- as.data.frame(riskScore_test) riskScore_test$ID <- row.names(riskScore_test) riskScore_test$score <- riskScore_test$V1 riskScore_test <- riskScore_test[,-1] View(riskScore_test) View(mRNA_g3) sta_risk_test <- inner_join(riskScore_test,mRNA_g3) View(sta_risk_test) row.names(sta_risk_test) <- sta_risk_test$ID sta_risk_test <- sta_risk_test[,-1] sta_risk_test$times <- sta_risk_test$OS_time/12 save(sta_risk_test,file = "sta_risk_test.Rdata") require(survminer) sur.cut <- surv_cutpoint(sta_risk_test, time = "OS_time", event = "OS",variables ="score") summary(sur.cut) # cutpoint statistic #score 0.2351643 5.288044 ##distribution of risk score View(sta_risk_test) fp<- sta_risk_test$score View(fp) names(fp)<-row.names(sta_risk_test) View(fp) fp_dat<-data.frame(s=1:length(fp),v=as.numeric(sort(fp))) View(fp_dat) fp_dat$Risk<-ifelse(fp_dat$v>=res.cut,"high","low") sur_dat<-data.frame(s=1:length(fp), t=sta_risk_test[names(sort(fp)),'OS_time'], e=sta_risk_test[names(sort(fp)),'OS']) sur_dat$Status<-as.factor(ifelse(sur_dat$e==0,'Alive','Death')) View(sur_dat) exp_dat<-t_survdiff_cox_exprset[names(sort(fp)), which(colnames(t_survdiff_cox_exprset) %in% genenames)] res.cut<-0.2351643 require(ggplot2) plot.point<-ggplot(fp_dat,aes(x=s,y=v))+ geom_point(aes(col=Risk),size=1)+ geom_segment(aes(x = sum(fp_dat$Risk=="low"), y = min(fp_dat$v), xend = sum(fp_dat$Risk=="low"), yend = res.cut),linetype="dashed")+ geom_segment(aes(x=0,y=res.cut, xend=sum(fp_dat$Risk=="low"), yend=res.cut),linetype="dashed")+ geom_text(aes(x=sum(fp_dat$Risk=="low")/2, y=res.cut+0.25, label=paste0("Cutoff: ",round(res.cut,3))), col ="black",size = 10,alpha=0.8)+ theme(axis.title.x=element_blank())+ scale_x_continuous(limits = c(0,NA),expand = c(0,0))+ labs(y="Risk score",fill="Risk")+ scale_colour_manual(values = c( "firebrick3","navy"))+ theme_bw()##去除背景 plot.point<-plot.point+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank()) plot.point plot.sur<-ggplot(sur_dat,aes(x=s,y=t))+ geom_point(aes(col=Status),size=1.2)+ geom_vline(aes(xintercept=sum(fp_dat$Risk=="low")),linetype="dashed")+ scale_colour_manual(values = c( "navy","firebrick3"))+ theme(axis.title.x=element_blank())+ scale_x_continuous(limits = c(0,NA),expand = c(0,0))+ scale_fill_discrete(labels=c("Alive", "Dead"))+ labs(y="Survival time(months)")+ theme_bw()##去除背景 plot.sur<-plot.sur+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank()) plot.sur tcga_expr<-mRNA_exprSet_all_vst[c("CST6","SLC7A2","RAB3B","IGFBP1","VSTM2L","EVX2"),] View(head(tcga_expr)) tcga_expr2<-t(tcga_expr) View(head(tcga_expr2)) tcga_expr3<-scale(tcga_expr2,scale = T,center = T) tcga_expr3 <- as.data.frame(tcga_expr3) View(head(tcga_expr3)) tcga_expr3$ID <- row.names(tcga_expr3) View(head(tcga_expr3)) class(tcga_expr3) tcga_expr6<-tcga_expr3 tcga_expr7<-tcga_expr6[names(sort(fp)),] View(tcga_expr7) tmp<-t(tcga_expr7) View(head(tmp)) tmp[tmp > 1] = 1 tmp[tmp < -1] = -1 View(tmp) library(pheatmap) plot.h<-pheatmap(tmp,show_colnames = F,cluster_cols = F,cluster_rows = F,cellheight=30, color =colorRampPalette(c("navy", "white","firebrick3"))(200)) ?pheatmap View(plot.h) grid(plot.point, plot.sur,plot.h$gtable, label_x=0, label_y=1, align = 'v',ncol = 1,axis="t") ## OS curve between high- and low-risk groups View(sta_risk_test) sta_risk_tcga<-sta_risk_test View(sta_risk_tcga) sta_risk_tcga$risk<-ifelse(sta_risk_tcga$score>0.2351643,"high","low") table(sta_risk_tcga$risk) #high low #91 215 fit <- survfit(Surv(OS_time, OS) ~ risk, data = sta_risk_tcga) ggsurv <- ggsurvplot( fit, # survfit object with calculated statistics. data = sta_risk_tcga, #palette= c("#E41A1C","#4DAF4A"), # data used to fit survival curves. risk.table = TRUE, # show risk table. pval = TRUE, # show p-value of log-rank test. conf.int = F, xlim = c(0,60), # present narrower X axis, but not affect # survival estimates. xlab = "Time in months", # customize X axis label. ylab = "Overall Survival Rate", break.time.by = 6, # break X axis in time intervals by 500. ggtheme = theme_classic2(), # customize plot and risk table with a theme. risk.table.y.text.col = T,# colour risk table text annotations. risk.table.height = 0.25, # the height of the risk table risk.table.y.text = F,# show bars instead of names in text annotations # in legend of risk table. ncensor.plot = F, conf.int.style = "step", # add the median survival pointer. legend.labs = c("Highrisk","Lowrisk")# change legend labels. ) ggsurv$plot <- ggsurv$plot + labs(title = "TCGA Gastric Cancer Cohort") + theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+ theme(legend.text = element_text(size = 12,face = 'bold')) ggsurv$plot <- ggpar( ggsurv$plot, font.title = c(16, "bold", "darkblue"), font.y = c(14, "bold.italic", "darkred"), font.x = c(14, "bold.italic", "blue") , font.xtickslab = c(12, "plain", "darkgreen","bold"), font.ytickslab = c(10,"darkgreen","bold"), legend = "top",legend.title = "", ) ggsurv$table <- ggpar( ggsurv$table, font.title = c(13, "bold.italic", "#E41A1C"))+ theme(axis.line = element_blank(),axis.ticks = element_blank(), axis.title = element_blank(),axis.text.x = element_blank()) ggsurv$table <- ggsurv$table + theme_cleantable() ggsurv ## plot the ROC library(survivalROC) roc_1=survivalROC(Stime=sta_risk_test$times, status=sta_risk_test$OS, marker = sta_risk_test$score, predict.time =1, method="KM") roc_2=survivalROC(Stime=sta_risk_test$times, status=sta_risk_test$OS, marker = sta_risk_test$score, predict.time =2, method="KM") roc_5=survivalROC(Stime=sta_risk_test$times, status=sta_risk_test$OS, marker = sta_risk_test$score, predict.time =5, method="KM") x <- unlist(roc_1$FP) ##提取x值 y <- unlist(roc_1$TP) plotdata_1 <- data.frame(x,y) plotdata_1$group <- "1" x <- unlist(roc_3$FP) ##提取x值 y <- unlist(roc_3$TP) plotdata_3 <- data.frame(x,y) plotdata_3$group <- "3" x <- unlist(roc_5$FP) ##提取x值 y <- unlist(roc_5$TP) plotdata_5 <- data.frame(x,y) plotdata_5$group <- "5" plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5) g <- ggplot(plotdata) + geom_path(aes(x = x, y = y,colour = group), size=1) + labs(x="1 - Specificity", y = "Ture positive rate") + ggpubr::theme_classic2()+ scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+ theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+ geom_abline(intercept=0, slope=1,linetype="dashed")+ scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.664", "AUC at 3 years: 0.704","AUC at 5 years: 0.667")+ theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+ theme(legend.position=c(.766,.25)) g ## Univariate and multivariate analysis load("./final9.Rdata") View(final9) univarcox<- function(x){ formu<-as.formula(paste0('sur~',x)) unicox<-coxph(formu,data=final9) unisum<-summary(unicox)#汇总数据 HR<-round(unisum$coefficients[,2],3)# HR风险比 Pvalue<-unisum$coefficients[,5]#p值 CI95<-paste0(round(unisum$conf.int[,c(3,4)],3),collapse = '-') #95%置信区间 univarcox<-data.frame('characteristics'=x, 'Hazard Ration'=HR, 'CI95'=CI95, 'pvalue'=ifelse(Pvalue < 0.001, "< 0.001", round(Pvalue,3))) return(univarcox)#返回数据框 } require(survival) sur<-Surv(time=final9$OS_time, event = final9$OS) names(final9) variable_names<-colnames(final9)[c(17,15)] univar<-lapply(variable_names,univarcox) univartable<-do.call(rbind,lapply(univar,data.frame)) univartable$`HR(95%CI)`<-paste0(univartable$Hazard.Ration,'(',univartable$CI95,')') univartable1<-dplyr::select(univartable,characteristics,`HR(95%CI)`,pvalue,-CI95,-Hazard.Ration) univartable1 (names<-univartable1$characteristics[univartable1$pvalue<0.05]) foresst_uni_os_risk<-read.csv("./risk cimp univariate.csv",stringsAsFactors=FALSE,header = T,sep = ",") View(foresst_uni_os_risk) colnames(foresst_uni_os_risk) tabletext <- cbind(c("\nUnivariate analysis",NA, foresst_uni_os_risk$Univariate.analysis), c("Hazard Ratio\n(95% CI)", NA, foresst_uni_os_risk$Hazard.Ratio..95.CI.), c("P-value", NA, foresst_uni_os_risk$P.value)) #BiocManager::install("forestplot") require(forestplot) forestplot(labeltext=tabletext, #图中的文本 mean=c(NA,1,foresst_uni_os_risk$mean),#HR lower=c(NA,1,foresst_uni_os_risk$lower), #95%置信区间下限 upper=c(NA,1,foresst_uni_os_risk$upper),#95%置信区间上限 #title="Hazard Ratio", graph.pos=3,#图在表中的列位置 graphwidth = unit(.4,"npc"),#图在表中的宽度比例 fn.ci_norm="fpDrawDiamondCI",#box类型选择钻石 col=fpColors(box="steelblue", lines="black", zero = "black"),#box颜色 lwd.ci=2,ci.vertices.height = 0.1,ci.vertices=TRUE,#置信区间用线宽、高、型 zero=1,#zero线横坐标 lwd.zero=2,#zero线宽 #虚线(可多条)及其横坐标、颜色、线宽 #xticks = c(round(min(data$mean),1), 1,round((max(data$mean)-min(data$mean)/2),1), round(max(data$mean),1)), #横坐标刻度 lwd.xaxis=2, #X轴线宽 xlab="Hazard Ratio",#X轴标题 hrzl_lines=list("3" = gpar(lwd=2, col="black"),#第三行顶部加黑线,引号内数字标记行位置 #"4" = gpar(lwd=60,lineend="butt", columns=c(1:4), col="#99999922"),#加阴影,弱项不建议使用 "5" = gpar(lwd=2, col="black")),#最后一行底部加黑线,""中数字为nrow(data)+5 txt_gp=fpTxtGp(label=gpar(cex=1.25),#各种字体大小设置 ticks=gpar(cex=1.25), xlab=gpar(cex = 1.25), title=gpar(cex = 1.25)),clip =c(0, 3), #is.summary = c(T,rep(F,27)),#首行字体类型设置 lineheight = unit(2,"cm"),#固定行高 #align=c("l","c","c"),#每列文字的对齐方式,偶尔会用到 #cex=10, colgap = unit(1,"cm"),#列间隙 mar=unit(rep(1.25, times = 4), "cm"),#图形页边距 new_page = F#是否新页 ) (form<-as.formula(paste0('sur~',paste0(names,collapse = '+')))) multicox<-coxph(formula = form,data = final9) multisum<-summary(multicox)##汇总 muHR<-round(multisum$coefficients[,2],3)# muPvalue<-multisum$coefficients[,5]#p值 muCIdown<-round(multisum$conf.int[,3],3)#下 muCIup<-round(multisum$conf.int[,4],3)#上 muCI<-paste0(muCIdown,'-',muCIup)##95%置信区间 multicox<-data.frame('characteristics'=names, 'muHazard Ration'=muHR, 'muCI95'=muCI, 'mupvalue'=ifelse(muPvalue < 0.001, "< 0.001", round(muPvalue,3))) rownames(multicox)<-NULL multicox$`HR(95%CI)`<-paste0(multicox$muHazard.Ration,'(',multicox$muCI95,')') multicox<-dplyr::select(multicox,characteristics,`HR(95%CI)`,mupvalue,-muCI95,-muHazard.Ration) View(multicox) #合并单因素多因素表格 uni_multi<-dplyr::full_join(univartable1,multicox,by="characteristics") uni_multi$characteristics <- as.character(uni_multi$characteristics) forest_mul_os_risk<-read.csv("./risk cimp mulvariate.csv",stringsAsFactors=FALSE,header = T,sep = ",") View(forest_mul_os_risk) names(forest_mul_os_risk) tabletext <- cbind(c("\nMultivariate analysis",NA, forest_mul_os_risk$Multivaritate.analysis), c("Hazard Ratio\n(95% CI", NA, forest_mul_os_risk$Hazard.Ratio..95.CI.), c("P-value", NA,forest_mul_os_risk$P.value)) forestplot(labeltext=tabletext, #图中的文本 mean=c(NA,1,forest_mul_os_risk$mean),#HR lower=c(NA,1,forest_mul_os_risk$lower), #95%置信区间下限 upper=c(NA,1,forest_mul_os_risk$upper),#95%置信区间上限 #title="Hazard Ratio", graph.pos=3,#图在表中的列位置 graphwidth = unit(.4,"npc"),#图在表中的宽度比例 fn.ci_norm="fpDrawDiamondCI",#box类型选择钻石 col=fpColors(box="steelblue", lines="black", zero = "black"),#box颜色 lwd.ci=2,ci.vertices.height = 0.1,ci.vertices=TRUE,#置信区间用线宽、高、型 zero=1,#zero线横坐标 lwd.zero=2,#zero线宽 #虚线(可多条)及其横坐标、颜色、线宽 #xticks = c(round(min(data$mean),1), 1,round((max(data$mean)-min(data$mean)/2),1), round(max(data$mean),1)), #横坐标刻度根据具体情况设置 lwd.xaxis=2, #X轴线宽 xlab="Hazard Ratio",#X轴标题 hrzl_lines=list("3" = gpar(lwd=2, col="black"),#第三行顶部加黑线,引号内数字标记行位置 #"4" = gpar(lwd=60,lineend="butt", columns=c(1:4), col="#99999922"),#加阴影,弱项不建议使用 "5" = gpar(lwd=2, col="black")),#最后一行底部加黑线,""中数字为nrow(data)+5 txt_gp=fpTxtGp(label=gpar(cex=1.25),#各种字体大小设置 ticks=gpar(cex=1.25), xlab=gpar(cex = 1.25), title=gpar(cex = 1.25)),clip =c(0, 3), #is.summary = c(T,rep(F,27)),#首行字体类型设置 lineheight = unit(2,"cm"),#固定行高 #align=c("l","c","c"),#每列文字的对齐方式,偶尔会用到 #cex=10, colgap = unit(1,"cm"),#列间隙 mar=unit(rep(1.25, times = 4), "cm"),#图形页边距 new_page = F#是否新页 ) ##validation of gene signature setwd("/Users/zengzhuo/Downloads/R/GSE_stomach/GSE13861") library(GEOquery) gset = getGEO('GSE13861', destdir=".",getGPL = F) gset=gset[[1]] pdata=pData(gset) View(pdata) exprSet_GSE13861=exprs(gset) View(exprSet_GSE13861) clinical_GSE13861<-read.csv("YUHS.csv",header = T,stringsAsFactors = F,sep =",",check.names = F) View(clinical_GSE13861) library(limma) exprSet_GSE13861_1<-normalizeBetweenArrays(exprSet_GSE13861) annotation<-read.csv("GPL6884-11607.csv",header = T,stringsAsFactors = F,sep =",",check.names = F) View(annotation) annotation<-annotation[,c("ID","Symbol")] class(annotation) exprSet_GSE13861_1<-as.data.frame(exprSet_GSE13861_1) exprSet_GSE13861_1$ID<-row.names(exprSet_GSE13861_1) require(dplyr) exprSet_GSE13861_1<-inner_join(exprSet_GSE13861_1,annotation) View(exprSet_GSE13861_1) exprSet_GSE13861_1<-exprSet_GSE13861_1%>% dplyr::select("ID","Symbol",everything()) exprSet_GSE13861_1<-exprSet_GSE13861_1[,-1] exprSet_GSE13861_1<-exprSet_GSE13861_1%>% #求出平均数(这边的点号代表上一步产出的数据) mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% #去除symbol中的NA filter(Symbol != "NA") %>% #把表达量的平均值按从大到小排序 arrange(desc(rowMean)) %>% # symbol留下第一个 distinct(Symbol,.keep_all = T) %>% #反向选择去除rowMean这一列 dplyr::select(-rowMean) row.names(exprSet_GSE13861_1)<-exprSet_GSE13861_1$Symbol exprSet_GSE13861_1<-exprSet_GSE13861_1[,-1] View(exprSet_GSE13861_1) load("/Users/zengzhuo/Downloads/R2/GSE_stomach/GSE13861/exprSet_GSE13861_1.Rdata") View(exprSet_GSE13861_1) geneids<-c("CST6","SLC7A2","RAB3B","IGFBP1","VSTM2L","EVX2") exprSet_GSE13861_bio<-exprSet_GSE13861_1[geneids,] View(exprSet_GSE13861_bio) exprSet_GSE13861_bio<-na.omit(exprSet_GSE13861_bio) exprSet_GSE13861_bio1<-t(exprSet_GSE13861_bio) exprSet_GSE13861_bio2<-scale(exprSet_GSE13861_bio1,scale = T,center = T) View(exprSet_GSE13861_bio2) a<-coef[-5] a View(as.matrix(exprSet_GSE13861_bio2)) riskScore_GSE13861<- as.matrix(exprSet_GSE13861_bio2) %*% a View(riskScore_GSE13861) riskScore_GSE13861 <- as.data.frame(riskScore_GSE13861) riskScore_GSE13861$ID <- row.names(riskScore_GSE13861) riskScore_GSE13861$score <- riskScore_GSE13861$V1 riskScore_GSE13861 <- riskScore_GSE13861[,-1] View(riskScore_GSE13861) load("/Users/zengzhuo/Downloads/R2/GSE_stomach/GSE13861/clinical_GSE13861_1.Rdata") View(clinical_GSE13861_1) riskScore_GSE13861<-inner_join(riskScore_GSE13861,clinical_GSE13861_1) View(riskScore_GSE13861) rownames(riskScore_GSE13861)<-riskScore_GSE13861$ID riskScore_GSE13861$risk <- ifelse(riskScore_GSE13861$score>0.2351643,'high','low') table(riskScore_GSE13861$risk) #high low #25 40 fit <- survfit(Surv(OS_time, OS) ~ risk, data = riskScore_GSE13861) ggsurv <- ggsurvplot( fit, # survfit object with calculated statistics. data = riskScore_GSE13861, #palette= c("#E41A1C","#4DAF4A"), # data used to fit survival curves. risk.table = TRUE, # show risk table. pval = TRUE, # show p-value of log-rank test. conf.int = F, xlim = c(0,60), # present narrower X axis, but not affect # survival estimates. xlab = "Time in months", # customize X axis label. ylab = "Overall Survival Rate", break.time.by = 6, # break X axis in time intervals by 500. ggtheme = theme_classic2(), # customize plot and risk table with a theme. risk.table.y.text.col = T,# colour risk table text annotations. risk.table.height = 0.25, # the height of the risk table risk.table.y.text = F,# show bars instead of names in text annotations # in legend of risk table. ncensor.plot = F, conf.int.style = "step", # add the median survival pointer. legend.labs = c("Highrisk","Lowrisk")# change legend labels. ) ggsurv$plot <- ggsurv$plot + labs(title = "GSE13861 Cohort") + theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+ theme(legend.text = element_text(size = 12,face = 'bold')) ggsurv$plot <- ggpar( ggsurv$plot, font.title = c(16, "bold", "darkblue"), font.y = c(14, "bold.italic", "darkred"), font.x = c(14, "bold.italic", "blue") , font.xtickslab = c(12, "plain", "darkgreen","bold"), font.ytickslab = c(10,"darkgreen","bold"), legend = "top",legend.title = "", ) ggsurv$table <- ggpar( ggsurv$table, font.title = c(13, "bold.italic", "#E41A1C"))+ theme(axis.line = element_blank(),axis.ticks = element_blank(), axis.title = element_blank(),axis.text.x = element_blank()) ggsurv$table <- ggsurv$table + theme_cleantable() ggsurv roc_1=survivalROC(Stime=riskScore_13861$times, status=riskScore_13861$OS, marker = riskScore_13861$score, predict.time =1, method="KM") roc_1$AUC roc_3=survivalROC(Stime=riskScore_13861$times, status=riskScore_13861$OS, marker = riskScore_13861$score, predict.time =3, method="KM") roc_3$AUC roc_5=survivalROC(Stime=riskScore_13861$times, status=riskScore_13861$OS, marker = riskScore_13861$score, predict.time =5, method="KM") roc_5$AUC x <- unlist(roc_1$FP) ##提取x值 y <- unlist(roc_1$TP) plotdata_1 <- data.frame(x,y) plotdata_1$group <- "1" x <- unlist(roc_3$FP) ##提取x值 y <- unlist(roc_3$TP) plotdata_3 <- data.frame(x,y) plotdata_3$group <- "3" x <- unlist(roc_5$FP) ##提取x值 y <- unlist(roc_5$TP) plotdata_5 <- data.frame(x,y) plotdata_5$group <- "5" plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5) require(ggplot2) g <- ggplot(plotdata) + geom_path(aes(x = x, y = y,colour = group), size=1) + labs(x="1 - Specificity", y = "Ture positive rate") + ggpubr::theme_classic2()+ scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+ theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+ geom_abline(intercept=0, slope=1,linetype="dashed")+ scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.638", "AUC at 3 years: 0.777", "AUC at 5 years: 0.745"))+ theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+ theme(legend.position=c(.766,.25)) g ### comparison of signatures compareROC_tcga_jcp1<-select(compareROC_tcga2,"MARCKS","MAPK14","CCNF","INCENP","CHAF1A") View(compareROC_tcga_jcp1) compareROC_tcga_jcp2<-scale(compareROC_tcga_jcp1,scale = T,center = T) View(compareROC_tcga_jcp2) class(compareROC_tcga_jcp2) compareROC_tcga_jcp2<-as.data.frame(compareROC_tcga_jcp2) coef_jcp<-c(0.672,0.48,0.568,-0.522,-0.798) coef_jcp<-as.matrix(coef_jcp) rownames(coef_jcp)<-c("MARCKS","MAPK14","CCNF","INCENP","CHAF1A") View(coef_jcp) riskScore_compare_13861_jcp<- as.matrix(compareROC_tcga_jcp2[,c(1:5)]) %*% coef_jcp riskScore_compare_13861_jcp <- as.data.frame(riskScore_compare_13861_jcp) View(riskScore_compare_13861_jcp) riskScore_compare_13861_jcp$ID <-str_sub(row.names(compareROC_tcga_jcp2),1,15) riskScore_compare_13861_jcp$score <- riskScore_compare_13861_jcp$V1 riskScore_compare_13861_jcp <- riskScore_compare_13861_jcp[,-1] View(riskScore_compare_13861_jcp) riskScore_compare_13861_jcp <- inner_join(riskScore_compare_13861_jcp,final2) View(riskScore_compare_13861_jcp) row.names(riskScore_compare_13861_jcp) <- riskScore_compare_13861_jcp$ID View(riskScore_compare_13861_jcp) riskScore_compare_13861_jcp$times <- riskScore_compare_13861_jcp$OS_time/12 roc_1=survivalROC(Stime=riskScore_compare_13861_jcp$times, status=riskScore_compare_13861_jcp$OS, marker = riskScore_compare_13861_jcp$score, predict.time =1, method="KM") roc_3=survivalROC(Stime=riskScore_compare_13861_jcp$times, status=riskScore_compare_13861_jcp$OS, marker = riskScore_compare_13861_jcp$score, predict.time =3, method="KM") roc_5=survivalROC(Stime=riskScore_compare_13861_jcp$times, status=riskScore_compare_13861_jcp$OS, marker = riskScore_compare_13861_jcp$score, predict.time =5, method="KM") x <- unlist(roc_1$FP) ##提取x值 y <- unlist(roc_1$TP) plotdata_1 <- data.frame(x,y) plotdata_1$group <- "1" x <- unlist(roc_3$FP) ##提取x值 y <- unlist(roc_3$TP) plotdata_3 <- data.frame(x,y) plotdata_3$group <- "3" x <- unlist(roc_5$FP) ##提取x值 y <- unlist(roc_5$TP) plotdata_5 <- data.frame(x,y) plotdata_5$group <- "5" plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5) g <- ggplot(plotdata) + geom_path(aes(x = x, y = y,colour = group), size=1) + labs(x="1 - Specificity", y = "Ture positive rate") + ggpubr::theme_classic2()+ scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+ theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+ geom_abline(intercept=0, slope=1,linetype="dashed")+ scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.694", "AUC at 3 years: 0.653", "AUC at 5 years: 0.666"))+ theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+ theme(legend.position=c(.766,.25)) g setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad") load("./mRNA_exprSet_vst.Rdata") View(mRNA_exprSet_vst) compareROC_tcga1<-mRNA_exprSet_vst compareROC_tcga2<-t(compareROC_tcga1) class(compareROC_tcga2) compareROC_tcga2<-as.data.frame(compareROC_tcga2) compareROC_tcga3<-select(compareROC_tcga2,"NOX4","FJX1","HEYL","LOX","SERPINE2", "COMP","RBMS1","LAMC1","MFAP2","ANXA5","NETO2", "PDLIM3","GADD45B") View(compareROC_tcga3) compareROC_tcga4<-scale(compareROC_tcga3,scale = T,center = T) View(compareROC_tcga4) class(compareROC_tcga4) compareROC_tcga4<-as.data.frame(compareROC_tcga4) riskScore_compare_13861<- as.matrix(compareROC_tcga4[,c(1:13)]) %*% coef_peerj riskScore_compare_13861 <- as.data.frame(riskScore_compare_13861) View(riskScore_compare_13861) require(stringr) riskScore_compare_13861$ID <-str_sub(row.names(compareROC_tcga4),1,15) riskScore_compare_13861$score <- riskScore_compare_13861$V1 riskScore_compare_13861 <- riskScore_compare_13861[,-1] View(riskScore_compare_13861) View(final2) riskScore_compare_13861 <- inner_join(riskScore_compare_13861,final2) View(riskScore_compare_13861) row.names(riskScore_compare_13861) <- riskScore_compare_13861$ID View(riskScore_compare_13861) riskScore_compare_13861$times <- riskScore_compare_13861$OS_time/12 View(riskScore_compare_13861) roc_1=survivalROC(Stime=riskScore_compare_13861$times, status=riskScore_compare_13861$OS, marker = riskScore_compare_13861$score, predict.time =1, method="KM") roc_3=survivalROC(Stime=riskScore_compare_13861$times, status=riskScore_compare_13861$OS, marker = riskScore_compare_13861$score, predict.time =3, method="KM") roc_5=survivalROC(Stime=riskScore_compare_13861$times, status=riskScore_compare_13861$OS, marker = riskScore_compare_13861$score, predict.time =5, method="KM") x <- unlist(roc_1$FP) ##提取x值 y <- unlist(roc_1$TP) plotdata_1 <- data.frame(x,y) plotdata_1$group <- "1" x <- unlist(roc_3$FP) ##提取x值 y <- unlist(roc_3$TP) plotdata_3 <- data.frame(x,y) plotdata_3$group <- "3" x <- unlist(roc_5$FP) ##提取x值 y <- unlist(roc_5$TP) plotdata_5 <- data.frame(x,y) plotdata_5$group <- "5" plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5) g <- ggplot(plotdata) + geom_path(aes(x = x, y = y,colour = group), size=1) + labs(x="1 - Specificity", y = "Ture positive rate") + ggpubr::theme_classic2()+ scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+ theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+ geom_abline(intercept=0, slope=1,linetype="dashed")+ scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.627", "AUC at 3 years: 0.629", "AUC at 5 years: 0.673"))+ theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+ theme(legend.position=c(.766,.25)) g ###plot nomogram setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad") library(survival) BiocManager::install("rms") library(rms) load("./sta_risk_test.Rdata") View(sta_risk_test) names(sta_risk_test) class(sta_risk_test) dd<-datadist(sta_risk_test) options(datadist="dd") h<-cph(Surv(OS_time,OS)~score,data = sta_risk_test, x = T,y = T,surv = T) survival<-Survival(h) survival1<-function(x)survival(12,x) survival2<-function(x)survival(36,x) survival3<-function(x)survival(60,x) nom<-nomogram(h,fun = list(survival1,survival2,survival3), fun.at = c(0.05,seq(0.1,0.9,by = 0.1),0.95), funlabel = c("1 year survival","3 years survival","5 years survival")) plot(nom) h<-cph(Surv(OS_time,OS)~score,data = sta_risk_test, x = T,y = T,surv = T,time.inc = 36) cal<-calibrate(h,u = 36,cmethod = "KM",m=90) plot(cal,xlim=c(0,1),ylim=c(0,1), errbar.col="black",col="firebrick1") abline(a=0,b=1,lty=3,lwd=1,col="blue") ##Methylation pearson analysis getwd() load("./Tumor_bVals.Rdata") View(head(Tumor_bVals)) load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/Illumina450k_annotation.RData") View(ann450k2) cst6_sites<-filter(ann450k2,gene_id=="CST6") View(cst6_sites) cst6_sites<-cst6_sites[1:5,c(4,35)] View(cst6_sites) cst6_sites$Name #"cg04327181" "cg05057976" "cg07270851" "cg10560147" "cg14347177" cst6_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg04327181",] View(cst6_bval) cst6_bval2<-t(cst6_bval) View(cst6_bval2) class(cst6_bval2) cst6_bval2<-as.data.frame(cst6_bval2) cst6_bval2$ID<-row.names(cst6_bval2) View(cst6_bval2) load("./mRNA_diff.Rdata") View(head(mRNA_diff)) cst6_exr<-mRNA_diff[row.names(mRNA_diff)=="CST6",] View(cst6_exr) cst6_exr2<-t(cst6_exr) cst6_exr2<-as.data.frame(cst6_exr2) cst6_exr2$ID<-row.names(cst6_exr2) cst6_pearson<-inner_join(cst6_exr2,cst6_bval2) View(cst6_pearson) class(cst6_pearson) names(cst6_pearson) which(cst6_pearson$CST6>12) cst6_pearson<-cst6_pearson[-47,] pdf("correlation with CST6.pdf",6,5) ggplot(cst6_pearson, aes(x=cg04327181, y=CST6)) + geom_point()+ #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+ theme_bw()+ stat_smooth(method=lm,se=T)+ labs(title=paste0("R = ",round(cor.test(cst6_pearson$cg04327181,cst6_pearson$CST6)$estimate,3),", P = ", round(cor.test(cst6_pearson$cg04327181,cst6_pearson$CST6)$p.value,3)), x="Methylation of CST6 promoter",y="CST6 expression")+ theme( plot.title = element_text(hjust = 0.5,size = 15), axis.text.y = element_text(size = 20), axis.text.x = element_text(size = 20), axis.title.x=element_text(size = 20), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 20), axis.line.y = element_line(size = 0.5, colour = "black"), ) dev.off() ##SLC7A2 View(ann450k2) SLC7A2_sites<-filter(ann450k2,gene_id=="SLC7A2") View(SLC7A2_sites) SLC7A2_sites<-SLC7A2_sites[1:2,c(4,35)] View(SLC7A2_sites) SLC7A2_sites$Name # "cg03340398" "cg09020665" SLC7A2_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg03340398",] View(SLC7A2_bval) SLC7A2_bval2<-t(SLC7A2_bval) View(SLC7A2_bval2) class(SLC7A2_bval2) SLC7A2_bval2<-as.data.frame(SLC7A2_bval2) SLC7A2_bval2$ID<-row.names(SLC7A2_bval2) View(SLC7A2_bval2) SLC7A2_exr<-mRNA_diff[row.names(mRNA_diff)=="SLC7A2",] View(SLC7A2_exr) SLC7A2_exr2<-t(SLC7A2_exr) SLC7A2_exr2<-as.data.frame(SLC7A2_exr2) SLC7A2_exr2$ID<-row.names(SLC7A2_exr2) SLC7A2_pearson<-inner_join(SLC7A2_exr2,SLC7A2_bval2) View(SLC7A2_pearson) class(SLC7A2_pearson) names(SLC7A2_pearson) which(SLC7A2_pearson$SLC7A2>15) SLC7A2_pearson<-SLC7A2_pearson[c(-164,-210),] pdf("correlation with SLC7A2.pdf",6,5) ggplot(SLC7A2_pearson, aes(x=cg03340398, y=SLC7A2)) + geom_point()+ #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+ theme_bw()+ stat_smooth(method=lm,se=T)+ labs(title=paste0("R = ",round(cor.test(SLC7A2_pearson$cg03340398,SLC7A2_pearson$SLC7A2)$estimate,3),", P = ", round(cor.test(SLC7A2_pearson$cg03340398,SLC7A2_pearson$SLC7A2)$p.value,3)), x="Methylation of SLC7A2 promoter",y="SLC7A2 expression")+ theme( plot.title = element_text(hjust = 0.5,size = 15), axis.text.y = element_text(size = 20), axis.text.x = element_text(size = 20), axis.title.x=element_text(size = 20), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 20), axis.line.y = element_line(size = 0.5, colour = "black"), ) dev.off() ##RAB3B RAB3B_sites<-filter(ann450k2,gene_id=="RAB3B") View(RAB3B_sites) RAB3B_sites<-RAB3B_sites[1:6,c(4,35)] View(RAB3B_sites) RAB3B_sites$Name # "cg01417615" "cg02395396" "cg05929645" "cg07808546" "cg18194821" #"cg22429822" RAB3B_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg01417615",] RAB3B_bval2<-t(RAB3B_bval) RAB3B_bval2<-as.data.frame(RAB3B_bval2) RAB3B_bval2$ID<-row.names(RAB3B_bval2) RAB3B_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="RAB3B",] RAB3B_exr2<-t(RAB3B_exr) RAB3B_exr2<-as.data.frame(RAB3B_exr2) RAB3B_exr2$ID<-row.names(RAB3B_exr2) RAB3B_pearson<-inner_join(RAB3B_exr2,RAB3B_bval2) g<-ggplot(RAB3B_pearson, aes(x=cg01417615, y=RAB3B)) + geom_point()+ #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+ theme_bw()+ stat_smooth(method=lm,se=T)+ labs(title=paste0("R = ",round(cor.test(RAB3B_pearson$cg01417615,RAB3B_pearson$RAB3B)$estimate,3),", P = ", round(cor.test(RAB3B_pearson$cg01417615,RAB3B_pearson$RAB3B)$p.value,3)), x="Methylation of RAB3B promoter",y="RAB3B expression")+ theme( plot.title = element_text(hjust = 0.5,size = 15), axis.text.y = element_text(size = 20), axis.text.x = element_text(size = 20), axis.title.x=element_text(size = 20), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 20), axis.line.y = element_line(size = 0.5, colour = "black"), ) g ##IGFBP1 IGFBP1_sites<-filter(ann450k2,gene_id=="IGFBP1") View(IGFBP1_sites) IGFBP1_sites<-IGFBP1_sites[1:2,c(4,35)] View(IGFBP1_sites) IGFBP1_sites$Name # "cg01310829" "cg05823605" IGFBP1_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg01310829",] IGFBP1_bval2<-t(IGFBP1_bval) IGFBP1_bval2<-as.data.frame(IGFBP1_bval2) IGFBP1_bval2$ID<-row.names(IGFBP1_bval2) IGFBP1_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="IGFBP1",] IGFBP1_exr2<-t(IGFBP1_exr) IGFBP1_exr2<-as.data.frame(IGFBP1_exr2) IGFBP1_exr2$ID<-row.names(IGFBP1_exr2) IGFBP1_pearson<-inner_join(IGFBP1_exr2,IGFBP1_bval2) which(IGFBP1_pearson$IGFBP1>12) IGFBP1_pearson<-IGFBP1_pearson[c(-55,-88,-218),] g<-ggplot(IGFBP1_pearson, aes(x=cg01310829, y=IGFBP1)) + geom_point()+ #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+ theme_bw()+ stat_smooth(method=lm,se=T)+ labs(title=paste0("R = ",round(cor.test(IGFBP1_pearson$cg01310829,IGFBP1_pearson$IGFBP1)$estimate,3),", P = ", round(cor.test(IGFBP1_pearson$cg01310829,IGFBP1_pearson$IGFBP1)$p.value,3)), x="Methylation of IGFBP1 promoter",y="IGFBP1 expression")+ theme( plot.title = element_text(hjust = 0.5,size = 15), axis.text.y = element_text(size = 20), axis.text.x = element_text(size = 20), axis.title.x=element_text(size = 20), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 20), axis.line.y = element_line(size = 0.5, colour = "black"), ) g ##VSTM2L VSTM2L_sites<-filter(ann450k2,gene_id=="VSTM2L") View(VSTM2L_sites) VSTM2L_sites<-VSTM2L_sites[2:5,c(4,35)] VSTM2L_sites$Name # "cg09337248" "cg09565639" "cg12163925" "cg17071589" VSTM2L_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg09565639",] VSTM2L_bval2<-t(VSTM2L_bval) VSTM2L_bval2<-as.data.frame(VSTM2L_bval2) VSTM2L_bval2$ID<-row.names(VSTM2L_bval2) VSTM2L_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="VSTM2L",] VSTM2L_exr2<-t(VSTM2L_exr) VSTM2L_exr2<-as.data.frame(VSTM2L_exr2) VSTM2L_exr2$ID<-row.names(VSTM2L_exr2) VSTM2L_pearson<-inner_join(VSTM2L_exr2,VSTM2L_bval2) which(VSTM2L_pearson$VSTM2L>13.5) VSTM2L_pearson<-VSTM2L_pearson[c(-36,-283),] g<-ggplot(VSTM2L_pearson, aes(x=cg09565639, y=VSTM2L)) + geom_point()+ #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+ theme_bw()+ stat_smooth(method=lm,se=T)+ labs(title=paste0("R = ",round(cor.test(VSTM2L_pearson$cg09565639,VSTM2L_pearson$VSTM2L)$estimate,3),", P = ", round(cor.test(VSTM2L_pearson$cg09565639,VSTM2L_pearson$VSTM2L)$p.value,3)), x="Methylation of VSTM2L promoter",y="VSTM2L expression")+ theme( plot.title = element_text(hjust = 0.5,size = 15), axis.text.y = element_text(size = 20), axis.text.x = element_text(size = 20), axis.title.x=element_text(size = 20), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 20), axis.line.y = element_line(size = 0.5, colour = "black"), ) g ##EVX2 EVX2_sites<-filter(ann450k2,gene_id=="EVX2") View(EVX2_sites) EVX2_sites<-EVX2_sites[3:4,c(4,35)] EVX2_sites$Name # "cg07536910" "cg07939836" EVX2_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg07939836",] EVX2_bval2<-t(EVX2_bval) EVX2_bval2<-as.data.frame(EVX2_bval2) EVX2_bval2$ID<-row.names(EVX2_bval2) EVX2_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="EVX2",] EVX2_exr2<-t(EVX2_exr) EVX2_exr2<-as.data.frame(EVX2_exr2) EVX2_exr2$ID<-row.names(EVX2_exr2) EVX2_pearson<-inner_join(EVX2_exr2,EVX2_bval2) which(EVX2_pearson$EVX2>7) EVX2_pearson<-EVX2_pearson[c(-31,-271),] g<-ggplot(EVX2_pearson, aes(x=cg07939836, y=EVX2)) + geom_point()+ #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+ theme_bw()+ stat_smooth(method=lm,se=T)+ labs(title=paste0("R = ",round(cor.test(EVX2_pearson$cg07939836,EVX2_pearson$EVX2)$estimate,3),", P = ", round(cor.test(EVX2_pearson$cg07939836,EVX2_pearson$EVX2)$p.value,3)), x="Methylation of EVX2 promoter",y="EVX2 expression")+ theme( plot.title = element_text(hjust = 0.5,size = 15), axis.text.y = element_text(size = 20), axis.text.x = element_text(size = 20), axis.title.x=element_text(size = 20), axis.line.x = element_line(size = 0.5, colour = "black"), axis.title.y=element_text(size = 20), axis.line.y = element_line(size = 0.5, colour = "black"), ) g ##gene expression in CIMP-related groups ##SLC7A2 load("./clust_result.Rdata") load("./mRNA_exprSet_all_vst.Rdata") View(clust_result) View(head(mRNA_exprSet_all_vst)) SLC7A2_CIMP<-t(mRNA_exprSet_all_vst) View(head(SLC7A2_CIMP)) dim(SLC7A2_CIMP) SLC7A2_CIMP<-as.data.frame(SLC7A2_CIMP) require(dplyr) SLC7A2_CIMP2<-dplyr::select(SLC7A2_CIMP,SLC7A2) SLC7A2_CIMP2$ID<-row.names(SLC7A2_CIMP) View(SLC7A2_CIMP2) SLC7A2_CIMP3<-inner_join(SLC7A2_CIMP2,clust_result) View(SLC7A2_CIMP3) which(SLC7A2_CIMP3$SLC7A2>15) SLC7A2_CIMP3<-SLC7A2_CIMP3[c(-240,-302),] names(SLC7A2_CIMP3) dim(SLC7A2_CIMP3) SLC7A2_CIMP3$cluster_result<-factor(SLC7A2_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H")) ###箱线图 require(ggpubr) p<-ggplot(SLC7A2_CIMP3,aes(x = cluster_result, y = SLC7A2,fill=cluster_result))+ geom_boxplot(outlier.color = NA)+ labs(x="Cluster", y = "SLC7A2 expression")+ theme_classic() my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") ) p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+ stat_compare_means(method = "kruskal.test") + #label的位置 stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(13.7,13,14.3))+ ylim(6.5,14.5) ##RAB3B View(clust_result) View(head(mRNA_exprSet_all_vst)) RAB3B_CIMP<-t(mRNA_exprSet_all_vst) dim(RAB3B_CIMP) RAB3B_CIMP<-as.data.frame(RAB3B_CIMP) RAB3B_CIMP2<-dplyr::select(RAB3B_CIMP,RAB3B) RAB3B_CIMP2$ID<-row.names(RAB3B_CIMP2) RAB3B_CIMP3<-inner_join(RAB3B_CIMP2,clust_result) #which(SLC7A2_CIMP3$SLC7A2>15) #SLC7A2_CIMP3<-SLC7A2_CIMP3[c(-240,-302),] dim(RAB3B_CIMP3) RAB3B_CIMP3$cluster_result<-factor(RAB3B_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H")) ###箱线图 require(ggpubr) p<-ggplot(RAB3B_CIMP3,aes(x = cluster_result, y = RAB3B,fill=cluster_result))+ geom_boxplot(outlier.color = NA)+ labs(x="Cluster", y = "RAB3B expression")+ theme_classic() my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") ) p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+ stat_compare_means(method = "kruskal.test") + #label的位置 stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(13.5,13,14))+ ylim(6.5,14.5) ##IGFBP1 View(clust_result) View(head(mRNA_exprSet_all_vst)) IGFBP1_CIMP<-t(mRNA_exprSet_all_vst) dim(IGFBP1_CIMP) IGFBP1_CIMP<-as.data.frame(IGFBP1_CIMP) IGFBP1_CIMP2<-dplyr::select(IGFBP1_CIMP,IGFBP1) IGFBP1_CIMP2$ID<-row.names(IGFBP1_CIMP2) IGFBP1_CIMP3<-inner_join(IGFBP1_CIMP2,clust_result) which(IGFBP1_CIMP3$IGFBP1>12) IGFBP1_CIMP3<-IGFBP1_CIMP3[c(-55,-88,-218),] dim(IGFBP1_CIMP3) IGFBP1_CIMP3$cluster_result<-factor(IGFBP1_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H")) ###箱线图 require(ggpubr) p<-ggplot(IGFBP1_CIMP3,aes(x = cluster_result, y = IGFBP1,fill=cluster_result))+ geom_boxplot(outlier.color = NA)+ labs(x="Cluster", y = "IGFBP1 expression")+ theme_classic() my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") ) p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+ stat_compare_means(method = "kruskal.test") + #label的位置 stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(10.3,10.5,10.8))+ ylim(5.5,11) #VSTM2L View(clust_result) View(head(mRNA_exprSet_all_vst)) VSTM2L_CIMP<-t(mRNA_exprSet_all_vst) dim(VSTM2L_CIMP) VSTM2L_CIMP<-as.data.frame(VSTM2L_CIMP) VSTM2L_CIMP2<-dplyr::select(VSTM2L_CIMP,VSTM2L) VSTM2L_CIMP2$ID<-row.names(VSTM2L_CIMP2) VSTM2L_CIMP3<-inner_join(VSTM2L_CIMP2,clust_result) which(VSTM2L_CIMP3$VSTM2L>13.5) VSTM2L_CIMP3<-VSTM2L_CIMP3[c(-36,-283),] dim(VSTM2L_CIMP3) VSTM2L_CIMP3$cluster_result<-factor(VSTM2L_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H")) ###箱线图 require(ggpubr) p<-ggplot(VSTM2L_CIMP3,aes(x = cluster_result, y = VSTM2L,fill=cluster_result))+ geom_boxplot(outlier.color = NA)+ labs(x="Cluster", y = "VSTM2L expression")+ theme_classic() my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") ) p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+ stat_compare_means(method = "kruskal.test") + #label的位置 stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(12.5,12,13))+ ylim(5.5,13) #EVX2 View(clust_result) View(head(mRNA_exprSet_all_vst)) EVX2_CIMP<-t(mRNA_exprSet_all_vst) dim(EVX2_CIMP) EVX2_CIMP<-as.data.frame(EVX2_CIMP) EVX2_CIMP2<-dplyr::select(EVX2_CIMP,EVX2) EVX2_CIMP2$ID<-row.names(EVX2_CIMP2) EVX2_CIMP3<-inner_join(EVX2_CIMP2,clust_result) which(EVX2_CIMP3$EVX2>7) EVX2_CIMP3<-EVX2_CIMP3[c(-20,-32,-243,-273),] dim(EVX2_CIMP3) EVX2_CIMP3$cluster_result<-factor(EVX2_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H")) ###箱线图 require(ggpubr) p<-ggplot(EVX2_CIMP3,aes(x = cluster_result, y = EVX2,fill=cluster_result))+ geom_boxplot(outlier.color = NA)+ labs(x="Cluster", y = "EVX2 expression")+ theme_classic() my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") ) p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+ stat_compare_means(method = "kruskal.test") + #label的位置 stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(6.8,7,7.2))+ ylim(5.5,7.5) #CST6 View(clust_result) View(head(mRNA_exprSet_all_vst)) CST6_CIMP<-t(mRNA_exprSet_all_vst) dim(CST6_CIMP) CST6_CIMP<-as.data.frame(CST6_CIMP) CST6_CIMP2<-dplyr::select(CST6_CIMP,CST6) CST6_CIMP2$ID<-row.names(CST6_CIMP2) CST6_CIMP3<-inner_join(CST6_CIMP2,clust_result) which(CST6_CIMP3$CST6>12) CST6_CIMP3<-CST6_CIMP3[c(-69,-127,-195,-285),] dim(CST6_CIMP3) CST6_CIMP3$cluster_result<-factor(CST6_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H")) ###箱线图 require(ggpubr) p<-ggplot(CST6_CIMP3,aes(x = cluster_result, y = CST6,fill=cluster_result))+ geom_boxplot(outlier.color = NA)+ labs(x="Cluster", y = "CST6 expression")+ theme_classic() my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") ) p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+ stat_compare_means(method = "kruskal.test") + #label的位置 stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(8.8,9.2,9.5))+ ylim(5.5,10) ##GO and KEGG analysis in riskscore related groups load("final2.Rdata") View(final2) metadata <- data.frame(row.names(final2)) View(metadata) names(metadata)<-"ID" class(metadata$ID) class(tcga_score$ID) metadata<-inner_join(metadata,tcga_score) View(metadata) class(metadata) metadata<-metadata[,-2] length(names(mRNA_exprSet)) length(metadata$ID) b<-vector() for(i in 1:length(metadata$ID)) { b[i]<-which(str_sub(names(mRNA_exprSet),1,15)==metadata$ID[i]) } View(b) View(head(mRNA_exprSet)) mRNA_exprSet<-mRNA_exprSet[,c(1,b)] View(head(mRNA_exprSet)) View(metadata) names(metadata) <- c("TCGA_id","sample") metadata$sample <- factor(metadata$sample,levels = c("low","high")) mycounts <- mRNA_exprSet keepGene=rowSums(edgeR::cpm(mycounts[-1])>0) >=2 table(keepGene) dim(mycounts) mycounts <-mycounts[keepGene,] library(DESeq2) dds <-DESeqDataSetFromMatrix(countData=mycounts, colData=metadata, design=~sample, tidy=TRUE) dds$sample dds <- DESeq(dds) dds_riskhl<-dds metadata$sample save(dds_riskhl,file = "dds_riskhl_DEseq.RData") load("./res_riskhl_result.Rdata") View(res_riskhl) res<-res_riskhl P <- 0.05 foldChange <- 2 diffSig = res[(res$padj < P & (res$log2FoldChange>foldChange | res$log2FoldChange<(-foldChange))),] View(diffSig) genelist3<-res$log2FoldChange names(genelist3) = res$symbol library(GOplot) library(clusterProfiler) gene<-names(genelist3)[abs(genelist3)>2] library(org.Hs.eg.db) gene <- bitr(gene, fromType = "SYMBOL", toType = c("ENTREZID"), OrgDb = org.Hs.eg.db) library(org.Hs.eg.db) ego<-enrichGO(gene = gene, OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 0.01, qvalueCutoff = 0.01, readable = TRUE) cnetplot(ego, categorySize="pvalue") cnetplot(ego, circular = TRUE, colorEdge = TRUE) kegg<-enrichKEGG(gene = gene, organism ="human", pvalueCutoff = 0.01, qvalueCutoff = 0.01, minGSSize = 1, #readable = TRUE , use_internal_data =FALSE) cnetplot(kegg, categorySize="pvalue",foldChange = genelist3)