#CIMP in gastric cancer
rm(list = ls())
options( stringsAsFactors = F )
setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad")

###Processing methylation data

Data<-read_tsv("./TCGA-STAD.methylation450.tsv")      ###Load methylation data
load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/Illumina450k_annotation.RData")   ###load Illumina450k_annotation
class(Data)
View(head(Data))
Data<-as.data.frame(Data)
class(Data)
View(head(Data))
dim(Data)
# 485577    398
row.names(Data)<-Data$`Composite Element REF`
View(head(row.names(Data)))
Data<-Data[,-1]
View(head(Data))
dim(Data)
#485577    397
Data1 <- na.omit(Data) ##Remove missing values
dim(Data1)
sum(duplicated(rownames(Data1)))
#0
matData1 = Data1
load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/Illumina450k_annotation.RData")
keep <- !(row.names(matData1) %in% ann450k2$Name[ann450k2$chr %in% 
                                                   c("chrX","chrY")])
table(keep)
# FALSE   TRUE 
#9078 364255 
matData1 <- matData1[keep,] ###Remove probes from sex chromosomes
library(minfi)
matData1<-as.matrix(matData1)
View(head(matData1))
grset=makeGenomicRatioSetFromMatrix(matData1,what="Beta")
class(grset)
mSetSqFlt <- dropLociWithSnps(grset) ##remove probes with SNPs at CpG site
dim(mSetSqFlt)

library(methylationArrayAnalysis)
dataDirectory <- system.file("extdata", package = "methylationArrayAnalysis")
xReactiveProbes <- read.csv(file=paste(dataDirectory,
                                       "48639-non-specific-probes-Illumina450k.csv",
                                       sep="/"), stringsAsFactors=FALSE)
keep <- !(featureNames(mSetSqFlt) %in% xReactiveProbes$TargetID)  ###exclude cross reactive probes
table(keep)
# FALSE   TRUE 
#14161 348497 
mSetSqFlt <- mSetSqFlt[keep,] 
dim(mSetSqFlt)

bVals <- getBeta(mSetSqFlt) ####Get beta value
dim(bVals)
View(bVals)
save(bVals,file = "bVals.Rdata")

BiocManager::install("lumi")
require(lumi)
mVals<-beta2m(bVals) ####Get M value
dim(mVals)
#348497    397
View(head(mVals))
save(mVals,file = "mVals.Rdata")

###Get Clinical Information Matrix
metadata <- data.frame(colnames(mVals))

for (i in 1:length(metadata[,1])) {
  num <- as.numeric(substring(metadata[i,1],14,15))
  
  if (num %in% seq(1,9)) {metadata[i,2] <- "Tumor"}
  
  if (num %in% seq(10,29)) {metadata[i,2] <- "Normal"}
}
names(metadata) <- c("TCGA_id","sample")
View(metadata)
metadata$sample <- as.factor(metadata$sample)
require(dplyr)
metadata %>% dplyr::group_by(sample) %>% summarise(n())
save(metadata,file = "metadata.Rdata")
sum(metadata$TCGA_id == colnames(mVals))

View(head(mVals))
Normal_barcode <- colnames(mVals)[metadata$sample =="Normal"]
mVals_df <- as.data.frame(mVals)
Normal_mVals <- mVals_df %>%
  dplyr::select(Normal_barcode)
dim(Normal_mVals)
save(Normal_mVals,file = "Normal_mVals.Rdata")

Tumor_barcode <- colnames(mVals)[metadata$sample =="Tumor"]
Tumor_mVals <- mVals_df %>%
  dplyr::select(Tumor_barcode)
dim(Tumor_mVals)
colnames(Tumor_mVals) <- substr(colnames(Tumor_mVals),1,15)
Tumor_mVals = Tumor_mVals[,!duplicated(colnames(Tumor_mVals))]
dim(Tumor_mVals)
Tumor_mVals <- Tumor_mVals[,substr(colnames(Tumor_mVals),14,15) == "01"]
dim(Tumor_mVals)

sum(metadata$TCGA_id == colnames(bVals))
View(metadata)
View(bVals)
Normal_barcode <- colnames(bVals)[metadata$sample =="Normal"]
bVals_df <- as.data.frame(bVals)
Normal_bVals <- bVals_df %>%dplyr::select(Normal_barcode)  ###Get the beta value of normal tissue
dim(Normal_bVals)
save(Normal_bVals,file = "Normal_bVals.Rdata")
Tumor_barcode <- colnames(bVals)[metadata$sample =="Tumor"]  ###Get the beta value of tumor tissue
Tumor_bVals <- bVals_df %>% dplyr::select(Tumor_barcode)
dim(Tumor_bVals)
colnames(Tumor_bVals) <- substr(colnames(Tumor_bVals),1,15)     ###Reduce duplication in tumor samples
Tumor_bVals = Tumor_bVals[,!duplicated(colnames(Tumor_bVals))]
dim(Tumor_bVals)
Tumor_bVals <- Tumor_bVals[,substr(colnames(Tumor_bVals),14,15) == "01"]
dim(Tumor_bVals)

###Unsupervised clustering 

Tumor_bVals$SD <- apply(Tumor_bVals,1,sd)          ###Screen for methylation sites with variability greater than 0.2
sum(Tumor_bVals$SD>0.20)
inclusion_Tumor <- (Tumor_bVals$SD>0.20)
View(inclusion_Tumor)
final_Tumor <- row.names(Tumor_bVals)[inclusion_Tumor]
length(final_Tumor)
Normal_bVals$average <- apply(Normal_bVals,1,mean)     #relatively low β-values in normal samples
inclusion_Normal <- (Normal_bVals$average < 0.05)
final_Normal <- row.names(Normal_bVals)[inclusion_Normal ]
length(final_Normal )
clust_probe <- intersect(final_Tumor,final_Normal)  ##Screening probes
length(clust_probe)
save(clust_probe,file = "clust_probe.Rdata")
ann450k2_CIMP <- ann450k2 %>% dplyr::filter(Name %in% clust_probe)
dim(ann450k2_CIMP)
save(ann450k2_CIMP,file = "ann450k2_CIMP.Rdata")

load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/clust_probe.Rdata")
length(clust_probe)
Tumor_mVals3 <- Tumor_mVals[row.names(Tumor_mVals) %in% clust_probe,]
dim(Tumor_mVals3)
Tumor_mVals4 <- as.matrix(Tumor_mVals3)
require(ConsensusClusterPlus)
set.seed(2019)
results<-ConsensusClusterPlus(Tumor_mVals4,maxK=10,pItem=0.8,            ####ConsensusCluster
                              clusterAlg = 'km',distance='euclidean',
                              plot="pdf",seed= 200000)
View(results)
clust_result<- results[[3]][["consensusClass"]]
clust_result <- as.data.frame(clust_result)
View(clust_result)
clust_result$clust_result[clust_result$clust_result ==1] <- "CIMP-L"
clust_result$clust_result[clust_result$clust_result ==2] <- "CIMP-M"
clust_result$clust_result[clust_result$clust_result ==3] <- "CIMP-H" 
clust_result$patient_ID<-row.names(clust_result)
clust_result<-clust_result[,c(2,1)]
names(clust_result)[2]<-"cluster_result"
View(clust_result)
write.csv(clust_result,file = "clust_result.csv",row.names = F)
save(clust_result,file = "clust_result.Rdata")


###read clinical information
stad_tcga_clinical_data <- read.delim("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/stad_tcga_clinical_data.tsv")

stad_tcga_clinical_data<-stad_tcga_clinical_data[,c(3,4,5,6,7,9,12,23,24,28,31,32,35,44,45,53,54,61,64,67,73)]
names(stad_tcga_clinical_data)
View(stad_tcga_clinical_data)
names(stad_tcga_clinical_data)<-c("ID","Age","Pathologic_M","Pathologic_N","Pathologic_Stage",
                                  "Pathologic_T","Type","DFS_time","DFS","Family_History",
                                  "Grade","Type.name","H.P","Positive.N","Examined.N","OS_time",
                                  "OS","Primary.site","Race","Margin","Gender")
save(stad_tcga_clinical_data,file = "stad_tcga_clinical_data.Rdata")

TCGA_clinical <- stad_tcga_clinical_data
final1 <- inner_join(TCGA_clinical,clust_result)
dim(final1)

View(final1)
final1$OS[final1$OS == "DECEASED"] <- 1
final1$OS[final1$OS == "LIVING"] <- 0
final1$OS <- as.numeric(final1$OS)
table(final1$DFS)
final1$DFS[final1$DFS == "Recurred/Progressed"] <- 1
final1$DFS[final1$DFS == "DiseaseFree"] <- 0
final1$DFS <- as.numeric(final1$DFS)

### draw the Heatmap

clust_result1 <- clust_result[order(clust_result$clust_result),]
View(clust_result1)
Tumor_bVals5 <- Tumor_bVals3 %>% dplyr::select(clust_result1$ID)
load("./final10.Rdata")
lowname<-final10$ID[final10$clust_result=="CIMP-L"]
middlename<-final10$ID[final10$clust_result=="CIMP-M"]
highname<-final10$ID[final10$clust_result=="CIMP-H"]
allname<-c(highname,lowname,middlename)
Tumor_bVals5<-select(Tumor_bVals5,c(lowname,middlename,highname))
sum(colnames(Tumor_bVals5)==allname2)
library(ComplexHeatmap)
library(matrixStats)
library(circlize)
library(RColorBrewer)
require(ggplot2)
require(ggpubr)
library(tableone)
library(ggtern)
ha = HeatmapAnnotation(Cluster = final10$clust_result,
                       Age = final10$Age,
                       Gender = final10$Gender,
                       Grade = final10$Grade,
                       Pathologic_M = final10$Pathologic_M,
                       Pathologic_N = final10$Pathologic_N,
                       Pathologic_T = final10$Pathologic_T,
                       Stage = final10$Pathologic_Stage,
                       Pathologic_Location = final10$Primary.site,
                       DEAD = final10$OS,
                       HP.infection = final10$H.P,
                       Reccurence = final10$DFS,
                       Family_History = final10$Family_History,
                       MSI.status=final10$MSI.status,
                       col = list(Cluster = structure(names = c("CIMP-L", "CIMP-M", "CIMP-H"), c("#4DAF4A","#377EB8","#E41A1C")),
                                  Age = structure(names = c("<60 years", ">60 years"),c("#8DD3C7","#FFFFB3")),
                                  Gender = structure(names = c("Female", "Male"),c("#BEBADA","#FB8072")),
                                  Grade = structure(names = c("G1/G2", "G3/G4"),c("#80B1D3","#FDB462")),
                                  Pathologic_M = structure(names = c("M0", "M1/MX"), c("#B3DE69","#FCCDE5")),
                                  Pathologic_N = structure(names = c("N0", "N1/2/3"),c("#D9D9D9","#BC80BD")),
                                  Pathologic_T = structure(names = c("T1+T2", "T3+T4"),c("#CCEBC5","#FFED6F")),
                                  Stage = structure(names = c("Stage I","Stage II", "Stage III","Stage IV"),brewer.pal(4, "Paired")),
                                  Pathologic_Location= structure(names = c("Antrum/Distal", "Cardia/Proximal","Fundus/Body"),brewer.pal(3, "Dark2")),
                                  DEAD = structure(names = c("0", "1"),c("white","#66C2A5")),
                                  HP.infection= structure(names = c("No", "Yes"),c("white","#FC8D62")),
                                  Reccurence = structure(names = c("0", "1"),c("white","#8DA0CB")),
                                  Family_History= structure(names = c("NO", "YES"),c("white","#E78AC3")),
                                  MSI.status=structure(names = c("MSI", "MSS"),c("white","#A6D854"))
                       ),
                       na_col = "grey",
                       show_legend = c(TRUE, TRUE, TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,TRUE,T,T,T,T,T),
                       annotation_height = unit(rep(4,13), "mm"),
                       annotation_legend_param = list(
                         Cluster = list(title = "Cluster"),
                         Stage = list(title = "Stage"),
                         Gender  = list(title = "Gender"),
                         Grade  = list(title = "Grade"),
                         Pathologic_M  = list(title = "Pathologic_M"),
                         Pathologic_N  = list(title = "Pathologic_N"),
                         Pathologic_T  = list(title = "Pathologic_T"),
                         Pathologic_Location = list(title = "Pathologic_Location")
                       ))

col_fun = colorRamp2(c(0, 0.5, 1), c("#377EB8", "white", "#E41A1C"))

ht_list= Heatmap(as.matrix(Tumor_bVals5), col = col_fun, name = "Methylation",
                 clustering_distance_rows = "euclidean", row_dend_reorder = TRUE,
                 cluster_columns = FALSE, cluster_rows = FALSE,
                 show_row_dend = FALSE, show_column_dend = FALSE,
                 show_row_names = FALSE, show_column_names = FALSE,
                 bottom_annotation = ha, column_title = qq("STAD samples (n = @{ncol(Tumor_bVals5)})"),
                 column_title_gp = gpar(font = 2),
                 row_title_gp = gpar(col = "#FFFFFF00"))+
  Heatmap(Normal_bVals1, col = col_fun, cluster_rows = F,show_row_names = FALSE, show_column_names = FALSE, 
          show_column_dend = FALSE, column_title = "Normal",
          column_title_gp = gpar(font = 2),
          show_heatmap_legend = FALSE, width = unit(1, "cm"))+
  Heatmap(ann450k2_CIMP$Relation_to_Island,
          name = "Relation to CGI", cluster_rows = F,
          show_row_names = FALSE, show_column_names = FALSE, width = unit(5, "mm"),
          col = structure(names = c("Island", "N_Shelf", "N_Shore", "OpenSea","S_Shelf","S_Shore"), 
                          c("#377EB8","#4DAF4A","#984EA3","#FF7F00","#E41A1C","#56B4E9")))+
  Heatmap(ann450k2_CIMP$island_location,
          name = "Relation to TSS", cluster_rows = F,
          show_row_names = FALSE, show_column_names = FALSE, 
          width = unit(5, "mm"),structure(names = c("Body","TSS200","5'UTR", "TSS1500","Intergenic","1stExon","3'UTR"), 
                                          brewer.pal(7, "Set1")))

draw(ht_list, annotation_legend_side = "left", heatmap_legend_side = "left")

annotation_titles = c(Cluster = "Cluster",
                      Age = "Age",
                      Gender = "Gender",
                      Grade = "Grade",
                      Pathologic_M = "Pathologic_M",
                      Pathologic_N = "Pathologic_N",
                      Pathologic_T = "Pathologic_T",
                      Stage = "Stage",
                      Pathologic_Location = 'Pathologic_Location',
                      DEAD = "DEAD",
                      HP.infection = "HP.infection",
                      Reccurence = "Reccurence",
                      Family_History = "Family_History",
                      MSI.status="MSI.status"
)
for(an in names(annotation_titles)) {
  decorate_annotation(an, {
    grid.text(annotation_titles[an], unit(-2, "mm"), just = "right",gp = gpar(font = 2,cex=0.8))
    grid.rect(gp = gpar(fill = NA, col = "black"))
  })
}


###draw the OS curve

require(survival)
fit <- survfit(Surv(OS_time, OS) ~ clust_result, data = final1)
require(survminer)
ggsurv <- ggsurvplot(
  fit,                     # survfit object with calculated statistics.
  data = final1, 
  #palette= c("#E41A1C","#4DAF4A"),          # data used to fit survival curves.
  risk.table = TRUE,       # show risk table.
  pval = TRUE,             # show p-value of log-rank test.
  conf.int = F,
  xlim = c(0,60),    # present narrower X axis, but not affect
  # survival estimates.
  xlab = "Time in months",   # customize X axis label.
  ylab = "Overall Survival Rate",
  break.time.by = 6,     # break X axis in time intervals by 500.
  ggtheme = theme_classic2(), # customize plot and risk table with a theme.
  risk.table.y.text.col = T,# colour risk table text annotations.
  risk.table.height = 0.25, # the height of the risk table
  risk.table.y.text = F,# show bars instead of names in text annotations
  # in legend of risk table.
  ncensor.plot = F,
  conf.int.style = "step", # add the median survival pointer.
  legend.labs = 
    c("CIMP-H","CIMP-L","CIMP-M")# change legend labels.
)
ggsurv$plot <- ggsurv$plot + labs(title = "TCGA Gastric Cancer Cohort") +
  theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+
  theme(legend.text = element_text(size = 12,face = 'bold'))

ggsurv$plot <- ggpar(
  ggsurv$plot,
  font.title    = c(16, "bold", "darkblue"),          
  font.y        = c(14, "bold.italic", "darkred"),
  font.x = c(14, "bold.italic", "blue") ,
  font.xtickslab = c(12, "plain", "darkgreen","bold"),
  font.ytickslab = c(10,"darkgreen","bold"),
  legend = "top",legend.title = "",
)

ggsurv$table <- ggpar(
  ggsurv$table,
  font.title    = c(13, "bold.italic", "#E41A1C"))+ 
  theme(axis.line = element_blank(),axis.ticks = element_blank(),
        axis.title = element_blank(),axis.text.x = element_blank())
ggsurv$table <- ggsurv$table + theme_cleantable()
ggsurv

##draw the PFS curve
fit <- survfit(Surv(DFS_time, DFS) ~ clust_result, data = final2)
ggsurv <- ggsurvplot(
  fit,                     # survfit object with calculated statistics.
  data = final1, 
  #palette= c("#E41A1C","#4DAF4A"),          # data used to fit survival curves.
  risk.table = TRUE,       # show risk table.
  pval = TRUE,             # show p-value of log-rank test.
  conf.int = F,
  xlim = c(0,60),    # present narrower X axis, but not affect
  # survival estimates.
  xlab = "Time in months",   # customize X axis label.
  ylab = "Recurrence-Free Survival Rate",
  break.time.by = 6,     # break X axis in time intervals by 500.
  ggtheme = theme_classic2(), # customize plot and risk table with a theme.
  risk.table.y.text.col = T,# colour risk table text annotations.
  risk.table.height = 0.25, # the height of the risk table
  risk.table.y.text = F,# show bars instead of names in text annotations
  # in legend of risk table.
  ncensor.plot = F,
  conf.int.style = "step", # add the median survival pointer.
  legend.labs = 
    c("CIMP-H","CIMP-L")# change legend labels.
)

ggsurv$plot <- ggsurv$plot + labs(title = "TCGA Stomach Cancer  cohort") +
  theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+
  theme(legend.text = element_text(size = 12,face = 'bold'))


ggsurv$plot <- ggpar(
  ggsurv$plot,
  font.title    = c(16, "bold", "darkblue"),          
  font.y        = c(14, "bold.italic", "darkred"),
  font.x = c(14, "bold.italic", "blue") ,
  font.xtickslab = c(12, "plain", "darkgreen","bold"),
  font.ytickslab = c(10,"darkgreen","bold"),
  legend = "top",legend.title = "",
)

ggsurv$table <- ggpar(
  ggsurv$table,
  font.title    = c(13, "bold.italic", "#E41A1C"))+ 
  theme(axis.line = element_blank(),axis.ticks = element_blank(),
        axis.title = element_blank(),axis.text.x = element_blank())
ggsurv$table <- ggsurv$table + theme_cleantable()
ggsurv

####The abundance of tumor-infiltrating immune cells


setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad")
immune<-read.table("./Immnue-cibersort.csv",header = T,sep=",",stringsAsFactors = F)
View(immune)
names(immune)[1]<-"ID"
load("./final7.Rdata")
View(final7)
require(dplyr)
final8<-dplyr::select(final7,ID,clust_result)
View(final8)
immune_score<-inner_join(final8,immune)
View(immune_score)
immune_score<-immune_score[,c(-25,-26,-27)]
summary(immune_score)
names(immune_score)
class(immune_score$clust_result)
immune_score$clust_result<-factor(immune_score$clust_result,levels=c("CIMP-L","CIMP-M","CIMP-H"))

row.names(immune_score)<-immune_score$ID
names(immune_score)[11] <- "T.cells.regulatory.Tregs"
CIMP_H <- immune_score$ID[immune_score$clust_result =="CIMP-H"]
CIMP_M <- immune_score$ID[immune_score$clust_result =="CIMP-M"]
CIMP_L <- immune_score$ID[immune_score$clust_result =="CIMP-L"]
length(CIMP_H)
#68
length(CIMP_M)
#112
length(CIMP_L)
#191
View(immune_score)
class(immune_score)
im1<-immune_score[,-1]
view(im1)
im1$clust_result <- factor(im1$clust_result,levels = c("CIMP-L","CIMP-M","CIMP-H"))
im1$ID<-row.names(im1)
im1<-dplyr::select(im1,ID,everything())
require(reshape2)
im3<- melt(im1[,-1],id.vars = c("clust_result"))
view(im3)
class(im3$value)
im3$value<-im3$value*100

pvalue_im3 <- im3 %>% group_by(variable) %>%
  summarize(p.value = kruskal.test(value ~ clust_result)$p.value)
View(pvalue_im3)

sig <- rep("",22)
for (i in seq(22)) {
  if (pvalue_im3$p.value[i] < 0.001)
    sig[i] <- "***" else
      if (pvalue_im3$p.value[i] < 0.01)
        sig[i] <- "**" else
          if (pvalue_im3$p.value[i] < 0.05)
            sig[i] <- "*"
}
#pdf("vioplot.pdf", width=15, height=7)

ggplot(im3)+
  geom_boxplot(aes(x=variable,y=value,fill=clust_result),width=0.6,position = position_dodge(0.6),outlier.color = NA,
               outlier.size = 0.01)+
  scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"),
                    breaks=c("CIMP-L","CIMP-M","CIMP-H"),
                    labels=c("CIMP-L","CIMP-M","CIMP-H"))+
  
  # geom_smooth(data=c3,aes(x=variable2,y=value,color=group),size=1,linetype = "dashed")+
  #stat_summary(fun.y = mean, geom = "errorbar", aes(x=variable,y=value,ymax = ..y.., ymin = ..y..,color=group),width = .75, linetype = "dashed")
  xlab("")+
  ylab("Fraction of immune cells (%)")+
  
  
  theme_bw()+
  coord_cartesian(ylim = c(0,45))+
  theme(
    
    legend.position = "top",
    legend.background=element_blank(),
    legend.key = element_blank(),
    legend.margin=margin(0,0,0,0,"mm"),
    axis.text.x = element_text(angle = 60,hjust=1,size=12),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 14),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    legend.text=element_text(size=rel(1.1)),
    legend.title=element_blank()
  )+
  guides(color=T)+
  annotate("text",x=seq(1,22),y = 45,label=sig)
#dev.off()

###draw the mutation profiles

setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad")
library(maftools)
#BiocManager::install("maftools")
stad.mutect2.maf <- GDCquery_Maf("STAD", pipelines = "mutect2")
class(stad.mutect2.maf)
save(stad.mutect2.maf,file = "stad.mutect2.maf.Rdata")
load("./stad.mutect2.maf.Rdata")
write.csv(stad.mutect2.maf,file = "Mutation profile of TCGA cohort.csv",row.names = F)
View(head(stad.mutect2.maf))
getwd()
load("./final5.Rdata")
load("./stad.mutect2.maf.Rdata")
#names(final5)[4]<-"Tumor_Sample_Barcode"
#names(final5)[17]<-"OS.time"

options(stringsAsFactors = F)
require(dplyr)
names(final5)
final_maf<-dplyr::select(final5,ID,clust_result,OS_time,OS)
View(final_maf)
names(final_maf)[1]<-"Tumor_Sample_Barcode"
names(final_maf)[3]<-"OS.time"
require(stringr)
final_maf$Tumor_Sample_Barcode<-str_sub(final_maf$Tumor_Sample_Barcode,1,12)
save(final_maf,file = "final_maf.Rdata")

maf = read.maf(maf = stad.mutect2.maf ,clinicalData = final_maf,isTCGA = T)
stad.maf<- subsetMaf(maf =maf, tsb = final_maf$Tumor_Sample_Barcode, 
                     mafObj = TRUE)

oncoplot(maf = stad.maf, top = 30, fontSize =0.5 ,showTumorSampleBarcodes = F )

#getSampleSummary(stad.maf)
#getClinicalData(stad.maf)
#View(stad.maf@data)
#id=as.character(unique(stad.maf@data$Tumor_Sample_Barcode))

View(Tumor_Sample_Barcode)
require(dplyr)
final5<-final4
require(stringr)
final5$match_id<-str_sub(final5$ID,1,12)
View(final5)
final5<-inner_join(Tumor_Sample_Barcode,final5,by="match_id")
final5<-final5[,-2]
final5 <- final5 %>%
  dplyr::rename(Overall_Survival_Status = OS, time = OS_time)

write.table(final5,'stad_clinical_final5.tsv',sep = '\t',row.names = F,quote = F)
maf_clinical <- subsetMaf(maf = stad.maf, tsb = final5$Tumor_Sample_Barcode, 
                          mafObj = TRUE)
####

plotmafSummary(maf = stad.maf, rmOutlier = TRUE, addStat = 'median', dashboard = TRUE, titvRaw = FALSE)

col = RColorBrewer::brewer.pal(n =8, name = 'Paired')
names(col) = c("Missense_Mutation","Frame_Shift_Del", 'Nonsense_Mutation',"Splice_Site", 
               "In_Frame_Del","Frame_Shift_Ins","In_Frame_Ins","Multi_Hit")

fabcolors = RColorBrewer::brewer.pal(n = 3,name = 'Set1')
names(fabcolors) = c("CIMP-H", "CIMP-M", "CIMP-L")
fabcolors = list(clust_result = fabcolors)

oncoplot(maf = stad.maf,colors = col,
         genes =c("TTN","TP53","MUC16","LRP1B","SYNE1","ARID1A","CSMD3","FLG","FAT4",
                  "HMCN1","ZFHX4","PIK3CA"),
         legendFontSize = 1.5,
         clinicalFeatures = 'clust_result',
         sortByAnnotation = TRUE, 
         annotationColor = fabcolors)
require(ComplexHeatmap)
#decorate_annotation("clust_result", {
#  grid.text("clust_result", x = unit(-0.5, "mm"), just = "right")
#})

#tumor mutation burden

final_L<-filter(final_maf,clust_result=="CIMP-L")
View(final_M)
final_M<-filter(final_maf,clust_result=="CIMP-M")
final_H<-filter(final_maf,clust_result=="CIMP-H")
maf_L<-subsetMaf(maf = stad.maf, tsb = final_L$Tumor_Sample_Barcode, 
                 mafObj = TRUE)
maf_M<-subsetMaf(maf = stad.maf, tsb = final_M$Tumor_Sample_Barcode, 
                 mafObj = TRUE)
maf_H<-subsetMaf(maf = stad.maf, tsb = final_H$Tumor_Sample_Barcode, 
                 mafObj = TRUE)


TMB_L <- as.data.frame(maf_L@variants.per.sample)
TMB_M<- as.data.frame(maf_M@variants.per.sample)
TMB_H<- as.data.frame(maf_H@variants.per.sample)

View(maf_L@variants.per.sample)
View(TMB_L)
TMB_L$TMB <- TMB_L$Variants%/%35
TMB_M$TMB <- TMB_M$Variants%/%35
TMB_H$TMB <- TMB_H$Variants%/%35
View(TMB_L$TMB)
View(TMB_H$TMB)
View(TMB_L)
TMB_L$Cluster <- "CIMP-L"
TMB_M$Cluster <- "CIMP-M"
TMB_H$Cluster <- "CIMP-H"

TMB_total <- rbind(TMB_L,TMB_M,TMB_H)

TMB_total$Cluster <- factor(TMB_total$Cluster,levels = c("CIMP-L","CIMP-M","CIMP-H"))
TMB_total$TMB
my_comparisons <- list(c("CIMP-L", "CIMP-M"), c("CIMP-L", "CIMP-H"), 
                       c("CIMP-M","CIMP-H"))
require(ggpubr)
result_TMB<- as.data.frame(compare_means(TMB~Cluster, data=TMB_total,method = "wilcox.test"))
save(result_TMB,file = "result_TMB.Rdata")

write.table(result,"TMB_CLUSTER.csv",sep=",")
write.table(TMB_total,"TMB_result.csv",sep=",")


p <- ggplot(TMB_total, aes(x = Cluster, y = TMB,color=Cluster))+
  geom_boxplot(outlier.color = NA)+
  stat_compare_means(method = "kruskal.test", label.x = 2,label.y = 50) + #label的位置
  stat_compare_means(label = "p.signif", method = "wilcox.test")+
  theme_bw() + #去除背景色
  theme(panel.grid =element_blank()) + #去除网格线
  theme(panel.border = element_blank()) + #去除外层边框
  theme(axis.line = element_line(colour = "black")) + #沿坐标轴显示直线
  xlab("Cluster") +
  theme(axis.title.x=element_text(size=30,face="bold"), 
        axis.title.y=element_text(size=30,face="bold"), 
        axis.text=element_text(size=15,face="bold"), 
        legend.text = element_text( size = 15, face = 'bold'))+
  guides(color=FALSE)


p +  ylim(0,60)+geom_dotplot(binaxis = "y", #沿y轴堆积，并沿着x轴分组
                             binwidth = 0.5, #最大组距
                             dotsize = 0.6, #点的大小
                             #如果点太多，两组叠在一起，就需要运行下面这行把它们分开
                             #stackgroups = T, binpositions="all",
                             aes(colour =Cluster),
                             stackdir = "center")+
  scale_color_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))
dev.off()

#### DEGs analysis

setwd("/Users/zengzhuo/Downloads/R/TCGA_STA")
metadata <- jsonlite::fromJSON("metadata.cart.2019-08-12.json")
View(metadata)
metadata_id <- metadata %>% 
  dplyr::select(c(file_name,associated_entities)) 
naid_df <- data.frame()
for (i in 1:373){
  naid_df[i,1] <- substr(metadata_id$file_name[i],1,nchar(metadata_id$file_name[i])-3)
  naid_df[i,2] <- metadata_id$associated_entities[i][[1]]$entity_submitter_id
}
#读入所有解压的文件 373个
setwd("/Users/zengzhuo/Downloads/R/TCGA_STA/00_data_read_in_one_file")
nameList <- list.files("data_unzip/")
length(nameList)
location <- which(naid_df==nameList[1],arr.ind = TRUE) ##which函数有一个已知value返回坐标的功能
TCGA_id <- as.character(naid_df[location[1],2]) ##通过坐标，获取TCGA_id
expr_df<- read.table(paste0("data_unzip/",nameList[1]),stringsAsFactors = F, header = F) #读入第一个文件，保存为data.frame
names(expr_df) <- c("gene_id",TCGA_id) #给刚才数据库命名
for (i in 2:length(nameList)){
  location <- which(naid_df==nameList[i],arr.ind = TRUE)
  TCGA_id <- as.character(naid_df[location[1],2])
  dfnew <- read.table(paste0("data_unzip/",nameList[i]),stringsAsFactors = F,header = F)
  names(dfnew) <- c("gene_id",TCGA_id)
  expr_df <- inner_join(expr_df,dfnew,by="gene_id")
}
tail(expr_df$gene_id,10)
expr_df <- expr_df[1:(length(expr_df$gene_id)-5),]
save(expr_df,file = "expr_df.Rdata")
expr_df_nopoint <- expr_df %>% 
  tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.") %>% 
  dplyr::select(-drop)
save(expr_df_nopoint,file = "expr_df_nopoint.Rdata")

gtf1 <- rtracklayer::import('Homo_sapiens.GRCh38.97.chr.gtf')
gtf_df <- as.data.frame(gtf1)
save(gtf_df,file = "gtf_df.Rdata")
test <- gtf_df[1:5,]
View(test)
mRNA_exprSet <- gtf_df %>% 
  dplyr::filter(type=="gene",gene_biotype=="protein_coding") %>% #筛选gene,和编码指标
  dplyr::select(c(gene_name,gene_id,gene_biotype)) %>% 
  dplyr::inner_join(expr_df_nopoint,by ="gene_id") %>% 
  tidyr::unite(gene_id,gene_name,gene_id,gene_biotype,sep = " | ")
dim(mRNA_exprSet )
#19631   374 
save(mRNA_exprSet,file = "mRNA_exprSet.Rdata")
load("/Users/zengzhuo/Downloads/R/TCGA_STA/00_data_read_in_one_file/expr_df_nopoint.Rdata")
View(head(expr_df_nopoint))
write.csv(expr_df_nopoint,file = "Expression profile of TCGA cohort.csv",row.names = F)
View(mRNA_exprSet)
names(mRNA_exprSet)
metadata <- data.frame(names(mRNA_exprSet)[-1])
View(metadata)
View(final1)
names(metadata)<-"ID"
nchar(metadata$ID[1])
metadata$ID<-str_sub(metadata$ID,1,15)
metadata<-inner_join(metadata,final1)
View(metadata)
metadata<-select(metadata,ID,clust_result)
table(metadata$clust_result)
#CIMP-H CIMP-L CIMP-M 
#  52    162     92
metadata1<-metadata
metadata<-filter(metadata,clust_result!="CIMP-M")
View(metadata)
View(mRNA_exprSet)
mRNA_exprSet1<-mRNA_exprSet
length(names(mRNA_exprSet))
length(metadata$ID)
b<-vector()
for(i in 1:length(metadata$ID))
{
  b[i]<-which(str_sub(names(mRNA_exprSet),1,15)==metadata$ID[i])
}
View(b)
mRNA_exprSet<-mRNA_exprSet[,c(1,b)]
View(mRNA_exprSet)
View(metadata1)
metadata<-metadata1
table(metadata$clust_result)
metadata<-filter(metadata,clust_result!="CIMP-M")
View(metadata)
metadata$clust_result[metadata$clust_result=="CIMP-L"]<-"L"
names(metadata) <- c("TCGA_id","sample")
metadata$sample <- as.factor(metadata$sample)
metadata$sample<-relevel(metadata$sample,ref = "L")
metadata$sample
View(metadata)
mycounts <- mRNA_exprSet
keepGene=rowSums(edgeR::cpm(mycounts[-1])>0) >=2
table(keepGene)
dim(mycounts)
mycounts <-mycounts[keepGene,]
dim(mycounts)
#19514   215
library(DESeq)
dds <-DESeqDataSetFromMatrix(countData=mycounts, 
                             colData=metadata, 
                             design=~sample,
                             tidy=TRUE)
dds$sample
dds <- DESeq(dds)
class(dds)
metadata$sample
save(dds,file = "dds_DEseq.RData")
res <- results(dds, tidy=TRUE) #获取结果
res <- as_tibble(res)
require(dplyr)
res <- res %>%
  separate(row,c("symbol","ensemble","genetype"),sep = " \\| ") %>%
  dplyr::select(- c(ensemble,genetype)) %>%
  arrange(desc(abs(log2FoldChange))) %>% #排序。为了去重
  distinct(symbol,.keep_all = TRUE) %>%
  arrange(desc(log2FoldChange))#再次按照log2FoldChange从大到小排序
View(res)
save(res,file = "result.Rda")
load("./result.Rda")
View(res)

P <-  0.05
foldChange <- 2
diffSig = res[(res$padj < P & (res$log2FoldChange>foldChange | res$log2FoldChange<(-foldChange))),]
View(diffSig)

## LASSO analysis

#rm(list = ls()) 
load("./diffSig.Rdata")
View(diffSig)
write.csv(diffSig,file = "DEGs between CIMP-H and CIMP-L gruops.csv",row.names = F)
gene.diff<-diffSig$symbol
save(gene.diff,file = "/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/gene.diff.Rdata")
length(gene.diff)
mRNA_diff<-mRNA_exprSet_vst[gene.diff,]
dim(mRNA_diff)
View(mRNA_diff)
names(mRNA_diff)
require(stringr)
names(mRNA_diff)<-str_sub(names(mRNA_diff),1,15)
save(mRNA_diff,file = "mRNA_diff.Rdata")
load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/final_c2.Rdata")
View(final_c2)
cross1<-rownames(final_c2)%in%names(mRNA_diff)
sum(cross1)
final2<-final_c2[cross1,]
View(final2)
final2<-final2[,c(1,2)]
View(final2)
names(final2)
save(final2,file = "final2.Rdata")
e<-vector()
for(i in 1:length(rownames(final2)))
{
  e[i]<-which(names(mRNA_diff)==rownames(final2)[i])
}
View(e)
mRNA_diff2<-mRNA_diff[,e]
load("./final_d1.Rdata")
View(final_d1)
table(final_d1$OS)
require(survival)
Coxoutput=data.frame()
dim(final_d1)
View(head(final_d1))
for(i in colnames(final_d1[,4:ncol(final_d1)])){
  cox <- coxph(Surv(OS_time, OS) ~ final_d1[,i], data = final_d1)
  coxSummary = summary(cox)
  Coxoutput=rbind(Coxoutput,cbind(gene=i,HR=coxSummary$coefficients[,"exp(coef)"],
                                  z=coxSummary$coefficients[,"z"],
                                  pvalue=coxSummary$coefficients[,"Pr(>|z|)"],
                                  lower=coxSummary$conf.int[,3],
                                  upper=coxSummary$conf.int[,4]))
}
class(Coxoutput)
View(Coxoutput)
dim(Coxoutput)

for(i in c(2:6)){
  Coxoutput[,i] <- as.numeric(as.vector(Coxoutput[,i]))
}
View(Coxoutput)
Coxoutput <- arrange(Coxoutput,pvalue)  %>% #按照p值排序
  filter(pvalue < 0.05) 

TCGA_HR_UP <- Coxoutput$gene[Coxoutput$HR>1]
TCGA_HR_DOWN<-Coxoutput$gene[Coxoutput$HR<1]
View(TCGA_HR_UP)
class(TCGA_HR_UP)

library(glmnet)
library(survival)
mRNA_diff4<-as.matrix(mRNA_diff4)
row.names(final_d1)==row.names(mRNA_diff4)
cvfit = cv.glmnet(mRNA_diff4, Surv(mysurv$OS_time,mysurv$OS), 
                  nfold=1000,
                  family = "cox"
) 
plot(cvfit)
fit <- glmnet(mRNA_diff4, Surv(mysurv$OS_time,mysurv$OS), 
              family = "cox") 
plot(fit, label = TRUE)
coef.min = coef(cvfit, s = "lambda.min") 
coef.min
active.min = which(coef.min != 0)
View(active.min)
geneids <- colnames(mRNA_diff4)[active.min]
geneids

### develop a CIMP-related prognostic gene signature


load("/Users/zengzhuo/Downloads/R2/TCGA_STA/00_data_read_in_one_file/mRNA_exprSet.Rdata")
View(mRNA_exprSet)
mRNA_exprSet_all<-mRNA_exprSet
View(mRNA_exprSet_all)
names(mRNA_exprSet_all)[-1]<-str_sub(names(mRNA_exprSet_all)[-1],1,15)
metadata_all <- data.frame(names(mRNA_exprSet_all)[-1])
View(metadata_all)
names(metadata_all)<-"ID"
class(metadata_all$ID)
metadata_all$ID<-as.character(metadata_all$ID)
metadata_all<-inner_join(metadata_all,final1)
View(metadata_all)
dim(metadata_all)
metadata_all<-dplyr::select(metadata_all,ID,cluster_result)
table(metadata_all$cluster_result)
b<-vector()
for(i in 1:length(metadata_all$ID))
{
  b[i]<-which(names(mRNA_exprSet_all)==metadata_all$ID[i])
}
View(b)
mRNA_exprSet_all<-mRNA_exprSet_all[,c(1,b)]
View(mRNA_exprSet_all)
View(metadata_all)
names(metadata_all) <- c("TCGA_id","sample")
table(metadata_all$sample)
metadata_all$sample[metadata_all$sample=="CIMP-L"]<-"L"
metadata_all$sample[metadata_all$sample=="CIMP-H"]<-"H"
metadata_all$sample[metadata_all$sample=="CIMP-M"]<-"M"
table(metadata_all$sample)
metadata_all$sample <- factor(metadata_all$sample,levels = c("H","M","L"))
View(metadata_all)
mycounts <- mRNA_exprSet_all
keepGene=rowSums(edgeR::cpm(mycounts[-1])>0) >=2
table(keepGene)
#FALSE  TRUE 
#108 19523 
mycounts <-mycounts[keepGene,]
dim(mycounts)
#19513   307

dds <-DESeqDataSetFromMatrix(countData=mycounts, 
                             colData=metadata_all, 
                             design=~sample,
                             tidy=TRUE)

vsd <- vst(dds, blind = FALSE)
mRNA_exprSet_all_vst <- as.data.frame(assay(vsd))
View(mRNA_exprSet_all_vst)
mRNA_exprSet_all_vst$row<-row.names(mRNA_exprSet_all_vst)
mRNA_exprSet_all_vst <- mRNA_exprSet_all_vst %>%
  separate(row,c("symbol","ensemble","genetype"),sep = " \\| ") %>%
  dplyr::select(- c(ensemble,genetype)) 
mRNA_exprSet_all_vst <- mRNA_exprSet_all_vst %>%
  dplyr::select(symbol,everything())
mRNA_exprSet_all_vst <- mRNA_exprSet_all_vst %>%
  distinct(symbol,.keep_all = TRUE) 
View(mRNA_exprSet_all_vst)
row.names(mRNA_exprSet_all_vst)<-mRNA_exprSet_all_vst$symbol
mRNA_exprSet_all_vst<-mRNA_exprSet_all_vst[,-1]
View(mRNA_exprSet_all_vst)
save(mRNA_exprSet_all_vst,file = "mRNA_exprSet_all_vst.Rdata")

mRNA_g1<-mRNA_exprSet_all_vst[geneids,]
View(mRNA_g1)
mRNA_g2<-t(mRNA_g1)
View(mRNA_g2)
mRNA_g3<-scale(mRNA_g2,scale = T,center = T)

View(mRNA_g3)
mRNA_g3 <- as.data.frame(mRNA_g3)
mRNA_g3$ID <- row.names(mRNA_g3)
mRNA_g3<- inner_join(final1,mRNA_g3)
View(mRNA_g3)
row.names(mRNA_g3)<-mRNA_g3$ID
save(mRNA_g3,file = "mRNA_g3.Rdata")

fmla <- as.formula(paste("Surv(OS_time, OS) ~", paste0(geneids, collapse = "+")))
cox <- coxph(fmla, data = mRNA_g3)
coef <- cox$coefficients
View(coef)

##calculate the cutoff value

View(mRNA_g3)
names(mRNA_g3)
View(as.matrix(mRNA_g3[,c(23:28)]))
riskScore_test <- as.matrix(mRNA_g3[,c(23:28)]) %*% coef
View(riskScore_test)
riskScore_test <- as.data.frame(riskScore_test)
riskScore_test$ID <- row.names(riskScore_test)
riskScore_test$score <- riskScore_test$V1
riskScore_test <- riskScore_test[,-1]
View(riskScore_test)
View(mRNA_g3)

sta_risk_test <- inner_join(riskScore_test,mRNA_g3)
View(sta_risk_test)
row.names(sta_risk_test) <- sta_risk_test$ID
sta_risk_test <- sta_risk_test[,-1]
sta_risk_test$times <- sta_risk_test$OS_time/12
save(sta_risk_test,file = "sta_risk_test.Rdata")

require(survminer)
sur.cut <- surv_cutpoint(sta_risk_test, time = "OS_time", event = "OS",variables ="score")
summary(sur.cut)
#       cutpoint statistic
#score 0.2351643  5.288044


##distribution of risk score

View(sta_risk_test)
fp<- sta_risk_test$score
View(fp)
names(fp)<-row.names(sta_risk_test)
View(fp)
fp_dat<-data.frame(s=1:length(fp),v=as.numeric(sort(fp)))
View(fp_dat)
fp_dat$Risk<-ifelse(fp_dat$v>=res.cut,"high","low")
sur_dat<-data.frame(s=1:length(fp),
                    t=sta_risk_test[names(sort(fp)),'OS_time'],
                    e=sta_risk_test[names(sort(fp)),'OS']) 
sur_dat$Status<-as.factor(ifelse(sur_dat$e==0,'Alive','Death'))
View(sur_dat)

exp_dat<-t_survdiff_cox_exprset[names(sort(fp)),
                                which(colnames(t_survdiff_cox_exprset) %in% genenames)]

res.cut<-0.2351643

require(ggplot2)
plot.point<-ggplot(fp_dat,aes(x=s,y=v))+
  geom_point(aes(col=Risk),size=1)+
  geom_segment(aes(x = sum(fp_dat$Risk=="low"),
                   y = min(fp_dat$v), 
                   xend = sum(fp_dat$Risk=="low"), 
                   yend = res.cut),linetype="dashed")+
  geom_segment(aes(x=0,y=res.cut,
                   xend=sum(fp_dat$Risk=="low"),
                   yend=res.cut),linetype="dashed")+
  geom_text(aes(x=sum(fp_dat$Risk=="low")/2,
                y=res.cut+0.25,
                label=paste0("Cutoff: ",round(res.cut,3))),
            col ="black",size = 10,alpha=0.8)+
  theme(axis.title.x=element_blank())+
  scale_x_continuous(limits = c(0,NA),expand = c(0,0))+
  labs(y="Risk score",fill="Risk")+
  scale_colour_manual(values = c( "firebrick3","navy"))+
  theme_bw()##去除背景

plot.point<-plot.point+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())
plot.point

plot.sur<-ggplot(sur_dat,aes(x=s,y=t))+
  geom_point(aes(col=Status),size=1.2)+
  geom_vline(aes(xintercept=sum(fp_dat$Risk=="low")),linetype="dashed")+
  scale_colour_manual(values = c( "navy","firebrick3"))+
  theme(axis.title.x=element_blank())+
  scale_x_continuous(limits = c(0,NA),expand = c(0,0))+
  scale_fill_discrete(labels=c("Alive", "Dead"))+
  labs(y="Survival time(months)")+
  theme_bw()##去除背景

plot.sur<-plot.sur+theme(panel.grid.major = element_blank(),panel.grid.minor = element_blank())
plot.sur

tcga_expr<-mRNA_exprSet_all_vst[c("CST6","SLC7A2","RAB3B","IGFBP1","VSTM2L","EVX2"),]
View(head(tcga_expr))
tcga_expr2<-t(tcga_expr)
View(head(tcga_expr2))
tcga_expr3<-scale(tcga_expr2,scale = T,center = T)
tcga_expr3 <- as.data.frame(tcga_expr3)
View(head(tcga_expr3))
tcga_expr3$ID <- row.names(tcga_expr3)
View(head(tcga_expr3))
class(tcga_expr3)
tcga_expr6<-tcga_expr3
tcga_expr7<-tcga_expr6[names(sort(fp)),]
View(tcga_expr7)
tmp<-t(tcga_expr7)
View(head(tmp))
tmp[tmp > 1] = 1
tmp[tmp < -1] = -1
View(tmp)
library(pheatmap)
plot.h<-pheatmap(tmp,show_colnames = F,cluster_cols = F,cluster_rows = F,cellheight=30,
                 color =colorRampPalette(c("navy", "white","firebrick3"))(200))
?pheatmap
View(plot.h)

grid(plot.point, plot.sur,plot.h$gtable,
     label_x=0,
     label_y=1,
     align = 'v',ncol = 1,axis="t")


## OS curve between high- and low-risk groups

View(sta_risk_test)
sta_risk_tcga<-sta_risk_test
View(sta_risk_tcga)
sta_risk_tcga$risk<-ifelse(sta_risk_tcga$score>0.2351643,"high","low")
table(sta_risk_tcga$risk)
#high  low 
#91  215 
fit <- survfit(Surv(OS_time, OS) ~ risk, data = sta_risk_tcga)
ggsurv <- ggsurvplot(
  fit,                     # survfit object with calculated statistics.
  data = sta_risk_tcga, 
  #palette= c("#E41A1C","#4DAF4A"),          # data used to fit survival curves.
  risk.table = TRUE,       # show risk table.
  pval = TRUE,             # show p-value of log-rank test.
  conf.int = F,
  xlim = c(0,60),    # present narrower X axis, but not affect
  # survival estimates.
  xlab = "Time in months",   # customize X axis label.
  ylab = "Overall Survival Rate",
  break.time.by = 6,     # break X axis in time intervals by 500.
  ggtheme = theme_classic2(), # customize plot and risk table with a theme.
  risk.table.y.text.col = T,# colour risk table text annotations.
  risk.table.height = 0.25, # the height of the risk table
  risk.table.y.text = F,# show bars instead of names in text annotations
  # in legend of risk table.
  ncensor.plot = F,
  conf.int.style = "step", # add the median survival pointer.
  legend.labs = 
    c("Highrisk","Lowrisk")# change legend labels.
)
ggsurv$plot <- ggsurv$plot + labs(title = "TCGA Gastric Cancer Cohort") +
  theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+
  theme(legend.text = element_text(size = 12,face = 'bold'))

ggsurv$plot <- ggpar(
  ggsurv$plot,
  font.title    = c(16, "bold", "darkblue"),          
  font.y        = c(14, "bold.italic", "darkred"),
  font.x = c(14, "bold.italic", "blue") ,
  font.xtickslab = c(12, "plain", "darkgreen","bold"),
  font.ytickslab = c(10,"darkgreen","bold"),
  legend = "top",legend.title = "",
)

ggsurv$table <- ggpar(
  ggsurv$table,
  font.title    = c(13, "bold.italic", "#E41A1C"))+ 
  theme(axis.line = element_blank(),axis.ticks = element_blank(),
        axis.title = element_blank(),axis.text.x = element_blank())
ggsurv$table <- ggsurv$table + theme_cleantable()
ggsurv

## plot the ROC

library(survivalROC)
roc_1=survivalROC(Stime=sta_risk_test$times, 
                  status=sta_risk_test$OS, 
                  marker = sta_risk_test$score, 
                  predict.time =1, method="KM")

roc_2=survivalROC(Stime=sta_risk_test$times, 
                  status=sta_risk_test$OS, 
                  marker = sta_risk_test$score, 
                  predict.time =2, method="KM")

roc_5=survivalROC(Stime=sta_risk_test$times, 
                  status=sta_risk_test$OS, 
                  marker = sta_risk_test$score, 
                  predict.time =5, method="KM")

x <- unlist(roc_1$FP)  ##提取x值
y <- unlist(roc_1$TP)
plotdata_1 <- data.frame(x,y) 
plotdata_1$group <- "1"

x <- unlist(roc_3$FP)  ##提取x值
y <- unlist(roc_3$TP)
plotdata_3 <- data.frame(x,y) 
plotdata_3$group <- "3"

x <- unlist(roc_5$FP)  ##提取x值
y <- unlist(roc_5$TP)
plotdata_5 <- data.frame(x,y) 
plotdata_5$group <- "5"


plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5)


g <- ggplot(plotdata) + 
  geom_path(aes(x = x, y = y,colour = group), size=1) + 
  labs(x="1 - Specificity", y = "Ture positive rate") +
  ggpubr::theme_classic2()+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+
  geom_abline(intercept=0, slope=1,linetype="dashed")+
  scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.664",
                                              "AUC at 3 years: 0.704","AUC at 5 years: 0.667")+
  theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+
  theme(legend.position=c(.766,.25))
g

## Univariate and multivariate analysis 


load("./final9.Rdata")
View(final9)
univarcox<- function(x){
  formu<-as.formula(paste0('sur~',x))
  unicox<-coxph(formu,data=final9)
  unisum<-summary(unicox)#汇总数据
  HR<-round(unisum$coefficients[,2],3)# HR风险比
  Pvalue<-unisum$coefficients[,5]#p值
  CI95<-paste0(round(unisum$conf.int[,c(3,4)],3),collapse = '-') #95%置信区间
  univarcox<-data.frame('characteristics'=x,
                        'Hazard Ration'=HR,
                        'CI95'=CI95,
                        'pvalue'=ifelse(Pvalue < 0.001, "< 0.001", round(Pvalue,3)))
  return(univarcox)#返回数据框
}
require(survival)
sur<-Surv(time=final9$OS_time, event = final9$OS)
names(final9)
variable_names<-colnames(final9)[c(17,15)]

univar<-lapply(variable_names,univarcox)
univartable<-do.call(rbind,lapply(univar,data.frame))
univartable$`HR(95%CI)`<-paste0(univartable$Hazard.Ration,'(',univartable$CI95,')')
univartable1<-dplyr::select(univartable,characteristics,`HR(95%CI)`,pvalue,-CI95,-Hazard.Ration)
univartable1
(names<-univartable1$characteristics[univartable1$pvalue<0.05])

foresst_uni_os_risk<-read.csv("./risk cimp univariate.csv",stringsAsFactors=FALSE,header = T,sep = ",")
View(foresst_uni_os_risk)
colnames(foresst_uni_os_risk)

tabletext <- cbind(c("\nUnivariate analysis",NA, foresst_uni_os_risk$Univariate.analysis),
                   c("Hazard Ratio\n(95% CI)", NA, foresst_uni_os_risk$Hazard.Ratio..95.CI.),
                   c("P-value", NA, foresst_uni_os_risk$P.value))
#BiocManager::install("forestplot")
require(forestplot)
forestplot(labeltext=tabletext, #图中的文本
           mean=c(NA,1,foresst_uni_os_risk$mean),#HR
           lower=c(NA,1,foresst_uni_os_risk$lower), #95%置信区间下限
           upper=c(NA,1,foresst_uni_os_risk$upper),#95%置信区间上限
           #title="Hazard Ratio",
           graph.pos=3,#图在表中的列位置
           graphwidth = unit(.4,"npc"),#图在表中的宽度比例
           fn.ci_norm="fpDrawDiamondCI",#box类型选择钻石
           col=fpColors(box="steelblue", lines="black", zero = "black"),#box颜色
           lwd.ci=2,ci.vertices.height = 0.1,ci.vertices=TRUE,#置信区间用线宽、高、型
           zero=1,#zero线横坐标
           lwd.zero=2,#zero线宽
           #虚线(可多条)及其横坐标、颜色、线宽
           #xticks = c(round(min(data$mean),1), 1,round((max(data$mean)-min(data$mean)/2),1), round(max(data$mean),1)), #横坐标刻度
           lwd.xaxis=2, #X轴线宽
           xlab="Hazard Ratio",#X轴标题
           hrzl_lines=list("3" = gpar(lwd=2, col="black"),#第三行顶部加黑线，引号内数字标记行位置
                           #"4" = gpar(lwd=60,lineend="butt", columns=c(1:4), col="#99999922"),#加阴影，弱项不建议使用
"5" = gpar(lwd=2, col="black")),#最后一行底部加黑线,""中数字为nrow(data)+5
txt_gp=fpTxtGp(label=gpar(cex=1.25),#各种字体大小设置
               ticks=gpar(cex=1.25),
               xlab=gpar(cex = 1.25),
               title=gpar(cex = 1.25)),clip =c(0, 3),
#is.summary = c(T,rep(F,27)),#首行字体类型设置
lineheight = unit(2,"cm"),#固定行高
#align=c("l","c","c"),#每列文字的对齐方式，偶尔会用到
#cex=10, 
colgap = unit(1,"cm"),#列间隙  
mar=unit(rep(1.25, times = 4), "cm"),#图形页边距
new_page = F#是否新页
  )

(form<-as.formula(paste0('sur~',paste0(names,collapse = '+'))))
multicox<-coxph(formula = form,data = final9)
multisum<-summary(multicox)##汇总
muHR<-round(multisum$coefficients[,2],3)#
muPvalue<-multisum$coefficients[,5]#p值
muCIdown<-round(multisum$conf.int[,3],3)#下
muCIup<-round(multisum$conf.int[,4],3)#上
muCI<-paste0(muCIdown,'-',muCIup)##95%置信区间
multicox<-data.frame('characteristics'=names,
                     'muHazard Ration'=muHR,
                     'muCI95'=muCI,
                     'mupvalue'=ifelse(muPvalue < 0.001, "< 0.001", round(muPvalue,3)))
rownames(multicox)<-NULL
multicox$`HR(95%CI)`<-paste0(multicox$muHazard.Ration,'(',multicox$muCI95,')')
multicox<-dplyr::select(multicox,characteristics,`HR(95%CI)`,mupvalue,-muCI95,-muHazard.Ration)
View(multicox)
#合并单因素多因素表格
uni_multi<-dplyr::full_join(univartable1,multicox,by="characteristics")
uni_multi$characteristics <- as.character(uni_multi$characteristics)


forest_mul_os_risk<-read.csv("./risk cimp mulvariate.csv",stringsAsFactors=FALSE,header = T,sep = ",")
View(forest_mul_os_risk)
names(forest_mul_os_risk)
tabletext <- cbind(c("\nMultivariate analysis",NA, forest_mul_os_risk$Multivaritate.analysis),
                   c("Hazard Ratio\n(95% CI", NA, forest_mul_os_risk$Hazard.Ratio..95.CI.),
                   c("P-value", NA,forest_mul_os_risk$P.value))


forestplot(labeltext=tabletext, #图中的文本
           mean=c(NA,1,forest_mul_os_risk$mean),#HR
           lower=c(NA,1,forest_mul_os_risk$lower), #95%置信区间下限
           upper=c(NA,1,forest_mul_os_risk$upper),#95%置信区间上限
           #title="Hazard Ratio",
           graph.pos=3,#图在表中的列位置
           graphwidth = unit(.4,"npc"),#图在表中的宽度比例
           fn.ci_norm="fpDrawDiamondCI",#box类型选择钻石
           col=fpColors(box="steelblue", lines="black", zero = "black"),#box颜色
           lwd.ci=2,ci.vertices.height = 0.1,ci.vertices=TRUE,#置信区间用线宽、高、型
           zero=1,#zero线横坐标
           lwd.zero=2,#zero线宽
           #虚线(可多条)及其横坐标、颜色、线宽
           #xticks = c(round(min(data$mean),1), 1,round((max(data$mean)-min(data$mean)/2),1), round(max(data$mean),1)), #横坐标刻度根据具体情况设置
           lwd.xaxis=2, #X轴线宽
           xlab="Hazard Ratio",#X轴标题
           hrzl_lines=list("3" = gpar(lwd=2, col="black"),#第三行顶部加黑线，引号内数字标记行位置
                           #"4" = gpar(lwd=60,lineend="butt", columns=c(1:4), col="#99999922"),#加阴影，弱项不建议使用
                           "5" = gpar(lwd=2, col="black")),#最后一行底部加黑线,""中数字为nrow(data)+5
           txt_gp=fpTxtGp(label=gpar(cex=1.25),#各种字体大小设置
                          ticks=gpar(cex=1.25),
                          xlab=gpar(cex = 1.25),
                          title=gpar(cex = 1.25)),clip =c(0, 3),
           #is.summary = c(T,rep(F,27)),#首行字体类型设置
           lineheight = unit(2,"cm"),#固定行高
           #align=c("l","c","c"),#每列文字的对齐方式，偶尔会用到
           #cex=10,   
           colgap = unit(1,"cm"),#列间隙    
           mar=unit(rep(1.25, times = 4), "cm"),#图形页边距
           new_page = F#是否新页
)

##validation of gene signature

setwd("/Users/zengzhuo/Downloads/R/GSE_stomach/GSE13861")
library(GEOquery)
gset = getGEO('GSE13861', destdir=".",getGPL = F)
gset=gset[[1]]
pdata=pData(gset)
View(pdata)
exprSet_GSE13861=exprs(gset)
View(exprSet_GSE13861)
clinical_GSE13861<-read.csv("YUHS.csv",header = T,stringsAsFactors = F,sep =",",check.names = F)
View(clinical_GSE13861)
library(limma) 
exprSet_GSE13861_1<-normalizeBetweenArrays(exprSet_GSE13861)

annotation<-read.csv("GPL6884-11607.csv",header = T,stringsAsFactors = F,sep =",",check.names = F)
View(annotation)
annotation<-annotation[,c("ID","Symbol")]
class(annotation)
exprSet_GSE13861_1<-as.data.frame(exprSet_GSE13861_1)
exprSet_GSE13861_1$ID<-row.names(exprSet_GSE13861_1)
require(dplyr)
exprSet_GSE13861_1<-inner_join(exprSet_GSE13861_1,annotation)
View(exprSet_GSE13861_1)
exprSet_GSE13861_1<-exprSet_GSE13861_1%>%
  dplyr::select("ID","Symbol",everything())
exprSet_GSE13861_1<-exprSet_GSE13861_1[,-1]

exprSet_GSE13861_1<-exprSet_GSE13861_1%>%
  #求出平均数(这边的点号代表上一步产出的数据)
  mutate(rowMean =rowMeans(.[grep("GSM", names(.))])) %>% 
  #去除symbol中的NA
  filter(Symbol != "NA") %>% 
  #把表达量的平均值按从大到小排序
  arrange(desc(rowMean)) %>% 
  # symbol留下第一个
  distinct(Symbol,.keep_all = T) %>% 
  #反向选择去除rowMean这一列
  dplyr::select(-rowMean) 

row.names(exprSet_GSE13861_1)<-exprSet_GSE13861_1$Symbol
exprSet_GSE13861_1<-exprSet_GSE13861_1[,-1]
View(exprSet_GSE13861_1)

load("/Users/zengzhuo/Downloads/R2/GSE_stomach/GSE13861/exprSet_GSE13861_1.Rdata")
View(exprSet_GSE13861_1)
geneids<-c("CST6","SLC7A2","RAB3B","IGFBP1","VSTM2L","EVX2")
exprSet_GSE13861_bio<-exprSet_GSE13861_1[geneids,]
View(exprSet_GSE13861_bio)
exprSet_GSE13861_bio<-na.omit(exprSet_GSE13861_bio)
exprSet_GSE13861_bio1<-t(exprSet_GSE13861_bio)
exprSet_GSE13861_bio2<-scale(exprSet_GSE13861_bio1,scale = T,center = T)
View(exprSet_GSE13861_bio2)
a<-coef[-5]
a
View(as.matrix(exprSet_GSE13861_bio2))
riskScore_GSE13861<- as.matrix(exprSet_GSE13861_bio2) %*% a
View(riskScore_GSE13861)
riskScore_GSE13861 <- as.data.frame(riskScore_GSE13861)
riskScore_GSE13861$ID <- row.names(riskScore_GSE13861)
riskScore_GSE13861$score <- riskScore_GSE13861$V1
riskScore_GSE13861 <- riskScore_GSE13861[,-1]
View(riskScore_GSE13861)
load("/Users/zengzhuo/Downloads/R2/GSE_stomach/GSE13861/clinical_GSE13861_1.Rdata")
View(clinical_GSE13861_1)
riskScore_GSE13861<-inner_join(riskScore_GSE13861,clinical_GSE13861_1)
View(riskScore_GSE13861)
rownames(riskScore_GSE13861)<-riskScore_GSE13861$ID
riskScore_GSE13861$risk <- ifelse(riskScore_GSE13861$score>0.2351643,'high','low')
table(riskScore_GSE13861$risk)
#high  low 
#25   40

fit <- survfit(Surv(OS_time, OS) ~ risk, data = riskScore_GSE13861)
ggsurv <- ggsurvplot(
  fit,                     # survfit object with calculated statistics.
  data = riskScore_GSE13861, 
  #palette= c("#E41A1C","#4DAF4A"),          # data used to fit survival curves.
  risk.table = TRUE,       # show risk table.
  pval = TRUE,             # show p-value of log-rank test.
  conf.int = F,
  xlim = c(0,60),    # present narrower X axis, but not affect
  # survival estimates.
  xlab = "Time in months",   # customize X axis label.
  ylab = "Overall Survival Rate",
  break.time.by = 6,     # break X axis in time intervals by 500.
  ggtheme = theme_classic2(), # customize plot and risk table with a theme.
  risk.table.y.text.col = T,# colour risk table text annotations.
  risk.table.height = 0.25, # the height of the risk table
  risk.table.y.text = F,# show bars instead of names in text annotations
  # in legend of risk table.
  ncensor.plot = F,
  conf.int.style = "step", # add the median survival pointer.
  legend.labs = 
    c("Highrisk","Lowrisk")# change legend labels.
)
ggsurv$plot <- ggsurv$plot + labs(title = "GSE13861 Cohort") +
  theme(axis.title.x = element_blank())+ theme(plot.title = element_text(hjust = 0.5),)+
  theme(legend.text = element_text(size = 12,face = 'bold'))

ggsurv$plot <- ggpar(
  ggsurv$plot,
  font.title    = c(16, "bold", "darkblue"),          
  font.y        = c(14, "bold.italic", "darkred"),
  font.x = c(14, "bold.italic", "blue") ,
  font.xtickslab = c(12, "plain", "darkgreen","bold"),
  font.ytickslab = c(10,"darkgreen","bold"),
  legend = "top",legend.title = "",
)

ggsurv$table <- ggpar(
  ggsurv$table,
  font.title    = c(13, "bold.italic", "#E41A1C"))+ 
  theme(axis.line = element_blank(),axis.ticks = element_blank(),
        axis.title = element_blank(),axis.text.x = element_blank())
ggsurv$table <- ggsurv$table + theme_cleantable()
ggsurv

roc_1=survivalROC(Stime=riskScore_13861$times, 
                  status=riskScore_13861$OS, 
                  marker = riskScore_13861$score, 
                  predict.time =1, method="KM")
roc_1$AUC


roc_3=survivalROC(Stime=riskScore_13861$times, 
                  status=riskScore_13861$OS, 
                  marker = riskScore_13861$score, 
                  predict.time =3, method="KM")
roc_3$AUC


roc_5=survivalROC(Stime=riskScore_13861$times, 
                  status=riskScore_13861$OS, 
                  marker = riskScore_13861$score, 
                  predict.time =5, method="KM")
roc_5$AUC


x <- unlist(roc_1$FP)  ##提取x值
y <- unlist(roc_1$TP)
plotdata_1 <- data.frame(x,y) 
plotdata_1$group <- "1"

x <- unlist(roc_3$FP)  ##提取x值
y <- unlist(roc_3$TP)
plotdata_3 <- data.frame(x,y) 
plotdata_3$group <- "3"

x <- unlist(roc_5$FP)  ##提取x值
y <- unlist(roc_5$TP)
plotdata_5 <- data.frame(x,y) 
plotdata_5$group <- "5"


plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5)

require(ggplot2)
g <- ggplot(plotdata) + 
  geom_path(aes(x = x, y = y,colour = group), size=1) + 
  labs(x="1 - Specificity", y = "Ture positive rate") +
  ggpubr::theme_classic2()+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+
  geom_abline(intercept=0, slope=1,linetype="dashed")+
  scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.638",
                                              "AUC at 3 years: 0.777",
                                              "AUC at 5 years: 0.745"))+
  theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+
  theme(legend.position=c(.766,.25))
g


### comparison of signatures

compareROC_tcga_jcp1<-select(compareROC_tcga2,"MARCKS","MAPK14","CCNF","INCENP","CHAF1A")
View(compareROC_tcga_jcp1)
compareROC_tcga_jcp2<-scale(compareROC_tcga_jcp1,scale = T,center = T)
View(compareROC_tcga_jcp2)
class(compareROC_tcga_jcp2)
compareROC_tcga_jcp2<-as.data.frame(compareROC_tcga_jcp2)

coef_jcp<-c(0.672,0.48,0.568,-0.522,-0.798)
coef_jcp<-as.matrix(coef_jcp)
rownames(coef_jcp)<-c("MARCKS","MAPK14","CCNF","INCENP","CHAF1A")
View(coef_jcp)

riskScore_compare_13861_jcp<- as.matrix(compareROC_tcga_jcp2[,c(1:5)]) %*% coef_jcp
riskScore_compare_13861_jcp <- as.data.frame(riskScore_compare_13861_jcp)
View(riskScore_compare_13861_jcp)

riskScore_compare_13861_jcp$ID <-str_sub(row.names(compareROC_tcga_jcp2),1,15)
riskScore_compare_13861_jcp$score <- riskScore_compare_13861_jcp$V1
riskScore_compare_13861_jcp <- riskScore_compare_13861_jcp[,-1]
View(riskScore_compare_13861_jcp)


riskScore_compare_13861_jcp <- inner_join(riskScore_compare_13861_jcp,final2)
View(riskScore_compare_13861_jcp)
row.names(riskScore_compare_13861_jcp) <- riskScore_compare_13861_jcp$ID
View(riskScore_compare_13861_jcp)
riskScore_compare_13861_jcp$times <- riskScore_compare_13861_jcp$OS_time/12


roc_1=survivalROC(Stime=riskScore_compare_13861_jcp$times, 
                  status=riskScore_compare_13861_jcp$OS, 
                  marker = riskScore_compare_13861_jcp$score, 
                  predict.time =1, method="KM")

roc_3=survivalROC(Stime=riskScore_compare_13861_jcp$times, 
                  status=riskScore_compare_13861_jcp$OS, 
                  marker = riskScore_compare_13861_jcp$score, 
                  predict.time =3, method="KM")

roc_5=survivalROC(Stime=riskScore_compare_13861_jcp$times, 
                  status=riskScore_compare_13861_jcp$OS, 
                  marker = riskScore_compare_13861_jcp$score, 
                  predict.time =5, method="KM")


x <- unlist(roc_1$FP)  ##提取x值
y <- unlist(roc_1$TP)
plotdata_1 <- data.frame(x,y) 
plotdata_1$group <- "1"

x <- unlist(roc_3$FP)  ##提取x值
y <- unlist(roc_3$TP)
plotdata_3 <- data.frame(x,y) 
plotdata_3$group <- "3"

x <- unlist(roc_5$FP)  ##提取x值
y <- unlist(roc_5$TP)
plotdata_5 <- data.frame(x,y) 
plotdata_5$group <- "5"


plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5)


g <- ggplot(plotdata) + 
  geom_path(aes(x = x, y = y,colour = group), size=1) + 
  labs(x="1 - Specificity", y = "Ture positive rate") +
  ggpubr::theme_classic2()+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+
  geom_abline(intercept=0, slope=1,linetype="dashed")+
  scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.694",
                                              "AUC at 3 years: 0.653",
                                              "AUC at 5 years: 0.666"))+
  theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+
  theme(legend.position=c(.766,.25))
g


setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad")
load("./mRNA_exprSet_vst.Rdata")
View(mRNA_exprSet_vst)
compareROC_tcga1<-mRNA_exprSet_vst
compareROC_tcga2<-t(compareROC_tcga1)
class(compareROC_tcga2)
compareROC_tcga2<-as.data.frame(compareROC_tcga2)
compareROC_tcga3<-select(compareROC_tcga2,"NOX4","FJX1","HEYL","LOX","SERPINE2",
                         "COMP","RBMS1","LAMC1","MFAP2","ANXA5","NETO2",
                         "PDLIM3","GADD45B")
View(compareROC_tcga3)
compareROC_tcga4<-scale(compareROC_tcga3,scale = T,center = T)
View(compareROC_tcga4)
class(compareROC_tcga4)
compareROC_tcga4<-as.data.frame(compareROC_tcga4)


riskScore_compare_13861<- as.matrix(compareROC_tcga4[,c(1:13)]) %*% coef_peerj
riskScore_compare_13861 <- as.data.frame(riskScore_compare_13861)
View(riskScore_compare_13861)
require(stringr)
riskScore_compare_13861$ID <-str_sub(row.names(compareROC_tcga4),1,15)
riskScore_compare_13861$score <- riskScore_compare_13861$V1
riskScore_compare_13861 <- riskScore_compare_13861[,-1]
View(riskScore_compare_13861)

View(final2)
riskScore_compare_13861 <- inner_join(riskScore_compare_13861,final2)
View(riskScore_compare_13861)
row.names(riskScore_compare_13861) <- riskScore_compare_13861$ID
View(riskScore_compare_13861)
riskScore_compare_13861$times <- riskScore_compare_13861$OS_time/12
View(riskScore_compare_13861)

roc_1=survivalROC(Stime=riskScore_compare_13861$times, 
                  status=riskScore_compare_13861$OS, 
                  marker = riskScore_compare_13861$score, 
                  predict.time =1, method="KM")

roc_3=survivalROC(Stime=riskScore_compare_13861$times, 
                  status=riskScore_compare_13861$OS, 
                  marker = riskScore_compare_13861$score, 
                  predict.time =3, method="KM")

roc_5=survivalROC(Stime=riskScore_compare_13861$times, 
                  status=riskScore_compare_13861$OS, 
                  marker = riskScore_compare_13861$score, 
                  predict.time =5, method="KM")


x <- unlist(roc_1$FP)  ##提取x值
y <- unlist(roc_1$TP)
plotdata_1 <- data.frame(x,y) 
plotdata_1$group <- "1"

x <- unlist(roc_3$FP)  ##提取x值
y <- unlist(roc_3$TP)
plotdata_3 <- data.frame(x,y) 
plotdata_3$group <- "3"

x <- unlist(roc_5$FP)  ##提取x值
y <- unlist(roc_5$TP)
plotdata_5 <- data.frame(x,y) 
plotdata_5$group <- "5"


plotdata <- rbind(plotdata_1,plotdata_3,plotdata_5)


g <- ggplot(plotdata) + 
  geom_path(aes(x = x, y = y,colour = group), size=1) + 
  labs(x="1 - Specificity", y = "Ture positive rate") +
  ggpubr::theme_classic2()+
  scale_x_continuous(expand = c(0, 0)) + scale_y_continuous(expand = c(0, 0))+
  theme(axis.text=element_text(size=14,face="bold"),axis.title=element_text(size=16,face="bold"))+
  geom_abline(intercept=0, slope=1,linetype="dashed")+
  scale_colour_hue(name="my legend", labels=c("AUC at 1 year: 0.627",
                                              "AUC at 3 years: 0.629",
                                              "AUC at 5 years: 0.673"))+
  theme(legend.title=element_blank())+theme(legend.text = element_text(size = 14,face = "bold"))+
  theme(legend.position=c(.766,.25))
g


###plot nomogram

setwd("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad")
library(survival)
BiocManager::install("rms")
library(rms)

load("./sta_risk_test.Rdata")
View(sta_risk_test)
names(sta_risk_test)
class(sta_risk_test)
dd<-datadist(sta_risk_test)
options(datadist="dd")

h<-cph(Surv(OS_time,OS)~score,data = sta_risk_test,
       x = T,y = T,surv = T)

survival<-Survival(h)
survival1<-function(x)survival(12,x)
survival2<-function(x)survival(36,x)
survival3<-function(x)survival(60,x)
nom<-nomogram(h,fun = list(survival1,survival2,survival3),
              fun.at = c(0.05,seq(0.1,0.9,by = 0.1),0.95),
              funlabel = c("1 year survival","3 years survival","5 years survival"))
plot(nom)

h<-cph(Surv(OS_time,OS)~score,data = sta_risk_test,
       x = T,y = T,surv = T,time.inc = 36)
cal<-calibrate(h,u = 36,cmethod = "KM",m=90)
plot(cal,xlim=c(0,1),ylim=c(0,1),
     errbar.col="black",col="firebrick1")
abline(a=0,b=1,lty=3,lwd=1,col="blue")

##Methylation pearson analysis

getwd()
load("./Tumor_bVals.Rdata")
View(head(Tumor_bVals))
load("/Users/zengzhuo/Downloads/CIMP in gastric cancer/stad/Illumina450k_annotation.RData")
View(ann450k2)
cst6_sites<-filter(ann450k2,gene_id=="CST6")
View(cst6_sites)
cst6_sites<-cst6_sites[1:5,c(4,35)]
View(cst6_sites)
cst6_sites$Name
#"cg04327181" "cg05057976" "cg07270851" "cg10560147" "cg14347177"
cst6_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg04327181",]
View(cst6_bval)
cst6_bval2<-t(cst6_bval)
View(cst6_bval2)
class(cst6_bval2)
cst6_bval2<-as.data.frame(cst6_bval2)
cst6_bval2$ID<-row.names(cst6_bval2)
View(cst6_bval2)
load("./mRNA_diff.Rdata")
View(head(mRNA_diff))
cst6_exr<-mRNA_diff[row.names(mRNA_diff)=="CST6",]
View(cst6_exr)
cst6_exr2<-t(cst6_exr)
cst6_exr2<-as.data.frame(cst6_exr2)
cst6_exr2$ID<-row.names(cst6_exr2)
cst6_pearson<-inner_join(cst6_exr2,cst6_bval2)
View(cst6_pearson)
class(cst6_pearson)
names(cst6_pearson)
which(cst6_pearson$CST6>12)
cst6_pearson<-cst6_pearson[-47,]
pdf("correlation with CST6.pdf",6,5)
ggplot(cst6_pearson, aes(x=cg04327181, y=CST6)) + 
  geom_point()+
  #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+
  theme_bw()+
  stat_smooth(method=lm,se=T)+
  labs(title=paste0("R = ",round(cor.test(cst6_pearson$cg04327181,cst6_pearson$CST6)$estimate,3),", P = ",
                    round(cor.test(cst6_pearson$cg04327181,cst6_pearson$CST6)$p.value,3)),
       x="Methylation of CST6 promoter",y="CST6 expression")+
  theme(
    plot.title = element_text(hjust = 0.5,size = 15),
    axis.text.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    axis.title.x=element_text(size = 20),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 20),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    
  )
dev.off()

##SLC7A2

View(ann450k2)
SLC7A2_sites<-filter(ann450k2,gene_id=="SLC7A2")
View(SLC7A2_sites)
SLC7A2_sites<-SLC7A2_sites[1:2,c(4,35)]
View(SLC7A2_sites)
SLC7A2_sites$Name
# "cg03340398" "cg09020665"
SLC7A2_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg03340398",]
View(SLC7A2_bval)
SLC7A2_bval2<-t(SLC7A2_bval)
View(SLC7A2_bval2)
class(SLC7A2_bval2)
SLC7A2_bval2<-as.data.frame(SLC7A2_bval2)
SLC7A2_bval2$ID<-row.names(SLC7A2_bval2)
View(SLC7A2_bval2)

SLC7A2_exr<-mRNA_diff[row.names(mRNA_diff)=="SLC7A2",]
View(SLC7A2_exr)
SLC7A2_exr2<-t(SLC7A2_exr)
SLC7A2_exr2<-as.data.frame(SLC7A2_exr2)
SLC7A2_exr2$ID<-row.names(SLC7A2_exr2)
SLC7A2_pearson<-inner_join(SLC7A2_exr2,SLC7A2_bval2)
View(SLC7A2_pearson)
class(SLC7A2_pearson)
names(SLC7A2_pearson)
which(SLC7A2_pearson$SLC7A2>15)
SLC7A2_pearson<-SLC7A2_pearson[c(-164,-210),]
pdf("correlation with SLC7A2.pdf",6,5)
ggplot(SLC7A2_pearson, aes(x=cg03340398, y=SLC7A2)) + 
  geom_point()+
  #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+
  theme_bw()+
  stat_smooth(method=lm,se=T)+
  labs(title=paste0("R = ",round(cor.test(SLC7A2_pearson$cg03340398,SLC7A2_pearson$SLC7A2)$estimate,3),", P = ",
                    round(cor.test(SLC7A2_pearson$cg03340398,SLC7A2_pearson$SLC7A2)$p.value,3)),
       x="Methylation of SLC7A2 promoter",y="SLC7A2 expression")+
  theme(
    plot.title = element_text(hjust = 0.5,size = 15),
    axis.text.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    axis.title.x=element_text(size = 20),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 20),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    
  )
dev.off()

##RAB3B
RAB3B_sites<-filter(ann450k2,gene_id=="RAB3B")
View(RAB3B_sites)
RAB3B_sites<-RAB3B_sites[1:6,c(4,35)]
View(RAB3B_sites)
RAB3B_sites$Name
# "cg01417615" "cg02395396" "cg05929645" "cg07808546" "cg18194821"
#"cg22429822"
RAB3B_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg01417615",]
RAB3B_bval2<-t(RAB3B_bval)
RAB3B_bval2<-as.data.frame(RAB3B_bval2)
RAB3B_bval2$ID<-row.names(RAB3B_bval2)
RAB3B_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="RAB3B",]
RAB3B_exr2<-t(RAB3B_exr)
RAB3B_exr2<-as.data.frame(RAB3B_exr2)
RAB3B_exr2$ID<-row.names(RAB3B_exr2)
RAB3B_pearson<-inner_join(RAB3B_exr2,RAB3B_bval2)


g<-ggplot(RAB3B_pearson, aes(x=cg01417615, y=RAB3B)) + 
  geom_point()+
  #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+
  theme_bw()+
  stat_smooth(method=lm,se=T)+
  labs(title=paste0("R = ",round(cor.test(RAB3B_pearson$cg01417615,RAB3B_pearson$RAB3B)$estimate,3),", P = ",
                    round(cor.test(RAB3B_pearson$cg01417615,RAB3B_pearson$RAB3B)$p.value,3)),
       x="Methylation of RAB3B promoter",y="RAB3B expression")+
  theme(
    plot.title = element_text(hjust = 0.5,size = 15),
    axis.text.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    axis.title.x=element_text(size = 20),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 20),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    
  )
g
##IGFBP1
IGFBP1_sites<-filter(ann450k2,gene_id=="IGFBP1")
View(IGFBP1_sites)
IGFBP1_sites<-IGFBP1_sites[1:2,c(4,35)]
View(IGFBP1_sites)
IGFBP1_sites$Name
# "cg01310829" "cg05823605"
IGFBP1_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg01310829",]
IGFBP1_bval2<-t(IGFBP1_bval)
IGFBP1_bval2<-as.data.frame(IGFBP1_bval2)
IGFBP1_bval2$ID<-row.names(IGFBP1_bval2)
IGFBP1_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="IGFBP1",]
IGFBP1_exr2<-t(IGFBP1_exr)
IGFBP1_exr2<-as.data.frame(IGFBP1_exr2)
IGFBP1_exr2$ID<-row.names(IGFBP1_exr2)
IGFBP1_pearson<-inner_join(IGFBP1_exr2,IGFBP1_bval2)
which(IGFBP1_pearson$IGFBP1>12)
IGFBP1_pearson<-IGFBP1_pearson[c(-55,-88,-218),]

g<-ggplot(IGFBP1_pearson, aes(x=cg01310829, y=IGFBP1)) + 
  geom_point()+
  #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+
  theme_bw()+
  stat_smooth(method=lm,se=T)+
  labs(title=paste0("R = ",round(cor.test(IGFBP1_pearson$cg01310829,IGFBP1_pearson$IGFBP1)$estimate,3),", P = ",
                    round(cor.test(IGFBP1_pearson$cg01310829,IGFBP1_pearson$IGFBP1)$p.value,3)),
       x="Methylation of IGFBP1 promoter",y="IGFBP1 expression")+
  theme(
    plot.title = element_text(hjust = 0.5,size = 15),
    axis.text.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    axis.title.x=element_text(size = 20),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 20),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    
  )
g


##VSTM2L
VSTM2L_sites<-filter(ann450k2,gene_id=="VSTM2L")
View(VSTM2L_sites)
VSTM2L_sites<-VSTM2L_sites[2:5,c(4,35)]
VSTM2L_sites$Name
# "cg09337248" "cg09565639" "cg12163925" "cg17071589"
VSTM2L_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg09565639",]
VSTM2L_bval2<-t(VSTM2L_bval)
VSTM2L_bval2<-as.data.frame(VSTM2L_bval2)
VSTM2L_bval2$ID<-row.names(VSTM2L_bval2)
VSTM2L_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="VSTM2L",]
VSTM2L_exr2<-t(VSTM2L_exr)
VSTM2L_exr2<-as.data.frame(VSTM2L_exr2)
VSTM2L_exr2$ID<-row.names(VSTM2L_exr2)
VSTM2L_pearson<-inner_join(VSTM2L_exr2,VSTM2L_bval2)
which(VSTM2L_pearson$VSTM2L>13.5)
VSTM2L_pearson<-VSTM2L_pearson[c(-36,-283),]

g<-ggplot(VSTM2L_pearson, aes(x=cg09565639, y=VSTM2L)) + 
  geom_point()+
  #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+
  theme_bw()+
  stat_smooth(method=lm,se=T)+
  labs(title=paste0("R = ",round(cor.test(VSTM2L_pearson$cg09565639,VSTM2L_pearson$VSTM2L)$estimate,3),", P = ",
                    round(cor.test(VSTM2L_pearson$cg09565639,VSTM2L_pearson$VSTM2L)$p.value,3)),
       x="Methylation of VSTM2L promoter",y="VSTM2L expression")+
  theme(
    plot.title = element_text(hjust = 0.5,size = 15),
    axis.text.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    axis.title.x=element_text(size = 20),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 20),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    
  )
g

##EVX2
EVX2_sites<-filter(ann450k2,gene_id=="EVX2")
View(EVX2_sites)
EVX2_sites<-EVX2_sites[3:4,c(4,35)]
EVX2_sites$Name
# "cg07536910" "cg07939836"
EVX2_bval<-Tumor_bVals[row.names(Tumor_bVals)=="cg07939836",]
EVX2_bval2<-t(EVX2_bval)
EVX2_bval2<-as.data.frame(EVX2_bval2)
EVX2_bval2$ID<-row.names(EVX2_bval2)
EVX2_exr<-mRNA_exprSet_all_vst[row.names(mRNA_exprSet_all_vst)=="EVX2",]
EVX2_exr2<-t(EVX2_exr)
EVX2_exr2<-as.data.frame(EVX2_exr2)
EVX2_exr2$ID<-row.names(EVX2_exr2)
EVX2_pearson<-inner_join(EVX2_exr2,EVX2_bval2)
which(EVX2_pearson$EVX2>7)
EVX2_pearson<-EVX2_pearson[c(-31,-271),]

g<-ggplot(EVX2_pearson, aes(x=cg07939836, y=EVX2)) + 
  geom_point()+
  #scale_color_manual(values = c("#E41A1C", "#FF7F00","#2B8CBE"))+
  theme_bw()+
  stat_smooth(method=lm,se=T)+
  labs(title=paste0("R = ",round(cor.test(EVX2_pearson$cg07939836,EVX2_pearson$EVX2)$estimate,3),", P = ",
                    round(cor.test(EVX2_pearson$cg07939836,EVX2_pearson$EVX2)$p.value,3)),
       x="Methylation of EVX2 promoter",y="EVX2 expression")+
  theme(
    plot.title = element_text(hjust = 0.5,size = 15),
    axis.text.y = element_text(size = 20),
    axis.text.x = element_text(size = 20),
    axis.title.x=element_text(size = 20),
    axis.line.x = element_line(size = 0.5, colour = "black"),
    axis.title.y=element_text(size = 20),
    axis.line.y = element_line(size = 0.5, colour = "black"),
    
  )
g


##gene expression in CIMP-related groups

##SLC7A2
load("./clust_result.Rdata")
load("./mRNA_exprSet_all_vst.Rdata")
View(clust_result)
View(head(mRNA_exprSet_all_vst))
SLC7A2_CIMP<-t(mRNA_exprSet_all_vst)
View(head(SLC7A2_CIMP))
dim(SLC7A2_CIMP)
SLC7A2_CIMP<-as.data.frame(SLC7A2_CIMP)
require(dplyr)
SLC7A2_CIMP2<-dplyr::select(SLC7A2_CIMP,SLC7A2)
SLC7A2_CIMP2$ID<-row.names(SLC7A2_CIMP)
View(SLC7A2_CIMP2)
SLC7A2_CIMP3<-inner_join(SLC7A2_CIMP2,clust_result)
View(SLC7A2_CIMP3)
which(SLC7A2_CIMP3$SLC7A2>15)
SLC7A2_CIMP3<-SLC7A2_CIMP3[c(-240,-302),]
names(SLC7A2_CIMP3)
dim(SLC7A2_CIMP3)
SLC7A2_CIMP3$cluster_result<-factor(SLC7A2_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H"))

###箱线图
require(ggpubr)
p<-ggplot(SLC7A2_CIMP3,aes(x = cluster_result, y = SLC7A2,fill=cluster_result))+
  geom_boxplot(outlier.color = NA)+
  labs(x="Cluster", y = "SLC7A2 expression")+
  theme_classic()
my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") )
p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+
  stat_compare_means(method = "kruskal.test") + #label的位置
  stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(13.7,13,14.3))+
  ylim(6.5,14.5)

##RAB3B
View(clust_result)
View(head(mRNA_exprSet_all_vst))
RAB3B_CIMP<-t(mRNA_exprSet_all_vst)
dim(RAB3B_CIMP)
RAB3B_CIMP<-as.data.frame(RAB3B_CIMP)
RAB3B_CIMP2<-dplyr::select(RAB3B_CIMP,RAB3B)
RAB3B_CIMP2$ID<-row.names(RAB3B_CIMP2)
RAB3B_CIMP3<-inner_join(RAB3B_CIMP2,clust_result)
#which(SLC7A2_CIMP3$SLC7A2>15)
#SLC7A2_CIMP3<-SLC7A2_CIMP3[c(-240,-302),]
dim(RAB3B_CIMP3)
RAB3B_CIMP3$cluster_result<-factor(RAB3B_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H"))

###箱线图
require(ggpubr)
p<-ggplot(RAB3B_CIMP3,aes(x = cluster_result, y = RAB3B,fill=cluster_result))+
  geom_boxplot(outlier.color = NA)+
  labs(x="Cluster", y = "RAB3B expression")+
  theme_classic()
my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") )
p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+
  stat_compare_means(method = "kruskal.test") + #label的位置
  stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(13.5,13,14))+
  ylim(6.5,14.5)

##IGFBP1
View(clust_result)
View(head(mRNA_exprSet_all_vst))
IGFBP1_CIMP<-t(mRNA_exprSet_all_vst)
dim(IGFBP1_CIMP)
IGFBP1_CIMP<-as.data.frame(IGFBP1_CIMP)
IGFBP1_CIMP2<-dplyr::select(IGFBP1_CIMP,IGFBP1)
IGFBP1_CIMP2$ID<-row.names(IGFBP1_CIMP2)
IGFBP1_CIMP3<-inner_join(IGFBP1_CIMP2,clust_result)
which(IGFBP1_CIMP3$IGFBP1>12)
IGFBP1_CIMP3<-IGFBP1_CIMP3[c(-55,-88,-218),]
dim(IGFBP1_CIMP3)
IGFBP1_CIMP3$cluster_result<-factor(IGFBP1_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H"))

###箱线图
require(ggpubr)
p<-ggplot(IGFBP1_CIMP3,aes(x = cluster_result, y = IGFBP1,fill=cluster_result))+
  geom_boxplot(outlier.color = NA)+
  labs(x="Cluster", y = "IGFBP1 expression")+
  theme_classic()
my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") )
p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+
  stat_compare_means(method = "kruskal.test") + #label的位置
  stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(10.3,10.5,10.8))+
  ylim(5.5,11)

#VSTM2L
View(clust_result)
View(head(mRNA_exprSet_all_vst))
VSTM2L_CIMP<-t(mRNA_exprSet_all_vst)
dim(VSTM2L_CIMP)
VSTM2L_CIMP<-as.data.frame(VSTM2L_CIMP)
VSTM2L_CIMP2<-dplyr::select(VSTM2L_CIMP,VSTM2L)
VSTM2L_CIMP2$ID<-row.names(VSTM2L_CIMP2)
VSTM2L_CIMP3<-inner_join(VSTM2L_CIMP2,clust_result)
which(VSTM2L_CIMP3$VSTM2L>13.5)
VSTM2L_CIMP3<-VSTM2L_CIMP3[c(-36,-283),]
dim(VSTM2L_CIMP3)
VSTM2L_CIMP3$cluster_result<-factor(VSTM2L_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H"))

###箱线图
require(ggpubr)
p<-ggplot(VSTM2L_CIMP3,aes(x = cluster_result, y = VSTM2L,fill=cluster_result))+
  geom_boxplot(outlier.color = NA)+
  labs(x="Cluster", y = "VSTM2L expression")+
  theme_classic()
my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") )
p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+
  stat_compare_means(method = "kruskal.test") + #label的位置
  stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(12.5,12,13))+
  ylim(5.5,13)

#EVX2
View(clust_result)
View(head(mRNA_exprSet_all_vst))
EVX2_CIMP<-t(mRNA_exprSet_all_vst)
dim(EVX2_CIMP)
EVX2_CIMP<-as.data.frame(EVX2_CIMP)
EVX2_CIMP2<-dplyr::select(EVX2_CIMP,EVX2)
EVX2_CIMP2$ID<-row.names(EVX2_CIMP2)
EVX2_CIMP3<-inner_join(EVX2_CIMP2,clust_result)
which(EVX2_CIMP3$EVX2>7)
EVX2_CIMP3<-EVX2_CIMP3[c(-20,-32,-243,-273),]
dim(EVX2_CIMP3)
EVX2_CIMP3$cluster_result<-factor(EVX2_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H"))

###箱线图
require(ggpubr)
p<-ggplot(EVX2_CIMP3,aes(x = cluster_result, y = EVX2,fill=cluster_result))+
  geom_boxplot(outlier.color = NA)+
  labs(x="Cluster", y = "EVX2 expression")+
  theme_classic()
my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") )
p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+
  stat_compare_means(method = "kruskal.test") + #label的位置
  stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(6.8,7,7.2))+
  ylim(5.5,7.5)

#CST6
View(clust_result)
View(head(mRNA_exprSet_all_vst))
CST6_CIMP<-t(mRNA_exprSet_all_vst)
dim(CST6_CIMP)
CST6_CIMP<-as.data.frame(CST6_CIMP)
CST6_CIMP2<-dplyr::select(CST6_CIMP,CST6)
CST6_CIMP2$ID<-row.names(CST6_CIMP2)
CST6_CIMP3<-inner_join(CST6_CIMP2,clust_result)
which(CST6_CIMP3$CST6>12)
CST6_CIMP3<-CST6_CIMP3[c(-69,-127,-195,-285),]
dim(CST6_CIMP3)
CST6_CIMP3$cluster_result<-factor(CST6_CIMP3$cluster_result,levels =c("CIMP-L","CIMP-M","CIMP-H"))

###箱线图
require(ggpubr)
p<-ggplot(CST6_CIMP3,aes(x = cluster_result, y = CST6,fill=cluster_result))+
  geom_boxplot(outlier.color = NA)+
  labs(x="Cluster", y = "CST6 expression")+
  theme_classic()
my_comparisons <- list( c("CIMP-L", "CIMP-M"), c("CIMP-M", "CIMP-H"), c("CIMP-L", "CIMP-H") )
p+scale_fill_manual(values = c("#4DAF4A", "#377EB8","#E41A1C"))+
  stat_compare_means(method = "kruskal.test") + #label的位置
  stat_compare_means(comparisons = my_comparisons,label = "p.signif",label.y =c(8.8,9.2,9.5))+
  ylim(5.5,10)

##GO and KEGG analysis in riskscore related groups

load("final2.Rdata")
View(final2)
metadata <- data.frame(row.names(final2))
View(metadata)
names(metadata)<-"ID"
class(metadata$ID)
class(tcga_score$ID)
metadata<-inner_join(metadata,tcga_score)
View(metadata)
class(metadata)
metadata<-metadata[,-2]

length(names(mRNA_exprSet))
length(metadata$ID)
b<-vector()
for(i in 1:length(metadata$ID))
{
  b[i]<-which(str_sub(names(mRNA_exprSet),1,15)==metadata$ID[i])
}
View(b)
View(head(mRNA_exprSet))
mRNA_exprSet<-mRNA_exprSet[,c(1,b)]
View(head(mRNA_exprSet))
View(metadata)
names(metadata) <- c("TCGA_id","sample")
metadata$sample <- factor(metadata$sample,levels = c("low","high"))

mycounts <- mRNA_exprSet
keepGene=rowSums(edgeR::cpm(mycounts[-1])>0) >=2
table(keepGene)
dim(mycounts)
mycounts <-mycounts[keepGene,]

library(DESeq2)
dds <-DESeqDataSetFromMatrix(countData=mycounts, 
                             colData=metadata, 
                             design=~sample,
                             tidy=TRUE)
dds$sample
dds <- DESeq(dds)
dds_riskhl<-dds
metadata$sample

save(dds_riskhl,file = "dds_riskhl_DEseq.RData")

load("./res_riskhl_result.Rdata")
View(res_riskhl)
res<-res_riskhl
P <-  0.05
foldChange <- 2
diffSig = res[(res$padj < P & (res$log2FoldChange>foldChange | res$log2FoldChange<(-foldChange))),]
View(diffSig)

genelist3<-res$log2FoldChange
names(genelist3) = res$symbol

library(GOplot)
library(clusterProfiler)
gene<-names(genelist3)[abs(genelist3)>2]
library(org.Hs.eg.db)
gene <- bitr(gene, fromType = "SYMBOL", 
             toType = c("ENTREZID"), 
             OrgDb = org.Hs.eg.db)
library(org.Hs.eg.db)
ego<-enrichGO(gene = gene,
              OrgDb = org.Hs.eg.db,
              ont = "BP",
              pAdjustMethod = "BH",
              pvalueCutoff = 0.01,
              qvalueCutoff = 0.01,
              readable = TRUE)
cnetplot(ego, categorySize="pvalue")
cnetplot(ego, circular = TRUE, colorEdge = TRUE)

kegg<-enrichKEGG(gene = gene,
                 organism ="human",
                 pvalueCutoff = 0.01,
                 qvalueCutoff = 0.01,
                 minGSSize = 1,
                 #readable = TRUE ,
                 use_internal_data =FALSE)
cnetplot(kegg, categorySize="pvalue",foldChange = genelist3)