rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./00_rawdata")){
  dir.create("./00_rawdata")
}
setwd("./00_rawdata")
library(dplyr) 
library(tidyr)
library(tinyarray)
library(tidyverse)
library(GEOquery)
library(Biobase)
library(AnnoProbe) #idmap
library(clusterProfiler)
library(enrichplot)

# GSE20685
setwd("/data/nas3///29_YQ953-11/00_rawdata/")
if (!dir.exists("GSE20685"))
{dir.create("GSE20685")}
setwd("GSE20685")
#
library(Biobase)
library(GEOquery) # 
# GSE20685
gset<-getGEO("GSE20685",
             destdir = '.',
             GSEMatrix = T,
             getGPL = F)
gset$GSE20685_series_matrix.txt.gz@annotation
expr<-as.data.frame(exprs(gset[[1]]))
gpl<-getGEO("GPL570",destdir = '../../../../pipeline/GPL/')
a=gset[[1]]
#gpl<-Table(gpl)
gpl <- idmap(gpl = 'GPL570')
colnames(gpl)
library(clusterProfiler)
probe2symobl<-gpl
# probe2symobl<-gpl %>%
#   dplyr::select('ID','Symbol')%>%
#   filter('Symbol'!='')%>%
#   separate('Symbol',c('symbol','drop'),sep = '///')%>%
#   dplyr::select(-drop)
probe2symobl=probe2symobl[probe2symobl$symbol!='',]
colnames(probe2symobl)<-c('ID','symbol')
dat.va<-expr
dat.va$ID<-rownames(dat.va)
dat.va$ID<-as.character(dat.va$ID)
probe2symobl$ID<-as.character(probe2symobl$ID)
dat.va<-dat.va %>%
  inner_join(probe2symobl,by='ID')%>%
  dplyr::select(-ID)%>%     ## 
  dplyr::select(symbol,everything())%>%     ## 
  mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>%    ## 
  arrange(desc(rowMean))%>%       ## 
  distinct(symbol,.keep_all = T)%>%      ## symbol
  dplyr::select(-rowMean)%>%     ## rowMean
  tibble::column_to_rownames(colnames(.)[1])   ## 

## 
pd <- pData(a)
colnames(pd)
# table(pd$source_name_ch1)
# pd <- subset(pd,source_name_ch1 == 'Breast tissue, cancer')
survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.3,OS.time=pd$characteristics_ch1.4)


table(survival$OS)
survival$OS <- ifelse(survival$OS=='event_death: 0',0,1)
table(survival$OS.time)
survival$OS.time <- gsub('follow_up_duration (years): ','',survival$OS.time,fixed = T)
survival$OS.time <- as.numeric(survival$OS.time)*365
dat <- dat.va[,survival$sample]

write.csv(dat,file = 'dat(GSE20685).csv')
write.csv(survival,file = 'survival(GSE20685).csv')


# # GSE20685
# if (!dir.exists("GSE42568")) 
# {dir.create("GSE42568")}
# setwd("GSE42568")
# 
# 
# # GSE42568
# gset<-getGEO("GSE42568",
#              destdir = '.',
#              GSEMatrix = T,
#              getGPL = F)
# gset$GSE42568_series_matrix.txt.gz@annotation
# expr<-as.data.frame(exprs(gset[[1]]))
# gpl<-getGEO("GPL570",destdir = '../../../../pipeline/GPL/')
# a=gset[[1]]
# #gpl<-Table(gpl)
# gpl <- idmap(gpl = 'GPL570')
# colnames(gpl)
# library(clusterProfiler)
# probe2symobl<-gpl
# # probe2symobl<-gpl %>%
# #   dplyr::select('ID','Symbol')%>%
# #   filter('Symbol'!='')%>%
# #   separate('Symbol',c('symbol','drop'),sep = '///')%>%
# #   dplyr::select(-drop)
# probe2symobl=probe2symobl[probe2symobl$symbol!='',]
# colnames(probe2symobl)<-c('ID','symbol')
# dat.va<-expr
# dat.va$ID<-rownames(dat.va)
# dat.va$ID<-as.character(dat.va$ID)
# probe2symobl$ID<-as.character(probe2symobl$ID)
# dat.va<-dat.va %>%
#   inner_join(probe2symobl,by='ID')%>%
#   dplyr::select(-ID)%>%     ## 
#   dplyr::select(symbol,everything())%>%     ## 
#   mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>%    ## 
#   arrange(desc(rowMean))%>%       ## 
#   distinct(symbol,.keep_all = T)%>%      ## symbol
#   dplyr::select(-rowMean)%>%     ## rowMean
#   tibble::column_to_rownames(colnames(.)[1])   ## 
# 
# ## 
# pd <- pData(a)
# table(pd$source_name_ch1)
# pd <- subset(pd,source_name_ch1 == 'Breast tissue, cancer')
# survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.9,OS.time=pd$characteristics_ch1.8)
# 
# 
# table(survival$OS)
# survival$OS <- ifelse(survival$OS=='overall survival event: 0',0,1)
# table(survival$OS.time)
# survival$OS.time <- gsub('overall survival time_days: ','',survival$OS.time,fixed = T)
# #survival$OS.time <- as.numeric(survival$OS.time)*30
# dat <- dat.va[,survival$sample]
# 
# write.csv(dat,file = 'dat(GSE42568).csv')
# write.csv(survival,file = 'survival(GSE42568).csv')
# 
# 


# # GSE7390
# if (!dir.exists("GSE7390")) 
# {dir.create("GSE7390")}
# setwd("GSE7390")
# 
# 
# # GSE7390
# gset<-getGEO("GSE7390",
#              destdir = '.',
#              GSEMatrix = T,
#              getGPL = F)
# gset$GSE7390_series_matrix.txt.gz@annotation
# expr<-as.data.frame(exprs(gset[[1]]))
# gpl<-getGEO("GPL96",destdir = '../../../../pipeline/GPL/')
# a=gset[[1]]
# #gpl<-Table(gpl)
# gpl <- idmap(gpl = 'GPL96')
# colnames(gpl)
# library(clusterProfiler)
# probe2symobl<-gpl
# # probe2symobl<-gpl %>%
# #   dplyr::select('ID','Symbol')%>%
# #   filter('Symbol'!='')%>%
# #   separate('Symbol',c('symbol','drop'),sep = '///')%>%
# #   dplyr::select(-drop)
# probe2symobl=probe2symobl[probe2symobl$symbol!='',]
# colnames(probe2symobl)<-c('ID','symbol')
# dat.va<-expr
# dat.va$ID<-rownames(dat.va)
# dat.va$ID<-as.character(dat.va$ID)
# probe2symobl$ID<-as.character(probe2symobl$ID)
# dat.va<-dat.va %>%
#   inner_join(probe2symobl,by='ID')%>%
#   dplyr::select(-ID)%>%     ## 
#   dplyr::select(symbol,everything())%>%     ## 
#   mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>%    ## 
#   arrange(desc(rowMean))%>%       ## 
#   distinct(symbol,.keep_all = T)%>%      ## symbol
#   dplyr::select(-rowMean)%>%     ## rowMean
#   tibble::column_to_rownames(colnames(.)[1])   ## 
# 
# ## 
# pd <- pData(a)
# table(pd$source_name_ch1)
# #pd <- subset(pd,source_name_ch1 == 'Breast tissue, cancer')
# survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.16,OS.time=pd$characteristics_ch1.15)
# 
# 
# table(survival$OS)
# survival$OS <- ifelse(survival$OS=='e.os: 0',0,1)
# table(survival$OS.time)
# survival$OS.time <- gsub('t.os: ','',survival$OS.time,fixed = T)
# #survival$OS.time <- as.numeric(survival$OS.time)*30
# dat <- dat.va[,survival$sample]
# 
# write.csv(dat,file = 'dat(GSE7390).csv')
# write.csv(survival,file = 'survival(GSE7390).csv')


## GSE58812
#if (!dir.exists("GSE58812")) 
#{dir.create("GSE58812")}
#setwd("GSE58812")
#
#
## GSE58812
#gset<-getGEO("GSE58812",
#             destdir = '.',
#             GSEMatrix = T,
#             getGPL = F)
#gset$GSE58812_series_matrix.txt.gz@annotation
#expr<-as.data.frame(exprs(gset[[1]]))
#gpl<-getGEO("GPL570",destdir = '../../../../pipeline/GPL/')
#a=gset[[1]]
##gpl<-Table(gpl)
#gpl <- idmap(gpl = 'GPL570')
#colnames(gpl)
#library(clusterProfiler)
#probe2symobl<-gpl
## probe2symobl<-gpl %>%
##   dplyr::select('ID','Symbol')%>%
##   filter('Symbol'!='')%>%
##   separate('Symbol',c('symbol','drop'),sep = '///')%>%
##   dplyr::select(-drop)
#probe2symobl=probe2symobl[probe2symobl$symbol!='',]
#colnames(probe2symobl)<-c('ID','symbol')
#dat.va<-expr
#dat.va$ID<-rownames(dat.va)
#dat.va$ID<-as.character(dat.va$ID)
#probe2symobl$ID<-as.character(probe2symobl$ID)
#dat.va<-dat.va %>%
#  inner_join(probe2symobl,by='ID')%>%
#  dplyr::select(-ID)%>%     ## 
#  dplyr::select(symbol,everything())%>%     ## 
#  mutate(rowMean=rowMeans(.[grep('GSM',names(.))]))%>%    ## 
#  arrange(desc(rowMean))%>%       ## 
#  distinct(symbol,.keep_all = T)%>%      ## symbol
#  dplyr::select(-rowMean)%>%     ## rowMean
#  tibble::column_to_rownames(colnames(.)[1])   ## 
#
### 
#pd <- pData(a)
#colnames(pd)
#table(pd$source_name_ch1)
#pd <- subset(pd,source_name_ch1 == 'Breast tumor biospsie')
#survival <- data.frame(sample=pd$geo_accession,OS=pd$characteristics_ch1.4,OS.time=pd$characteristics_ch1.5)
#
#
#table(survival$OS)
#survival$OS <- ifelse(survival$OS=='death: 0',0,1)
#table(survival$OS.time)
#survival$OS.time <- gsub('os (days): ','',survival$OS.time,fixed = T)
##survival$OS.time <- as.numeric(survival$OS.time)*30
#dat <- dat.va[,survival$sample]
#dat <- log2(dat+1)
#write.csv(dat,file = 'dat(GSE58812).csv')
#write.csv(survival,file = 'survival(GSE58812).csv')


rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./00_rawdata")){
  dir.create("./00_rawdata")
}
setwd("./00_rawdata")

library(readr)
library(readxl)
library(tidyverse)
library(tibble)

##
annotation <- read.table(file = './gencode.v36.annotation.gtf.gene.probemap')
probe2symbol <- annotation[,(1:2)]
colnames(probe2symbol) <- c('ID', 'symbol')
probe2symbol <- probe2symbol[-1,]
probe2symbol$ID <- as.character(probe2symbol$ID)

#count
count.log <- read_tsv(file = './TCGA-BRCA.star_counts.tsv.gz') %>% 
  column_to_rownames(., var = 'Ensembl_ID')
#count.log <- tinyarray::trans_exp(count.log,mrna_only = T)#
count <- 2^count.log-1 #xenalog2+1，

dat_count <- count
dat_count$ID <- rownames(dat_count) %>% as.character()

##id
dat_count <- dat_count %>%
  inner_join(probe2symbol, by = 'ID') %>% #ID
  dplyr::select(-ID) %>% #ID，
  dplyr::select(symbol, everything()) %>% #symbol
  mutate(rowMean = rowMeans(.[grep('TCGA', names(.))])) %>% #TCGA,mutateDataFrame，。
  arrange(desc(rowMean)) %>% #
  distinct(symbol, .keep_all = T) %>% #symbol,.keep_all
  dplyr::select(-rowMean) %>% #rowMean
  tibble::column_to_rownames(colnames(.)[1]) #symbol,column_to_rownamestibble
dim(dat_count) 
##59427   589

# ## 
survival <- read_tsv('./TCGA-BRCA.survival.tsv.gz')
dat_count <- dat_count[,colnames(dat_count) %in% survival$sample]
dim(dat_count)
#  59427   576
## 
phenotype <- read_tsv('./TCGA-BRCA.clinical.tsv.gz') %>% data.frame(.)
colnames(phenotype)

# oral_cancer_sites <- c("Base of tongue, NOS", "Cheek mucosa", "Floor of mouth, NOS", 
#                        "Gum, NOS", "Hard palate", "Lip, NOS", "Mouth, NOS", 
#                        "Overlapping lesion of lip, oral cavity and pharynx", 
#                        "Palate, NOS", "Tongue, NOS", "Tonsil, NOS")
# 
# # 
# phenotype <- phenotype[phenotype$tissue_or_organ_of_origin.diagnoses %in% oral_cancer_sites, ]


table(phenotype$sample_type.samples)
table(phenotype$sample_id.samples)
group <- subset(phenotype, sample_type.samples == 'Primary Tumor' | sample_type.samples == 'Solid Tissue Normal')
group <- group[group$sample %in% colnames(dat_count),]
group <- group[order(group$sample_type.samples, decreasing = T),]
group <- data.frame(sample = group$sample, group = group$sample_type.samples)
group$group <- ifelse(group$group == 'Solid Tissue Normal', 'Normal', 'Tumor')
group <- data.frame(row.names = group$sample, group = group$group )####Typegroup
#
dat_count <- subset(dat_count, select=rownames(group))
identical(colnames(dat_count), rownames(group))
dim(group) 
#574   1 
table(group$group)
# Normal  Tumor 
# 59    515 
write.csv(group, file = '03.group_TCGA.csv', quote=F) 

### mRNA
library("rtracklayer")
gtf_data = import('./gencode.v36.annotation.gtf.gz') %>%as.data.frame()
head(gtf_data)
table(gtf_data$gene_type)
protein_coding=gtf_data%>%
  dplyr::filter(type=="gene",gene_type=="protein_coding")%>%
  dplyr::select(gene_id,gene_type,gene_name)
mRNA.count<-dat_count[rownames(dat_count)%in%protein_coding$gene_name,]
dim(mRNA.count) 
# 19938   574
identical(colnames(mRNA.count), rownames(group))
write.csv(mRNA.count,file = '01.expr_count_TCGA.csv', quote = F, row.names = T)


##tpm
fpkm.log <- read_tsv(file = './TCGA-BRCA.star_tpm.tsv.gz') %>%
  as.data.frame() %>%
  column_to_rownames(var = 'Ensembl_ID')
fpkm <- 2^fpkm.log-1 #xenalog2+1，

dat_fpkm <- fpkm
dat_fpkm$ID <- rownames(dat_fpkm)
dat_fpkm$ID <- as.character(dat_fpkm$ID)
##id
dat_fpkm <- dat_fpkm %>%
  inner_join(probe2symbol, by = 'ID') %>% #ID
  dplyr::select(-ID) %>% #ID，
  dplyr::select(symbol, everything()) %>% #symbol
  mutate(rowMean = rowMeans(.[grep('TCGA', names(.))])) %>% #,mutateDataFrame，。
  arrange(desc(rowMean)) %>% #
  distinct(symbol, .keep_all = T) %>% #symbol,.keep_all
  dplyr::select(-rowMean) %>% #rowMean
  tibble::column_to_rownames(colnames(.)[1]) #,column_to_rownamestibble
dim(dat_fpkm) 
## 59427   589

dat_fpkm <- dat_fpkm[,colnames(mRNA.count)]
dat_fpkm <- dat_fpkm[rownames(mRNA.count),]
dat_fpkm <- na.omit(dat_fpkm)
identical(colnames(dat_fpkm), rownames(group))
identical(colnames(dat_fpkm), colnames(mRNA.count))
identical(rownames(dat_fpkm), rownames(mRNA.count))
dim(dat_fpkm)
# 19938   574

dat_fpkm_log2 <- log2(dat_fpkm + 1)
write.csv(dat_fpkm_log2, file = '02.expr_tpm_TCGA.csv', row.names = T, quote = F)


#
survival <- read_tsv(file = './TCGA-BRCA.survival.tsv.gz')
##
group1 <- read.csv("./03.group_TCGA.csv")
table(group1$group)
group2 <- group1[group1$group == 'Tumor',]
survival <- survival[survival$sample %in% group2$X,]
survival$`_PATIENT`
survival <- survival %>% select(-`_PATIENT`)
dim(survival)
# 515   3

write.csv(survival, file = '04.survival_TCGA.csv', quote=F) 


#phe-
phe1 <- read.delim2("./TCGA-BRCA.clinical.tsv.gz", sep = "\t")
colnames(phe1)
table(phe1$race.demographic)
cli_dat1 <- cbind(sample = phe1$sample,
                  gender = phe1$gender.demographic,
                  age = phe1$age_at_index.demographic,
                  histological=phe1$neoplasm_histologic_grade,
                  #smoking=phe1$tobacco_smoking_history,
                  #race=phe1$race.demographic,
                  stage = phe1$ajcc_pathologic_stage.diagnoses,
                  T_stage = phe1$ajcc_pathologic_t.diagnoses,
                  N_stage = phe1$ajcc_pathologic_n.diagnoses,
                  M_stage = phe1$ajcc_pathologic_m.diagnoses
)

cli_dat <- as.data.frame(cli_dat1)
#table(cli_dat$M_stage)#
cli_dat[cli_dat == ""] <- NA
cli_dat <- na.omit(cli_dat)
#cli_dat$age <- as.numeric(cli_dat$age) / 365
write.csv(cli_dat,file = "05.phenotype_TCGA.csv",quote=F,row.names = F)


rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./01_DEG_TCGA")){
  dir.create("./01_DEG_TCGA")
}
setwd("./01_DEG_TCGA")


#DEG1-TCGA-----------------------------------------------------------------------------------------
library(tidyverse)
library(lance)
library(DESeq2)
dat <- read.csv('../00_rawdata/01.expr_count_TCGA.csv',row.names = 1,check.names = F)
dat <- round(dat,0)
group <- read.csv('../00_rawdata/03.group_TCGA.csv')
colnames(group) <- c('sample','group')
table(group$group) 
# Normal  Tumor 
# 32    375 
colnames(group)
condition = factor(group$group)
coldata <- data.frame(row.names = colnames(dat), condition)

dds <- DESeqDataSetFromMatrix(countData = dat,colData=coldata,design = ~condition)
dds = dds[rowSums(counts(dds)) > 1,]
dds <- estimateSizeFactors(dds)
# 
normalized_counts <- counts(dds,normalized=T) 
#write.table(normalized_counts,file = 'normalized.counts.csv',sep = ',',row.names = T,quote = F) #
dds <- DESeq(dds)
# 
res = results(dds, contrast = c("condition","Tumor","Normal"))
res = res[order(res$pvalue),]
head(res)
summary(res)
table(res$pvalue<0.05)
DEG <- as.data.frame(res)
DEG <- na.omit(DEG)
dim(DEG)
head(DEG)
## change
logFC_cutoff <- 1
DEG$change = as.factor(
  ifelse(DEG$pvalue<0.05&abs(DEG$log2FoldChange)>logFC_cutoff,
         ifelse(DEG$log2FoldChange>logFC_cutoff,'UP','DOWN'),'NOT'))
table(DEG$change)

sig_diff <- subset(DEG,
                   DEG$pvalue < 0.05 & abs(DEG$log2FoldChange) >= logFC_cutoff)
table(sig_diff$change)
#1    DOWN  NOT   UP 
#    1982    0 3095 
#0.5 DOWN  NOT   UP 
#    3600    0 5977 
write.csv(DEG,file = "01.DEG1_all.csv",quote = F,row.names = T)
write.csv(sig_diff,file = "02.DEG1_sig1.csv",quote = F,row.names = T)
saveRDS(res,file = 'res.rds')

#-----
library(dplyr)
library(ggfun)
library(grid)
library(ggplot2)
library(ggthemes)
library(Ipaper)
library(scales)
library(ggrepel)
library(pheatmap)
library(RColorBrewer)
library(gplots)
library(tidyverse)
library(lance)

dat_rep<-DEG[rownames(DEG)%in%
               rownames(rbind(head(sig_diff[order(sig_diff$log2FoldChange,decreasing = T),],10),
                              head(sig_diff[order(sig_diff$log2FoldChange,decreasing = F),],10))),]


volcano_plot<- ggplot(data = DEG,
                      aes(x = log2FoldChange,
                          y = -log10(pvalue),
                          color =change)) +
  scale_color_manual(values = c("#29AEB1", "darkgray","#B72230")) +
  #   scale_x_continuous(breaks = c(-10,-6,-3,-1,0,1,3,6,10)) +
  #scale_y_continuous(#trans = "log1p",breaks = c(0,50,100,150,200,500)) +
  geom_point(size = 2.4, alpha = 0.4, na.rm=T) +
  theme_bw(base_size = 12, base_family = "Times") +
  geom_vline(xintercept = c(-1,1),#--------------------------------------------#########FC#######
             lty = 4,
             col = "gray40",
             lwd = 0.6)+
  geom_hline(yintercept = -log10(0.05),
             lty = 4,
             col = "gray40",
             lwd = 0.6)+
  theme(legend.position = "right",
        panel.grid = element_blank(),
        legend.title = element_blank(),
        legend.text = element_text(face="bold",
                                   color="black",
                                   family = "Times",
                                   size=13),
        plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face = "bold",
                                   color = "black",
                                   size = 15),
        axis.text.y = element_text(face = "bold",
                                   color = "black",
                                   size = 15),
        axis.title.x = element_text(face = "bold",
                                    color = "black",
                                    size = 15),
        axis.title.y = element_text(face = "bold",
                                    color = "black",
                                    size = 15)) +
  geom_label_repel(
    data = dat_rep,
    aes(label = rownames(dat_rep)),
    max.overlaps = 1000,
    size = 4,
    box.padding = unit(0.5, "lines"),
    min.segment.length = 0,
    point.padding = unit(0.8, "lines"), segment.color = "black", show.legend = FALSE )+
  labs(x = "log2(Fold Change)",
       y = "-log10(P.Value)")
volcano_plot
pdf('03.volcano_2.pdf',  w=8,h=6,family='Times')
volcano_plot
dev.off()
png('03.volcano_2.png',  w=8,h=6,family='Times', units='in',res=600)
volcano_plot
dev.off()


# ------
library(ComplexHeatmap)
library(circlize)
dat_rep<-sig_diff[rownames(sig_diff)%in%
                    rownames(rbind(head(sig_diff[order(sig_diff$log2FoldChange,decreasing = T),],10),
                                   head(sig_diff[order(sig_diff$log2FoldChange,decreasing = F),],10))),]
dat_rep <- dat_rep[order(dat_rep$change,decreasing = T),]
rt <- dat
rt <- rt[rownames(dat_rep),]
group <- group[order(group$group),]
table(group$group)
group$group <- ifelse(group$group == 'Tumor','BC','Control')
rt <- rt[,group$sample]
x<-log2(rt+1)
mat <- t(scale(t(x)))#
mat[mat < (-2)] <- (-2)
mat[mat > 2] <- 2

pdf('06.heatmap.pdf',  w=6,h=6,family='Times')
densityHeatmap(mat ,title = "Distribution as heatmap", ylab = " ",height = unit(3, "cm")) %v%
  HeatmapAnnotation(Group = group$group, col = list(Group = c("BC" = "#B72230", "Control" = "#104680"))) %v%
  Heatmap(mat, 
          row_names_gp = gpar(fontsize = 9),
          show_column_names = F,
          show_row_names = T,
          ###show_colnames = FALSE,
          name = "expression", 
          ###cluster_cols = F,
          cluster_rows = F,
          height = unit(6, "cm"),
          #cluster_columns = FALSE,
          ###cluster_rows = FALSE,
          col = colorRampPalette(c("darkgreen", "white","orange"))(100))
dev.off()

png('06.heatmap.png',w=6,h=6,units='in',res=600,family='Times')
densityHeatmap(mat ,title = "Distribution as heatmap", ylab = " ",height = unit(3, "cm")) %v%
  HeatmapAnnotation(Group = group$group, col = list(Group = c("BC" = "#B72230", "Control" = "#104680"))) %v%
  Heatmap(mat, 
          row_names_gp = gpar(fontsize = 9),
          show_column_names = F,
          show_row_names = T,
          ###show_colnames = FALSE,
          name = "expression", 
          ###cluster_cols = F,
          cluster_rows = F,
          height = unit(6, "cm"),
          #cluster_columns = FALSE,
          ###cluster_rows = FALSE,
          col = colorRampPalette(c("darkgreen", "white","orange"))(100))
dev.off()

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./02_intersect")){
  dir.create("./02_intersect")
}
setwd("./02_intersect")

gene1 <- read.csv('../01_DEG_TCGA/02.DEG1_sig.csv') #5077
gene2 <- read.csv("../00_rawdata/L-RGs.csv")  #179


library(VennDiagram)
venn_list<-list(gene1=gene1$X,
                gene2=gene2$symbol
                
)
venn.plot <-venn.diagram(venn_list,  filename = NULL ,
                         force.unique = T,     #
                         print.mode = c("percent", "raw"),
                         fill = c("#E41A1C","#377EB8"), alpha = 0.6,
                         col = FALSE,
                         cex = 1.2,                                        #
                         cat.fontface = "bold",                            #
                         cat.col = c("#E41A1C","#377EB8"),     #
                         category.names = c('DEGs','LRGs'),     #
                         #fontfamily = 'serif',
                         cat.pos = c(-20,20),
                         cat.cex = 1.3,                                    ##
                         cat.fontfamily = 'serif',
                         margin = 0.1,
                         scaled =FALSE
)

inter <- get.venn.partitions(venn_list)
#venn.plotgrid.drawpdf
pdf("01.DEGs_LRGs.pdf")
grid.draw(venn.plot)
dev.off()

venn.diagram(venn_list, filename = '01.DEGs_LRGs.png', imagetype = 'png',
             fill = c("#E41A1C","#377EB8"), alpha = 0.6,
             col = FALSE,
             cat.fontface = "bold",
             cex = 1.2,
             force.unique = T,
             cat.col = c("#E41A1C","#377EB8"),
             category.names = c('DEGs','LRGs'),#
             print.mode = c("percent", "raw"),
             fontfamily = 'serif',
             cat.pos = c(-20,20),
             cat.cex = 1.3,
             #cat.fontfamily = 'serif',
             margin = 0.1,
             scaled =FALSE)
inter <- get.venn.partitions(venn_list)
for (i in 1:nrow(inter)) inter[i,'values'] <- paste(inter[[i,'..values..']], collapse = ', ')
#paste；collapse
Candidate_genes<-inter[1,]$..values.. $`1`
write.csv(Candidate_genes,'01.DEGs_LRGs.csv',quote = F,row.names = F)

# ############------------------------
# 
# rm(list = ls())
# setwd("/data/nas3///13_YQ835-2/")
# if (! dir.exists("./02_intersect")){
#   dir.create("./02_intersect")
# }
# setwd("./02_intersect")
# 
# gene1 <- read.csv('../01_DEG_TCGA/02.DEG1_sig.csv') #2152
# gene2 <- read.csv("../00_rawdata/NETs-RGs.csv")  #136
# 
# 
# 
# library(VennDiagram)
# venn_list<-list(gene1=gene1$X,
#                 gene2=gene2$symbol
#                 
# )
# venn.plot <-venn.diagram(venn_list,  filename = NULL ,
#                          force.unique = T,     #
#                          print.mode = c("percent", "raw"),
#                          fill = c("#E41A1C","#377EB8"), alpha = 0.6,
#                          col = FALSE,
#                          cex = 1.2,                                        #
#                          cat.fontface = "bold",                            #
#                          cat.col = c("#E41A1C","#377EB8"),     #
#                          category.names = c('DEGs','NETs-RGs'),     #
#                          #fontfamily = 'serif',
#                          #cat.pos = c(20,-20),
#                          cat.cex = 1.3,                                    ##
#                          cat.fontfamily = 'serif',
#                          margin = 0.1,
#                          scaled =FALSE
# )
# 
# inter <- get.venn.partitions(venn_list)
# #venn.plotgrid.drawpdf
# pdf("02.DEGs_NETs-RGs.pdf")
# grid.draw(venn.plot)
# dev.off()
# 
# venn.diagram(venn_list, filename = '02.DEGs_NETs-RGs.png', imagetype = 'png',
#              fill = c("#E41A1C","#377EB8"), alpha = 0.6,
#              col = FALSE,
#              cat.fontface = "bold",
#              cex = 1.2,
#              force.unique = T,
#              cat.col = c("#E41A1C","#377EB8"),
#              category.names = c('DEGs','NETs-RGs'),#
#              print.mode = c("percent", "raw"),
#              fontfamily = 'serif',
#              #cat.pos = c(20,-20),
#              cat.cex = 1.3,
#              #cat.fontfamily = 'serif',
#              margin = 0.1,
#              scaled =FALSE)
# inter <- get.venn.partitions(venn_list)
# for (i in 1:nrow(inter)) inter[i,'values'] <- paste(inter[[i,'..values..']], collapse = ', ')
# #paste；collapse
# Candidate_genes<-inter[1,]$..values.. $`1`
# write.csv(Candidate_genes,'02.DEGs_NETs-RGs.csv',quote = F,row.names = F)rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (!dir.exists("03_enrichment")) {dir.create("03_enrichment")}
setwd("03_enrichment")
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(GOplot)
library(Rgraphviz)
library(ggnewscale)
library(DOSE)
library(dplyr)
library(tidyverse)
library(data.table)
library(ggraph)
library(tidygraph)
library(stringr)
library(ComplexHeatmap)

# charset -----------------------------------------------------------------
go.circ = 1
kegg.circ = 1
kegg.buble = 1
#kegg.chord = 1
# load('../00_rawdat/00_temp/enrich.Rdata')

# dat prepare -------------------------------------------------------------
enrich_ID <- read.csv("../02_intersect/01.DEGs_LRGs.csv")
# enrich_ID <- readRDS("../00_rawdat/00_temp/rsult.sig1.rds")[[20]]
select_DERNA <- read.csv("../01_DEG_TCGA/02.DEG1_sig.csv",row.names = 1)#  
# grep('',colnames(select_DERNA))
if(str_detect(colnames(select_DERNA)[1],'logFC')){
  colnames(select_DERNA)[1] <- 'log2FoldChange'
}
gene_symbol <- clusterProfiler::bitr(
  geneID = enrich_ID$x,
  fromType = "SYMBOL",
  toType = "ENTREZID",
  OrgDb = "org.Hs.eg.db")
library(clusterProfiler)

enrich_go <- enrichGO(gene = gene_symbol[,2],
                      OrgDb = org.Hs.eg.db,
                      keyType = "ENTREZID",
                      ont = "ALL",
                      pAdjustMethod = "fdr",
                      pvalueCutoff = 1,
                      qvalueCutoff = 1,
                      readable = TRUE)

enrich_go <- mutate(enrich_go, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
GO_ALL <- enrich_go@result
if(median(GO_ALL$richFactor) < 0.1){GO_ALL$richFactor<- GO_ALL$richFactor*10}

GO_ALL <- GO_ALL[GO_ALL$pvalue < 0.05,]
BP <- GO_ALL[GO_ALL$ONTOLOGY=='BP', ]
BP <- BP[order(BP$pvalue,decreasing = F),]
CC <- GO_ALL[GO_ALL$ONTOLOGY=='CC', ]
CC <- CC[order(CC$pvalue,decreasing = F),]
MF <- GO_ALL[GO_ALL$ONTOLOGY=='MF', ]
MF <- MF[order(MF$pvalue,decreasing = F),]
paste0("",dim(GO_ALL)[[1]],"，",dim(BP)[[1]],"，",dim(CC)[[1]],"，",dim(MF)[[1]],"")
#"231，179，14，38"
write.csv(GO_ALL,file = "00.go_ALL.csv",row.names = F)
write.csv(as.data.frame(BP), '00.GO_BP.csv', row.names = FALSE)
write.csv(as.data.frame(CC), '00.GO_CC.csv', row.names = FALSE)
write.csv(as.data.frame(MF), '00.GO_MF.csv', row.names = FALSE)

display_number = c(5, 5, 5)
ego_result_BP <- as.data.frame(BP)[1:display_number[1], ]
ego_result_CC <- as.data.frame(CC)[1:display_number[2], ]
ego_result_MF <- as.data.frame(MF)[1:display_number[3], ]
paste0(ego_result_BP$Description,collapse = "（）；")
paste0(ego_result_CC$Description,collapse = "（）；")
paste0(ego_result_MF$Description,collapse = "（）；")
col1 <- c("#8DA0CB","#E78AC3","#A6D854","#FFD92F","#E5C494","#8DD3C7")
col2 <- c("#BC80BD","#CCEBC5","#FFED6F","#1B9E77","#D95F02","#7570B3")
col3 <- c("#E4C755", "#9FA3A8",'#D6E7A3','#F4A371FF','#F9C187FF','#FACF94FF')
Col1 <- sample(col1,1)
Col2 <- sample(col2,1)
Col3 <- sample(col3,1)
#GO-
if(T){
  go_enrich_df <- data.frame(
    ID=c(ego_result_BP$ID, ego_result_CC$ID, ego_result_MF$ID),
    Description=c(ego_result_BP$Description,ego_result_CC$Description,ego_result_MF$Description),
    GeneNumber=c(ego_result_BP$Count, ego_result_CC$Count, ego_result_MF$Count),
    type=factor(c(rep("BP:biological process", display_number[1]), 
                  rep("CC:cellular component", display_number[2]),
                  rep("MF:molecular function", display_number[3])), 
                levels=c("BP:biological process", "CC:cellular component","MF:molecular function" )))
  go_enrich_df$type_order=factor(rev(as.integer(rownames(go_enrich_df))),labels=rev(go_enrich_df$Description))
  COLS <- c(Col1, Col2,Col3)
  
  p <- ggplot(data=go_enrich_df, aes(x=type_order,y=GeneNumber, fill=type)) + #
    geom_bar(stat="identity", width=0.8) + #，
    scale_fill_manual(values = COLS) + ###
    coord_flip() + ##，
    xlab("GO term") + 
    ylab("Gene Number") + 
    labs(title = "GO Terms")+
    theme_bw()+theme(
      legend.background = element_rect(fill = "white", color = "black", size = 0.2),
      legend.text = element_text(face="bold",color="black",family = "Times",size=12),
      plot.title = element_text(hjust = 0.5, face = "bold",color = "black",family = "Times",size = 18),
      axis.text.x = element_text(face = "bold",color = "black",size = 14),
      axis.text.y = element_text(face = "bold",color = "black",size = 14),
      axis.title.x = element_text(face = "bold",color = "black",family = "Times",size = 18),
      axis.title.y = element_text(face = "bold",color = "black",family = "Times",size = 18),
      plot.subtitle = element_text(hjust = 0.5,family = "Times", size = 14, face = "italic", colour = "black")
    )+scale_x_discrete(labels=function(x) str_wrap(x, width=40))
  pdf("01.go_barplot.pdf", width = 12, height = 8,family = "Times")
  print(p)
  dev.off()
  png("01.go_barplot.png", width = 12, height = 8,res = 600,units = 'in',family = "Times")
  print(p)
  dev.off()
} 
# circ
if(go.circ == 1){
  go_tem <- rbind(ego_result_BP,ego_result_CC,ego_result_MF)
  go <-  data.frame(Category = go_tem[,'ONTOLOGY'],ID = go_tem[,'ID'],Term = go_tem[,'Description'], 
                    Genes = gsub("/", ", ", go_tem[,'geneID']), adj_pval = go_tem[,'pvalue'])
  genelist <- data.frame(ID = rownames(select_DERNA), logFC = select_DERNA$log2FoldChange)
  row.names(genelist)=genelist[,1]
  library(GOplot)
  circ <- circle_dat(go, genelist)
  go_tem$pvalue <- -log10(go_tem$pvalue)
  go_enrich_df <- data.frame(
    category=go_tem$ONTOLOGY, 
    gene_num.min = 0,
    gene_num.max = max(go_tem$Count),
    gene_num.rich=go_tem$Count,
    "-log10.p" = go_tem$pvalue,
    up.regulated = 0,
    down.regulated = 0,
    rich.factor = go_tem$richFactor,
    Description=go_tem$ID)
  circ$sig <- ifelse(circ$logFC > 0,"up","down")
  tem1 <- go_enrich_df$Description
  i =1
  for(i in 1:15){
    x = tem1[i]
    circ1 <- circ[grep(x,circ$ID),]
    num1 <- nrow(circ1)
    tem2 <- as.data.frame(table(circ1$sig))
    if(nrow(tem2) == 2){
      go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]]
      go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[2]]
    }else if(tem2[1,1] == "up"){
      go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[1]]
    }else{go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]]}
  }
  
  go_enrich_df$Description <- factor(go_enrich_df$Description, levels = go_enrich_df$Description)
  rownames(go_enrich_df) <- go_enrich_df$Description
  pdf(file = paste0("02.GOcirclize_plot.pdf"),width = 12,height = 6,family = "Times")
  a <- dev.cur()   
  png(file = paste0("02.GOcirclize_plot.png"),width= 12, height= 6, units="in", res=300,family = "Times")
  dev.control("enable")
  par(mar = c(2,2,2,2),cex=1,family="Times")
  circle_size = unit(1, 'snpc')
  # circlize 
  library(circlize)
  ##
  circos.par(gap.degree = 2, start.degree = 90)
  ##， ko
  plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max')]  #， ko 
  ko_color <- c(rep(Col1, 5), rep(Col2, 5), rep(Col3, 5))  #
  circos.genomicInitialize(plot_data, plotType = NULL, major.by = 1)  #
  circos.track(
    ylim = c(0, 1), track.height = 0.05, bg.border = NA, bg.col = ko_color,  #、
    panel.fun = function(x, y) {
      ylim = get.cell.meta.data('ycenter')  #ylim、xlim  ko id 
      xlim = get.cell.meta.data('xcenter')
      sector.name = get.cell.meta.data('sector.index')  #sector.name  ko id 
      circos.axis(h = 'top', labels.cex = 0.4, major.tick.percentage = 0.4, labels.niceFacing = FALSE)  #
      circos.text(xlim, ylim, sector.name, cex = 0.8, niceFacing = FALSE)  # ko id 
    } )
  colnames(go_enrich_df)[5] <- c("-log10(pvalue)")
  ##， p 
  plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.rich', '-log10(pvalue)')]
  label_data <- go_enrich_df['gene_num.rich'] 
  p_max <- round(max(go_enrich_df$`-log10(pvalue)`)) + 1 
  colorsChoice <- colorRampPalette(c('white','#4393C3'))
  color_assign <- colorRamp2(breaks = 0:p_max, col = colorsChoice(p_max + 1))
  circos.genomicTrackPlotRegion(
    plot_data, track.height = 0.08, bg.border = NA, stack = TRUE,  #、
    panel.fun = function(region, value, ...) {
      circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...)  #， p 
      ylim = get.cell.meta.data('ycenter')  #，ylim、xlim、sector.name （）
      xlim = label_data[get.cell.meta.data('sector.index'),1] / 2
      sector.name = label_data[get.cell.meta.data('sector.index'),1]
      circos.text(xlim, ylim, sector.name, cex = 0.3, niceFacing = FALSE)  #（）
    } )
  
  go_enrich_df$all.regulated <- go_enrich_df$up.regulated + go_enrich_df$down.regulated
  go_enrich_df$up.proportion <- go_enrich_df$up.regulated / go_enrich_df$all.regulated
  go_enrich_df$down.proportion <- go_enrich_df$down.regulated / go_enrich_df$all.regulated
  go_enrich_df$up <- go_enrich_df$up.proportion * go_enrich_df$gene_num.max
  plot_data_up <- go_enrich_df[c('Description', 'gene_num.min', 'up')]
  names(plot_data_up) <- c('Description', 'start', 'end')
  plot_data_up$type <- 1  
  go_enrich_df$down <- go_enrich_df$down.proportion * go_enrich_df$gene_num.max + go_enrich_df$up
  plot_data_down <- go_enrich_df[c('Description', 'up', 'down')]
  names(plot_data_down) <- c('Description', 'start', 'end')
  plot_data_down$type <- 2  
  
  plot_data <- rbind(plot_data_up, plot_data_down)
  label_data <- go_enrich_df[c('up', 'down', 'up.regulated', 'down.regulated')]
  color_assign <- colorRamp2(breaks = c(1, 2), col = c('#FDDBC7', '#9ACD32'))
  circos.genomicTrackPlotRegion(
    plot_data, track.height = 0.08, bg.border = NA, stack = TRUE,  #、
    panel.fun = function(region, value, ...) {
      circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...)  #，，
      ylim = get.cell.meta.data('cell.bottom.radius') - 0.5  #，ylim、xlim、sector.name （）
      xlim = label_data[get.cell.meta.data('sector.index'),1] / 2
      sector.name = label_data[get.cell.meta.data('sector.index'),3]
      circos.text(xlim, ylim, sector.name, cex = 0.4, niceFacing = FALSE)  #（）
      xlim = (label_data[get.cell.meta.data('sector.index'),2]+label_data[get.cell.meta.data('sector.index'),1]) / 2
      sector.name = label_data[get.cell.meta.data('sector.index'),4]
      circos.text(xlim, ylim, sector.name, cex = 0.4, niceFacing = FALSE)  #，
    } )
  plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max', 'rich.factor')]
  label_data <- go_enrich_df['category']
  color_assign <- c('BP' = Col1, 'CC' = Col2, 'MF' = Col3)
  circos.genomicTrack(
    plot_data, ylim = c(0, max(plot_data$rich.factor)), track.height = 0.3, bg.col = 'gray95', bg.border = NA,  #、
    panel.fun = function(region, value, ...) {
      sector.name = get.cell.meta.data('sector.index')
      circos.genomicRect(region, value, col = color_assign[label_data[sector.name,1]], border = NA, ytop.column = 1, ybottom = 0, ...)  #，， ko 
      circos.lines(c(0, max(region)), c(0.05, 0.05), col = 'gray', lwd = 0.3) 
    } )
  circos.clear()
  category_legend <- Legend(
    labels = c("BP:biological process", "CC:cellular component","MF:molecular function" ),
    type = 'points', pch = NA, background = c(Col1,Col2,Col3), 
    labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm'))
  updown_legend <- Legend(
    labels = c('Up-regulated', 'Down-regulated'), 
    type = 'points', pch = NA, background = c('#FDDBC7', '#9ACD32'), 
    labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm'))
  pvalue_legend <- Legend(
    col_fun = colorRamp2(round(seq(0, p_max, length.out = 6), 0), 
                         colorRampPalette(c('white','#4393C3'))(6)),
    legend_height = unit(3, 'cm'), labels_gp = gpar(fontsize = 8), 
    title_gp = gpar(fontsize = 9), title_position = 'topleft', title = '-Log10(pvalue)')
  
  lgd_list_vertical <- packLegend(updown_legend, pvalue_legend)
  pushViewport(viewport(x = 0.85, y = 0.5))
  grid.draw(lgd_list_vertical)
  upViewport()
  lgd_list_vertical <- packLegend(category_legend)
  pushViewport(viewport(x = 0.5, y = 0.5))
  grid.draw(lgd_list_vertical)
  upViewport()
  dev.copy(which = a) 
  dev.off()
  dev.off()
}

go_tem <- rbind(ego_result_BP,ego_result_CC,ego_result_MF)
library(tidyverse)
df <- go_tem[,c('ONTOLOGY','Description','geneID','Count','pvalue')] %>% 
  filter(ONTOLOGY %in% c("BP","CC","MF")) %>%
  mutate(`-log10(pvalue)`=-log10(pvalue)) %>%
  arrange(ONTOLOGY,desc(`-log10(pvalue)`))

df1 <- df
new_df <- data.frame()
columns <- names(df1)

for (i in 1:nrow(df1)) {
  current_row <- df1[i, ]
  new_row <- as.data.frame(matrix(NA, nrow = 1, ncol = length(columns)))
  colnames(new_row) <- columns
  new_df <- rbind(new_df, current_row, new_row)
}

new_df <- new_df %>% 
  mutate(id=as.character(row_number()),
         col=case_when(ONTOLOGY=="BP" ~ "#7294D4",
                       ONTOLOGY=="CC" ~ "#00A08A",
                       ONTOLOGY=="MF" ~ "#F98400",
                       TRUE ~ "white"))

new_df$id <- factor(new_df$id,levels = new_df$id %>% rev())
col_mapping <- new_df[,c('id','col')] %>% deframe()

replace_fifth_slash <- function(text) {
  parts <- str_split(text, "/")[[1]]
  if (length(parts) > 13) {
    parts[14] <- paste0("\n", parts[14])
  }
  return(paste(parts, collapse = "/"))
}

new_df <- new_df %>%
  mutate(geneID = sapply(geneID, replace_fifth_slash))
colnames(new_df)
p <- ggplot(new_df,aes(`-log10(pvalue)`,id))+
  geom_bar(stat = "identity",aes(fill=ONTOLOGY),
           show.legend = F)+
  geom_text(data=new_df %>% drop_na(),
            aes(label=Count),vjust=0.5,hjust=0,size=3)+
  geom_text(data=new_df %>% drop_na() %>% filter(Count <= 13),
            aes(x=0.01,label=geneID,color=ONTOLOGY),
            vjust=0.5,hjust=0,size=3,nudge_y =-1,show.legend = F)+
  geom_text(data=new_df %>% drop_na() %>% filter(Count > 13),
            aes(x=0.01,label=geneID,color=ONTOLOGY),
            vjust=0.5,hjust=0,size=3,nudge_y =-1.5
            ,show.legend = F)+
  scale_x_continuous(expand= expansion(mult = c(0,0.05)))+
  scale_y_discrete(label=new_df$Description %>% rev())+
  scale_fill_manual(values = c("#7294D4","#00A08A","#F98400"),
                    na.translate = FALSE)+
  scale_color_manual(values = c("#7294D4","#00A08A","#F98400"))+
  labs(y=NULL)+
  coord_cartesian(clip = "off")+
  theme(axis.text.y=element_text(color=rev(col_mapping)),
        axis.ticks.y=element_blank(),
        panel.background = element_blank(),
        axis.line.x=element_line(color="black"),
        plot.margin=unit(c(0.5,0.5,0.5,0.5),unit="cm"))+
  annotate(geom="rect",xmin=-0.12,xmax=-0.01,ymin=0.2,ymax=11,
           fill="#F98400",alpha=0.6)+
  annotate(geom="rect",xmin=-0.12,xmax=-0.01,ymin=11,ymax=21,
           fill="#00A08A",alpha=0.6)+
  annotate(geom="rect",xmin=-0.12,xmax=-0.01,ymin=21,ymax=31,
           fill="#7294D4",alpha=0.6)+
  annotate(geom="text",x=-0.065,y=25.5,label="BP",size=3,color="black",
           angle=90,fontface="bold")+
  annotate(geom="text",x=-0.065,y=15.5,label="CC",size=3,color="black",
           angle=90,fontface="bold")+
  annotate(geom="text",x=-0.065,y=5.5,label="MF",size=3,color="black",
           angle=90,fontface="bold")
pdf("06.go_barplot.pdf", width = 10, height = 6,family = "Times")
p
dev.off()
png("06.go_barplot.png", width = 10, height = 6,res = 600,units = 'in',family = "Times")
p
dev.off()


library(tidyverse)
library(RColorBrewer)
library(MetBrewer)
library(ggnewscale)

colnames(df1)
df1$GeneRatio <- df1$Count / 109
pp <- ggplot(df1,aes(GeneRatio,Description))+
  geom_segment(aes(x=0,xend=GeneRatio,y=Description,yend=Description,color=ONTOLOGY),
               size=6,show.legend = F,alpha=0.5)+
  geom_text(aes(x=0,y=Description,label=Description),vjust=0.5,hjust=0,size=3.5,color="black")+
  scale_color_manual(values=c("#abd0a7","#3B9AB2","#00A08A"))+
  new_scale_color()+
  geom_point(aes(size=GeneRatio,color=pvalue,fill=pvalue),shape=19)+
  scale_fill_gradientn(colors=met.brewer("Cassatt1")) +
  scale_color_gradientn(colors=met.brewer("Cassatt1")) +
  guides(size=guide_legend(title="GeneRatio"))+
  facet_grid(ONTOLOGY~.,scale="free_y")+
  scale_x_continuous(expand = c(0,0.005))+
  theme_classic()+
  coord_cartesian(clip="off")+
  theme(axis.title = element_blank(),
        axis.text.y=element_blank(),
        axis.text.x=element_text(color="black",face="bold"),
        axis.ticks.y=element_blank(),
        panel.background = element_blank(),
        strip.background = element_rect(fill = "grey85", colour = NA),
        panel.spacing.y = unit(0, "cm"),
        legend.key=element_blank(),
        legend.title = element_text(color="black",size=9), 
        legend.text = element_text(color="black",size=8),
        legend.spacing.x=unit(0.1,'cm'),
        legend.key.width=unit(0.5,'cm'),
        legend.key.height=unit(0.5,'cm'),
        legend.background=element_blank(),
        legend.box.margin = margin(1,1,1,1),
        #legend.position = c(0.92,0.78),
        plot.margin=margin(0.5,0.5,0.5,0.5,"cm")
  )
pdf("07.go_barplot.pdf", width = 7, height = 6,family = "Times")
print(pp)
dev.off()
png("07.go_barplot.png", width = 7, height = 6,res = 600,units = 'in',family = "Times")
print(pp)
dev.off()


# _KEGG
library(clusterProfiler)
library(stringr)
library(ComplexHeatmap)
options(stringsAsFactors = F)
gene_symbol <- clusterProfiler::bitr(
  geneID = enrich_ID$x,
  fromType = "SYMBOL",
  toType = "ENTREZID",
  OrgDb = "org.Hs.eg.db")
KEGG <- enrichKEGG( gene = gene_symbol$ENTREZID,#
                    organism = "hsa",  #
                    keyType = "kegg",  #
                    minGSSize = 1,
                    maxGSSize = 500,
                    pvalueCutoff = 1,
                    pAdjustMethod = "BH",
                    qvalueCutoff = 0.9
)
KEGG <- mutate(KEGG, richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))
kk <- setReadable(KEGG, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
kegg_result <- kk@result
if(median(kegg_result$richFactor) < 0.1){kegg_result$richFactor<- kegg_result$richFactor*10}
hh <- as.data.frame(kegg_result)
rownames(hh) <- 1:nrow(hh)
hh <- hh[hh$pvalue <= 0.05,]
dim(hh)#31
hh <- hh[order(hh$pvalue,decreasing = F),]
write.csv(hh, file = "00.KEGG.csv")

rownames(hh) <- 1:nrow(hh)
hh$order=factor(rev(as.integer(rownames(hh))),labels = rev(hh$Description))
kk <- hh[c(1:10),] 
paste0(kk$Description,collapse = "（）；")
if(kegg.circ == 1){
  go <-  data.frame(Category = "KEGG",ID = kk[,'ID'],Term = kk[,'Description'], 
                    Genes = gsub("/", ", ", kk[,'geneID']), adj_pval = kk[,'pvalue'])
  genelist <- data.frame(ID = rownames(select_DERNA), logFC = select_DERNA$log2FoldChange)
  row.names(genelist)=genelist[,1]
  library(GOplot)
  circ <- circle_dat(go, genelist)
  kk$pvalue <- -log10(kk$pvalue)
  go_enrich_df <- data.frame(
    category="KEGG", 
    gene_num.min = 0,
    gene_num.max = max(kegg_result$Count),
    gene_num.rich=kk$Count,
    "-log10.p" = kk$pvalue,
    up.regulated = 0,
    down.regulated = 0,
    rich.factor = kk$richFactor,
    Description=kk$ID)
  circ$sig <- ifelse(circ$logFC > 0,"up","down")
  tem1 <- go_enrich_df$Description
  for(i in 1:10){
    x = tem1[i]
    circ1 <- circ[grep(x,circ$ID),]
    num1 <- nrow(circ1)
    tem2 <- as.data.frame(table(circ1$sig))
    if(nrow(tem2) == 2){
      go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]]
      go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[2]]
    }else if(tem2[1,1] == "up"){
      go_enrich_df[match(x,go_enrich_df$Description),6] = table(circ1$sig)[[1]]
    }else{
      go_enrich_df[match(x,go_enrich_df$Description),7] = table(circ1$sig)[[1]]}
  }
  
  go_enrich_df$Description <- factor(go_enrich_df$Description, levels = go_enrich_df$Description)
  rownames(go_enrich_df) <- go_enrich_df$Description
  library(ggplot2)
  pdf(file = paste0("06.KEGG_cic.pdf"),width = 13,height = 6,family = "Times")
  a <- dev.cur()   
  png(file = paste0("06.KEGG_cic.png"),width= 13, height= 6, units="in", res=300,family = "Times")
  dev.control("enable")
  par(family="Times")
  circle_size = unit(1, 'snpc')
  library(circlize)
  kegg.color <- sample(col1,1)
  circos.par(gap.degree = 2, start.degree = 90)
  plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max')]  #， ko 
  ko_color <- c(rep(kegg.color, 15))  #
  circos.genomicInitialize(plot_data, plotType = NULL, major.by = 1)  #
  circos.track(
    ylim = c(0, 1), track.height = 0.05, bg.border = NA, bg.col = ko_color,  #、
    panel.fun = function(x, y) {
      ylim = get.cell.meta.data('ycenter')  #ylim、xlim  ko id 
      xlim = get.cell.meta.data('xcenter')
      sector.name = get.cell.meta.data('sector.index')  #sector.name  ko id 
      circos.axis(h = 'top', labels.cex = 0.6, major.tick.percentage = 0.4, labels.niceFacing = FALSE)  #
      circos.text(xlim, ylim, sector.name, cex = 0.8, niceFacing = FALSE)  # ko id 
    } )
  colnames(go_enrich_df)[5] <- c("-log10(pvalue)")
  plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.rich', '-log10(pvalue)')]  #， p 
  label_data <- go_enrich_df['gene_num.rich']  #，
  p_max <- round(max(go_enrich_df$`-log10(pvalue)`)) + 1  # p ，
  colorsChoice <- colorRampPalette(c('white','#4393C3'))  # p 
  color_assign <- colorRamp2(breaks = 0:p_max, col = colorsChoice(p_max + 1))
  
  circos.genomicTrackPlotRegion(
    plot_data, track.height = 0.08, bg.border = NA, stack = TRUE,  #、
    panel.fun = function(region, value, ...) {
      circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...)  #， p 
      ylim = get.cell.meta.data('ycenter')  #，ylim、xlim、sector.name （）
      xlim = label_data[get.cell.meta.data('sector.index'),1] / 2
      sector.name = label_data[get.cell.meta.data('sector.index'),1]
      circos.text(xlim, ylim, sector.name, cex = 0.6, niceFacing = FALSE)  #（）
    } )
  go_enrich_df$all.regulated <- go_enrich_df$up.regulated + go_enrich_df$down.regulated
  go_enrich_df$up.proportion <- go_enrich_df$up.regulated / go_enrich_df$all.regulated
  go_enrich_df$down.proportion <- go_enrich_df$down.regulated / go_enrich_df$all.regulated
  
  go_enrich_df$up <- go_enrich_df$up.proportion * go_enrich_df$gene_num.max
  plot_data_up <- go_enrich_df[c('Description', 'gene_num.min', 'up')]
  names(plot_data_up) <- c('Description', 'start', 'end')
  plot_data_up$type <- 1  # 1 
  
  go_enrich_df$down <- go_enrich_df$down.proportion * go_enrich_df$gene_num.max + go_enrich_df$up
  plot_data_down <- go_enrich_df[c('Description', 'up', 'down')]
  names(plot_data_down) <- c('Description', 'start', 'end')
  plot_data_down$type <- 2  # 2 
  
  plot_data <- rbind(plot_data_up, plot_data_down)
  label_data <- go_enrich_df[c('up', 'down', 'up.regulated', 'down.regulated')]
  color_assign <- colorRamp2(breaks = c(1, 2), col = c('#FDDBC7', '#9ACD32'))
  circos.genomicTrackPlotRegion(
    plot_data, track.height = 0.08, bg.border = NA, stack = TRUE,  #、
    panel.fun = function(region, value, ...) {
      circos.genomicRect(region, value, col = color_assign(value[[1]]), border = NA, ...)  #，，
      ylim = get.cell.meta.data('cell.bottom.radius') - 0.5  #，ylim、xlim、sector.name （）
      xlim = label_data[get.cell.meta.data('sector.index'),1] / 2
      sector.name = label_data[get.cell.meta.data('sector.index'),3]
      circos.text(xlim, ylim, sector.name, cex = 0.6, niceFacing = FALSE)  #（）
      xlim = (label_data[get.cell.meta.data('sector.index'),2]+label_data[get.cell.meta.data('sector.index'),1]) / 2
      sector.name = label_data[get.cell.meta.data('sector.index'),4]
      circos.text(xlim, ylim, sector.name, cex = 0.6, niceFacing = FALSE)  #，
    } )
  plot_data <- go_enrich_df[c('Description', 'gene_num.min', 'gene_num.max', 'rich.factor')]  #，
  label_data <- go_enrich_df['category']  #，，
  color_assign <- c('KEGG' = kegg.color)
  circos.genomicTrack(
    plot_data, ylim = c(0, max(plot_data$rich.factor)), track.height = 0.3, bg.col = 'gray95', bg.border = NA,  #、
    panel.fun = function(region, value, ...) {
      sector.name = get.cell.meta.data('sector.index')  #sector.name  ko id ， ko ，
      circos.genomicRect(region, value,col = color_assign[label_data[sector.name,1]], border = NA, ytop.column = 1, ybottom = 0, ...)  #，， ko 
      circos.lines(c(0, max(region)), c(0.05, 0.05), col = 'gray', lwd = 0.3)  # 0.5 
    })
  circos.clear()
  category_legend <- Legend(
    labels = c("KEGG Pathway"),
    type = 'points', pch = NA, background = c(kegg.color), 
    labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm'))
  updown_legend <- Legend(
    labels = c('Up-regulated', 'Down-regulated'), 
    type = 'points', pch = NA, background =c('#FDDBC7', '#9ACD32'), 
    labels_gp = gpar(fontsize = 8), grid_height = unit(0.5, 'cm'), grid_width = unit(0.5, 'cm'))
  pvalue_legend <- Legend(
    col_fun = colorRamp2(round(seq(0, p_max, length.out = 6), 0), 
                         colorRampPalette(c('white','#4393C3'))(6)),
    legend_height = unit(3, 'cm'), labels_gp = gpar(fontsize = 8), 
    title_gp = gpar(fontsize = 9), title_position = 'topleft', title = '-Log10(pvalue)')
  
  lgd_list_vertical <- packLegend(updown_legend, pvalue_legend)
  pushViewport(viewport(x = 0.85, y = 0.5))
  grid.draw(lgd_list_vertical)
  upViewport()
  lgd_list_vertical <- packLegend(category_legend)
  pushViewport(viewport(x = 0.5, y = 0.5))
  grid.draw(lgd_list_vertical)
  upViewport()
  dev.copy(which = a) 
  dev.off()
  dev.off()
}


if(kegg.buble == 1){
  kk <- hh[c(1:10),] 
  p2 <-  ggplot(kk,aes(y=order,x=Count))+
    geom_point(aes(size=Count,color= pvalue))+# 
    scale_x_continuous(breaks = seq(min(kk$Count), max(kk$Count), by = 1), # x  
                       limits = c(min(kk$Count) - 0.5, max(kk$Count) + 0.5))+  #x
    scale_color_gradient(high="#EE0000B2",low = "#008B45B2")+
    labs(color=expression(pvalue,size="Count"), 
         x="Gene Number",y="Pathways",title="KEGG Enrichment")+
    theme_bw()+
    theme(
      legend.background = element_rect(fill = "white", color = "black", size = 0.2),
      legend.text = element_text(face="bold",color="black",family = "Times",size=14),
      plot.title = element_text(hjust = 0.5, face = "bold",color = "black",family = "Times",size = 18),
      axis.text.x = element_text(face = "bold",color = "black",size = 14),
      axis.text.y = element_text(face = "bold",color = "black",size = 14),
      axis.title.x = element_text(face = "bold",color = "black",family = "Times",size = 18),
      axis.title.y = element_text(face = "bold",color = "black",family = "Times",size = 18),
      plot.subtitle = element_text(hjust = 0.5,family = "Times", size = 14, face = "bold", colour = "black")
    )+scale_y_discrete(labels=function(x) str_wrap(x, width=40))
  pdf("04.KEGG_dot.pdf", width = 9, height = 8,family = "Times")
  print(p2)
  dev.off()
  png("04.KEGG_dot.png", width = 9, height = 8,res = 600,units = 'in',family = "Times")
  print(p2)
  dev.off()
}

save.image('enrich.Rdata')


# important segment -------------------------------------------------------
# load('enrich.Rdata')
# paste0("",dim(GO_ALL)[[1]],"，",dim(BP)[[1]],"，",dim(CC)[[1]],"，",dim(MF)[[1]],"")
# paste0(ego_result_BP$Description,collapse = "（）、")
# paste0(ego_result_CC$Description,collapse = "（）、")
# paste0(ego_result_MF$Description,collapse = "（）、")
# hh2 <- subset(hh,pvalue<0.05)
# dim(hh2)
# paste0(hh$Description[1:15],collapse = "（）、")
# 
# 
# > paste0("",dim(GO_ALL)[[1]],"，",dim(BP)[[1]],"，",dim(CC)[[1]],"，",dim(MF)[[1]],"")
# [1] "2090，1955，37，98"
# > paste0(ego_result_BP$Description,collapse = "（）、")
# [1] "lymph vessel development（）、lymph vessel morphogenesis（）、epithelial cell proliferation（）、leukocyte migration（）、ameboidal-type cell migration"
# > paste0(ego_result_CC$Description,collapse = "（）、")
# [1] "platelet alpha granule（）、external side of plasma membrane（）、platelet alpha granule lumen（）、collagen-containing extracellular matrix（）、integrin complex"
# > paste0(ego_result_MF$Description,collapse = "（）、")
# [1] "receptor ligand activity（）、signaling receptor activator activity（）、integrin binding（）、transmembrane receptor protein kinase activity（）、growth factor binding"
# > hh2 <- subset(hh,pvalue<0.05)
# > dim(hh2)
# [1] 137  11
# > paste0(hh$Description[1:15],collapse = "（）、")
# [1] "PI3K-Akt signaling pathway（）、Focal adhesion（）、Proteoglycans in cancer（）、Ras signaling pathway（）、MAPK signaling pathway（）、AGE-RAGE signaling pathway in diabetic complications（）、Rap1 signaling pathway（）、EGFR tyrosine kinase inhibitor resistance（）、Relaxin signaling pathway（）、Human cytomegalovirus infection（）、Renal cell carcinoma（）、TNF signaling pathway（）、Calcium signaling pathway（）、HIF-1 signaling pathway（）、Yersinia infection"
# 
#prognostic model
#06_single_factor_cox------------------------------------------------
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./05_single_factor_cox")){
  dir.create("./05_single_factor_cox")
}
setwd("./05_single_factor_cox")
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(lance)
library(readxl)
library(readr)
library(tidyverse)

survival<-read.csv('../00_rawdata/04.survival_TCGA.csv',row.names = 1)
#survival$sample<-rownames(survival)
dat.tcga<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F)
#colnames(dat.tcga)<-gsub('.','-',colnames(dat.tcga),fixed = T)
hubgene<-read.csv('../02_intersect/01.DEGs_LRGs.csv')
#write.csv(survival,file = '01.survival.csv',quote = F,row.names = F)

train_dat<-dat.tcga[,survival$sample]
train_dat<-t(train_dat)
train_dat<-as.data.frame(train_dat)
train_dat<-train_dat[,colnames(train_dat)%in%hubgene$x,]
train_dat$sample<-rownames(train_dat)
train_dat<-merge(survival,train_dat,by='sample')
train_dat<-column_to_rownames(train_dat,var = 'sample')
train_data<-train_dat
write.csv(train_data,"02.single_factor_cox_data.csv")

.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(survival)
library(survminer)
colnames_sum <- colnames(train_data)
colnames_sum <- gsub("-","_",colnames_sum)
colnames_sum <- gsub(" ","_",colnames_sum)
colnames(train_data) <- colnames_sum

covariates <- colnames_sum[-which(colnames_sum %in% c("OS", "OS.time"))]
#Surv()   
univ_formulas <- sapply(covariates,
                        function(x) as.formula(paste("Surv(OS.time, OS)~", x)))  #as.formula(). 。formula
# coxphcox cox
univ_models <- lapply(univ_formulas,
                      function(x) {coxph(x, data = train_data)})

# ph
univ_cox_zph <- lapply(univ_models,
                       function(x) {
                         y <- cox.zph(x)
                         #p
                         p<-y$table[-nrow(y$table),][3]
                         chisq <- y$table[-nrow(y$table),][1]
                         df <- y$table[-nrow(y$table),][2]
                         res<-c(chisq,df,p)
                         names(res)<-c("chisq","df","p")
                         return(res)
                       })
univ_cox_zph <- t(as.data.frame(univ_cox_zph, check.names = FALSE))
univ_cox_zph <- as.data.frame(univ_cox_zph)
write.csv(univ_cox_zph,"PH_Unifactor_all.csv")
univ_cox_zph2 <- subset(univ_cox_zph,p>0.05)
write.csv(univ_cox_zph2,"PH_Unifactor.csv")
univ_cox_zph <- rownames(univ_cox_zph)[univ_cox_zph$p>0.05] #98

train_dat<-train_dat[,colnames(train_dat)%in%univ_cox_zph,]
train_dat$sample<-rownames(train_dat)
train_dat<-merge(survival,train_dat,by='sample')
train_dat<-column_to_rownames(train_dat,var = 'sample')
train_data<-train_dat
write.csv(train_data,"02.single_factor_cox_data_PH_after.csv")

#cox
library(survival)
library(survminer)
colnames_sum <- colnames(train_data)
colnames_sum <- gsub("-","_",colnames_sum)
colnames_sum <- gsub(" ","_",colnames_sum)
colnames(train_data) <- colnames_sum

covariates <- colnames_sum[-which(colnames_sum %in% c("OS", "OS.time"))]
#Surv()   
univ_formulas <- sapply(covariates,
                        function(x) as.formula(paste("Surv(OS.time, OS)~", x)))  #as.formula(). 。formula
# coxphcox cox
univ_models <- lapply(univ_formulas,
                      function(x) {coxph(x, data = train_data)})

univ_results <- lapply(univ_models,
                       function(x){ 
                         x <- summary(x)
                         #p
                         p.value<-signif(x$wald["pvalue"], digits=3)
                         #HR
                         HR <-signif(x$coef[2], digits=4);
                         #95%
                         HR.confint.lower <- signif(x$conf.int[,"lower .95"],4)
                         HR.confint.upper <- signif(x$conf.int[,"upper .95"],4)
                         HR <- paste0(HR, " (", 
                                      HR.confint.lower, "-", 
                                      HR.confint.upper, ")")
                         res<-c(p.value,HR)
                         names(res)<-c("p.value","HR (95% CI for HR)")
                         return(res)
                       })

## coefb（beta）。exp(coef)cox（HR）
## zwald，coefse(coef)。ower .95 upper .95exp(coef)95%，，，，。
res_mod <- t(as.data.frame(univ_results, check.names = FALSE))
res_mod <- as.data.frame(res_mod)
res_results_0.01 <- res_mod[which(as.numeric(res_mod$p.value) < 0.01),]
res_results_0.01 <- na.omit(res_results_0.01)
# 4
write.csv(res_results_0.01,file = "03.single_factor_cox_results_0.05.csv",quote = F,row.names = T)
dim(res_results_0.01)
## 
library(tidyr)
res_results_0.01_2 <- separate(res_results_0.01, "HR (95% CI for HR)",
                               into = c("HR", "HR.95L", "HR.95H"),
                               sep = " ")
res_results_0.01_2 <- separate(res_results_0.01_2, "HR.95L",
                               into = c("HR.95L", "HR.95H"),
                               sep = "\\-")
res_results_0.01_2$HR.95L <- gsub("\\(", "", res_results_0.01_2$HR.95L)
res_results_0.01_2$HR.95H <- gsub("\\)", "", res_results_0.01_2$HR.95H)

res_results_0.01_2[,1:ncol(res_results_0.01_2)] <- as.numeric(unlist(res_results_0.01_2[,1:ncol(res_results_0.01_2)]))
res_results_0.01_2 <- res_results_0.01_2[order(res_results_0.01_2$HR),]
hz <- paste(round(res_results_0.01_2$HR,4),
            "(",round(res_results_0.01_2$HR.95L,4),
            "-",round(res_results_0.01_2$HR.95H,4),")",sep = "")


tabletext <- cbind(c(NA,"Gene",rownames(res_results_0.01_2)),
                   c(NA,"P value",ifelse(res_results_0.01_2$p.value<0.001,
                                         "< 0.001",
                                         round(res_results_0.01_2$p.value,3))),
                   c(NA,"Hazard Ratio(95% CI)",hz))
nrow(tabletext) + 1

.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(forestplot)
#
pdf(file = "04.univariate_cox_forest.pdf", height = 9, width = 10, onefile = F,family = 'Times')
forestplot(labeltext=tabletext,
           graph.pos=4,  #Pvalue
           is.summary = c(TRUE, TRUE,rep(FALSE, 70)),
           col=fpColors(box="darkred", lines="darkblue", zero = "gray50"),
           mean=c(NA,NA,res_results_0.01_2$HR),
           lower=c(NA,NA,res_results_0.01_2$HR.95L), #95%
           upper=c(NA,NA,res_results_0.01_2$HR.95H), #95%
           boxsize=0.1,lwd.ci=3,   #，
           ci.vertices.height = 0.08,ci.vertices=TRUE, #、、
           zero=1,lwd.zero=0.5,    #zero 
           colgap=unit(5,"mm"),    #
           xticks = c(0.2,0.4,0.6,0.8,1,1.2,1.4,1.6,1.8,2,2.2,2.4,2.6), #
           lwd.xaxis=2,            #X
           lineheight = unit(1.2,"cm"), #
           graphwidth = unit(.5,"npc"), #
           cex=0.9, fn.ci_norm = fpDrawCircleCI, #
           hrzl_lines = list("3" = gpar(col = "black", lty = 1, lwd = 2)),
           # hrzl_lines=list("2" = gpar(lwd=2, col="black"),
           #                 "3" = gpar(lwd=2, col="black"), #，
           #                 "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1
           # mar=unit(rep(0.01, times = 4), "cm"),#
           #fpTxtGpcex
           txt_gp=fpTxtGp(label=gpar(cex=1),
                          ticks=gpar(cex=0.8, fontface = "bold"),
                          xlab=gpar(cex = 1, fontface = "bold"),
                          title=gpar(cex = 1.25, fontface = "bold")),
           xlab="Hazard Ratio",
           grid = T,
           clip = c(0,2.2)) # x，
dev.off()

png(filename = "04.univariate_cox_forest.png", height = 9, width = 10,units='in',res=600,family='Times')
forestplot(labeltext=tabletext,
           graph.pos=4,  #Pvalue
           is.summary = c(TRUE, TRUE,rep(FALSE, 70)),
           col=fpColors(box="darkred", lines="darkblue", zero = "gray50"),
           mean=c(NA,NA,res_results_0.01_2$HR),
           lower=c(NA,NA,res_results_0.01_2$HR.95L), #95%
           upper=c(NA,NA,res_results_0.01_2$HR.95H), #95%
           boxsize=0.1,lwd.ci=3,   #，
           ci.vertices.height = 0.08,ci.vertices=TRUE, #、、
           zero=1,lwd.zero=0.5,    #zero 
           colgap=unit(5,"mm"),    #
           xticks = c(0.2,0.4,0.6,0.8,1,1.2,1.4,1.6,1.8,2,2.2,2.4,2.6), #
           lwd.xaxis=2,            #X
           lineheight = unit(1.2,"cm"), #
           graphwidth = unit(.5,"npc"), #
           cex=0.9, fn.ci_norm = fpDrawCircleCI, #
           hrzl_lines = list("3" = gpar(col = "black", lty = 1, lwd = 2)),
           # hrzl_lines=list("2" = gpar(lwd=2, col="black"),
           #                 "3" = gpar(lwd=2, col="black"), #，
           #                 "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1
           # mar=unit(rep(0.01, times = 4), "cm"),#
           #fpTxtGpcex
           txt_gp=fpTxtGp(label=gpar(cex=1),
                          ticks=gpar(cex=0.8, fontface = "bold"),
                          xlab=gpar(cex = 1, fontface = "bold"),
                          title=gpar(cex = 1.25, fontface = "bold")),
           xlab="Hazard Ratio",
           grid = T,
           clip = c(0,2.2)) # x，
dev.off()

# gene_list <- rownames(res_results_0.01)
# #gene_list <- lasso_geneids
# cox_data <- as.formula(paste0('Surv(OS.time, OS)~',
#                               paste(gene_list,
#                                     sep = '',
#                                     collapse = '+')))
# 
# cox_more <- coxph(cox_data,
#                   data = train_data)
# cox_zph <- cox.zph(cox_more)
# cox_table <- cox_zph$table[-nrow(cox_zph$table),]
# cox_table2 <- data.frame(cox_table)
# write.csv(cox_table2,'PH.csv')
# ph <- subset(cox_table2,p>0.05)
# write.csv(ph,'05.ph_test.csv')


#---------------------------------------------------------------------------------------

#07_Lasso--------------------------------------------------------------------------------------
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (!dir.exists("06_Lasso")){dir.create("06_Lasso")}
setwd("06_Lasso")
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(glmnet)
library(dplyr)
library(ggplot2)

train_data <- read.csv('../05_single_factor_cox/02.single_factor_cox_data_PH_after.csv',check.names = F,row.names = 1)
res_results_0.01 <- read.csv('../05_single_factor_cox/03.single_factor_cox_results_0.05.csv',check.names = F,row.names = 1)
x <- subset(train_data, select = -c(OS, OS.time))
x <- x[,rownames(res_results_0.01)] %>% as.matrix()
y <- subset(train_data, select = c(OS, OS.time))
y <- data.matrix(Surv(y$OS.time,y$OS))
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
#
#if(lasso == 'lasso')
library("glmnet")
library("survival")

# info <- data.frame(seed = "",num = "")
# info <- info[-1,]
# if(T){
#   for(i in 1:1000){
#     set.seed(i)
#     fit <- glmnet(x,y,family="cox")   #Cox，
#     lasso_fit <- cv.glmnet(x=x, y=y, family='cox', type.measure='deviance',maxit = 1000,nfolds = 10)  ##  ,maxit = 1000
#     coefficient <- coef(lasso_fit, s=lasso_fit$lambda.min)
#     Active.Index <- which(as.numeric(coefficient)!=0)
#     astive.coefficients <- as.numeric(coefficient)[Active.Index]
#     lasso_geneids <- rownames(coefficient)[Active.Index]
#     lasso<-data.frame(gene=lasso_geneids ,coef=astive.coefficients )
#     info <- rbind(info,data.frame(seed = i,num = nrow(lasso)))
#   }
# 
# }
# table(info$num)
# write.csv(info,'info.csv')
# info <- read.csv('info.csv',row.names = 1)
# a <- rstatix::filter(info,!duplicated(info$num))

x_all <- subset(train_data, select = -c(OS, OS.time))
x_all <- x_all[,rownames(res_results_0.01)]
y_all <- subset(train_data, select = c(OS, OS.time))
fit <- glmnet(as.matrix(x_all), Surv(y_all$OS.time,y_all$OS),family = "cox")
set.seed(68) #
setwd("/data/nas3///29_YQ953-11/06_Lasso/")
{
  cvfit = cv.glmnet(as.matrix(x_all),
                    Surv(y_all$OS.time,y_all$OS),nfold=10,
                    family = "cox") 
  # pdf(file = "01.lasso_model.pdf", height = 5)
  # plot(fit, xvar = "lambda", cex.lab = 1) + abline(v = c(log(cvfit$lambda.min),
  #                                                        log(cvfit$lambda.1se)), lty = 2) + text(x = log(cvfit$lambda.min),
  #                                                                                                y = -5, paste("Lambda.min\n", round(cvfit$lambda.min, 4)), cex = 1.6,
  #                                                                                                adj = 0.9) + text(x = log(cvfit$lambda.1se), y = 5, paste("Lambda.lse\n",
  #                                                                                                                                                          round(cvfit$lambda.1se, 4)), cex = 1.6, adj = 0.9)                                                                                                                                          
  # dev.off()
  # png(filename = "01.lasso_model.png", height = 400, width = 500)
  # plot(fit, xvar = "lambda", cex.lab = 1) + abline(v = c(log(cvfit$lambda.min),
  #                                                        log(cvfit$lambda.1se)), lty = 2) + text(x = log(cvfit$lambda.min),
  #                                                                                                y = -5, paste("Lambda.min\n", round(cvfit$lambda.min, 4)), cex = 1.6,
  #                                                                                                adj = 0.9) + text(x = log(cvfit$lambda.1se), y = 5, paste("Lambda.lse\n",
  #                                                                                                                                                          round(cvfit$lambda.1se, 4)), cex = 1.6, adj = 0.9)
  # 
  # dev.off()
  # 
  # png(filename = "02.lasso_verify.png", height = 400, width = 500)
  # plot(cvfit, las =1)
  # dev.off()
  # pdf(file = "02.lasso_verify.pdf", height = 5)
  # plot(cvfit, las =1)
  # dev.off()
  coef.min = coef(cvfit, s = "lambda.min")  # lambda.min & lambda.1se 
  cvfit$lambda.min #0.01362608
  # 0
  active.min = which(coef.min@i != 0)
  coef.min
  # 
  lasso_geneids <- coef.min@Dimnames[[1]][coef.min@i+1]
  lasso_geneids
  
}
#"IGF2BP2" "PCF11"   "RPUSD4"  "NUDT11" 
lasso_geneids <- data.frame(lasso_geneids)
colnames(lasso_geneids) <- 'symbol'
write.csv(lasso_geneids, "03.lasso_genes.csv",row.names = T)
coef.min
# 1
# CD24   0.1758641
# CEBPD -0.1753270
# CCL19 -0.0829096
# ZIC2   0.1191970   
saveRDS(cvfit,file = '04.cvfit.rds')
saveRDS(x_all,file = '05.x_all.rds')
saveRDS(coef.min,file = '06.coef.min.rds')
cvfit$lambda.min   #  0.001690118
log(cvfit$lambda.min)  #-6.382957

#
x <- coef(fit) 
tmp <- as.data.frame(as.matrix(x)) 
tmp$coef <- row.names(tmp) 
tmp <- reshape::melt(tmp, id = "coef") 
tmp$variable <- as.numeric(gsub("s", "", tmp$variable)) 
tmp$coef <- gsub('_','-',tmp$coef) 
tmp$lambda <- fit$lambda[tmp$variable+1] 
# extract the lambda values 
tmp$norm <- apply(abs(x[-1,]), 2, sum)[tmp$variable+1] 
# compute L1 norm

#
head(tmp)

p <- ggplot(tmp,aes(log(lambda),value,color = coef)) + 
  geom_vline(xintercept = log(cvfit$lambda.min),
             size=0.8,color='grey60',
             alpha=0.8,linetype=2)+
  geom_line(size=1) + 
  #ylim(-10,-2)+
  xlab("Log(Lambda)") + 
  ylab('Coefficients')+ 
  theme_bw(base_rect_size = 2)+ 
  scale_color_manual(values= c('#287eb7','#ff8c33','#ff9898','#dedb8e',
                                        '#c49d93',"#E78AC3","#A6D854","#FFD92F","#E5C494","#4682B4","#CD3700",'seagreen',"darkblue","#6a3d9a","#e5486e",'#A73030FF'))+ 
                                          scale_x_continuous(expand = c(0.01,0.01))+ 
  scale_y_continuous(expand = c(0.01,0.01))+ 
  theme(panel.grid = element_blank(), 
        axis.title = element_text(size=15,color='black'), 
        axis.text = element_text(size=12,color='black'), 
        legend.title = element_blank(), 
        legend.text = element_text(size=12,color='black'), 
        legend.position = 'right')+ 
  annotate('text',x = -5.5,y=0.25,
           label='Optimal Lambda = 0.001690118',
           color='black')+ 
  guides(col=guide_legend(ncol = 1))

xx <- data.frame(lambda=cvfit[["lambda"]],
                 cvm=cvfit[["cvm"]],
                 cvsd=cvfit[["cvsd"]], 
                 cvup=cvfit[["cvup"]],
                 cvlo=cvfit[["cvlo"]],
                 nozezo=cvfit[["nzero"]]) 
xx$ll<- log(xx$lambda) 
xx$NZERO<- paste0(xx$nozezo,' vars')
p

pdf('03.lasso_model.pdf',w=7,h=5,family='Times')
print(p)
dev.off()

png('03.lasso_model.png',w=7,h=5,units='in',res=600,family='Times')
print(p)
dev.off()

p <- ggplot(xx,aes(ll,cvm,color=NZERO))+ 
  
  geom_errorbar(aes(x=ll,ymin=cvlo,ymax=cvup),
                width=0.05,size=1)+ 
  geom_vline(xintercept = xx$ll[which.min(xx$cvm)],
             size=0.8,color='grey60',alpha=0.8,
             linetype=2)+ 
  geom_point(size=2)+
  xlab("Log(Lambda)") +
  ylab('Partial Likelihood Deviance')+ 
  theme_bw(base_rect_size = 1.5)+ 
  scale_color_manual(values= c('#287eb7','#ff8c33','#ff9898','#dedb8e',
                                        '#c49d93',"#E78AC3","#A6D854","#FFD92F","#E5C494","#4682B4","#CD3700",'seagreen',"darkblue","#6a3d9a","#e5486e",'#A73030FF'))+ 
                                          
  scale_x_continuous(expand = c(0.02,0.02))+ 
  scale_y_continuous(expand = c(0.02,0.02))+ 
  theme(panel.grid = element_blank(), 
        axis.title = element_text(size=15,
                                  color='black'), 
        axis.text = element_text(size=12,
                                 color='black'), 
        legend.title = element_blank(), 
        legend.text = element_text(size=12,
                                   color='black'), 
        legend.position = 'bottom'
        
  )+ 
  annotate('text',x= -5.8,y=13.5,
           label='Optimal Lambda = 0.001690118',
           color='black')+ 
  guides(col=guide_legend(ncol = 6))
p

pdf('03.lasso_verify.pdf',w=6,h=5,family='Times')
print(p)
dev.off()

png('03.lasso_verify.png',w=6,h=5,units='in',res=600,family='Times')
print(p)
dev.off()


#08_Train_estimate--------------------------------------------------------------------------------------
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (!dir.exists("07_Train_estimate")){dir.create("07_Train_estimate")}
setwd("07_Train_estimate")

.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
train_data <- read.csv('../05_single_factor_cox/02.single_factor_cox_data_PH_after.csv',check.names = F,row.names = 1)
#res_results_0.01 <- read.csv('../06_single_factor_cox/03.single_factor_cox_results_0.01.csv',check.names = F,row.names = 1)
coxGene <- read.csv('../06_Lasso/03.lasso_genes.csv',row.names = 1,check.names = F)
cvfit <- readRDS('../06_Lasso/04.cvfit.rds')
x_all <- readRDS('../06_Lasso/05.x_all.rds')
coef.min <- readRDS('../06_Lasso/06.coef.min.rds')


cox_gene_rt <- train_data[ ,c('OS.time','OS', coxGene$symbol)]  # lasso
riskScore <- predict(cvfit, newx = as.matrix(x_all),s=cvfit$lambda.min)
riskScore<-as.numeric(riskScore)
class(riskScore)
coxGene <- coxGene$symbol
outCol=c("OS","OS.time",coxGene)
# GC
risk=as.vector(ifelse(riskScore>median(riskScore),'High','Low'))
risk <- as.data.frame(c(cbind(id=rownames(cbind(train_data[,outCol],riskScore,risk)),
                              cbind(train_data[,outCol],riskScore,risk))))
table(risk$risk)
riskScore <- data.frame(risk)
#write.csv(riskScore, file = "01.train_riskScore.csv", quote = F)

cox_gene_rt<-riskScore
median(cox_gene_rt$riskScore)  #  0.4308912

.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(survival)
library(survminer)
OS_dat <- risk
gene<- colnames(OS_dat)  #surv_cutpoint 
res.cut <- surv_cutpoint(OS_dat,time = 'OS.time',event = 'OS',variables = 'riskScore',minprop = 0.4)#
cutpoint <- res.cut$cutpoint[1]#0.329995
OS_dat$group <- factor(ifelse(OS_dat$riskScore>cutpoint['riskScore',],'High','Low'),levels = c('High','Low'))
table(OS_dat$group)  #  
kmfit_out <- survfit(Surv(OS.time, OS) ~ group, data = OS_dat)
surv_pvalue(kmfit_out) #
# High  Low 
# 522  562 

# kmfit <- survfit(Surv(OS.time, OS) ~ risk, data = cox_gene_rt)
# surv_pvalue(kmfit)

# obj_is_vector <- function(obj) {  
#   is.vector(obj) && length(dim(obj)) == 1  
# }
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(ggplot2)
#trace(ggsurvplot,edit=T)
pdf('02.Train_survival_KM.pdf',w=7,h=7,onefile=F,family='Times')
train_survival_median <- ggsurvplot(kmfit_out,
                                    pval = TRUE, 
                                    conf.int = F,
                                    legend.labs=c("High risk","Low risk" ),
                                    legend.title="Risk score",
                                    title="Train KM",
                                    font.main = c(15,"bold"),
                                    risk.table = TRUE, 
                                    risk.table.col = "strata", 
                                    linetype = "strata", 
                                    surv.median.line = "hv", 
                                    ggtheme = theme_bw(), 
                                    palette = c("#A73030FF", "#0073C2FF"))

print(train_survival_median)
dev.off()

png('02.Train_survival_KM.png',w=7,h=7,family='Times',units='in',res=600)
train_survival_median <- ggsurvplot(kmfit_out,
                                    pval = TRUE, 
                                    conf.int = F,
                                    legend.labs=c("High risk","Low risk" ),
                                    legend.title="Risk score",
                                    title="Train KM",
                                    font.main = c(15,"bold"),
                                    risk.table = TRUE, 
                                    risk.table.col = "strata", 
                                    linetype = "strata", 
                                    surv.median.line = "hv", 
                                    ggtheme = theme_bw(), 
                                    palette = c("#A73030FF", "#0073C2FF"))

print(train_survival_median)
dev.off()

colnames(OS_dat)
OS_dat <- OS_dat[,-9]
colnames(OS_dat)[9] <- 'risk'
write.csv(OS_dat, file = "01.train_OS_dat.csv", quote = F)
table(OS_dat$risk)
risk2 <- risk
risk <- OS_dat

#4 ROC 
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(survivalROC)
ROC <- risk

multi_ROC <- function(time_vector, risk_score_table){
  library(survivalROC)
  single_ROC <- function(single_time){
    for_ROC <- survivalROC(Stime=risk_score_table$OS.time,
                           status=risk_score_table$OS,
                           marker=risk_score_table$riskScore,
                           predict.time=single_time,
                           method = 'KM')
    data.frame('True_positive'=for_ROC$TP, 'False_positive'=for_ROC$FP,
               'Cut_values'=for_ROC$cut.values, 'Time'=rep(single_time, length(for_ROC$TP)),
               'AUC'=rep(for_ROC$AUC, length(for_ROC$TP)))
  }
  multi_ROC_list <- lapply(time_vector, single_ROC)
  do.call(rbind, multi_ROC_list)
}
for_multi_ROC <- multi_ROC(time_vector = c(365*seq(1,5,2)), 
                           risk_score_table = risk)
table(for_multi_ROC$AUC)

for_multi_ROC$Time <- factor(for_multi_ROC$Time)
# ROC 
library(geomROC)
library(plotROC)
library(ggthemes)
auc_y1 <- round(for_multi_ROC[which(for_multi_ROC$Time==365),5][1],3)
auc_y3 <- round(for_multi_ROC[which(for_multi_ROC$Time==1095),5][1],3)
#auc_y4 <- round(for_multi_ROC[which(for_multi_ROC$Time==1460),5][1],2)
auc_y5 <- round(for_multi_ROC[which(for_multi_ROC$Time==1825),5][1],3)
#auc_y6 <- round(for_multi_ROC[which(for_multi_ROC$Time==2190),5][1],2)
#auc_y7 <- round(for_multi_ROC[which(for_multi_ROC$Time==2555),5][1],3)


ROC <- ggplot(for_multi_ROC, aes(x=False_positive,
                                 y=True_positive,
                                 label=Cut_values,
                                 color=Time)) +
  scale_color_manual(breaks = c('365',"1095",'1825'),
                     labels = c("1 year","3 years","5 years"),
                     values = c("#4682B4", "#FF7F00", "#984EA3")) +
  plotROC::geom_roc(labels = F, stat = 'identity') +
  style_roc() +
  geom_abline(slope = 1, intercept = 0, color = 'gray', linetype=2) +
  theme_bw() +
  labs(title = "Train ROC") +
  theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'),
        panel.grid = element_blank(),
        plot.title = element_text(size = 15, hjust = 0.5, face = "bold")) +
  annotate("text", x=0.75, y=c(0.25, 0.15, 0.05),
           label = c(paste('AUC of 1 year =', format(auc_y1,nsmall=3)),
                     paste('AUC of 3 years =', format(auc_y3,nsmall=3)),
                     paste('AUC of 5 years =', format(auc_y5,nsmall=3))
           ))
ROC
pdf("03.Train_ROC.pdf", width = 6, height = 5,family = "Times")
ROC
dev.off()
png("03.Train_ROC.png", width = 6, height = 5,res = 600,units = 'in',family = "Times")
ROC
dev.off()

riskScore <- risk$riskScore
risk_dis <- ggplot(risk, aes(x=reorder(id, riskScore),
                             y=riskScore,
                             color = factor(risk,
                                            levels = c('High', 'Low'),
                                            labels = c("High Risk", "Low Risk")))) +
  geom_point() +
  scale_color_manual(values = c("#A73030FF", "#0073C2FF")) +
  scale_x_discrete(breaks = risk[order(risk$riskScore),]$id[c(1,100,200,300,400,500,600,700,800,900,1000,1100)],
                   labels = c(1,100,200,300,400,500,600,700,800,900,1000,1100),
                   expand = c(0.02,0)) +
  geom_vline(xintercept = nrow(risk[which(risk$risk=='Low'),]) + 0.5,
             lty = 2) +
  geom_hline(#yintercept = median(risk$riskScore),
    yintercept = cutpoint['riskScore',],
    lty =2) +
  labs(x = "Patients(increasing risk score)",
       y = "Risk Score",
       title = "Train Risk Score Distribution") +
  theme_base() +
  theme(legend.title = element_blank(),
        legend.key = element_blank(),
        legend.justification = c(0,1),
        legend.position = c(0,1),
        #        legend.margin = margin(c(-5,4,4,3)),
        legend.background = element_rect(color = "black", size = .3),
        plot.title = element_text(size = 15, hjust = 0.5))
risk_dis
pdf("04.Train_Risk_Score_Distribution.pdf", width = 7, height = 5,family = "Times")
risk_dis
dev.off()
png("04.Train_Risk_Score_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times")
risk_dis
dev.off()


surv_stat <- ggplot(risk, aes(x=reorder(id, riskScore),
                              y=OS.time/365,
                              color = factor(OS,
                                             levels = c(0,1),
                                             labels = c("Alive", "Dead")))) +
  geom_point() +
  scale_color_manual(values = c("#0073C2FF","#A73030FF")) +
  scale_x_discrete(breaks = risk[order(risk$riskScore),]$id[c(1,100,200,300,400,500,600,700,800,900,1000,1100)],
                   labels = c(1,100,200,300,400,500,600,700,800,900,1000,1100),
                   expand = c(0.02,0)) +
  ylim(c(0,10))+
  geom_vline(xintercept = nrow(risk[which(risk$risk=='Low'),]) + 0.5,
             lty = 2) +
  labs(x = "Patients(increasing risk score)",
       y = "Survival time(Years)",
       title = "Train survival state distribution") +
  theme_base() +
  theme(legend.title = element_blank(),
        legend.key = element_blank(),
        legend.justification = c(0,1),
        legend.position = c(0,1),
        #        legend.margin = margin(c(-5,4,4,3)),
        legend.background = element_rect(color = "black", size = .3),
        plot.title = element_text(size = 15, hjust = 0.5))
surv_stat
pdf("05.Train_survival_state_Distribution.pdf", width = 7, height = 5,family = "Times")
surv_stat
dev.off()
png("05.Train_survival_state_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times")
surv_stat
dev.off()


library(ComplexHeatmap)
risk <- risk[order(risk$risk, decreasing = T),]
risk2 <- data.frame(row.names=risk$id,Risk = risk$risk)
risk2$Risk <- as.factor(risk2$Risk)

rownames(cox_gene_rt)<-cox_gene_rt[,1]
cox_gene_rt2 <- cox_gene_rt[, coxGene]
fpkm_case_rt2 <- t(cox_gene_rt2) %>% as.data.frame(.)
select.expr <- fpkm_case_rt2[coxGene,rownames(risk2)]
select.expr <- log2(select.expr+1)
matrix <- t(scale(t(select.expr)))#
matrix[matrix< (-1)] <- -1
matrix[matrix>1] <- 1
identical(colnames(matrix), rownames(risk2))

ann.color <- list(Risk = c(High="#dc143c",Low = "#6495ed"))
group_rat <- data.frame(row.names=rownames(risk2),risk = risk2$Risk)
group_rat$risk <- as.factor(group_rat$risk)
cellwidth = 300/nrow(group_rat)
cellheight = 200/length(coxGene)
pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight,
                   main = 'Train',
                   fontfamily = "Times",
                   fontsize = 10,
                   annotation_colors = ann.color,
                   annotation_col= risk2,
                   annotation_names_col = F,
                   annotation_names_row = F,
                   color = colorRampPalette(c("navy", "white","firebrick1"))(100),
                   cluster_rows = F,
                   cluster_cols = F,
                   show_colnames = F,
                   width=8,height=5,
                   file='06.heatmap.png')

pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight,
                   main = 'Train',
                   fontfamily = "Times",
                   fontsize = 10,
                   annotation_colors = ann.color,
                   annotation_col= risk2,
                   annotation_names_col = F,
                   annotation_names_row = F,
                   color = colorRampPalette(c("navy", "white","firebrick1"))(100),
                   cluster_rows = F,
                   cluster_cols = F,
                   show_colnames = F,
                   width=8,height=5,
                   file='06.heatmap.pdf')


#09_Validation_estimate----------------------------------------------------------------------------------------
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./08_Validation_estimate")){
  dir.create("./08_Validation_estimate")
}
setwd("./08_Validation_estimate")
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(glmnet)
library(readr)
library(tidyverse)
dat <- read.csv("../00_rawdata/GSE20685/dat(GSE20685).csv",row.names = 1,check.names = F)
# "FAM129A""NIBAN1"
#rownames(dat)[which(rownames(dat) == "FAM129A")] <- "NIBAN1"
#dat <- log2(dat+1)
survival.va <- read.csv('../00_rawdata/GSE20685/survival(GSE20685).csv',header = T)
#survival.va$sample <- rownames(survival.va)
#survival.va <- subset(survival.va,OS.time>59)
dat <- dat[,survival.va$sample]
lasso_geneids <- read.csv('../06_Lasso/03.lasso_genes.csv',row.names = 1,check.names = F)
coef.min <- readRDS('../06_Lasso/06.coef.min.rds')
dat <- t(dat)%>%data.frame()

dat$sample <- rownames(dat)
test_data <- merge(survival.va,dat,by='sample')
test_data <- column_to_rownames(test_data,var='sample')

lasso_geneids
test_data2<-test_data[,lasso_geneids$symbol]
#test_data2 <- test_data2[,-2]
risk_out<-data.frame(test_data2)
risk_out$risk_out<-NA
risk_out$riskScore_out<-NA
cnt<-1

coef.min<-coef.min[lasso_geneids$symbol,]

while (cnt < length(rownames(test_data2))+1) {
  risk_out$riskScore_out[cnt]<-sum(coef.min*test_data2[cnt,])
  cnt = cnt + 1
}

dim(test_data2)
cnt<-1
while (cnt < length(rownames(test_data2))+1) {
  risk_out$risk_out[cnt]=as.vector(ifelse(risk_out$riskScore_out[cnt]>median(risk_out$riskScore_out),0,1))
  cnt = cnt + 1
}
riskScore_out<-as.numeric(risk_out$riskScore_out)

coxGene <- lasso_geneids$symbol
outCol=c("OS","OS.time",coxGene)
risk_out=as.vector(ifelse(riskScore_out>median(riskScore_out),'High','Low'))
median(riskScore_out)   #2.762742
risk_out <- as.data.frame(c(cbind(id=rownames(cbind(test_data[,outCol],
                                                    riskScore_out,
                                                    risk_out)),
                                  cbind(test_data[,outCol],
                                        riskScore_out,
                                        risk_out))))
table(risk_out$risk_out)
#write.csv(risk_out,file = '01.risk_va.csv',row.names = T)
.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
# # K-M
library(survival)
library(survminer)
OS_dat <- risk_out
gene<- colnames(OS_dat)  #surv_cutpoint 
res.cut <- surv_cutpoint(OS_dat,time = 'OS.time',event = 'OS',variables = 'riskScore_out',minprop = 0.4)#
cutpoint <- res.cut$cutpoint[1] #0.1854183
OS_dat$group <- factor(ifelse(OS_dat$riskScore_out>cutpoint['riskScore_out',],'High','Low'),levels = c('High','Low'))
table(OS_dat$group) # 
# High  Low 
# 188  139
kmfit_out <- survfit(Surv(OS.time, OS) ~ group, data = OS_dat)
surv_pvalue(kmfit_out)
#pval=0.01

# obj_is_vector <- function(obj) {  
#   is.vector(obj) && length(dim(obj)) == 0
# }


# kmfit <- survfit(Surv(OS.time, OS) ~ risk_out, data = risk_out)
# surv_pvalue(kmfit)

pdf('02.Verify_survival_KM.pdf',w=7,h=7,onefile=F,family='Times')
train_survival_median <- ggsurvplot(kmfit_out,
                                    pval = T, 
                                    conf.int = F,
                                    legend.labs=c("High risk","Low risk" ),
                                    legend.title="Risk score",
                                    title="Verify KM",
                                    font.main = c(15,"bold"),
                                    risk.table = TRUE, 
                                    risk.table.col = "strata", 
                                    linetype = "strata", 
                                    surv.median.line = "hv", 
                                    ggtheme = theme_bw(), 
                                    palette = c("#A73030FF", "#0073C2FF"))

print(train_survival_median)
dev.off()

png('02.Verify_survival_KM.png',w=7,h=7,family='Times',units='in',res=600)
train_survival_median <- ggsurvplot(kmfit_out,
                                    pval = T,
                                    conf.int = F,
                                    legend.labs=c("High risk","Low risk" ),
                                    legend.title="Risk score",
                                    title="Verify KM",
                                    font.main = c(15,"bold"),
                                    risk.table = TRUE,
                                    risk.table.col = "strata",
                                    linetype = "strata",
                                    surv.median.line = "hv",
                                    ggtheme = theme_bw(),
                                    palette = c("#A73030FF", "#0073C2FF"))

print(train_survival_median)
dev.off()

colnames(OS_dat)
OS_dat <- OS_dat[,-9]
colnames(OS_dat)[9] <- 'risk_out'
write.csv(OS_dat, file = "01.verify_OS_dat.csv", quote = F)
risk2 <- risk_out
risk <- OS_dat

#4 ROC 
library(survivalROC)
ROC <- risk

multi_ROC <- function(time_vector, risk_score_table){
  library(survivalROC)
  single_ROC <- function(single_time){
    for_ROC <- survivalROC(Stime=risk_score_table$OS.time,
                           status=risk_score_table$OS,
                           marker=risk_score_table$riskScore,
                           predict.time=single_time,
                           method = 'KM')
    data.frame('True_positive'=for_ROC$TP, 'False_positive'=for_ROC$FP,
               'Cut_values'=for_ROC$cut.values, 'Time'=rep(single_time, length(for_ROC$TP)),
               'AUC'=rep(for_ROC$AUC, length(for_ROC$TP)))
  }
  multi_ROC_list <- lapply(time_vector, single_ROC)
  do.call(rbind, multi_ROC_list)
}
for_multi_ROC <- multi_ROC(time_vector = c(365*seq(1,5,2)), 
                           risk_score_table = risk_out)
table(for_multi_ROC$AUC)

for_multi_ROC$Time <- factor(for_multi_ROC$Time)
# ROC
library(geomROC)
library(plotROC)

auc_y1 <- round(for_multi_ROC[which(for_multi_ROC$Time==365),5][1],3)
#auc_y2 <- round(for_multi_ROC[which(for_multi_ROC$Time==730),5][1],2)
auc_y3 <- round(for_multi_ROC[which(for_multi_ROC$Time==1095),5][1],3)
#auc_y4 <- round(for_multi_ROC[which(for_multi_ROC$Time==1460),5][1],2)
auc_y5 <- round(for_multi_ROC[which(for_multi_ROC$Time==1825),5][1],3)
#auc_y6 <- round(for_multi_ROC[which(for_multi_ROC$Time==2190),5][1],2)
#auc_y7 <- round(for_multi_ROC[which(for_multi_ROC$Time==2555),5][1],2)


ROC <- ggplot(for_multi_ROC, aes(x=False_positive,
                                 y=True_positive,
                                 label=Cut_values,
                                 color=Time)) +
  scale_color_manual(breaks = c('365',"1095",'1825'),
                     labels = c("1 year","3 years","5 years"),
                     values = c("#4682B4", "#FF7F00", "#984EA3")) +
  plotROC::geom_roc(labels = F, stat = 'identity') +
  style_roc() +
  geom_abline(slope = 1, intercept = 0, color = 'gray', linetype=2) +
  theme_bw() +
  labs(title = "Verify ROC") +
  theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'),
        panel.grid = element_blank(),
        plot.title = element_text(size = 15, hjust = 0.5, face = "bold")) +
  annotate("text", x=0.75, y=c(0.25, 0.15, 0.05),
           label = c(paste('AUC of 1 year =', format(auc_y1,nsmall=2)),
                     paste('AUC of 3 years =', format(auc_y3,nsmall=2)),
                     paste('AUC of 5 years =', format(auc_y5,nsmall=2))
           ))
ROC
pdf("03.Verify_ROC.pdf", width = 6, height = 5,family = "Times")
ROC
dev.off()
png("03.Verify_ROC.png", width = 6, height = 5,res = 600,units = 'in',family = "Times")
ROC
dev.off()


library(ggthemes)
#risk <- risk_out
riskScore_out <- risk$riskScore_out
risk_dis <- ggplot(risk, aes(x=reorder(id, riskScore_out),
                             y=riskScore_out,
                             color = factor(risk_out,
                                            levels = c('High', 'Low'),
                                            labels = c("High Risk", "Low Risk")))) +
  geom_point() +
  scale_color_manual(values = c("#A73030FF", "#0073C2FF")) +
  scale_x_discrete(breaks = risk[order(risk$riskScore_out),]$id[c(1,30,60,90,120,150,180,210,240,270,300,330)],
                   labels = c(1,30,60,90,120,150,180,210,240,270,300,330),
                   expand = c(0.02,0)) +
  geom_vline(xintercept = nrow(risk[which(risk$risk_out=='Low')+0.5,]) ,
             lty = 2) +
  geom_hline(#yintercept = median(risk_out$riskScore_out),
    yintercept = cutpoint['riskScore',],
    lty =2) +
  labs(x = "Patients(increasing risk score)",
       y = "Risk Score",
       title = "Verify Risk Score Distribution") +
  theme_base() +
  theme(legend.title = element_blank(),
        legend.key = element_blank(),
        legend.justification = c(0,1),
        legend.position = c(0,1),
        #        legend.margin = margin(c(-5,4,4,3)),
        legend.background = element_rect(color = "black", size = .3),
        plot.title = element_text(size = 15, hjust = 0.5))
risk_dis
pdf("04.Verify_Risk_Score_Distribution.pdf", width = 7, height = 5,family = "Times")
risk_dis
dev.off()
png("04.Verify_Risk_Score_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times")
risk_dis
dev.off()


surv_stat <- ggplot(risk, aes(x=reorder(id, riskScore_out),
                              y=OS.time/365,
                              color = factor(OS,
                                             levels = c(0,1),
                                             labels = c("Alive", "Dead")))) +
  geom_point() +
  scale_color_manual(values = c("#0073C2FF","#A73030FF")) +
  scale_x_discrete(breaks = risk[order(risk$riskScore_out),]$id[c(1,30,60,90,120,150,180,210,240,270,300,330)],
                   labels = c(1,30,60,90,120,150,180,210,240,270,300,330),
                   expand = c(0.02,0)) +
  ylim(c(0,20))+
  geom_vline(xintercept = nrow(risk[which(risk$risk_out=='Low'),]) + 0.5,
             lty = 2) +
  labs(x = "Patients(increasing risk score)",
       y = "Survival time(Years)",
       title = "Verify survival state distribution") +
  theme_base() +
  theme(legend.title = element_blank(),
        legend.key = element_blank(),
        legend.justification = c(0,1),
        legend.position = c(0,1),
        #        legend.margin = margin(c(-5,4,4,3)),
        legend.background = element_rect(color = "black", size = .3),
        plot.title = element_text(size = 15, hjust = 0.5))

surv_stat
pdf("05.Verify_survival_state_Distribution.pdf", width = 7, height = 5,family = "Times")
surv_stat
dev.off()
png("05.Verify_survival_state_Distribution.png", width = 7, height = 5,res = 600,units = 'in',family = "Times")
surv_stat
dev.off()

library(ComplexHeatmap)
risk <- risk[order(risk$risk_out, decreasing = T),]
risk2 <- data.frame(row.names=risk$id,Risk = risk$risk_out)
risk2$Risk <- as.factor(risk2$Risk)


cox_gene_rt2 <- dat[, coxGene]
fpkm_case_rt2 <- t(cox_gene_rt2) %>% as.data.frame(.)
select.expr <- fpkm_case_rt2[coxGene,rownames(risk2)]
select.expr <- log2(select.expr+1)
matrix <- t(scale(t(select.expr)))#
matrix[matrix< (-1)] <- -1
matrix[matrix>1] <- 1
identical(colnames(matrix), rownames(risk2))

ann.color <- list(Risk = c(High="#dc143c",Low = "#6495ed"))
group_rat <- data.frame(row.names=rownames(risk2),risk = risk2$Risk)
group_rat$risk <- as.factor(group_rat$risk)
cellwidth = 300/nrow(group_rat)
cellheight = 200/length(coxGene)
pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight,
                   main = 'Verify',
                   fontfamily = "Times",
                   fontsize = 10,
                   annotation_colors = ann.color,
                   annotation_col= risk2,
                   annotation_names_col = F,
                   annotation_names_row = F,
                   color = colorRampPalette(c("navy", "white","firebrick1"))(100),
                   cluster_rows = F,
                   cluster_cols = F,
                   show_colnames = F,
                   width=8,height=5,
                   file='06.heatmap.png')

pheatmap::pheatmap(matrix,cellwidth = cellwidth,cellheight = cellheight,
                   main = 'Verify',
                   fontfamily = "Times",
                   fontsize = 10,
                   annotation_colors = ann.color,
                   annotation_col= risk2,
                   annotation_names_col = F,
                   annotation_names_row = F,
                   color = colorRampPalette(c("navy", "white","firebrick1"))(100),
                   cluster_rows = F,
                   cluster_cols = F,
                   show_colnames = F,
                   width=8,height=5,
                   file='06.heatmap.pdf')

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./09_progmodel")){
  dir.create("./09_progmodel") 
}
setwd("./09_progmodel")

## 07-1 Cox----------
#.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
survival <- read.csv('../00_rawdata/04.survival_TCGA.csv',row.names = 1)
phenotype<-read.csv('../00_rawdata/05.phenotype_TCGA.csv')
colnames(phenotype)
train_phenotype<-data.frame(sample=phenotype$sample,
                            age=phenotype$age,
                            gender=phenotype$gender,
                            stage=phenotype$stage,
                            T.stage=phenotype$T_stage,
                            N.stage=phenotype$N_stage,
                            M.stage=phenotype$M_stage)
write.csv(train_phenotype,'01.train_phenotype.csv')
# Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1
# Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2
# Current reformed smoker for > 15 years (greater than 15 years) = 3
# Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4
# Current reformed smoker, duration not specified = 5
# Smoking History not documented = 7

train_phenotype<-merge(train_phenotype,survival,by='sample')
train_phenotype$OS<-as.numeric(train_phenotype$OS)
train_phenotype$OS.time<-as.numeric(train_phenotype$OS.time)
train_phenotype2<-train_phenotype

table(train_phenotype2$stage)
train_phenotype2$stage <- gsub('A','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('B','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('C','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage III','3/4',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage IV','3/4',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage II','2',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage I','1',train_phenotype2$stage)
#train_phenotype2$stage <- gsub('not reported',NA,train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage X',NA,train_phenotype2$stage)


table(train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('a','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('b','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('c','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('d','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('TX',NA,train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T3','3/4',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T4','3/4',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T2','2',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T1','1',train_phenotype2$T.stage)

table(train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('a','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('b','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('c','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('d','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub(' (i-)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub(' (i+)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub(' (mol+)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub('mi','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N3','3',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N2','2',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N1','1',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N0','0',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('NX',NA,train_phenotype2$N.stage)


table(train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub('MX',NA,train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub(' (i+)','',train_phenotype2$M.stage,fixed = T)
train_phenotype2$M.stage<-gsub('c','',train_phenotype2$M.stage,fixed = T)
train_phenotype2$M.stage<-gsub('M0','0',train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub('M1','1',train_phenotype2$M.stage)

train_phenotype2 <- na.omit(train_phenotype2)


table(train_phenotype2$age)
train_phenotype2$age <- as.numeric(train_phenotype2$age)
train_phenotype2$age <- ifelse(train_phenotype2$age>60,1,0)
median(train_phenotype2$age)


table(train_phenotype2$gender)
train_phenotype2$gender <- ifelse(train_phenotype2$gender=='female',1,0)

colnames(train_phenotype2)
colnames(train_phenotype2)<-c('id','Age','Gender','Stage','T.stage','N.stage',"M.stage",'OS.time','OS')
risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1)
sub_risk <- subset(risk, select = c(id, riskScore))

train_risk_clinical <- merge(train_phenotype2,
                             sub_risk,
                             by = "id")
rownames(train_risk_clinical) <- train_risk_clinical$id
train_risk_clinical = subset(train_risk_clinical, select = -c(id))
dim(train_risk_clinical)
os_risk_clinical <- train_risk_clinical

colnames_train <- colnames(train_risk_clinical)
covariates_train <- colnames_train[-which(colnames_train %in% c("OS", "OS.time"))]
covariates <- covariates_train

train_risk_clinical$T.stage<-factor(train_risk_clinical$T.stage)
train_risk_clinical$N.stage<-factor(train_risk_clinical$N.stage)
train_risk_clinical$M.stage<-factor(train_risk_clinical$M.stage)
train_risk_clinical$Stage <- factor(train_risk_clinical$Stage)


library(survival)
library(dplyr)
res.risk = coxph(Surv(time = OS.time, event = OS) ~ riskScore, data = train_risk_clinical) %>% summary
res.risk = c(res.risk$conf.int[-2], res.risk$coefficients[5])

res.age = coxph(Surv(time = OS.time, event = OS) ~ Age, data = train_risk_clinical) %>% summary
res.age = c(res.age$conf.int[-2], res.age$coefficients[5])

res.gender = coxph(Surv(time = OS.time, event = OS) ~ Gender, data = train_risk_clinical) %>% summary
res.gender = c(res.gender$conf.int[-2], res.gender$coefficients[5])

# res.smoke = coxph(Surv(time = OS.time, event = OS) ~ Smoking, data = train_risk_clinical) %>% summary
# res.smoke = cbind(res.smoke$conf.int[,-2], res.smoke$coefficients[,5])

res.stage = coxph(Surv(time = OS.time, event = OS) ~ Stage, data = train_risk_clinical) %>% summary
res.stage = cbind(res.stage$conf.int[,-2], res.stage$coefficients[,5])

res.T_stage = coxph(Surv(time = OS.time, event = OS) ~ T.stage, data = train_risk_clinical) %>% summary
res.T_stage = cbind(res.T_stage$conf.int[,-2], res.T_stage$coefficients[,5])

res.M_stage = coxph(Surv(time = OS.time, event = OS) ~ M.stage, data = train_risk_clinical) %>% summary
res.M_stage = c(res.M_stage$conf.int[-2], res.M_stage$coefficients[5])

res.N_stage = coxph(Surv(time = OS.time, event = OS) ~ N.stage, data = train_risk_clinical) %>% summary
res.N_stage = cbind(res.N_stage$conf.int[,-2], res.N_stage$coefficients[,5])

res.ref = c(1,1,1,NA)
res = rbind(res.risk,res.age,res.gender,res.ref,res.stage,res.ref,res.T_stage,res.ref,res.N_stage,res.M_stage) %>% as.data.frame()
rownames(res)
res$Indicators = c("riskScore","Age","Gender","Stage1(Reference)",'Stage2','Stage3/4',"T1(Reference)",'T2','T3/4','N0(Reference)','N1','N2','N3','M1 vs M0')
colnames(res) = c("hr","low","up","pv","Indicator")
res$p = signif(res$pv, 2) %>% paste0("p = ", .)
res$p[is.na(res$pv)] = NA
res$Indicator = factor(res$Indicator, levels = rev(res$Indicator))
rownames(res) <- res$Indicator
res2 <- data.frame(p.value=res$pv,
                   HR=res$hr,
                   HR.95L=res$low,
                   HR.95H=res$up,
                   Indicator=res$Indicator)
rownames(res2) <- res2$Indicator
write.csv(res2, file = "univariate_cox_prog_forest.csv",quote = F)
res2 <- subset(res2, select = -c(Indicator))
library(tidyr)
hz <- paste(round(res2$HR,3),
            "(",round(res2$HR.95L,3),
            "-",round(res2$HR.95H,3),")",sep = "")
hz
hz[c(4,7,10)] <- ""

tabletext <- cbind(c(NA,rownames(res2)),
                   c("P value",ifelse(res2$p.value<0.0001,
                                      "< 0.0001",
                                      round(res2$p.value,4))),
                   c("Hazard Ratio(95% CI)",hz))
nrow(tabletext) + 1
library(forestplot)
pdf(file = '01.uncoxforest.pdf',height = 6, width = 12, onefile = F)
forestplot(labeltext=tabletext, 
           graph.pos=4,  #Pvalue
           is.summary = c(TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,rep(FALSE, 7)),
           col=fpColors(box="darkred", lines="darkblue", zero = "gray50"),
           mean=c(NA,res2$HR),
           lower=c(NA,res2$HR.95L), #95%
           upper=c(NA,res2$HR.95H), #95%
           boxsize=0.2,lwd.ci=3,   #，
           ci.vertices.height = 0.08,ci.vertices=TRUE, #、、
           zero=1,lwd.zero=0.5,    #zero 
           colgap=unit(5,"mm"),    #
           xticks = c(0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9), #
           lwd.xaxis=2,            #X
           lineheight = unit(1.2,"cm"), #
           graphwidth = unit(.6,"npc"), #
           cex=0.9, fn.ci_norm = fpDrawCircleCI, #
           hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)),
           # hrzl_lines=list("2" = gpar(lwd=2, col="black"),
           #                 "3" = gpar(lwd=2, col="black"), #，
           #                 "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1
           # mar=unit(rep(0.01, times = 4), "cm"),#
           #fpTxtGpcex
           txt_gp=fpTxtGp(label=gpar(cex=1),
                          ticks=gpar(cex=0.8, fontface = "bold"),
                          xlab=gpar(cex = 1, fontface = "bold"),
                          title=gpar(cex = 1.25, fontface = "bold")),
           xlab="Hazard Ratio",
           grid = T,
           title = "Univariate",
           clip = c(0,9)) # x，
dev.off()

png(file = '01.uncoxforest.png',height = 400, width = 900)
forestplot(labeltext=tabletext, 
           graph.pos=4,  #Pvalue
           is.summary = c(TRUE,FALSE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,rep(FALSE, 7)),
           col=fpColors(box="darkred", lines="darkblue", zero = "gray50"),
           mean=c(NA,res2$HR),
           lower=c(NA,res2$HR.95L), #95%
           upper=c(NA,res2$HR.95H), #95%
           boxsize=0.2,lwd.ci=3,   #，
           ci.vertices.height = 0.08,ci.vertices=TRUE, #、、
           zero=1,lwd.zero=0.5,    #zero 
           colgap=unit(5,"mm"),    #
           xticks = c(0,0.5,1,1.5,2,2.5,3,3.5,4,4.5,5,5.5,6,6.5,7,7.5,8,8.5,9), #
           lwd.xaxis=2,            #X
           lineheight = unit(1.2,"cm"), #
           graphwidth = unit(.6,"npc"), #
           cex=0.9, fn.ci_norm = fpDrawCircleCI, #
           hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)),
           # hrzl_lines=list("2" = gpar(lwd=2, col="black"),
           #                 "3" = gpar(lwd=2, col="black"), #，
           #                 "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1
           # mar=unit(rep(0.01, times = 4), "cm"),#
           #fpTxtGpcex
           txt_gp=fpTxtGp(label=gpar(cex=1),
                          ticks=gpar(cex=0.8, fontface = "bold"),
                          xlab=gpar(cex = 1, fontface = "bold"),
                          title=gpar(cex = 1.25, fontface = "bold")),
           xlab="Hazard Ratio",
           grid = T,
           title = "Univariate",
           clip = c(0,9)) # x，
dev.off()

colnames(train_risk_clinical)


#PH-------------
library(survival)
library(survminer)
res.all <- data.frame()
for (i in 1:length(covariates)) {
  #i <- 1
  gene_list <- covariates[i]
  cox_data <- as.formula(paste0('Surv(OS.time, OS)~',
                                paste(gene_list,
                                      sep = '',
                                      collapse = '+')))
  
  cox_more <- coxph(cox_data,
                    data = os_risk_clinical)
  cox_zph <- cox.zph(cox_more)
  cox_table <- cox_zph$table[-nrow(cox_zph$table),]
  cox_table
  res <- data.frame(gene=covariates[i],PH.p=cox_table[3])
  res.all <- rbind(res.all, res)
  p <- ggcoxzph(cox_zph)
  # pdf(paste0('01.ph.test_',gene_list,'.pdf'), width = 6,height = 4,onefile = F,family='Times')
  # print(p)
  # dev.off()
  # 
  # png(paste0('01.ph.test_',gene_list,'.png'), width = 6,height = 4,family='Times',units='in',res=600)
  # print(p)
  # dev.off()
}
colnames(res.all)[1] <- 'phenotype'
write.csv(res.all,
          file = "01.uniCox_PH.csv",
          quote = F,
          row.names = F)

ph.gene <- res.all[as.numeric(res.all$PH.p)>0.05,]
dim(ph.gene)
ph.gene$phenotype

# "Age"       "Gender"    "N.stage"   "M.stage"   "riskScore"


## 07-2 Cox----------
cox_more = coxph(Surv(time = OS.time, event = OS) ~ riskScore+Age+Stage+T.stage+N.stage+M.stage, data = train_risk_clinical)

cox_zph <- cox.zph(cox_more)
cox_table <- cox_zph$table[-nrow(cox_zph$table),]      #PH
cox_table2 <- data.frame(cox_table)
write.csv(cox_table2,'PH_test.csv')
#T stage 

res.mul = coxph(Surv(time = OS.time, event = OS) ~ riskScore+Age+N.stage+M.stage, data = train_risk_clinical)%>% summary
res.mul = cbind(res.mul$conf.int[,-2], res.mul$coefficients[,5]) %>% as.data.frame()
rownames(res.mul)

res.mul = rbind(res.mul[c(1:2),], res.ref, res.mul[c(3:6),])
res.mul$Indicators = c("riskScore",'Age','N0(Reference)','N1','N2','N3','M1 vs M0')
colnames(res.mul) = c("hr","low","up","pv","Indicator")
res.mul$p = signif(res.mul$pv, 2) %>% paste0("p = ", .)
res.mul$p[is.na(res.mul$pv)] = NA
res.mul$Indicator = factor(res.mul$Indicator, levels = rev(res.mul$Indicator))
rownames(res.mul) <- res.mul$Indicator
#res.mul <- res.mul[1,]


multi_res <- data.frame(p.value=res.mul$pv,
                        HR=res.mul$hr,
                        HR.95L=res.mul$low,
                        HR.95H=res.mul$up,
                        Indicator=res.mul$Indicator)
rownames(multi_res) <- multi_res$Indicator
multi_res
write.csv(multi_res,
          file = "multivariate_cox_prog_result.csv",
          quote = F,
          row.names = T)
multi_res <- subset(multi_res, select = -c(Indicator))

library(tidyr)
hz <- paste(round(multi_res$HR,3),
            "(",round(multi_res$HR.95L,3),
            "-",round(multi_res$HR.95H,3),")",sep = "")
hz

hz[c(3)] <- ""
tabletext <- cbind(c(NA,rownames(multi_res)),
                   c("P value",ifelse(multi_res$p.value<0.0001,
                                      "< 0.0001",
                                      round(multi_res$p.value,4))),
                   c("Hazard Ratio(95% CI)",hz))
nrow(tabletext) + 1
pdf(file = '02.mulcoxforest.pdf',height = 6, width = 12, onefile = F)
forestplot(labeltext=tabletext, 
           graph.pos=4,  #Pvalue
           is.summary = c(TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE),
           col=fpColors(box="darkred", lines="darkblue", zero = "gray50"),
           mean=c(NA,multi_res$HR),
           lower=c(NA,multi_res$HR.95L), #95%
           upper=c(NA,multi_res$HR.95H), #95%
           boxsize=0.2,lwd.ci=3,   #，
           ci.vertices.height = 0.08,ci.vertices=TRUE, #、、
           zero=1,lwd.zero=0.5,    #zero 
           colgap=unit(5,"mm"),    #
           xticks = c(0,1,2,3,4,5,6,7), #
           lwd.xaxis=2,            #X
           lineheight = unit(1.2,"cm"), #
           graphwidth = unit(.6,"npc"), #
           cex=0.9, fn.ci_norm = fpDrawCircleCI, #
           hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)),
           # hrzl_lines=list("2" = gpar(lwd=2, col="black"),
           #                 "3" = gpar(lwd=2, col="black"), #，
           #                 "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1
           # mar=unit(rep(0.01, times = 4), "cm"),#
           #fpTxtGpcex
           txt_gp=fpTxtGp(label=gpar(cex=1),
                          ticks=gpar(cex=0.8, fontface = "bold"),
                          xlab=gpar(cex = 1, fontface = "bold"),
                          title=gpar(cex = 1.25, fontface = "bold")),
           xlab="Hazard Ratio",
           grid = T,
           title = "Multivariate",
           clip = c(0,7)) # x，
dev.off()

png(file = '02.mulcoxforest.png',height = 6, width = 12,units = 'in',res = 600)
forestplot(labeltext=tabletext, 
           graph.pos=4,  #Pvalue
           is.summary = c(TRUE,FALSE,FALSE,TRUE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE,FALSE),
           col=fpColors(box="darkred", lines="darkblue", zero = "gray50"),
           mean=c(NA,multi_res$HR),
           lower=c(NA,multi_res$HR.95L), #95%
           upper=c(NA,multi_res$HR.95H), #95%
           boxsize=0.2,lwd.ci=3,   #，
           ci.vertices.height = 0.08,ci.vertices=TRUE, #、、
           zero=1,lwd.zero=0.5,    #zero 
           colgap=unit(5,"mm"),    #
           xticks = c(0,1,2,3,4,5,6,7), #
           lwd.xaxis=2,            #X
           lineheight = unit(1.2,"cm"), #
           graphwidth = unit(.6,"npc"), #
           cex=0.9, fn.ci_norm = fpDrawCircleCI, #
           hrzl_lines = list("2" = gpar(col = "black", lty = 1, lwd = 2)),
           # hrzl_lines=list("2" = gpar(lwd=2, col="black"),
           #                 "3" = gpar(lwd=2, col="black"), #，
           #                 "59" = gpar(lwd=2, col="black")),#,"16"nrow(tabletext)+1
           # mar=unit(rep(0.01, times = 4), "cm"),#
           #fpTxtGpcex
           txt_gp=fpTxtGp(label=gpar(cex=1),
                          ticks=gpar(cex=0.8, fontface = "bold"),
                          xlab=gpar(cex = 1, fontface = "bold"),
                          title=gpar(cex = 1.25, fontface = "bold")),
           xlab="Hazard Ratio",
           grid = T,
           title = "Multivariate",
           clip = c(0,7)) # x，
dev.off()


##07-3 COX，---------
#train_risk_clinical2 <- train_risk_clinical
multi_cov<-c('riskScore','Age','N.stage','M.stage')
cox_data_prog <- as.formula(paste0('Surv(OS.time, OS)~',
                                   paste(multi_cov,
                                         sep = '',
                                         collapse = '+')))
cox_more_prog <- coxph(cox_data_prog,
                       data = as.data.frame(train_risk_clinical))

# Nomogram
library(rms)
ddist <- datadist(train_risk_clinical)
options(datadist='ddist')

# COX，
res.cox <- psm(cox_data_prog,
               data = train_risk_clinical, dist = 'lognormal')
surv <- Survival(res.cox) # 
function(x) surv(365, x) # 1
# function(x) surv(730, x) # 2
function(x) surv(1095, x) # 3
function(x) surv(1825, x) # 5
#function(x) surv(2555, x) # 5
nom.cox <- nomogram(res.cox,
                    fun = list(function(x) surv(365, x),
                               function(x)surv(1095, x),
                               function(x)surv(1825, x)),
                    funlabel=c("1-year Survival Probability", "3-year Survival Probability", "5-year Survival Probability"),
                    maxscale = 10,
                    fun.at = c(0.01,seq(0.1,0.9,by=0.2),0.95,0.99),
                    lp=F)
plot(nom.cox, cex.axis  = 1.5, cex.var = 1.7)
png(filename = "03.nomogram_line_points.png", height = 10, width = 17,units = 'in',res = 300)
plot(nom.cox, cex.axis  = 1.5, cex.var = 1.7)
dev.off()
pdf(file = "03.nomogram_line_points.pdf", height = 10, width = 17)
plot(nom.cox, cex.axis  = 1.5, cex.var = 1.7)
dev.off()

library(rms)
library(regplot)
ddist <- datadist(train_risk_clinical)
options(datadist = "ddist")
y3 <- 1
y5 <- 3
y7 <- 5
LR1 <- survival::coxph(Surv(OS.time,OS) ~ riskScore+Age+N.stage+M.stage,data = train_risk_clinical)
p <- regplot::regplot(#res.cox, 
  LR1,
  observation = train_risk_clinical[200,], 
  interval = "confidence",
  points=TRUE,##
  title = "Nomogram",
  clickable = F,##
  subticks = T,
  failtime = c(365*y3,365*y5,365*y7),
  prfail = F,##T1、23,F1、23
  plots=c("density","boxes"),
  dencol="#95A2BF",
  boxcol="#ED7474",
  droplines=TRUE
)

##
# 03.nomogram_line_points_after.png
# 03.nomogram_line_points_after.pdf


##07-4 ---------
coxm_3 <- cph(cox_data_prog,
              data=train_risk_clinical,
              surv=T,
              x=T,y=T,
              time.inc = 1*365)
cal_3 <-calibrate(coxm_3,u=1*365,cmethod='KM',m=100,B=100)


coxm_5 <- cph(cox_data_prog,
              data=train_risk_clinical,
              surv=T,
              x=T,y=T,
              time.inc = 3*365)
cal_5 <-calibrate(coxm_5,u=3*365,cmethod='KM',m=100,B=100)


coxm_7 <- cph(cox_data_prog,
              data=train_risk_clinical,
              surv=T,
              x=T,y=T,
              time.inc = 5*365)
cal_7 <-calibrate(coxm_7,u=5*365,cmethod='KM',m=100,B=100)

pdf(file = '04.calibrate.pdf',w=8,h=8)
par(mar=c(7,4,4,3),cex=1.5)
plot(cal_3,
     subtitles = F,
     lwd=2,lty=1, ##
     errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ##
     xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',#
     ylab='Actual 1-5 year Survival Probability',#
     col="#00468b",#
     xlim = c(0,1),ylim = c(0,1)) ##xy
plot(cal_5,
     add = T,
     subtitles = F,
     lwd=2,lty=1,  ##
     errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ##
     xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',#
     ylab='Actual 1-5 year Survival Probability',#
     col="#ed0000",#
     xlim = c(0,1),ylim = c(0,1)) ##xy
plot(cal_7,
     add = T,
     subtitles = F,
     lwd=2,lty=1, ##
     errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ##
     xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',#
     ylab='Actual 1-5 year Survival Probability',#
     col="#42b540",#
     xlim = c(0,1),ylim = c(0,1)) ##xy
#
legend("bottomright", legend=c("1-year", "3-years", "5-years"), 
       col=c("#00468b", "#ed0000", "#42b540"), 
       lwd=2)
#
abline(0,1,lty=5,lwd=2,col="grey")
dev.off()

png(file = '04.calibrate.png',w=8,h=8,units = 'in',res = 300)
par(mar=c(7,4,4,3),cex=1.5)
plot(cal_3,
     subtitles = F,
     lwd=2,lty=1, ##
     errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ##
     xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',#
     ylab='Actual 1-5 year Survival Probability',#
     col="#00468b",#
     xlim = c(0,1),ylim = c(0,1)) ##xy
plot(cal_5,
     add = T,
     subtitles = F,
     lwd=2,lty=1,  ##
     errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ##
     xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',#
     ylab='Actual 1-5 year Survival Probability',#
     col="#ed0000",#
     xlim = c(0,1),ylim = c(0,1)) ##xy
plot(cal_7,
     add = T,
     subtitles = F,
     lwd=2,lty=1, ##
     errbar.col=c(rgb(0,118,192,maxColorValue = 255)), ##
     xlab='Nomogram-Predicted Probability of 1-5 year Survival Probability',#
     ylab='Actual 1-5 year Survival Probability',#
     col="#42b540",#
     xlim = c(0,1),ylim = c(0,1)) ##xy
#
legend("bottomright", legend=c("1-year", "3-years", "5-years"), 
       col=c("#00468b", "#ed0000", "#42b540"), 
       lwd=2)
#
abline(0,1,lty=5,lwd=2,col="grey")
dev.off()


# # ## DCA-----
# library(rmda)
# dca.dat <- train_risk_clinical
# complex<-decision_curve(OS ~ riskScore+Age+N.stage+M.stage,data =dca.dat,family = binomial(link ='logit'),
#                         thresholds = seq(0,1, by = 0.01),
#                         confidence.intervals= 0.95,
#                         study.design = 'case-control',
#                         population.prevalence= 0.3
# )
# riskScore.dca <- decision_curve(OS ~riskScore,data =dca.dat,family = binomial(link ='logit'),
#                                 thresholds = seq(0,1, by = 0.01),
#                                 confidence.intervals= 0.95,
#                                 study.design = 'case-control',
#                                 population.prevalence= 0.3
# )
# Age.dca <- decision_curve(OS ~Age,data =dca.dat,family = binomial(link ='logit'),
#                           thresholds = seq(0,1, by = 0.01),
#                           confidence.intervals= 0.95,
#                           study.design = 'case-control',
#                           population.prevalence= 0.3
# )
# N.stage.dca <- decision_curve(OS ~N.stage,data =dca.dat,family = binomial(link ='logit'),
#                               thresholds = seq(0,1, by = 0.01),
#                               confidence.intervals= 0.95,
#                               study.design = 'case-control',
#                               population.prevalence= 0.3
# )
# M.stage.dca <- decision_curve(OS ~M.stage,data =dca.dat,family = binomial(link ='logit'),
#                               thresholds = seq(0,1, by = 0.01),
#                               confidence.intervals= 0.95,
#                               study.design = 'case-control',
#                               population.prevalence= 0.3
# )
# dca.list <- list(riskScore.dca,Age.dca,N.stage.dca,M.stage.dca,complex)
# 
# pdf("05.DCA.pdf", width = 5, height = 5)
# plot_decision_curve(dca.list,
#                     curve.names=c('riskScore','Age','N.stage','M.stage','Nomogram'),
#                     cost.benefit.axis =FALSE,col= c("#FDC086","#7FC97F","#BEAED4","#74CCBE","#ED7474"),
#                     confidence.intervals=FALSE,
#                     standardize = FALSE)
# dev.off()
# 
# png("05.DCA.png", width = 5, height = 5,units = 'in',res = 300)
# plot_decision_curve(dca.list,
#                     curve.names=c('riskScore','Age','N.stage','M.stage','Nomogram'),
#                     cost.benefit.axis =FALSE,col= c("#FDC086","#7FC97F","#BEAED4","#74CCBE","#ED7474"),
#                     confidence.intervals=FALSE,
#                     standardize = FALSE)
# dev.off()
# 


##roc
res.mul = coxph(Surv(time = OS.time, event = OS) ~ riskScore+Age+N.stage+M.stage, data = train_risk_clinical)
train_risk_clinical$riskscore_prog = predict(res.mul, newdata = train_risk_clinical, type = "lp")
train_risk_clinical$risk_prog = ifelse(train_risk_clinical$riskscore_prog > median(train_risk_clinical$riskscore_prog, na.rm = T), "high", "low")
train_risk_clinical$risk_prog = factor(train_risk_clinical$risk_prog, levels = c("high", "low"), labels = c("High risk", "Low risk"))
train_risk_clinical2 <- train_risk_clinical
library(survival)
library(survminer)

multi_ROC_out <- function(time_vector, risk_score_table){
  library(survivalROC)
  single_ROC <- function(single_time){
    for_ROC <- survivalROC(Stime=risk_score_table$OS.time,
                           status=risk_score_table$OS,
                           marker=risk_score_table$riskscore_prog,
                           predict.time=single_time,method = 'KM')
    data.frame('True_positive'=for_ROC$TP, 'False_positive'=for_ROC$FP,
               'Cut_values'=for_ROC$cut.values, 'Time'=rep(single_time, length(for_ROC$TP)),
               'AUC'=rep(for_ROC$AUC, length(for_ROC$TP)))
  }
  multi_ROC_list <- lapply(time_vector, single_ROC)
  do.call(rbind, multi_ROC_list)
}

for_multi_ROC <- multi_ROC_out(time_vector = c(1*365,3*365,5*365),
                               risk_score_table = train_risk_clinical2)
for_multi_ROC$Time <- factor(for_multi_ROC$Time)

# ROC
library(scales)
library(geomROC)
library(plotROC)
auc_y3 <- round(for_multi_ROC[which(for_multi_ROC$Time==1*365),5][1],2)
auc_y5 <- round(for_multi_ROC[which(for_multi_ROC$Time==3*365),5][1],2)
auc_y7 <- round(for_multi_ROC[which(for_multi_ROC$Time==5*365),5][1],2)


ROC <- ggplot(for_multi_ROC, aes(x=False_positive,
                                 y=True_positive,
                                 label=Cut_values,
                                 color=Time)) +
  scale_color_manual(breaks = c("365",'1095','1825'),
                     labels = c("1 year","3 year","5 year"),
                     values = c("#4682B4", "#FF7F00", "#984EA3")) +
  plotROC::geom_roc(labels = F, stat = 'identity') +
  style_roc() +
  geom_abline(slope = 1, intercept = 0, color = 'gray', linetype=2) +
  theme_bw() +
  labs(title = "Independent prognosis ROC") +
  theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'),
        panel.grid = element_blank(),
        plot.title = element_text(size = 15, hjust = 0.5, face = "bold")) +
  # annotate('text', x=.75, y=.25, label=paste('AUC of 1 years =', round(auc_y1,2))) +
  # annotate('text', x=.75, y=.15, label=paste('AUC of 3 years =', round(auc_y3,2))) +
  # annotate('text', x=.75, y=.05, label=paste('AUC of 5 years =', round(auc_y5,2))) +
  annotate("text", x=0.75, y=c(0.25, 0.15, 0.05),
           label = c(paste('AUC of 1 year =', format(auc_y3,nsmall=2)),
                     paste('AUC of 3 years =', format(auc_y5,nsmall=2)),
                     paste('AUC of 5 years =', format(auc_y7,nsmall=2))))
ROC
ggsave('06.Prog_ROC.png', ROC,width = 6, height = 5)
ggsave('06.Prog_ROC.pdf', ROC,width = 6, height = 5)


# # DCA---------------------------------------------------------------------
# Nomogram<- coxph(cox_data_prog,data = train_risk_clinical)
# library(ggDCA)
# 
# fig3<-ggDCA::dca(Nomogram,
#                  new.data = NULL,
#                  times=365*1
# )
# fig5<-ggDCA::dca(Nomogram,
#                  new.data = NULL,
#                  times=365*3
# )
# fig7<-ggDCA::dca(Nomogram,
#                  new.data = NULL,
#                  times=5*365
# )
# 
# library(ggprism)
# p3<-ggplot(fig3,linetype =F,lwd = 1.2)+
#   theme_classic()+
#   theme_prism(base_size =17)+
#   theme(legend.position="top")+
#   theme(axis.title.x = element_text(size = 22, face = "bold", family = "Times"),
#         axis.title.y = element_text(size = 22, face = "bold", family = "Times"),
#         axis.text.x = element_text(size = 17, color='black',face = "bold", family = "Times"),
#         axis.text.y = element_text(size = 17, face = "bold", family = "Times"),
#         legend.text = element_text(size = 17, family = "Times",face = "bold"),
#         legend.title = element_blank(),
#         plot.title = element_text(size = 22, face = "bold", family = "Times",hjust=0.5),
#         #plot.margin = ggplot2::margin(t=.3,b=0,l=2,r=.5, unit = "cm"),
#         text = element_text(family = "Times"),
#         panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
#   scale_x_continuous(
#     #limits = c(0, 1),              #x
#     guide = "prism_minor") +
#   scale_colour_prism(
#     palette = "candy_bright",
#     name = "Cylinders",
#     label = c("3 Year", "ALL", "None"))
# 
# p3
# 
# p5<-ggplot(fig5,linetype =F,lwd = 1.2)+
#   theme_classic()+
#   theme_prism(base_size =17)+
#   theme(legend.position="top")+
#   theme(axis.title.x = element_text(size = 22, face = "bold", family = "Times"),
#         axis.title.y = element_text(size = 22, face = "bold", family = "Times"),
#         axis.text.x = element_text(size = 17, color='black',face = "bold", family = "Times"),
#         axis.text.y = element_text(size = 17, face = "bold", family = "Times"),
#         legend.text = element_text(size = 17, family = "Times",face = "bold"),
#         legend.title = element_blank(),
#         plot.title = element_text(size = 22, face = "bold", family = "Times",hjust=0.5),
#         #plot.margin = ggplot2::margin(t=.3,b=0,l=2,r=.5, unit = "cm"),
#         text = element_text(family = "Times"),
#         panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
#   scale_x_continuous(
#     #limits = c(0, 1),              #x
#     guide = "prism_minor") +
#   scale_colour_prism(
#     palette = "candy_bright",
#     name = "Cylinders",
#     label = c("5 Year", "ALL", "None"))
# 
# p5
# 
# p7 <- ggplot(fig7,linetype =F,lwd = 1.2)+
#   theme_classic()+
#   theme_prism(base_size =17)+
#   theme(legend.position="top")+
#   theme(axis.title.x = element_text(size = 22, face = "bold", family = "Times"),
#         axis.title.y = element_text(size = 22, face = "bold", family = "Times"),
#         axis.text.x = element_text(size = 17, color='black',face = "bold", family = "Times"),
#         axis.text.y = element_text(size = 17, face = "bold", family = "Times"),
#         legend.text = element_text(size = 17, family = "Times",face = "bold"),
#         legend.title = element_blank(),
#         plot.title = element_text(size = 22, face = "bold", family = "Times",hjust=0.5),
#         #plot.margin = ggplot2::margin(t=.3,b=0,l=2,r=.5, unit = "cm"),
#         text = element_text(family = "Times"),
#         panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
#   scale_x_continuous(
#     #limits = c(0, 1),              #x
#     guide = "prism_minor") +
#   scale_colour_prism(
#     palette = "candy_bright",
#     name = "Cylinders",
#     label = c("7 Year", "ALL", "None"))
# 
# p7
# 
# 
# library(cowplot)
# pdf('07.DCA_year.pdf',w=15,h=5,bg='white')
# plot_grid(p3,p5,p7,ncol=3)
# dev.off()
# 
# png('07.DCA_year.png',w=15,h=5,bg='white',units='in',res=600)
# plot_grid(p3,p5,p7,ncol=3)
# dev.off()

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./10_clinical_gene")){
  dir.create("./10_clinical_gene")
}
setwd("./10_clinical_gene")

##
library(lance)
library(tidyverse)
hubgene <- read.csv('../06_Lasso/03.lasso_genes.csv')
hubgene <- hubgene$symbol
dat<-read.csv('../00_rawdata/02.expr_tpm_TCGA.csv',row.names = 1)%>%lc.tableToNum()
colnames(dat)<-gsub('.','-',colnames(dat),fixed = T)
## tumor
group <- read.csv('../00_rawdata/03.group_TCGA.csv')
tumor.sample <- group$X[which(group$group=='Tumor')]
dat <- dat[,tumor.sample]
dat <- t(dat[hubgene,])%>%as.data.frame()%>%rownames_to_column(var = 'sample')


survival<-read.csv("../00_rawdata/04.survival_TCGA.csv",row.names = 1)
phenotype<-read.csv('../00_rawdata/05.phenotype_TCGA.csv')
#dat<-read.csv('../00_rawdata/02.expr_tpm(log)_TCGA.csv')

colnames(phenotype)
train_phenotype<-data.frame(sample=phenotype$sample,
                            age=phenotype$age,
                            gender=phenotype$gender,
                            stage=phenotype$stage,
                            T.stage=phenotype$T_stage,
                            N.stage=phenotype$N_stage,
                            M.stage=phenotype$M_stage)
write.csv(train_phenotype,'01.train_phenotype.csv')
# Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1
# Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2
# Current reformed smoker for > 15 years (greater than 15 years) = 3
# Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4
# Current reformed smoker, duration not specified = 5
# Smoking History not documented = 7

train_phenotype<-merge(train_phenotype,survival,by='sample')
train_phenotype$OS<-as.numeric(train_phenotype$OS)
train_phenotype$OS.time<-as.numeric(train_phenotype$OS.time)
train_phenotype2<-train_phenotype

table(train_phenotype2$stage)
train_phenotype2$stage <- gsub('A','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('B','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('C','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage III','3/4',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage IV','3/4',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage II','2',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage I','1',train_phenotype2$stage)
#train_phenotype2$stage <- gsub('not reported',NA,train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage X',NA,train_phenotype2$stage)


table(train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('a','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('b','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('c','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('d','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('TX',NA,train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T3','3/4',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T4','3/4',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T2','2',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T1','1',train_phenotype2$T.stage)

table(train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('a','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('b','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('c','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('d','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub(' (i-)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub(' (i+)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub(' (mol+)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub('mi','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N3','3',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N2','2',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N1','1',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N0','0',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('NX',NA,train_phenotype2$N.stage)


table(train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub('MX',NA,train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub(' (i+)','',train_phenotype2$M.stage,fixed = T)
train_phenotype2$M.stage<-gsub('c','',train_phenotype2$M.stage,fixed = T)
train_phenotype2$M.stage<-gsub('M0','0',train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub('M1','1',train_phenotype2$M.stage)


table(train_phenotype2$age)
train_phenotype2$age <- as.numeric(train_phenotype2$age)
train_phenotype2$age <- ifelse(train_phenotype2$age>60,">60",'<=60')

colnames(train_phenotype2)
train_phenotype2<-na.omit(train_phenotype2)
train_phenotype3 <- merge(train_phenotype2,dat,by = 'sample')
write.csv(train_phenotype3,'train_phenotype.csv')
library(tidyverse)
library(lance)
library(ggpubr)
library(Ipaper)
library(ggthemes)


##Age-----
table(train_phenotype3$age)
my_comparisons <- list(c("<=60",">60"))

age_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
age.group <- train_phenotype3[,c('sample','age')]
age.group <- na.omit(age.group)
age_data <- age_data[,colnames(age_data)%in%age.group$sample]
age.sample <- age.group$sample[which(age.group$age=='>60')]
age_data$Symbol <- rownames(age_data)

age_data2<-gather(age_data,
                  key = sample,
                  value = expr,
                  -c("Symbol"))
age_data2$Group <- ifelse(age_data2$sample%in%age.sample,'>60','<=60')
library(rstatix)

stat.test<-age_data2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
write.csv(stat.test,file = 'age.wilcox.csv',row.names = F,quote = F)

stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.05,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
age_plot <- ggplot(age_data2,aes(x = Symbol, y = expr, fill = Group)) +
  geom_violin(trim=F,color="black",aes(fill=Group)) +
  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  labs(title="Age", x="", y = "",size=20) +
  stat_compare_means(data = age_data2,
                     mapping = aes(group = Group),
                     label ="p.signif",
                     method = 'wilcox.test',
                     paired = F) +
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) 
#+
#  guides(fill='none')#+
# facet_wrap(~Symbol,scales = "free",nrow = 3) 
age_plot

##Gender-----
table(train_phenotype3$gender)
gender_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
gender.group <- train_phenotype3[,c('sample','gender')]
gender.group <- na.omit(gender.group)
gender_data <- gender_data[,colnames(gender_data)%in%gender.group$sample]
gender.sample <- gender.group$sample[which(gender.group$gender=='male')]
gender_data$Symbol <- rownames(gender_data)

gender_data2<-gather(gender_data,
                     key = sample,
                     value = expr,
                     -c("Symbol"))

gender_data2$Group <- ifelse(gender_data2$sample%in%gender.sample,'male','female')
library(rstatix)

stat.test<-gender_data2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
write.csv(stat.test,file = 'gender.wilcox.csv',row.names = F,quote = F)

stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.05,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
gender_plot <- ggplot(gender_data2,aes(x = Symbol, y = expr, fill = Group)) +
  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  geom_violin(trim=F,color="black",aes(fill=Group)) +
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  labs(title="Gender", x="", y = "",size=20) +
  stat_compare_means(data = gender_data2,
                     mapping = aes(group = Group),
                     label ="p.signif",
                     method = 'wilcox.test',
                     paired = F) +
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) 
gender_plot


##T.stage-----
table(train_phenotype3$T.stage)
T.stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
T.stage.group <- train_phenotype3[,c('sample','T.stage')]
T.stage.group <- na.omit(T.stage.group)
table(T.stage.group$T.stage)
T.stage_data <- T.stage_data[,colnames(T.stage_data)%in%T.stage.group$sample]
T.stage.sample <- T.stage.group$sample[which(T.stage.group$T.stage=='1'|T.stage.group$T.stage=='2')]

T.stage_data$Symbol <- rownames(T.stage_data)

T.stage_data2<-gather(T.stage_data,
                      key = sample,
                      value = expr,
                      -c("Symbol"))

T.stage_data2$Group <- ifelse(T.stage_data2$sample%in%T.stage.sample,'1/2','3/4')
library(rstatix)

stat.test<-T.stage_data2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
write.csv(stat.test,file = 'T.stage.wilcox.csv',row.names = F,quote = F)

stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.05,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
T.stage_plot <- ggplot(T.stage_data2,aes(x = Symbol, y = expr, fill = Group)) +
  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  geom_violin(trim=F,color="black",aes(fill=Group)) +
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  labs(title="T.stage", x="", y = "",size=20) +
  stat_compare_means(data = T.stage_data2,
                     mapping = aes(group = Group),
                     label ="p.signif",
                     method = 'wilcox.test',
                     paired = F) +
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) 
# facet_wrap(~Symbol,scales = "free",nrow = 3) 
T.stage_plot

##N.stage-----
table(train_phenotype3$N.stage)
N.stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
N.stage.group <- train_phenotype3[,c('sample','N.stage')]
N.stage.group <- na.omit(N.stage.group)
table(N.stage.group$N.stage)
N.stage_data <- N.stage_data[,colnames(N.stage_data)%in%N.stage.group$sample]
N.stage.sample <- N.stage.group$sample[which(N.stage.group$N.stage=='0'|N.stage.group$N.stage=='1')]
N.stage_data$Symbol <- rownames(N.stage_data)

N.stage_data2<-gather(N.stage_data,
                      key = sample,
                      value = expr,
                      -c("Symbol"))

N.stage_data2$Group <- ifelse(N.stage_data2$sample%in%N.stage.sample,'0/1','2/3')
library(rstatix)

stat.test<-N.stage_data2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
write.csv(stat.test,file = 'N.stage.wilcox.csv',row.names = F,quote = F)

stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.05,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
N.stage_plot <- ggplot(N.stage_data2,aes(x = Symbol, y = expr, fill = Group)) +
  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  geom_violin(trim=F,color="black",aes(fill=Group)) +
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  labs(title="N.stage", x="", y = "",size=20) +
  stat_compare_means(data = N.stage_data2,
                     mapping = aes(group = Group),
                     label ="p.signif",
                     method = 'wilcox.test',
                     paired = F) +
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) 
N.stage_plot


##M.stage-----
table(train_phenotype3$M.stage)
M.stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
M.stage.group <- train_phenotype3[,c('sample','M.stage')]
M.stage.group <- na.omit(M.stage.group)
table(M.stage.group$M.stage)
M.stage_data <- M.stage_data[,colnames(M.stage_data)%in%M.stage.group$sample]
M.stage.sample <- M.stage.group$sample[which(M.stage.group$M.stage=='0')]
M.stage_data$Symbol <- rownames(M.stage_data)

M.stage_data2<-gather(M.stage_data,
                      key = sample,
                      value = expr,
                      -c("Symbol"))

M.stage_data2$Group <- ifelse(M.stage_data2$sample%in%M.stage.sample,'0','1')
library(rstatix)

stat.test<-M.stage_data2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
write.csv(stat.test,file = 'M.stage.wilcox.csv',row.names = F,quote = F)

stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.05,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
M.stage_plot <- ggplot(M.stage_data2,aes(x = Symbol, y = expr, fill = Group)) +
  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  geom_violin(trim=F,color="black",aes(fill=Group)) +
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  labs(title="M.stage", x="", y = "",size=20) +
  stat_compare_means(data = M.stage_data2,
                     mapping = aes(group = Group),
                     label ="p.signif",
                     method = 'wilcox.test',
                     paired = F) +
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) 
M.stage_plot


##stage-----
table(train_phenotype3$stage)
stage_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
stage.group <- train_phenotype3[,c('sample','stage')]
stage.group <- na.omit(stage.group)
table(stage.group$stage)
stage_data <- stage_data[,colnames(stage_data)%in%stage.group$sample]
stage.sample <- stage.group$sample[which(stage.group$stage=='1'|stage.group=='2')]
stage_data$Symbol <- rownames(stage_data)

stage_data2<-gather(stage_data,
                    key = sample,
                    value = expr,
                    -c("Symbol"))

stage_data2$Group <- ifelse(stage_data2$sample%in%stage.sample,'1/2','3/4')
library(rstatix)

stat.test<-stage_data2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
write.csv(stat.test,file = 'stage.wilcox.csv',row.names = F,quote = F)

stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.05,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
stage_plot <- ggplot(stage_data2,aes(x = Symbol, y = expr, fill = Group)) +
  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  geom_violin(trim=F,color="black",aes(fill=Group)) +
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  labs(title="Stage", x="", y = "",size=20) +
  stat_compare_means(data = stage_data2,
                     mapping = aes(group = Group),
                     label ="p.signif",
                     method = 'wilcox.test',
                     paired = F) +
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) 
stage_plot


##histological--------
#table(train_phenotype3$histological)
#histological_data <- t(data.frame(row.names = train_phenotype3$sample,train_phenotype3[,c(10:13)]))%>%as.data.frame()
#histological.group <- train_phenotype3[,c('sample','histological')]
#histological.group <- na.omit(histological.group)
#table(histological.group$histological)
#histological_data <- histological_data[,colnames(histological_data)%in%histological.group$sample]
#histological.sample <- histological.group$sample[which(histological.group$histological=='G1'|histological.group$histological=='G2')]
#histological_data$Symbol <- rownames(histological_data)
#
#histological_data2<-gather(histological_data,
#                           key = sample,
#                           value = expr,
#                           -c("Symbol"))
#
#histological_data2$Group <- ifelse(histological_data2$sample%in%histological.sample,'G1/2','G3')
#library(rstatix)
#
#stat.test<-histological_data2%>%
#  group_by(Symbol)%>%
#  wilcox_test(expr ~ Group)%>%
#  adjust_pvalue(method = 'fdr')
#write.csv(stat.test,file = 'histological.wilcox.csv',row.names = F,quote = F)
#
#stat.test$p<-ifelse(stat.test$p<0.001,"***",
#                    ifelse(stat.test$p<0.05,"**",
#                           ifelse(stat.test$p<0.05,"*",'ns')))
#histological_plot <- ggplot(histological_data2,aes(x = Symbol, y = expr, fill = Group)) +
#  #geom_violin(trim=F,color="black") + #, “color=”(#white，)
#  #"trim"TRUE(),。FALSE,。
#  geom_violin(trim=F,color="black",aes(fill=Group)) +
#  stat_boxplot(geom="errorbar", 
#               width=0.1,
#               position = position_dodge(0.9)) +
#  geom_boxplot(width=0.7,
#               position=position_dodge(0.9),
#               outlier.shape = NA)+ #，width=0.1
#  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
#  labs(title="Histological", x="", y = "",size=20) +
#  stat_compare_means(data = histological_data2,
#                     mapping = aes(group = Group),
#                     label ="p.signif",
#                     method = 'wilcox.test',
#                     paired = F) +
#  theme_bw()+
#  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
#        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=12), 
#        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
#        axis.title.x=element_text(size=16,face="bold"),
#        axis.title.y=element_text(size=16,face="bold"),
#        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
#        legend.title = element_text(face = "bold", size = 12),
#        legend.position = "top",
#        panel.grid.major = element_blank(),
#        panel.grid.minor = element_blank()) 
#histological_plot


library(patchwork)
all_clinical_index <- age_plot +gender_plot + T.stage_plot + N.stage_plot+ M.stage_plot+ stage_plot  + 
  plot_layout(ncol = 3) &
  theme_bw() &
  theme(legend.title = element_blank(),
        legend.position = "top",
        legend.text = element_text(face = "bold"),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        # plot.title = element_blank(),
        panel.grid = element_blank(),
        axis.title = element_text(face = "bold"),
        axis.text.y = element_text(face = 'bold',size = 12),
        axis.text.x = element_text(angle = 45,hjust = 1,face = "bold",size = 12)) 
all_clinical_index
cairo_pdf("01.clinical.pdf", width = 12, height = 10,family = "Arial")
all_clinical_index
dev.off()


#all_clinical_index <- age_plot +gender_plot + stage_plot + histological_plot + 
#  plot_layout(ncol = 4) &
#  theme_bw() &
#  theme(legend.title = element_blank(),
#        legend.position = "top",
#        legend.text = element_text(face = "bold"),
#        plot.title = element_text(hjust = 0.5, face = "bold"),
#        # plot.title = element_blank(),
#        panel.grid = element_blank(),
#        axis.title = element_text(face = "bold"),
#        axis.text.y = element_text(face = 'bold',size = 12),
#        axis.text.x = element_text(angle = 45,hjust = 1,face = "bold",size = 12)) 
#all_clinical_index
#cairo_pdf("01.clinical2.pdf", width = 12, height = 5,family = "Arial")
#all_clinical_index
#dev.off()
#rm(list = ls())
#08 -------------
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./10_clinical")){
  dir.create("./10_clinical")
}
setwd("./10_clinical")
library(readr)
library(readxl)
survival <- read.csv('../00_rawdata/04.survival_TCGA.csv',row.names = 1)
phenotype<-read.csv('../00_rawdata/05.phenotype_TCGA.csv')
colnames(phenotype)
train_phenotype<-data.frame(sample=phenotype$sample,
                            age=phenotype$age,
                            gender=phenotype$gender,
                            stage=phenotype$stage,
                            T.stage=phenotype$T_stage,
                            N.stage=phenotype$N_stage,
                            M.stage=phenotype$M_stage)
write.csv(train_phenotype,'01.train_phenotype.csv')
# Lifelong Non-smoker (less than 100 cigarettes smoked in Lifetime) = 1
# Current smoker (includes daily smokers and non-daily smokers or occasional smokers) = 2
# Current reformed smoker for > 15 years (greater than 15 years) = 3
# Current reformed smoker for ≤15 years (less than or equal to 15 years) = 4
# Current reformed smoker, duration not specified = 5
# Smoking History not documented = 7

train_phenotype<-merge(train_phenotype,survival,by='sample')
train_phenotype$OS<-as.numeric(train_phenotype$OS)
train_phenotype$OS.time<-as.numeric(train_phenotype$OS.time)
train_phenotype2<-train_phenotype

table(train_phenotype2$stage)
train_phenotype2$stage <- gsub('A','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('B','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('C','',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage III','3/4',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage IV','3/4',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage II','2',train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage I','1',train_phenotype2$stage)
#train_phenotype2$stage <- gsub('not reported',NA,train_phenotype2$stage)
train_phenotype2$stage <- gsub('Stage X',NA,train_phenotype2$stage)


table(train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('a','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('b','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('c','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('d','',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('TX',NA,train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T3','3/4',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T4','3/4',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T2','2',train_phenotype2$T.stage)
train_phenotype2$T.stage<-gsub('T1','1',train_phenotype2$T.stage)

table(train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('a','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('b','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('c','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('d','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub(' (i-)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub(' (i+)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub(' (mol+)','',train_phenotype2$N.stage,fixed = T)
train_phenotype2$N.stage<-gsub('mi','',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N3','3',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N2','2',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N1','1',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('N0','0',train_phenotype2$N.stage)
train_phenotype2$N.stage<-gsub('NX',NA,train_phenotype2$N.stage)


table(train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub('MX',NA,train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub(' (i+)','',train_phenotype2$M.stage,fixed = T)
train_phenotype2$M.stage<-gsub('c','',train_phenotype2$M.stage,fixed = T)
train_phenotype2$M.stage<-gsub('M0','0',train_phenotype2$M.stage)
train_phenotype2$M.stage<-gsub('M1','1',train_phenotype2$M.stage)


table(train_phenotype2$age)
train_phenotype2$age <- as.numeric(train_phenotype2$age)
train_phenotype2$age <- ifelse(train_phenotype2$age>60,">60",'<=60')
#median(train_phenotype2$age)


#table(train_phenotype2$gender)
#train_phenotype2$gender <- ifelse(train_phenotype2$gender=='female',1,0)

colnames(train_phenotype2)
colnames(train_phenotype2)<-c('id','Age','Gender','Stage','T.stage','N.stage',"M.stage",'OS.time','OS')
train_phenotype2<-na.omit(train_phenotype2)

risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1)
sub_risk <- subset(risk, select = c(id, riskScore))

train_risk_clinical <- merge(train_phenotype2,
                             sub_risk,
                             by = "id")
rownames(train_risk_clinical) <- train_risk_clinical$id
train_risk_clinical = subset(train_risk_clinical, select = -c(id))
dim(train_risk_clinical)
write.csv(train_risk_clinical,'02.train_clinical.csv')

train_phenotype3 <- train_risk_clinical
library(ggpubr)
library(Ipaper)
library(ggthemes)


##Age-----
table(train_phenotype3$Age)
my_comparisons <- list(c("<=60",">60"))
age_data <- data.frame(riskScore = train_phenotype3$riskScore,
                       stage = factor(train_phenotype3$Age,
                                      levels = c("<=60",">60")))
age_data <- na.omit(age_data)
age<-ggplot(age_data,aes(x = stage, y = riskScore, fill = stage)) +
  geom_violin(trim=F,color="black",aes(fill=stage)) +
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(alpha=0.7) +
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
  scale_x_discrete(name = "") +
  ggtitle("Age") +
  theme_bw() +
  geom_signif(comparisons = my_comparisons,
              test = wilcox.test,
              map_signif_level = T,
              y_position = c(2,3,4))+
  theme(plot.title = element_text(size = 16, face =  "bold"),
        text = element_text(size = 20),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 20))+
  guides(fill='none') 
age

#gender
table(train_phenotype3$Gender)
my_comparisons <- list(c("female","male"))
gender_data <- data.frame(riskScore = train_phenotype3$riskScore,
                          stage = factor(train_phenotype3$Gender,
                                         levels = c("female","male")))
gender_data <- na.omit(gender_data)
gender<-ggplot(gender_data,aes(x = stage, y = riskScore, fill = stage)) +
  geom_violin(trim=F,color="black",aes(fill=stage)) +
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(alpha=0.7) +
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
  scale_x_discrete(name = "") +
  ggtitle("Gender") +
  theme_bw() +
  geom_signif(comparisons = my_comparisons,
              test = wilcox.test,
              map_signif_level = T,
              y_position = c(2,3,4))+
  theme(plot.title = element_text(size = 16, face =  "bold"),
        text = element_text(size = 20),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 20))+
  guides(fill='none') 
gender


## T.stage-----
table(train_phenotype3$T.stage)
my_comparisons <- list(c("1","2"),c("2","3/4"),c('1','3/4'))
T.stage_data <- data.frame(riskScore = train_phenotype3$riskScore,
                           stage = factor(train_phenotype3$T.stage,
                                          levels = c("1","2", "3/4")))
T.stage_data <- na.omit(T.stage_data)
T.stage<-ggplot(T.stage_data,aes(x = stage, y = riskScore, fill = stage)) +
  geom_violin(trim=F,color="black",aes(fill=stage)) +
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(alpha=0.7) +
  scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957","#F19685"), name = "Group")+
  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
  scale_x_discrete(name = "") +
  ggtitle("T Stage") +
  theme_bw() +
  geom_signif(comparisons = my_comparisons,
              test = wilcox.test,
              map_signif_level = T,
              y_position = c(2,2.8,3.6,3.4,2,4.8))+
  theme(plot.title = element_text(size = 16, face =  "bold"),
        text = element_text(size = 20),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 20))+
  guides(fill='none') 
T.stage

## N stage-----
table(train_phenotype3$N.stage)
my_comparisons <- list(c("0","1"),c("1","2"),c("2","3"),c('0','2'),c('0','3'),c('1','3'))
N.stage_data <- data.frame(riskScore = train_phenotype3$riskScore,
                           stage = factor(train_phenotype3$N.stage,
                                          levels = c("0","1", "2",'3')))
N.stage_data <- na.omit(N.stage_data)
N.stage<-ggplot(N.stage_data,aes(x = stage, y = riskScore, fill = stage)) +
  geom_violin(trim=F,color="black",aes(fill=stage)) +
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(alpha=0.7) +
  scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957","#F19685"), name = "Group")+
  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
  scale_x_discrete(name = "") +
  ggtitle("N Stage") +
  theme_bw() +
  geom_signif(comparisons = my_comparisons,
              test = wilcox.test,
              map_signif_level = T,
              y_position = c(2,3,4,5,6,7))+
  theme(plot.title = element_text(size = 16, face =  "bold"),
        text = element_text(size = 20),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 20))+
  guides(fill='none') 
N.stage

# M stage-----
table(train_phenotype3$M.stage)
my_comparisons <- list(c("0","1"))
M.stage_data <- data.frame(riskScore = train_phenotype3$riskScore,
                           stage = factor(train_phenotype3$M.stage,
                                          levels = c("0","1")))
M.stage_data <- na.omit(M.stage_data)
M.stage<-ggplot(M.stage_data,aes(x = stage, y = riskScore, fill = stage)) +
  geom_violin(trim=F,color="black",aes(fill=stage)) +
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(alpha=0.7) +
  scale_fill_manual(values= c("#E56F5E","#427AB2"), name = "Group")+
  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
  scale_x_discrete(name = "") +
  ggtitle("M Stage") +
  theme_bw() +
  geom_signif(comparisons = my_comparisons,
              test = wilcox.test,
              map_signif_level = T,
              y_position = c(2))+
  theme(plot.title = element_text(size = 16, face =  "bold"),
        text = element_text(size = 20),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 20))+
  guides(fill='none') 
M.stage


## Histological----
#table(train_phenotype3$Histological)
#my_comparisons <- list(c("G1","G2"),c('G2','G3'),c('G1','G3'))
#Histological_data <- data.frame(riskScore = train_phenotype3$riskScore,
#                                stage = factor(train_phenotype3$Histological,
#                                               levels = c("G1","G2",'G3')))
#Histological_data <- na.omit(Histological_data)
#Histological<-ggplot(Histological_data,aes(x = stage, y = riskScore, fill = stage)) +
#  geom_violin(trim=F,color="black",aes(fill=stage)) +
#  stat_boxplot(geom="errorbar",
#               width=0.1,
#               position = position_dodge(0.9)) +
#  geom_boxplot(alpha=0.7) +
#  scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957"), name = "Group")+
#  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
#  scale_x_discrete(name = "") +
#  ggtitle("Histological") +
#  theme_bw() +
#  geom_signif(comparisons = my_comparisons,
#              test = wilcox.test,
#              map_signif_level = T,
#              y_position = c(2,3,4))+
#  theme(plot.title = element_text(size = 16, face =  "bold"),
#        text = element_text(size = 20),
#        axis.title = element_text(face="bold"),
#        axis.text.x=element_text(size = 20))+
#  guides(fill='none') 
#Histological


# Stage----
table(train_phenotype3$Stage)
my_comparisons <- list(c("1","2"),c('2','3/4'),c("1","3/4"))
Stage_data <- data.frame(riskScore = train_phenotype3$riskScore,
                         stage = factor(train_phenotype3$Stage,
                                        levels = c("1","2",'3/4')))
Stage_data <- na.omit(Stage_data)
Stage<-ggplot(Stage_data,aes(x = stage, y = riskScore, fill = stage)) +
  geom_violin(trim=F,color="black",aes(fill=stage)) +
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(alpha=0.7) +
  scale_fill_manual(values= c("#E56F5E","#427AB2","#F6C957","#F19685"), name = "Group")+
  scale_y_continuous(name = "riskScore",expand = c(0.1,0.1))+
  scale_x_discrete(name = "") +
  ggtitle("Stage") +
  theme_bw() +
  geom_signif(comparisons = my_comparisons,
              test = wilcox.test,
              map_signif_level = T,
              y_position = c(2,2.5,3,3.5,4,4.5))+
  theme(plot.title = element_text(size = 16, face =  "bold"),
        text = element_text(size = 20),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 20))+
  guides(fill='none') 
Stage


library(patchwork)
all_clinical_index <- age + gender + T.stage + N.stage + M.stage  + Stage+
  plot_layout(ncol = 3) &
  theme_bw() &
  theme(legend.title = element_blank(),
        legend.position = "top",
        legend.text = element_text(face = "bold"),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        panel.grid = element_blank(),
        axis.title = element_text(face = "bold"),
        axis.text = element_text(face = "bold",size = 10)) 
all_clinical_index

cairo_pdf("01.clinical.pdf", width = 8, height = 6,family = "Arial")
all_clinical_index
dev.off()


#library(patchwork)
#all_clinical_index2 <- age + gender + Histological + Stage+
#  plot_layout(ncol = 4) &
#  theme_bw() &
#  theme(legend.title = element_blank(),
#        legend.position = "top",
#        legend.text = element_text(face = "bold"),
#        plot.title = element_text(hjust = 0.5, face = "bold"),
#        panel.grid = element_blank(),
#        axis.title = element_text(face = "bold"),
#        axis.text = element_text(face = "bold",size = 10)) 
#all_clinical_index2
#
#cairo_pdf("01.clinical2.pdf", width = 10, height = 4,family = "Arial")
#all_clinical_index2
#dev.off()

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (!dir.exists("11_GSEA")){dir.create("11_GSEA")}
setwd("11_GSEA")

.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(data.table)
library(org.Hs.eg.db)
library(clusterProfiler)
library(biomaRt)
library(enrichplot)
library(limma)
library(tidyverse)
library(lance)
## 
kegggmt<-read.gmt("/data/nas3//pipeline/MSigDB/h.all.v2023.2.Hs.symbols.gmt") #gmt
dat<-read.csv("../00_rawdata/01.expr_count_TCGA.csv", row.names = 1,check.names = F)
colnames(dat)<-gsub('.','-',colnames(dat),fixed = T)
dat <- round(dat,digits = 0)
#hubgene <- read.csv('../07_Lasso/03.lasso_genes.csv')
hub.exp<-dat#[hubgene$symbol,]
hub.exp<-hub.exp[order(rownames(hub.exp),decreasing = F),]  ##
risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1)
rownames(risk) <- risk$id 
train<-t(hub.exp)
colData=data.frame(sample=risk$id,group=ifelse(risk$risk=='High','High risk','Low risk'))  ##TNFRSF1A
colData$group <- factor(colData$group,levels = c("High risk","Low risk")) 
dat <- dat[,colData$sample]
gsea_exp<-dat
library(DESeq2)
dds<-DESeqDataSetFromMatrix(countData = gsea_exp,colData=colData,design = ~group)
dds = dds[rowSums(counts(dds)) > 1,]
dds<-DESeq(dds)
res =results(dds, contrast = c("group","High risk","Low risk"))
res =res[order(res$padj),]
head(res)
summary(res)
table(res$padj<0.05)
allGeneSets<-as.data.frame(res)
allGeneSets<-na.omit(allGeneSets)
logFCcutoff <- 0
allGeneSets$change = as.factor(
  ifelse(allGeneSets$padj < 0.05 & abs(allGeneSets$log2FoldChange) > logFCcutoff,
         ifelse(allGeneSets$log2FoldChange > logFCcutoff,'UP','DOWN'),'NOT')
)
table(allGeneSets$change)
DEGeneSets <- allGeneSets
allGeneSets <- allGeneSets[order(allGeneSets$log2FoldChange,decreasing = T),]
genelist <- allGeneSets$log2FoldChange
names(genelist) <- rownames(allGeneSets)
geneList <- sort(genelist, decreasing = T)
DEGeneSets <- DEGeneSets[order(DEGeneSets$padj),]

KEGG<-GSEA(geneList,TERM2GENE = kegggmt,pvalueCutoff = 1) #GSEA
sortKEGG<-data.frame(KEGG)
sortKEGG <- subset(sortKEGG,pvalue < 0.05 & abs(NES) > 1)
sortKEGG<-sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#enrichment score
write.csv(sortKEGG,file = '01.hall.result.csv',row.names = F)

my_colors <- c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00")

if(T){
  gsea.plot = function(res.kegg, top.hall,name){
    gsdata <- do.call(rbind, lapply(top.hall, enrichplot:::gsInfo, object = res.kegg))
    gsdata$Description = factor(gsdata$Description, levels = top.hall)
    p1 = ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) + 
      # theme_bw() +
      theme_classic(14) +
      theme(panel.grid.major = element_line(colour = "grey92"), 
            panel.grid.minor = element_line(colour = "grey92"), 
            panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) + 
      scale_x_continuous(expand = c(0, 0)) +
      scale_color_manual(values = my_colors) +
      #scale_color_brewer(palette = "Paired") +
      ggtitle(paste0("Gene Set Enrichment Analysis: ",name)) +
      # geom_hline(yintercept = 0, color = "black", size = 0.8) +
      geom_line(aes_(y = ~runningScore, color = ~Description), size = 1) +
      theme(legend.position = "top", 
            legend.justification = c(0,1),
            legend.title = element_blank(), 
            legend.background = element_rect(fill = "transparent")) +
      guides(
        color = guide_legend(
          ncol = 1, 
          byrow = TRUE,
          reverse = T)
      )+
      ylab("Running Enrichment Score") + 
      theme(axis.text.x = element_blank(), 
            axis.ticks.x = element_blank(),
            axis.line.x = element_blank(),
            text = element_text(face = "bold", family = "Times"))+
      theme(
        axis.title.y =element_text(size=15,family = "Times", face = "bold"),
        axis.text.y=element_text(size=15,family = "Times", face = "bold"),
        panel.grid.major=element_blank(),panel.grid.minor=element_blank()
      )
    i = 0
    for (term in unique(gsdata$Description)) {
      idx <- which(gsdata$ymin != 0 & gsdata$Description == term)
      gsdata[idx, "ymin"] <- i
      gsdata[idx, "ymax"] <- i + 1
      i <- i + 1 }
    p2 = ggplot(gsdata, aes_(x = ~x)) + 
      geom_linerange(aes_(ymin = ~ymin, ymax = ~ymax, color = ~Description)) + 
      xlab(NULL) + ylab(NULL) + 
      # theme_bw() +
      theme_classic(14) +
      theme(legend.position = "none", 
            axis.ticks = element_blank(),
            axis.text = element_blank(),
            axis.line.x = element_blank(),
            text = element_text(face = "bold", family = "Times")) + 
      scale_x_continuous(expand = c(0,0)) + 
      scale_y_continuous(expand = c(0,0)) + 
      scale_color_manual(values = my_colors) +
      #scale_color_brewer(palette = "Paired")+
      theme(
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        # legend.title=element_text(size=15) , 
        legend.text=element_text(size=14))
    p3 <- gseaplot2(res.kegg,top.hall,subplots = 3)+
      
      theme(axis.title.x =element_text(size=15,family = "Times", face = "bold"),
            axis.text.x =element_text(angle=0,size=15,hjust = 0,family = "Times", face = "bold"),
            axis.title.y =element_text(size=15,family = "Times", face = "bold"),
            axis.text.y=element_text(size=15,family = "Times", face = "bold"))+
      theme(
        panel.grid.major=element_blank(),panel.grid.minor=element_blank(),
        legend.title=element_text(size=15) , legend.text=element_text(size=14))
    # p = aplot::insert_bottom(p1, p2, height = 0.15)
    p <- p1/p2/p3+plot_layout(ncol = 1, height = c(4,1,2))
    return(p)
  }
}

# sortKEGG2 <- sortKEGG[order(sortKEGG$NES, decreasing = F),]            #enrichment score
# paths <- rbind(head(sortKEGG2,3),tail(sortKEGG2,3))
sortKEGG2 <- sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#p
paths <- sortKEGG2[1:5,]
select_path <- paths$Description

library(patchwork)

pal <- gsea.plot(KEGG, select_path,"Results")
pdf("01.GSEA_HALL.pdf", width = 10, height = 9,family = 'Times')
print(pal)
dev.off()

png("01.GSEA_HALL.png", width = 10, height = 9,units = 'in',res = 300,family = 'Times')
print(pal)
dev.off()


# 
# #############KEGG##################
# 
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (!dir.exists("11_GSEA")){dir.create("11_GSEA")}
setwd("11_GSEA")

.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(data.table)
library(org.Hs.eg.db)
library(clusterProfiler)
library(biomaRt)
library(enrichplot)
library(limma)
library(tidyverse)
library(lance)
##
kegggmt<-read.gmt("/data/nas3//pipeline/MSigDB/c2.cp.kegg.v2023.1.Hs.symbols.gmt") #gmt
dat<-read.csv("../00_rawdata/01.expr_count_TCGA.csv", row.names = 1,check.names = F)
colnames(dat)<-gsub('.','-',colnames(dat),fixed = T)
dat <- round(dat,digits = 0)
#hubgene <- read.csv('../07_Lasso/03.lasso_genes.csv')
hub.exp<-dat#[hubgene$symbol,]
hub.exp<-hub.exp[order(rownames(hub.exp),decreasing = F),]  ##
risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1)
rownames(risk) <- risk$id
train<-t(hub.exp)
colData=data.frame(sample=risk$id,group=ifelse(risk$risk=='High','High risk','Low risk'))  ##TNFRSF1A
colData$group <- factor(colData$group,levels = c("High risk","Low risk"))
dat <- dat[,colData$sample]
gsea_exp<-dat
library(DESeq2)
dds<-DESeqDataSetFromMatrix(countData = gsea_exp,colData=colData,design = ~group)
dds = dds[rowSums(counts(dds)) > 1,]
dds<-DESeq(dds)
res =results(dds, contrast = c("group","High risk","Low risk"))
res =res[order(res$padj),]
head(res)
summary(res)
table(res$padj<0.05)
allGeneSets<-as.data.frame(res)
allGeneSets<-na.omit(allGeneSets)
logFCcutoff <- 0
allGeneSets$change = as.factor(
  ifelse(allGeneSets$padj < 0.05 & abs(allGeneSets$log2FoldChange) > logFCcutoff,
         ifelse(allGeneSets$log2FoldChange > logFCcutoff,'UP','DOWN'),'NOT')
)
table(allGeneSets$change)
DEGeneSets <- allGeneSets
allGeneSets <- allGeneSets[order(allGeneSets$log2FoldChange,decreasing = T),]
genelist <- allGeneSets$log2FoldChange
names(genelist) <- rownames(allGeneSets)
geneList <- sort(genelist, decreasing = T)
DEGeneSets <- DEGeneSets[order(DEGeneSets$padj),]

KEGG<-GSEA(geneList,TERM2GENE = kegggmt,pvalueCutoff = 1) #GSEA
sortKEGG<-data.frame(KEGG)
sortKEGG <- subset(sortKEGG,pvalue < 0.05 & abs(NES) > 1)
sortKEGG<-sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#enrichment score
write.csv(sortKEGG,file = '02.kegg.result.csv',row.names = F)

my_colors <- c("#E41A1C","#377EB8","#4DAF4A","#984EA3","#FF7F00","#FFFF33")

if(T){
  gsea.plot = function(res.kegg, top.hall,name){
    gsdata <- do.call(rbind, lapply(top.hall, enrichplot:::gsInfo, object = res.kegg))
    gsdata$Description = factor(gsdata$Description, levels = top.hall)
    p1 = ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) +
      # theme_bw() +
      theme_classic(14) +
      theme(panel.grid.major = element_line(colour = "grey92"),
            panel.grid.minor = element_line(colour = "grey92"),
            panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) +
      scale_x_continuous(expand = c(0, 0)) +
      scale_color_manual(values = my_colors) +
      #scale_color_brewer(palette = "Paired") +
      ggtitle(paste0("Gene Set Enrichment Analysis: ",name)) +
      # geom_hline(yintercept = 0, color = "black", size = 0.8) +
      geom_line(aes_(y = ~runningScore, color = ~Description), size = 1) +
      theme(legend.position = "top",
            legend.justification = c(0,1),
            legend.title = element_blank(),
            legend.background = element_rect(fill = "transparent")) +
      guides(
        color = guide_legend(
          ncol = 1,
          byrow = TRUE,
          reverse = T)
      )+
      ylab("Running Enrichment Score") +
      theme(axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.line.x = element_blank(),
            text = element_text(face = "bold", family = "Times"))+
      theme(
        axis.title.y =element_text(size=15,family = "Times", face = "bold"),
        axis.text.y=element_text(size=15,family = "Times", face = "bold"),
        panel.grid.major=element_blank(),panel.grid.minor=element_blank()
      )
    i = 0
    for (term in unique(gsdata$Description)) {
      idx <- which(gsdata$ymin != 0 & gsdata$Description == term)
      gsdata[idx, "ymin"] <- i
      gsdata[idx, "ymax"] <- i + 1
      i <- i + 1 }
    p2 = ggplot(gsdata, aes_(x = ~x)) +
      geom_linerange(aes_(ymin = ~ymin, ymax = ~ymax, color = ~Description)) +
      xlab(NULL) + ylab(NULL) +
      # theme_bw() +
      theme_classic(14) +
      theme(legend.position = "none",
            axis.ticks = element_blank(),
            axis.text = element_blank(),
            axis.line.x = element_blank(),
            text = element_text(face = "bold", family = "Times")) +
      scale_x_continuous(expand = c(0,0)) +
      scale_y_continuous(expand = c(0,0)) +
      scale_color_manual(values = my_colors) +
      #scale_color_brewer(palette = "Paired")+
      theme(
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        # legend.title=element_text(size=15) ,
        legend.text=element_text(size=14))
    p3 <- gseaplot2(res.kegg,top.hall,subplots = 3)+

      theme(axis.title.x =element_text(size=15,family = "Times", face = "bold"),
            axis.text.x =element_text(angle=0,size=15,hjust = 0,family = "Times", face = "bold"),
            axis.title.y =element_text(size=15,family = "Times", face = "bold"),
            axis.text.y=element_text(size=15,family = "Times", face = "bold"))+
      theme(
        panel.grid.major=element_blank(),panel.grid.minor=element_blank(),
        legend.title=element_text(size=15) , legend.text=element_text(size=14))
    # p = aplot::insert_bottom(p1, p2, height = 0.15)
    p <- p1/p2/p3+plot_layout(ncol = 1, height = c(4,1,2))
    return(p)
  }
}
#paths <- rownames(kkTab[c(1:5),])

#sortKEGG2 <- sortKEGG[order(sortKEGG$NES, decreasing = F),]#enrichment score
#paths <- rbind(head(sortKEGG2,3),tail(sortKEGG2,3))3
#sortKEGG2 <- sortKEGG[order(sortKEGG$pvalue, decreasing = F),]
sortKEGG2 <- sortKEGG[order(sortKEGG$pvalue, decreasing = F),]#p
paths <- sortKEGG2[1:5,]
select_path <- paths$Description

library(patchwork)

pal <- gsea.plot(KEGG, select_path,"Results")
pdf("02.GSEA_KEGG.pdf", width = 10, height = 9,family = 'Times')
print(pal)
dev.off()

png("02.GSEA_KEGG.png", width = 10, height = 9,units = 'in',res = 300,family = 'Times')
print(pal)
dev.off()
# 
# 
# 
# 
# 
# 
# 
# 
# 
# #########################
# 
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/11_GSEA/")
if (!dir.exists("gene")){dir.create("gene")}
setwd("gene")

#GSEA---------------------------------------------------------------------
library(data.table)
library(org.Hs.eg.db)
library(clusterProfiler)
library(biomaRt)
library(enrichplot)
library(limma)
library(tidyverse)
library(lance)
library(tibble)
library(ggplot2)
library(patchwork)


data <- read.csv("../../00_rawdata/02.expr_tpm_TCGA.csv",row.names = 1)
colnames(data)<-gsub('.','-',colnames(data),fixed = T)
group <- read.csv("../../07_Train_estimate/01.train_OS_dat.csv",row.names = 1)
gene <- read.csv('../../06_Lasso/03.lasso_genes.csv')
geneList_survival <- gene$symbol
rownames(group) <- group$id
data_train<-data<-data[,rownames(group)]

library(corrplot)
library(org.Hs.eg.db)
library(clusterProfiler)
library(enrichplot)
library(GseaVis)
library(RColorBrewer)
library(psych)
library(reshape2)
library(plyr)
library(RColorBrewer)
library(ggplot2)


my_colors <- c("#FC8D62","#8DA0CB","#E78AC3","#A6D854","#FFD92F")

if(T){
  gsea.plot = function(res.kegg, top.hall,name){
    gsdata <- do.call(rbind, lapply(top.hall, enrichplot:::gsInfo, object = res.kegg))
    gsdata$Description = factor(gsdata$Description, levels = top.hall)
    p1 = ggplot(gsdata, aes_(x = ~x)) + xlab(NULL) +
      # theme_bw() +
      theme_classic(14) +
      theme(panel.grid.major = element_line(colour = "grey92"),
            panel.grid.minor = element_line(colour = "grey92"),
            panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank()) +
      scale_x_continuous(expand = c(0, 0)) +
      scale_color_manual(values = my_colors) +
      #scale_color_brewer(palette = "Paired") +
      ggtitle(paste0("Gene Set Enrichment Analysis: ",name)) +
      # geom_hline(yintercept = 0, color = "black", size = 0.8) +
      geom_line(aes_(y = ~runningScore, color = ~Description), size = 1) +
      theme(legend.position = "top",
            legend.justification = c(0,1),
            legend.title = element_blank(),
            legend.background = element_rect(fill = "transparent")) +
      guides(
        color = guide_legend(
          ncol = 1,
          byrow = TRUE,
          reverse = T)
      )+
      ylab("Running Enrichment Score") +
      theme(axis.text.x = element_blank(),
            axis.ticks.x = element_blank(),
            axis.line.x = element_blank(),
            text = element_text(face = "bold", family = "Times"))+
      theme(
        axis.title.y =element_text(size=13,family = "Times", face = "bold"),
        axis.text.y=element_text(size=13,family = "Times", face = "bold"),
        panel.grid.major=element_blank(),panel.grid.minor=element_blank()
      )
    i = 0
    for (term in unique(gsdata$Description)) {
      idx <- which(gsdata$ymin != 0 & gsdata$Description == term)
      gsdata[idx, "ymin"] <- i
      gsdata[idx, "ymax"] <- i + 1
      i <- i + 1 }
    p2 = ggplot(gsdata, aes_(x = ~x)) +
      geom_linerange(aes_(ymin = ~ymin, ymax = ~ymax, color = ~Description)) +
      xlab(NULL) + ylab(NULL) +
      # theme_bw() +
      theme_classic(14) +
      theme(legend.position = "none",
            axis.ticks = element_blank(),
            axis.text = element_blank(),
            axis.line.x = element_blank(),
            text = element_text(face = "bold", family = "Times")) +
      scale_x_continuous(expand = c(0,0)) +
      scale_y_continuous(expand = c(0,0)) +
      scale_color_manual(values = my_colors) +
      #scale_color_brewer(palette = "Paired")+
      theme(
        panel.grid.major=element_blank(),
        panel.grid.minor=element_blank(),
        # legend.title=element_text(size=15) ,
        legend.text=element_text(size=15))
    p3 <- gseaplot2(res.kegg,top.hall,subplots = 3)+

      theme(axis.title.x =element_text(size=0,family = "Times", face = "bold"),
            axis.text.x =element_text(angle=0,size=0,hjust = 0,family = "Times", face = "bold"),
            axis.title.y =element_text(size=13,family = "Times", face = "bold"),
            axis.text.y=element_text(size=13,family = "Times", face = "bold"))+
      theme(
        panel.grid.major=element_blank(),panel.grid.minor=element_blank(),
        legend.title=element_text(size=15) , legend.text=element_text(size=15))
    # p = aplot::insert_bottom(p1, p2, height = 0.15)
    p <- p1/p2/p3+plot_layout(ncol = 1, height = c(4,1,2))
    return(p)
  }
}


rt<-data
for (ii in geneList_survival){
  tryCatch({
    set.seed(1)
    #ii="DECR1"
    tar.exp <- rt[ii,]
    y <- as.numeric(tar.exp)
    data1 <- data.frame()
    for (i in rownames(rt)) {
      dd  <- corr.test(as.numeric(rt[i,]), y , method="spearman",adjust = "fdr")
      data1 = rbind(data1,data.frame(gene=i,cor=dd$r,p.value=dd$p))
    }
    data1 <- data1[order(data1$cor,decreasing = T),]
    gene1 <- data1$cor
    names(gene1) <- mapIds(org.Hs.eg.db,keys = data1$gene,column = 'ENTREZID',
                           keytype = 'SYMBOL',multiVals='filter')
    names(gene1)<-data1$gene
    gene1 <- na.omit(gene1)
    #
    gmt=read.gmt('/data/nas3//pipeline/MSigDB/c5.go.v7.4.symbols.gmt')
    kk=GSEA(gene1, TERM2GENE=gmt, pvalueCutoff = 1)
    kkTab=as.data.frame(kk)
    kkTab=kkTab[kkTab$pvalue<0.05,]
    write.csv(kkTab,paste0(ii,'.kegg_gsea.csv'),quote = F,row.names = F)

    paths <- rownames(kkTab[c(1:5),])
    pal <- gsea.plot(kk,paths,ii)
    pdf(paste0(ii,'.GSEA.KEGG.pdf'),width = 9,height = 7,family = "Times")
    print(pal)
    dev.off()
    png(paste0(ii,'.GSEA.KEGG.png'),width = 800,height = 500,family = "Times")
    print(pal)
    dev.off()
  },error=function(e){})
}


# 
# 
# # a <- read.csv('./CCDC80.kegg_gsea.csv',header = TRUE)
# # b <- read.csv('./CSPG4.kegg_gsea.csv')
# # c <- read.csv('./MAP1A.kegg_gsea.csv')
# # d <- read.csv('./NIBAN1.kegg_gsea.csv')
# # e <- read.csv('./PCOLCE2.kegg_gsea.csv')
# # f <- read.csv('./PDGFRA.kegg_gsea.csv')
# 


#-ssGSEA --------
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./12_immunity")){
  dir.create("./12_immunity")
}
setwd("./12_immunity")
library(lance)
library(magrittr)
expr<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F)  %>% lance::lc.tableToNum()
colnames(expr)<-gsub('.','-',colnames(expr),fixed = T)
risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1,check.names = F)
#risk$id <- rownames(risk)
group <- risk[,c("id",'risk')]
expr <- expr[,group$id]
#
high.sample<-group$id[which(group$risk=='High')]
low.sample<-group$id[which(group$risk=='Low')]
expr <- expr[,group$id]

library(GSVA)
gene_set <- read.table("/data/nas3//pipeline/ssGSEA/mmc3.txt",
                       header = T,
                       sep ="\t")

dat.final <- as.matrix(expr)
gene_list <- split(as.matrix(gene_set)[,1],
                   gene_set[,2])

ssgsea_score = gsva(dat.final, gene_list, 
                    method = "ssgsea", 
                    ssgsea.norm = TRUE, 
                    verbose = TRUE)
ssgsea_score1 <- as.data.frame(ssgsea_score)
write.csv(ssgsea_score1,
          file = "01.ssgsea_result_cell.csv",
          quote = F)

colnames(group) <- c('sample', 'group')
group<-group[order(group$group),]

ssgsea_score<-read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1)
ssgsea_score<-ssgsea_score[,group$sample]
annotation_col<-as.data.frame(group$group)
colnames(annotation_col)='Group'
rownames(annotation_col)=colnames(ssgsea_score)
ssgsea_score <- as.matrix(ssgsea_score)

library(pheatmap)
color.key<-c("#3777ac","#FF7F00")
ann_colors<-list(Group = c('Low'="#3777ac",'High'="#FF7F00"))
p <- pheatmap(
  ssgsea_score,
  color = colorRampPalette(color.key)(50),
  border_color = 'darkgrey',
  annotation_col = annotation_col,
  annotation_colors = ann_colors,
  labels_row = NULL,
  clustering_method = 'ward.D2',
  show_rownames = T,
  show_colnames = F,
  fontsize_col = 5,
  cluster_cols = F,
  cluster_rows = T)

png(file = "01.heatmap.png", height = 5, width = 8, units = 'in',res = 600,family='Times')
p
dev.off()
pdf(file = "01.heatmap.pdf", height = 5,width = 8,family='Times')
p
dev.off()

table(group$group)
group_case<-group[group$group=='High',]
group_case<-as.character(group_case$sample)
group_control<-group[group$group=='Low',]
group_control<-as.character(group_control$sample)

library(dplyr)
tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) 
tiics_result <- tiics_result[, group$sample] %>% as.matrix()
pvalue = padj <- matrix(0, nrow(tiics_result), 1)

for (i in 1:nrow(tiics_result)){
  pvalue[i, 1] = p.value = wilcox.test(tiics_result[i, group_case],
                                       tiics_result[i, group_control])$p.value
}
padj <- p.adjust(as.vector(pvalue), "fdr", n = length(pvalue))
rTable <- data.frame(#log2FoldChange, 
  pvalue, 
  padj,
  row.names = rownames(tiics_result))
rTable$immune_cell <- rownames(rTable)
rTable$sig <- ifelse(rTable$pvalue < 0.05,
                     ifelse(rTable$pvalue < 0.01, 
                            ifelse(rTable$pvalue < 0.001,
                                   ifelse(rTable$pvalue < 0.0001,
                                          paste(rTable$immune_cell, "****",  sep = ""),
                                          paste(rTable$immune_cell, "***", sep = "")),
                                   paste(rTable$immune_cell, "**", sep = "")),
                            paste(rTable$immune_cell, "*",  sep = "")), 
                     rTable$immune_cell)

diff_Table<-rTable[which(rTable$pvalue<0.05),]
dim(diff_Table)
# 16
write.csv(rTable,
          file = "02.cell_all_wilcox_test.csv",
          quote = F,
          row.names = F)
write.csv(diff_Table,
          file = "02.cell_diff_wilcox_test.csv",
          quote = F,
          row.names = F)

paste0(diff_Table$immune_cell,collapse = "()、")
# "Activated CD4 T cell()、CD56bright natural killer cell()、Central memory CD8 T cell()、Effector memory CD4 T cell()、Gamma delta T cell()、Immature dendritic cell()、Mast cell()、MDSC()、Memory B cell()、Monocyte()、Natural killer T cell()、Plasmacytoid dendritic cell()、Regulatory T cell()、T follicular helper cell()、Type 17 T helper cell()、Type 2 T helper cell"

library(tidyr)
library(ggplot2)
library(ggpubr)
library(Ipaper)

cell.data <- data.frame(Immune_Cell=rownames(tiics_result), 
                        tiics_result, 
                        pvalue=rTable$pvalue)
plot.cell <- cell.data[which(cell.data$pvalue<0.05),]
diff_tiics <- rownames(plot.cell)
violin_dat <- gather(plot.cell, key=Group, value=score, -c("Immune_Cell","pvalue"))

violin_dat$Group <- ifelse(gsub("\\.","-",violin_dat$Group) %in% group_control,
                           "Low", "High") 
violin_dat$Group <- factor(violin_dat$Group, levels = c("Low", "High"))
head(violin_dat)

boxplot_diff_TIICs <- ggplot(violin_dat, aes(x=Immune_Cell, 
                                             y=score,
                                             fill=Group)) +
  #geom_violin(trim=T,color='black') + #, “color=”(#white，)#"trim"TRUE(),。FALSE,。
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA,
               color='black')+ #，width=0.1
  scale_fill_manual(values= c("#3777ac","#FF7F00"))+ #
  labs(title="Cell Infiltration", x="", y = "Score",size=20) +
  stat_compare_means(data = violin_dat,
                     method = "wilcox.test", #，
                     mapping = aes(group = Group),
                     label ="p.signif",
                     hide.ns = F) +
  theme_bw()+#
  ylim(-0.25,0.8) +
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=16), # 
        axis.text.x=element_text(angle=90,hjust=1,colour="black",face="bold",size=12), #x45，1(hjust = 1)，14
        axis.text.y=element_text(hjust=0.5,colour="black",size=12), #y，，plain
        axis.title.x=element_text(size=16,face="bold"),#x
        axis.title.y=element_text(size=14,face="bold"), #y
        legend.text=element_text(face="bold", hjust = 0.5,colour="black", size=11), #
        legend.title=element_text(face="bold", colour="black", size=11),#
        text=element_text(family = 'Times'),
        #legend.justification=c(-0.1,1.2), #。##(1,1)1()，1。
        #legend.position=c(0, 1.04), #legend.position=c(0,1)，(1,1)。
        panel.grid.major = element_blank(), #
        panel.grid.minor = element_blank()) #
boxplot_diff_TIICs
pdf("02.ssgsea_Box.pdf", width = 10, height = 6,family = 'Times')
print(boxplot_diff_TIICs)
dev.off()

png("02.ssgsea_Box.png", width = 10, height = 6,units = 'in',res = 300,family = 'Times')
boxplot_diff_TIICs
dev.off()

res3 <- read.csv('01.ssgsea_result_cell.csv',row.names = 1)
dif_cell <- read.csv('02.cell_diff_wilcox_test.csv')
res3 <- res3[rownames(res3) %in% dif_cell$immune_cell,]
res3 <- t(res3) %>% as.data.frame()

library(ggcorrplot)
cor_data <- cor(res3,method="spearman")
corp <- cor_pmat(res3)

dat.r <- data.frame(cor_data)
dat.r$cell <- rownames(dat.r)
dat.r <- gather(dat.r,'Gene','r',-c(cell))

dat.p <- data.frame(corp)
dat.p$cell <- rownames(dat.p)
dat.p <- gather(dat.p,'Gene','p',-c(cell))
corrp <- (cbind(dat.r, dat.p))[,c("cell","Gene",'r',"p")]
colnames(corrp) <- c("cell1","cell2",'Correlation',"Pvalue")
corrp$cell2 <- gsub("\\.", " ",corrp$cell2)
write.csv(corrp,file="03.cell_cor.csv",quote=F)


library(ggcorrplot)
library(corrplot)
#---------------------
# 
# pdf("03.cell_corHeatmap.pdf",height=14,width=14,family='Times')
# col1 <- colorRampPalette(c("#3777ac", "white","#fbbab6"))
# corr_plot<-corrplot(cor_data,
#                     tl.cex=1.5,
#                     method = "square", #square,circle,method = 'shade'
#                     is.corr = T,
#                     type = "lower", #lower,upper,full
#                     p.mat = corp,
#                     insig = "blank",
#                     outline = "white",
#                     addCoef.col ="black",
#                     col = col1(200),
#                     tl.col = 'black',
#                     tl.offset = 0.4,
#                     number.font = 2,
#                     cl.cex = 0.8,
#                     number.cex = 1.2
# )
# dev.off()
# 
# png("03.cell_corHeatmap.png",height=14,width=14,family='Times',units='in',res=600)
# col1 <- colorRampPalette(c("#3777ac", "white","#fbbab6"))
# corr_plot<-corrplot(cor_data,
#                     tl.cex=1.5,
#                     method = "square", #square,circle,method = 'shade'
#                     is.corr = T,
#                     type = "lower", #lower,upper,full
#                     p.mat = corp,
#                     insig = "blank",
#                     outline = "white",
#                     addCoef.col ="black",
#                     col = col1(200),
#                     tl.col = 'black',
#                     tl.offset = 0.4,
#                     number.font = 2,
#                     cl.cex = 0.8,
#                     number.cex = 1.2
# )
# dev.off()

#------------------------------------

library (dplyr)
data_expr<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F)
gene <- read.csv('../06_Lasso/03.lasso_genes.csv')

expr <- t(data_expr) %>% as.data.frame()
expr <- expr[,gene$symbol]
expr <- expr[rownames(expr) %in% group$sample,]

tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) %>% lc.tableToNum
tiics_result <- tiics_result[, group$sample] %>% as.matrix()
tiics_result <- t(tiics_result) %>% as.data.frame()
tiics_result <- tiics_result[rownames(expr),]

diff <- read.csv('02.cell_diff_wilcox_test.csv')
tiics_result <- tiics_result[,diff$immune_cell]


identical(rownames(expr), rownames(tiics_result))
cor_r <- cor(expr,tiics_result,method = "spearman")
library(psych)
d <- corr.test(expr,tiics_result,use="complete",method = 'spearman')
cor_p <- d$p
cor_r2 <- cor_r %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>%
  tidyr::gather(., cell,Correlation,-gene)# cor
cor_p2 <- cor_p %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>%
  tidyr::gather(., cell, Pvalue, -gene)# p

cor_dat <- cbind(cor_r2, cor_p2)[,c("gene","cell","Correlation","Pvalue")]
write.csv(cor_dat,"04.correlation_cor.csv",row.names = F)

cor_dat$gene <- factor(cor_dat$gene)
cor_dat$Pvalue2 <- ifelse(cor_dat$Pvalue<0.05,"P<0.05","ns")
cor_dat$Correlation2 <- ifelse(abs(cor_dat$Correlation)<=0.3,"No correlation",
                               ifelse(cor_dat$Correlation>0.3,"Medium or strong+","Medium or strong-"))
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(ggsci)
library(rstatix)
library(ggthemes)
bub_plot <- ggplot(data = cor_dat, mapping = aes(x =gene , y = cell)) +
  geom_point(aes(size = Pvalue2, color = Correlation)) +
  scale_color_gradient2(high = "#FF7F00", mid = "white", low = "#3777ac",
                        limits = c(-1, 1), breaks = c(-1, -0.5, 0, 0.5, 1)) +
  theme_few() +
  theme(text = element_text(family = "Times"),
        axis.text.y = element_text(size = 15, face = "bold",color = "black"),
        axis.text.x = element_text(size = 15, face = "bold", angle = 45, hjust = 1, vjust = 1, color = "black"),
        axis.title.x = element_text(size = 15, face = "bold"),
        legend.title = element_text(face = "bold", size = 20),
        legend.text = element_text(face = "bold", size = 12),
        strip.text = element_text(size = 15, family = "Times")) +
  labs(x = "", y = "", size = "Pvalue", color = "Correlation") +
  #  P  0.05 
  geom_text(aes(label = ifelse(Pvalue < 0.05, round(Correlation, 2), NA)),
            vjust = 0, size = 3, angle = 0)


pdf("04.correlation111.pdf", width = 10, height = 10,family = 'Times')
bub_plot
dev.off()

png("04.correlation111.png", width = 10, height = 10,units = 'in',res = 300,family = 'Times')
bub_plot
dev.off()


# 
# 
# #############riskscore########
# setwd("/data/nas3///20_KMZK-40407-8/12_immunity/")
# if (! dir.exists("./05.riskscore")){
#   dir.create("./05.riskscore")
# }
# setwd("./05.riskscore")
# # 
# # group_corr <- read.csv("../../10_Train_estimate/01.train_OS_dat.csv",sep=",",row.names = 1,header = T)%>% dplyr::select(id,risk)%>% as.data.frame()
# # group_corr <- group_corr[order(group_corr$risk),]  #
# riskscore <- read.csv("../../07_Train_estimate/01.train_OS_dat.csv",sep=",",row.names = 1,header = T)%>% dplyr::select(id,riskScore)
# rownames(riskscore) <- riskscore$id
# 
# # ，
# riskscore$id <- NULL
# rownames(riskscore)<-gsub('-','.',rownames(riskscore),fixed = T)
# #riskscore1 <- riskscore[(group_corr$id),]  ###
# 
# # riskscore <- riskscore[,-1]%>% as.data.frame()#riskScore
# 
# #riskscore <- read.csv('./riskscore.csv',row.names = 1)
# re3 <- res3
# 
# train<-t(riskscore)
# 
# cnt<-1
# while(cnt < 2){    #56
#   library(psych)
#   x <-as.numeric(train[cnt,])
#   y <-data.frame(re3,check.names = F)
#   library(psych)
#   d <- corr.test(y,x,use="complete",method = 'spearman')
#   r <- data.frame(d$r)
#   p <- data.frame(d$p)
#   write.csv(p,file=paste(rownames(train)[cnt], "p.csv",sep='_'))
#   write.csv(r,file=paste(rownames(train)[cnt], "r.csv",sep='_'))
#   correlation<-data.frame(rownames(p),r$d.r,p$d.p)
#   colnames(correlation) <- c("cell","Correlation","p.value")
#   # correlation<-correlation[correlation$p.value < 0.05 & abs(correlation$Correlation) > 0.3,]    #
#   correlation$sig[correlation$p.value>0.05] <- "ns"   
#   correlation$sig[correlation$p.value<0.05&correlation$p.value>0.01] <- "*"   
#   correlation$sig[correlation$p.value<0.01&correlation$p.value>0.001] <- "**"  
#   correlation$sig[correlation$p.value<0.001&correlation$p.value>0.0001] <- "***"   
#   correlation$sig[correlation$p.value<0.0001] <- "****"
#   correlation$'correlation'<-abs(correlation$Correlation)
#   #correlation_sig<-correlation[c(3,9,10,11,23,29,7,12,18,31,1,14,15,25,28),]
#   library("viridis") 
#   correlation <- na.omit(correlation)
#   #trace(ggdotchart,edit=T)
#   p0<-ggdotchart(correlation, x = "cell", y = "Correlation",
#                  dot.size ='correlation',
#                  color ='p.value',  
#                  sorting = "descending",                        
#                  add = "segments",                             # 
#                  rotate = TRUE,   
#                  ggtheme = theme_pubr(),                        # 
#                  ylab="Correlation Coefficient (r)",
#                  xlab='',
#                  title=rownames(train)[cnt]
#   )+scale_colour_gradient( high = "#631879FF",low = "#00A087FF")
#   p10<-p0+theme(legend.position = "right",
#                 panel.background = element_blank())+geom_hline(aes(yintercept=0),linetype="dashed",lwd = 0.2)+
#     theme(axis.title.x =element_text(size=20,family = "Times", face = "bold"),
#           axis.text.x =element_text(size=16,family = "Times", face = "bold"),
#           axis.title.y =element_text(size=20,family = "Times", face = "bold"),
#           axis.text.y=element_text(size=16,family = "Times", face = "bold"),
#           plot.title=element_text(size=22,family = "Times", face = "bold",hjust=0.5),
#           legend.text = element_text(size = 16, family = "Times"),
#           legend.title = element_text(size = 20, family = "Times",face = "bold"))+
#     theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank())
#   p10
#   ggsave(paste(rownames(train)[cnt],'pdf',sep='.'),w=8,h=9)
#   ggsave(paste(rownames(train)[cnt],'png',sep='.'),w=8,h=9)
#   cnt<-cnt+1
# }


#-----------------------------------

# rm(list = ls())
# setwd("/data/nas3///20_KMZK-40407-8/")
# if (! dir.exists("./09_TMB_immunecycle")){
#   dir.create("./09_TMB_immunecycle")
# }
# setwd("./09_TMB_immunecycle")
# 
# library(TCGAmutations)
# library(maftools)
# library(grid)
# 
# maf <- TCGAmutations::tcga_load(study = "GBM")
# risk <- read.csv("../04_Riskmodel/01_risk.csv")
# risk1 <- risk[,c('id','risk')]
# High.sample <- risk1$id[which(risk1$risk==0)]
# Low.sample <- risk1$id[which(risk1$risk==1)]
# 
# ##-----------------------------------------------
# maf_sample <- data.frame(barcode = maf@clinical.data$Tumor_Sample_Barcode)
# maf_sample$sample <- stringr::str_sub(maf_sample$barcode, 1, 16)
# maf_sample <- merge(maf_sample, risk, by.x = "sample",by.y = 'id')
# sample <- subset(maf_sample)$barcode
# maf <- subsetMaf(maf,tsb = sample)
# 
# ##HIGH LOW
# high <- maf_sample[(maf_sample$sample) %in% High.sample,] #high
# high <- subset(high)$barcode #highbarcode
# maf.high <- subsetMaf(maf,tsb = high) 
# 
# pdf(file = "01.oncoplot.high.pdf", height = 10, width = 15,paper = "a4r") 
# par(mar = c(5, 5, 5, 5) + 0.1)  # 
# oncoplot(maf = maf.high, top = 20)
# dev.off()
# 
# png(file = "01.oncoplot.high.png", family = "Times", height = 10, width = 15, 
#     units = "in", res = 600)
# par(mar = c(5, 5, 5, 5) + 0.1)  # 
# oncoplot(maf = maf.high, top = 20)
# dev.off()
# 
# low <- maf_sample[(maf_sample$sample)%in%Low.sample,]
# low <- subset(low)$barcode
# maf.low <- subsetMaf(maf,tsb = low)
# pdf(file = "02.oncoplot.low.pdf", height = 10, width = 15,paper = "a4r")
# par(mar = c(5, 5, 5, 5) + 0.1)  # 
# oncoplot(maf = maf.low, top = 20)
# dev.off()
# 
# png(file = "02.oncoplot.low.png", family = "Times", height = 10, width = 15, 
#     units = "in", res = 600)
# par(mar = c(5, 5, 5, 5) + 0.1)  # 
# oncoplot(maf = maf.low, top = 20)
# dev.off()
# 
# ##wilcox-TMB,TMB-----------
# library(TCGAmutations)
# library(ggplot2)
# library(ggpubr)
# library(dplyr)
# library(ggsci)
# 
# tmp<-as.data.frame(tcga_available())
# dt<-maf
# dt<-dt@data
# dt1<-as.data.frame(table(dt$Tumor_Sample_Barcode))
# names(dt1)<-c('Barcode','Freq')
# dt1$tmb<-dt1$Freq/38
# tmb<-dt1
# tmb$Barcode <- substr(tmb$Barcode,1,16)
# tmb <- tmb[tmb$Barcode%in%risk1$id,]
# colnames(tmb) <- c('sample','Freq','tmb')
# 
# dat <- merge(risk, tmb, by.x = 'id', by.y = 'sample')
# dat$risk <- factor(dat$risk)
# write.csv(dat,"01.TMB-score.csv")
# 
# p1<-ggplot(dat,aes(x=riskScore,y=tmb))+
#   geom_point(aes(x=riskScore,y=tmb,color =risk), size=2,alpha=0.9)+
#   geom_smooth(method = 'lm', formula = y ~ x, se = T,color='black')+ 
#   stat_cor(data=dat, method = "spearman")+
#   scale_fill_manual(values =c("black")) +
#   theme_bw()+
#   scale_color_manual(values = c("#6495ed","#dc143c"), name = "risk") +
#   theme(axis.title = element_text(size = 20, face = "bold", family = "Times",color='black'),
#         axis.text.x = element_text(size = 16, face = "bold", family = "Times",color='black'),
#         axis.text.y = element_text(size = 16, face = "bold", family = "Times",color='black'),
#         legend.text = element_text(size = 14, face = "bold", family = "Times",color='black'),
#         legend.title= element_text(size =16, face = "bold", family = "Times",color='black'),
#         legend.position = "bottom")+
#   labs(x="Risk score",y='TMB')+
#   theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank())
# p1
# my_comparisons = list(c('Low_Risk','High_Risk')) #'Low_Risk','High_Risk'
# p2<-ggplot(dat, aes(x=risk, y=tmb,fill=risk)) + 
#   geom_violin()+
#   stat_boxplot(geom="errorbar",
#                width=0.1,
#                position = position_dodge(0.9)) +
#   geom_boxplot(width=0.4,
#                position=position_dodge(0.9),
#                outlier.shape = NA,
#                fill='white')+ #，width=0.1
#   geom_point(aes(fill = risk),
#              size = 0.5,
#              position = position_dodge(0.9))+
#   scale_fill_manual(values = c("#FF6347","#6495ED"), name = "Group") + 
#   stat_compare_means(qcomparisons = my_comparisons,
#                      label = "p.signif",
#                      method = "wilcox.test",
#                      color='black',family="Times",face = "bold")+ 
#   theme_bw()+ #
#   ylim(-50,160) +
#   theme(panel.background = element_rect(colour = 'black', size = 1, fill = 'white'),
#         panel.grid = element_blank()) + 
#   theme(axis.text.x=element_text(family="Times",size=12,face="bold"), #x15，1(hjust = 1)，Times20
#         axis.text.y=element_text(family="Times",size=12,face="bold"), #y，，plain
#         axis.title.y=element_text(family="Times",size = 15,face="bold"), #y
#         axis.title.x=element_text(family="Times",size = 10,face="bold"),
#         plot.title = element_text(hjust = 0.5,family="Times",size = 15,face="bold"),
#         legend.text = element_text(face = "bold"),
#         legend.title = element_text(face = "bold"))+
#   labs(title="", x="", y="")+
#   guides(fill='none')
# p2
# library(patchwork)
# p<-p1+p2+plot_layout(widths = c(3, 2))
# p
# pdf('03.tmb.pdf',width=12,height=6,family='Times')
# print(p)
# dev.off()
# 
# png('03.tmb.png',width=12,height=6,family='Times',units='in',res=600)
# print(p)
# dev.off()
# 
# ##------------------------------------
# library(psych)
# library(ggExtra)
# 
# dat_riskScore <- dplyr::select(dat, id, riskScore)
# dat_riskScore <- tibble::column_to_rownames(dat_riskScore, var = "id")
# dat_tmb <- dplyr::select(dat, id, tmb)
# dat_tmb <- tibble::column_to_rownames(dat_tmb, var = "id")
# obj <- corr.test(dat_tmb, dat_riskScore, use = "complete", method = "spearman")
# cor.r <- obj$r
# cor.p <- obj$p
# 
# dat4plot <- cbind(dat_riskScore, dat_tmb)
# p1 <- ggplot(dat4plot, aes(riskScore, tmb))+
#   xlab("riskScore")+ylab("Tumor Burden Mutation")+
#   geom_point()+geom_smooth(method = 'lm',formula = y~x)+
#   theme_bw()+
#   stat_cor(method = 'spearman',aes(x=riskScore,y=tmb))+
#   ylim(0, 4)
# p1
# p2 <- ggMarginal(p1,type = 'density',xparams = list(fill='seagreen'),
#                  yparams = list(fill='#A73030FF'))
# p2
# ggsave("04.cor_plot.pdf", p2, w=5, h=5)
# ggsave("04.cor_plot.png", p2, w=5, h=5, dpi=600)

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./13_checkpoint")){
  dir.create("./13_checkpoint")
}
setwd("./13_checkpoint")

gene <- read.csv('../00_rawdata/immune.csv')
dat<-read.csv('../00_rawdata/02.expr_tpm_TCGA.csv',row.names = 1)
colnames(dat)<-gsub('.','-',colnames(dat),fixed = T)

group <- read.csv('../07_Train_estimate/01.train_OS_dat.csv')%>%dplyr::select(c('id','risk'))
colnames(group) <- c('sample','group')
table(group$group)

hub_exp<-dat[gene$symbol,]
hub_exp <- na.omit(hub_exp) #
rownames(hub_exp)

hub_exp2<-hub_exp
hub_exp2$Symbol<-rownames(hub_exp2)
hub_exp2<-gather(hub_exp2,
                 key = sample,
                 value = expr,
                 -c("Symbol"))
hub_exp2 <- na.omit(hub_exp2)

High.sample <- group$sample[which(group$group=='High')]
## 
hub_exp2$Group<-ifelse(hub_exp2$sample%in%High.sample,'High','Low')
##
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(ggsci)
library(rstatix)
stat.test<-hub_exp2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
de.test <- subset(stat.test,p<0.05)
de.test$Symbol#29
write.csv(de.test,"00.de.test.csv")
# [1] "CD276"    "CD28"     "CD40"     "CD40LG"   "CD44"     "CD48"     "CTLA4"    "CXCL9"    "GZMA"     "HLA-A"    "HLA-DPA1" "HLA-DPB1"
# [13] "HLA-DQA1" "HLA-DRA"  "HMGB1"    "IL1B"     "ITGB2"    "LGALS9"   "MICB"     "PDCD1"    "PRF1"     "SELP"     "SLAMF7"   "TGFB1"   
# [25] "TIGIT"    "TNFSF15"  "TNFSF9"   "VEGFA"    "VEGFB"

de.test$p<-ifelse(de.test$p<0.001,"***",
                  ifelse(de.test$p<0.01,"**",
                         ifelse(de.test$p<0.05,"*",'ns')))


datcheck <- dat[de.test$Symbol,group$sample]
ciber.res <- t(datcheck)
violin_dat <-data.frame(cbind(ciber.res,group$group))
colnames(violin_dat)<-c(colnames(ciber.res),'group')

violin_dat$id <- rownames(violin_dat)
violin_dat <- reshape2::melt(violin_dat,id = c("id","group"))
violin_dat$value <-as.numeric(violin_dat$value)
colnames(violin_dat)<-c('id','group','variable','value')
violin_dat$group<- factor(violin_dat$group)
colnames(violin_dat)

p <- ggplot(violin_dat, aes(x=variable,y=value,fill=group)) +
  geom_boxplot() +
  #stat_compare_means(aes(group = condition),method = 'wilcox.test',label = "p.signif",cex=4)+
  annotate(geom = "text", x = de.test$Symbol, y = 14, size = 5, #family = "Times",
           label =as.character(de.test$p,face = "bold"))  +
  theme_bw()+
  # scale_fill_npg()+
  scale_fill_manual(values=c("#CD3700","#4682B4"))+
  theme(axis.title.x =element_text(size=20, face = "bold"),
        axis.text.x =element_text(size=14, face = "bold",angle=45,hjust = 1),
        axis.title.y =element_text(size=27, face = "bold"),
        axis.text.y=element_text(size=16, face = "bold"))+
  theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank(), legend.title=element_text(size=15) , legend.text=element_text(size=14))+
  xlab("") +
  ylab("checkpoint expression") + theme(axis.title.y = element_text(size = 15, vjust = 0.5, hjust = 0.5))
p
pdf("01.checkpoint_boxplot.pdf", width = 12, height = 6,family = 'Times')
p
dev.off()
png("01.checkpoint_boxplot.png", width = 12, height = 6,units = 'in',res = 300,family = 'Times')
p
dev.off()

# gene <- read.csv("../09_Lasso/03.lasso_genes.csv",row.names = 1)
# datch<-dat[gene$symbol,] %>% t() %>%  as.data.frame()
# 
# library(psych)
# hub_exp <- dat[de.test$Symbol,] %>% t()%>% as.data.frame()
# 
# cor_r <- WGCNA::cor(hub_exp,datch,method = "spearman") 
# cor_p <- WGCNA::corPvalueStudent(cor_r,length(rownames(hub_exp)))
# cor_r2 <- cor_r %>% as.data.frame %>% tibble::rownames_to_column(var = "cell") %>%
#   tidyr::gather(., gene,Correlation,-cell)# cor
# cor_p2 <- cor_p %>% as.data.frame %>% tibble::rownames_to_column(var = "cell") %>% 
#   tidyr::gather(., gene, Pvalue, -cell)# p
# 
# cor_dat <- cbind(cor_r2, cor_p2)[,c("cell","gene","Correlation","Pvalue")]
# 
# cor_dat$cell <- factor(cor_dat$cell)
# cor_dat$Pvalue2 <- ifelse(cor_dat$Pvalue<0.05,"P<0.05","ns")
# cor_dat$Correlation2 <- ifelse(abs(cor_dat$Correlation)<=0.3,"No correlation",
#                                ifelse(cor_dat$Correlation>0.3,"Medium or strong+","Medium or strong-"))
# write.csv(cor_dat,"02.correlation_cor.csv")
# cor_dat2 <- subset(cor_dat,cor_dat$Pvalue < 0.05 & abs(cor_dat$Correlation) > 0.3)
# cor_dat2 <- cor_dat2[order(cor_dat2$cell),] 
# cor_dat2$gene
# write.csv(cor_dat2,"03.correlation_degcor.csv")
# 
# #
# library(ggplot2)
# library(ggthemes)
# bub_plot <- ggplot(data = cor_dat, mapping = aes(x=cell,y=gene)) +
#   geom_point(aes(size=Pvalue2,color=Correlation))+
#   scale_color_gradient2(high = "#EF767A",mid = "white",low = "#456990",limits=c(-1, 1),breaks = c(-1,-0.6,-0.2,0.2,0.6,1))+
#   theme_few()+
#   theme(text = element_text(family = "Times"),
#         axis.text.y = element_text(size = 15,face = "bold",color = "black"),
#         axis.text.x = element_text(size = 15, face = "bold",angle = 45,hjust = 1,vjust = 1,color = "black"),
#         axis.title.x = element_text(size = 15, face = "bold"),
#         legend.title = element_text(face = "bold",size = 20),
#         legend.text = element_text(face = "bold",size = 12),
#         strip.text = element_text(size = 15, family = "Times"))+
#   labs(x="",y="",size="Pvalue",color="Correlation")+
#   geom_text(aes(label = round(Correlation, 2)),vjust = -1, size = 3)
# bub_plot
# pdf("04.correlation.pdf", width = 18, height = 6,family = 'Times')
# bub_plot
# dev.off()
# png("04.correlation.png", width = 18, height = 6,units = 'in',res = 300,family = 'Times')
# bub_plot
# dev.off()

# immunecycle------------------------------------------------
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./14_immunecycle")){
  dir.create("./14_immunecycle")
}
setwd("./14_immunecycle")

train_data <- read.csv("../00_rawdata/02.expr_tpm_TCGA.csv",header=TRUE, row.names=1, check.names=FALSE)
riskScore<-read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names=1)

group <- riskScore[,c("id","risk")]
colnames(group) <- c("sample","group")
dat <- train_data[,group$sample]
identical(colnames(dat),group$sample)

library(GSVA)
library(tidyverse)
gene_set <- read.table("../00_rawdata/cancer_immunity_cycle.txt",header = T,sep ="\t")
gene_set$Steps <- c(rep('Step 1:Relase of cancer cell antigenes',40),
                    rep('Step 2:Cancer antigen presentation',36),
                    rep('Step 3:Priming and activation',62),
                    rep('Step 4:Trafficking of immune cells to tumors',76),
                    rep('Step 5:Infiltration of immune cells into tumors',9),
                    rep('Step 6:Recongnition of cancer cells by T cells',25),
                    rep('Step 7:Killing of cancer cells',25))
gene_set$group <- gene_set$Steps %>% substr(.,8,60)
gene_set$step <- gene_set$Steps %>% substr(.,1,6)

table(gene_set$group[grep("Step 4",gene_set$Steps)] )
gene_set$group[grep("Step 4",gene_set$Steps)] <- gene_set$ImmuneCellType[grep("Step 4",gene_set$Steps)]
gene_set$group[grep("Step 4",gene_set$Steps)] <- paste0(gene_set$group[grep("Step 4",gene_set$Steps)]," recruiting")
gene_set$group <- paste0(gene_set$step,' ',gene_set$group)
dat.final <- as.matrix(dat)
gene_list <- split(as.matrix(gene_set)[,1],gene_set[,5])
ssgsea_score <- gsva(dat.final, gene_list,method = "ssgsea",ssgsea.norm = TRUE,verbose = TRUE)
write.csv(ssgsea_score,"05.immune_cycle.csv",row.names = T,quote = F)

#7---------------------
library(magrittr)
library(rstatix)
library(tidyverse)
tiic_fpkm <- ssgsea_score %>% t %>% as.data.frame %>% rownames_to_column(var="sample")
dat <- merge(group, tiic_fpkm, by = "sample")#cibersort
dat2 <- tidyr::gather(dat, step, score, -c("sample", "group"))#，，Cells, score

stat_res <- dat2 %>% 
  group_by(step) %>% 
  wilcox_test(score ~ group) %>% 
  # adjust_pvalue(method = "BH") %>%
  add_significance("p")
stat_res
Pathways_sig <- stat_res[stat_res$p<0.05,]
paste0(Pathways_sig$step,collapse = "，")

stat_res$p2 <- signif(stat_res$p, 3)#p，
stat_res$p2 <- ifelse(stat_res$p <= 0.05, paste0("~italic(p)==",
                                                 signif(stat_res$p,2)),
                      stat_res$p.signif)
write.csv(stat_res,"06.res_wicox_step.csv",row.names = F,quote = F)
library(ggpubr)
set.seed(1)
dat2$group <- ifelse(dat2$group=="High","High risk","Low risk")
dat2$group <- factor(dat2$group,levels = c("High risk","Low risk"))

# col <- readRDS("mypalette.rds")[1:7]
library(ggplot2)
library(ggsci)
p <- ggplot(dat2, aes(x=step, y=score, fill=group)) +
  geom_boxplot(width=0.8, alpha=0.8, position=position_dodge(0.9)) +
  scale_fill_manual(values = c("#EE2C2C", "#114889"), name = "Group") +
  # “ns”
  annotate(geom = "text", x = stat_res$step, y = 0.8, size = 6, family = "Times",
           label = ifelse(stat_res$p.signif != "ns", as.character(stat_res$p.signif), "")) +
  theme_bw() +
  theme(legend.position = "top") +
  theme(axis.title.x = element_text(size=20, family = "Times", face = "bold"),
        axis.text.x = element_text(size=13, hjust=1, family = "Times", face = "bold"),
        axis.title.y = element_text(size=20, family = "Times", face = "bold"),
        axis.text.y = element_text(size=16, family = "Times", face = "bold")) +
  theme(legend.title = element_text(size=15,family = "Times"), legend.text = element_text(size=14,family = "Times")) +
  theme(axis.line = element_line(color = "black"),  # 
        axis.ticks = element_line(color = "black")) +  # 
  labs(x="", y="Cancer Immunity Cycle") +
  coord_flip()   # 
print(p)
ggsave(width=10,height=10,'05.immuCycle.pdf')
ggsave(width=10,height=10,'05.immuCycle.png')

#---------------
dat2$step<- substr(dat2$step ,1,6)
dat2$High<-ifelse(dat2$group =="High risk",'High',NA)
dat2$Low<-ifelse(dat2$group =="Low risk",'Low',NA)
##
step1<-filter(dat2, group=="Low risk" & step=="Step 1")
step1$rowmedians<-median(step1$score)
low1<-median(step1$score)
step2<-filter(dat2, group=="Low risk" & step=="Step 2")
step2$rowmedians<-median(step2$score)
low2<-median(step2$score)
step3<-filter(dat2, group=="Low risk" & step=="Step 3")
step3$rowmedians<-median(step3$score)
low3<-median(step3$score)
step4<-filter(dat2, group=="Low risk" & step=="Step 4")
step4$rowmedians<-median(step4$score)
low4<-median(step4$score)
step5<-filter(dat2, group=="Low risk" & step=="Step 5")
step5$rowmedians<-median(step5$score)
low5<-median(step5$score)
step6<-filter(dat2, group=="Low risk" & step=="Step 6")
step6$rowmedians<-median(step6$score)
low6<-median(step6$score)
step7<-filter(dat2, group=="Low risk" & step=="Step 7")
step7$rowmedians<-median(step7$score)
low7<-median(step7$score)
##
step1<-filter(dat2, group=="High risk" & step=="Step 1")
step1$rowmedians<-median(step1$score)
High1<-median(step1$score)
step2<-filter(dat2, group=="High risk" & step=="Step 2")
step2$rowmedians<-median(step2$score)
High2<-median(step2$score)
step3<-filter(dat2, group=="High risk" & step=="Step 3")
step3$rowmedians<-median(step3$score)
High3<-median(step3$score)
step4<-filter(dat2, group=="High risk" & step=="Step 4")
step4$rowmedians<-median(step4$score)
High4<-median(step4$score)
step5<-filter(dat2, group=="High risk" & step=="Step 5")
step5$rowmedians<-median(step5$score)
High5<-median(step5$score)
step6<-filter(dat2, group=="High risk" & step=="Step 6")
step6$rowmedians<-median(step6$score)
High6<-median(step6$score)
step7<-filter(dat2, group=="High risk" & step=="Step 7")
step7$rowmedians<-median(step7$score)
High7<-median(step7$score)

##
# library(ggradar)
library(tidyverse)
#df<-c(High1,High2,High3,High4,High5,High6,High7)
#dn<-c(low1,low2,low3,low4,low5,low6,low7)
exam_scores <-data.frame(step1=c(High1,low1),
                         step2=c(High2,low2),
                         step3=c(High3,low3),
                         step4=c(High4,low4),
                         step5=c(High5,low5),
                         step6=c(High6,low6),
                         step7=c(High7,low7),
                         row.names = c("High","Low"))
# 
max_min <- data.frame(
  step1 = c(0.6, -0.6), step2 = c(0.6, -0.6), step3 = c(0.6, -0.6),
  step4 = c(0.6, -0.6),step5 = c(0.6, -0.6), step6 = c(0.6, -0.6),
  step7 = c(0.6, -0.6)
)
rownames(max_min) <- c("Max", "Min")
df <- rbind(max_min, exam_scores)

png('06.ssgsea.png',w=8,h=6,family='Times',units='in',res=600)
pdf('06.ssgsea.pdf',w=8,h=6,family='Times')
#install.packages("fmsb")
library(fmsb)
radarchart(
  df, axistype = 1,
  # Customize the polygon
  pcol = c("#DB3587",  "#35B3B2"), pfcol = scales::alpha(c("#DB3587","#35B3B2"),0.5), plwd = 2, plty = 1,
  # Customize the grid
  cglcol = "grey", cglty = 1, cglwd = 0.8,
  # Customize the axis
  axislabcol = "grey",
  # Variable labels
  vlcex = 0.7, vlabels = colnames(df),
  caxislabels = c(-0.3,-0.2,-0.1,0,0.1,0.2,0.3))
# Add an horizontal legend
legend(
  x = "bottom", legend = rownames(df[-c(1,2),]), horiz = TRUE,
  bty = "n", pch = 20 , col = c("#DB3587", "#35B3B2"),
  text.col = "black", cex = 1, pt.cex = 1.5
)
dev.off()
dev.off()


# ##############7###########
# ## 
# library(curl)
# curl_download("http://biocc.hrbmu.edu.cn/TIP/download/signature%20annotation.txt",
#               destfile = paste0(dest_dir, "/geneSet_TIP.txt"))
# geneInfo_TIP = read.table("./8、/ESCA/geneSet_TIP.txt", header = TRUE, sep = "\t")
# geneInfo_TIP$step_cell = paste0("step ", geneInfo_TIP$Steps, " ", geneInfo_TIP$ImmuneCellType)
# geneInfo_TIP$Description = paste0(geneInfo_TIP$step_cell, " ", geneInfo_TIP$Direction)
# 
# pathInfo_TIP = unique(geneInfo_TIP[,-1])
# geneSet_TIP = list()
# for(i in 1:length(pathInfo_TIP$Description)){
#   geneSet_TIP[[i]] = geneInfo_TIP$GeneSymbol[which(geneInfo_TIP$Description==pathInfo_TIP$Description[i])]
# }
# names(geneSet_TIP) = pathInfo_TIP$Description
# 
# ## ssGSEA
# library(GSVA)
# data_analysis = read.csv("./data/data_log2(tpm+1)_ESCA.csv", row.names = 1, check.names = FALSE)
# data_analysis = data_analysis[,survRisk_train$sample_id]
# score_TIP1 = gsva(as.matrix(data_analysis), geneSet_TIP, method = 'ssgsea', kcdf = 'Gaussian', abs.ranking = TRUE)
# 
# ## permutation 100
# ## 
# score_TIP = list()
# for(j in 1:length(colnames(score_TIP1))){
#   score_TIP[[j]] = as.data.frame(score_TIP1[,j])
#   colnames(score_TIP[[j]])[1] = "permutation1"
# }
# names(score_TIP) = colnames(score_TIP1)
# 
# for(i in 2:101){
#   data_random = data_analysis
#   rownames(data_random) = sample(rownames(data_random), size = nrow(data_random), replace = FALSE)
#   score_tmp = gsva(as.matrix(data_random), geneSet_TIP, method = 'ssgsea', kcdf = 'Gaussian', abs.ranking = TRUE)
#   for(j in 1:length(colnames(score_tmp))){
#     score_TIP[[j]] = cbind(score_TIP[[j]], as.data.frame(score_tmp[,j]))
#     colnames(score_TIP[[j]])[i] = paste0("permutation", i)
#   }
# }
# 
# ## z score
# zscore_TIP = matrix(0, nrow = length(pathInfo_TIP$Description), ncol = ncol(data_analysis))
# rownames(zscore_TIP) = pathInfo_TIP$Description
# colnames(zscore_TIP) = colnames(data_analysis)
# for(j in 1:ncol(zscore_TIP)){
#   for(i in 1:length(pathInfo_TIP$Description)){
#     zscore_TIP[i, j] = (score_TIP[[j]][i, 1]-mean(as.numeric(score_TIP[[j]][i,])))/sd(as.numeric(score_TIP[[j]][i,]))
#   }
# }
# 
# activeScore_TIP = matrix(0, nrow = length(unique(pathInfo_TIP$step_cell)), ncol = ncol(data_analysis))
# rownames(activeScore_TIP) = unique(pathInfo_TIP$step_cell)
# colnames(activeScore_TIP) = colnames(data_analysis)
# 
# ## positivenegativeNES(positive)-NES(negative)
# for(i in 1:nrow(activeScore_TIP)){
#   path_name = pathInfo_TIP$Description[which(pathInfo_TIP$step_cell==rownames(activeScore_TIP)[i])]
#   if(length(path_name)==2){
#     activeScore_TIP[i,] = as.numeric(zscore_TIP[paste0(rownames(activeScore_TIP)[i], " positive"),]) - as.numeric(zscore_TIP[paste0(rownames(activeScore_TIP)[i], " negative"),])
#   }else{
#     activeScore_TIP[i,] = as.numeric(zscore_TIP[path_name,])
#   }
# }
# write.csv(activeScore_TIP, paste0(dest_dir, "/activeScore_TIP.csv"))
# 
# 


# library(IOBR)
# library(ggcorrplot)
# library(corrplot)
# pdf("03.cell_corHeatmap.pdf",height=14,width=14,family='Times')           
# col1 <- colorRampPalette(c("#3777ac", "white","#FF7F00"))
# corr_plot<-corrplot(cor_data,
#                     tl.cex=1.5,
#                     method = "square", #square,circle,method = 'shade'
#                     is.corr = T,
#                     type = "lower", #lower,upper,full
#                     p.mat = corp,
#                     insig = "blank",
#                     outline = "white",
#                     addCoef.col ="black",
#                     col = col1(200),
#                     tl.col = 'black',
#                     tl.offset = 0.4,
#                     number.font = 2,
#                     cl.cex = 0.8,
#                     number.cex = 1.2
# )
# dev.off()
# 
# png("03.cell_corHeatmap.png",height=14,width=14,family='Times',units='in',res=600)           
# col1 <- colorRampPalette(c("#3777ac", "white","#FF7F00"))
# corr_plot<-corrplot(cor_data,
#                     tl.cex=1.5,
#                     method = "square", #square,circle,method = 'shade'
#                     is.corr = T,
#                     type = "lower", #lower,upper,full
#                     p.mat = corp,
#                     insig = "blank",
#                     outline = "white",
#                     addCoef.col ="black",
#                     col = col1(200),
#                     tl.col = 'black',
#                     tl.offset = 0.4,
#                     number.font = 2,
#                     cl.cex = 0.8,
#                     number.cex = 1.2
# )
# dev.off()


# library (dplyr)
# data_expr<-read.csv("../00_rawdata/02.expr_fpkm_TCGA.csv", row.names = 1,check.names = F)
# gene <- read.csv('../09_Lasso/03.lasso_genes.csv')
# 
# expr <- t(data_expr) %>% as.data.frame()
# expr <- expr[,gene$symbol]
# expr <- expr[rownames(expr) %in% group$sample,]
# 
# tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) %>% lc.tableToNum
# tiics_result <- tiics_result[, group$sample] %>% as.matrix()
# tiics_result <- t(tiics_result) %>% as.data.frame()
# tiics_result <- tiics_result[rownames(expr),]
# 
# diff <- read.csv('02.cell_diff_wilcox_test.csv')
# tiics_result <- tiics_result[,diff$immune_cell]
# 
# identical(rownames(expr), rownames(tiics_result))
# cor_r <- cor(expr,tiics_result,method = "spearman")
# library(psych)
# d <- corr.test(expr,tiics_result,use="complete",method = 'spearman')
# cor_p <- d$p
# cor_r2 <- cor_r %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>%
#   tidyr::gather(., cell,Correlation,-gene)# cor
# cor_p2 <- cor_p %>% as.data.frame %>% tibble::rownames_to_column(var = "gene") %>%
#   tidyr::gather(., cell, Pvalue, -gene)# p
# 
# cor_dat <- cbind(cor_r2, cor_p2)[,c("gene","cell","Correlation","Pvalue")]
# write.csv(cor_dat,"04.correlation_cor.csv",row.names = F)
# 
# cor_dat$gene <- factor(cor_dat$gene)
# cor_dat$Pvalue2 <- ifelse(cor_dat$Pvalue<0.05,"P<0.05","ns")
# cor_dat$Correlation2 <- ifelse(abs(cor_dat$Correlation)<=0.3,"No correlation",
#                                ifelse(cor_dat$Correlation>0.3,"Medium or strong+","Medium or strong-"))
# library(ggplot2)
# library(ggthemes)
# bub_plot <- ggplot(data = cor_dat, mapping = aes(x=gene,y=cell)) +
#   geom_point(aes(size=Pvalue2,color=Correlation))+
#   scale_color_gradient2(high = "#EF767A",mid = "white",low = "#456990",limits=c(-1,1),breaks = c(-1,-0.5,0,0.5,1))+
#   theme_few()+
#   theme(text = element_text(family = "Times"),
#         axis.text.y = element_text(size = 15,color = "black"),
#         axis.text.x = element_text(size = 15,face = "bold",angle = 90,hjust = 1,vjust = 1,color = "black"),
#         axis.title.x = element_text(size = 25, face = "bold"),
#         legend.title = element_text(face = "bold",size = 20),
#         legend.text = element_text(face = "bold",size = 12),
#         strip.text = element_text(size = 20))+
#   labs(x="",y="",size="Pvalue",color="Correlation")+
#   geom_text(aes(label = round(Correlation, 2)),vjust = -1, size = 3.5)
# bub_plot
# pdf("04.correlation.pdf", width = 9, height = 10,family = 'Times')
# bub_plot
# dev.off()
# 
# png("04.correlation.png", width = 9, height = 10,units = 'in',res = 300,family = 'Times')
# bub_plot
# dev.off()

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./15_Immune_pathway")){
  dir.create("./15_Immune_pathway")
}
setwd("./15_Immune_pathway")

path <- read.csv('../00_rawdata/pathway.csv')
path2 <- path %>%  
  separate_rows(Genes, sep = ", ")  
path2 <- path2 %>%  
  separate_rows(Genes, sep = ",")  

library(lance)
library(magrittr)
expr<-read.csv("../00_rawdata/02.expr_tpm_TCGA.csv", row.names = 1,check.names = F)  %>% lance::lc.tableToNum()
colnames(expr)<-gsub('.','-',colnames(expr),fixed = T)
risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv',row.names = 1,check.names = F)
#risk$id <- rownames(risk)
group <- risk[,c("id",'risk')]
expr <- expr[,group$id]
#
high.sample<-group$id[which(group$risk=='High')]
low.sample<-group$id[which(group$risk=='Low')]
expr <- expr[,group$id]

library(GSVA)
gene_set <- path2
length(table(gene_set$Pathway))

dat.final <- as.matrix(expr)
gene_list <- split(as.matrix(gene_set)[,2],
                   gene_set[,1])

ssgsea_score = gsva(dat.final, gene_list, 
                    method = "ssgsea", 
                    ssgsea.norm = TRUE, 
                    verbose = TRUE)
ssgsea_score1 <- as.data.frame(ssgsea_score)
write.csv(ssgsea_score1,
          file = "01.ssgsea_result_cell.csv",
          quote = F)
library(writexl)
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb,sheetName = 'Data')
writeData(wb,sheet = 'Data',x=ssgsea_score,row.names = T)
saveWorkbook(wb,file = '01.ssgsea_result_cell.xlsx',overwrite = T)

#a <- rownames(expr)%>% as.data.frame()
#diff_vec1 <- setdiff(gene_set$Pathway, rownames(ssgsea_score1))  

colnames(group) <- c('sample', 'group')
group<-group[order(group$group),]

ssgsea_score<-read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1)
ssgsea_score<-ssgsea_score[,group$sample]
annotation_col<-as.data.frame(group$group)
colnames(annotation_col)='Group'
rownames(annotation_col)=colnames(ssgsea_score)
ssgsea_score <- as.matrix(ssgsea_score)

# library(pheatmap)
# color.key<-c("#3777ac","#FF7F00")
# ann_colors<-list(Group = c('Low'="#3777ac",'High'="#FF7F00"))
# p <- pheatmap(
#   ssgsea_score,
#   color = colorRampPalette(color.key)(50),
#   border_color = 'darkgrey',
#   annotation_col = annotation_col,
#   annotation_colors = ann_colors,
#   labels_row = NULL,
#   clustering_method = 'ward.D2',
#   show_rownames = T,
#   show_colnames = F,
#   fontsize_col = 5,
#   cluster_cols = F,
#   cluster_rows = T)
# p
# png(file = "01.heatmap.png", height = 5, width = 8, units = 'in',res = 600,family='Times')
# p
# dev.off()
# pdf(file = "01.heatmap.pdf", height = 5,width = 8,family='Times')
# p
# dev.off()

table(group$group)
group_case<-group[group$group=='High',]
group_case<-as.character(group_case$sample)
group_control<-group[group$group=='Low',]
group_control<-as.character(group_control$sample)

library(dplyr)
tiics_result <- read.csv('./01.ssgsea_result_cell.csv',check.names = F, row.names = 1) 
tiics_result <- tiics_result[, group$sample] %>% as.matrix()
pvalue = padj <- matrix(0, nrow(tiics_result), 1)

for (i in 1:nrow(tiics_result)){
  pvalue[i, 1] = p.value = wilcox.test(tiics_result[i, group_case],
                                       tiics_result[i, group_control])$p.value
}
padj <- p.adjust(as.vector(pvalue), "fdr", n = length(pvalue))
rTable <- data.frame(#log2FoldChange, 
  pvalue, 
  padj,
  row.names = rownames(tiics_result))
rTable$panthway <- rownames(rTable)
rTable$sig <- ifelse(rTable$pvalue < 0.05,
                     ifelse(rTable$pvalue < 0.01, 
                            ifelse(rTable$pvalue < 0.001,
                                   ifelse(rTable$pvalue < 0.0001,
                                          paste(rTable$panthway, "****",  sep = ""),
                                          paste(rTable$panthway, "***", sep = "")),
                                   paste(rTable$panthway, "**", sep = "")),
                            paste(rTable$panthway, "*",  sep = "")), 
                     rTable$panthway)

diff_Table<-rTable[which(rTable$pvalue<0.05),]
dim(diff_Table)
# 18
write.csv(rTable,
          file = "02.all_wilcox_test.csv",
          quote = F,
          row.names = F)
rTable <- read.csv('02.all_wilcox_test.csv')
library(writexl)
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb,sheetName = 'Data')
writeData(wb,sheet = 'Data',x=rTable,row.names = T)
saveWorkbook(wb,file = '02.all_wilcox_test.xlsx',overwrite = T)


write.csv(diff_Table,
          file = "02.diff_wilcox_test.csv",
          quote = F,
          row.names = F)
diff_Table <- read.csv('02.diff_wilcox_test.csv')
library(writexl)
library(openxlsx)
wb <- createWorkbook()
addWorksheet(wb,sheetName = 'Data')
writeData(wb,sheet = 'Data',x=diff_Table,row.names = T)
saveWorkbook(wb,file = '02.diff_wilcox_test.xlsx',overwrite = T)

paste0(diff_Table$panthway,collapse = "()、")
#"Base_excision_repair()、Cell_cycle()、DNA_replication()、Fanconi_anemia_pathway()、FGFR3-coexpressed_genes()、Homologous_recombination()、Hypoxia()、MicroRNAs_in_cancer()、Mismatch_repair()、Nucleotide_excision_repair()、Oocyte_meiosis()、p53_signaling_pathway()、PPARG_network()、Progesterone-mediated_oocyte_maturation()、Pyrimidine_metabolism()、Spliceosome()、Viral_carcinogenesis()、WNT-β-catenin_network"

library(tidyr)
library(ggplot2)
library(ggpubr)
library(Ipaper)

cell.data <- data.frame(Immune_Cell=rownames(tiics_result), 
                        tiics_result, 
                        pvalue=rTable$pvalue)
#plot.cell <- cell.data[which(cell.data$pvalue<0.05),]
diff_tiics <- rownames(cell.data)
violin_dat <- gather(cell.data, key=Group, value=score, -c("Immune_Cell","pvalue"))

violin_dat$Group <- ifelse(gsub("\\.","-",violin_dat$Group) %in% group_control,
                           "Low", "High") 
violin_dat$Group <- factor(violin_dat$Group, levels = c("Low", "High"))
head(violin_dat)

boxplot_diff_TIICs <- ggplot(violin_dat, aes(x=Immune_Cell, 
                                             y=score,
                                             fill=Group)) +
  #geom_violin(trim=T,color='black') + #, “color=”(#white，)#"trim"TRUE(),。FALSE,。
  stat_boxplot(geom="errorbar",
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.7,
               position=position_dodge(0.9),
               outlier.shape = NA,
               color='black')+ #，width=0.1
  scale_fill_manual(values= c("#3777ac","#FF7F00"))+ #
  labs(title="Cell Infiltration", x="", y = "Score",size=20) +
  stat_compare_means(data = violin_dat,
                     method = "wilcox.test", #，
                     mapping = aes(group = Group),
                     label ="p.signif",
                     hide.ns = F) +
  theme_bw()+#
  ylim(-0.3,1) +
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=16), # 
        axis.text.x=element_text(angle=90,hjust=1,colour="black",face="bold",size=12), #x45，1(hjust = 1)，14
        axis.text.y=element_text(hjust=0.5,colour="black",size=12), #y，，plain
        axis.title.x=element_text(size=16,face="bold"),#x
        axis.title.y=element_text(size=14,face="bold"), #y
        legend.text=element_text(face="bold", hjust = 0.5,colour="black", size=11), #
        legend.title=element_text(face="bold", colour="black", size=11),#
        text=element_text(family = 'Times'),
        #legend.justification=c(-0.1,1.2), #。##(1,1)1()，1。
        #legend.position=c(0, 1.04), #legend.position=c(0,1)，(1,1)。
        panel.grid.major = element_blank(), #
        panel.grid.minor = element_blank()) #
boxplot_diff_TIICs
pdf("02.ssgsea_Box.pdf", width = 12, height = 7,family = 'Times')
print(boxplot_diff_TIICs)
dev.off()

png("02.ssgsea_Box.png", width = 12, height = 6,units = 'in',res = 300,family = 'Times')
boxplot_diff_TIICs
dev.off()
rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (! dir.exists("./16_IC50")){
  dir.create("./16_IC50")
} 
setwd("./16_IC50")

#.libPaths(c("/data/nas2/software/miniconda3/envs/R_4.2.2/lib/R/library", .libPaths()))
library(pRRophetic)
library(ggplot2)
set.seed(12345)
drug<-read.table(file = '/data/nas3//pipeline/IC50_drug/drugs.txt',sep='\t',header=F)
fpkm.log <- read.csv('../00_rawdata/02.expr_tpm_TCGA.csv', row.names = 1,check.names = F)
risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv')
#colnames(risk)[1] <- 'id'
fpkm.log <- fpkm.log[,risk$id]
identical(colnames(fpkm.log), risk$id)
dat <- fpkm.log

###IC50
a <- data.frame(row.names=colnames(fpkm.log),colnames(fpkm.log)) #

cnt<-1
while (cnt < 139) {
  print(cnt)
  predictedPtype <- pRRopheticPredict(as.matrix(dat), drug[cnt,],selection=1)
  Tipifarnib<-data.frame(predictedPtype)
  colnames(Tipifarnib)<-drug[cnt,]
  a<-cbind(a,Tipifarnib)
  cnt = cnt + 1
}

save.image('IC50.RData')
write.csv(a,'01.IC50.csv')
load('IC50.RData')

#-----------------------------------------------------------------
risk<-read.csv('../07_Train_estimate/01.train_OS_dat.csv')
risk1 <- data.frame(riskScore = risk$riskScore)
rownames(risk1) <- risk$id
risk <- risk1
median(risk$riskScore)  #-0.1099384
#IC50.diff <- read.csv('./01.IC50_diff_wilcox.csv')
IC50 <- read.csv('./01.IC50.csv',row.names = 1)
IC50 <- IC50[,-1]
#IC50 <- t(IC50) %>% as.data.frame()
IC50 <- IC50[rownames(risk),]
library(tidyr)
library(psych)
y <-IC50
x <-risk
identical(rownames(x),rownames(y))

library(psych)
d <- corr.test(x,y,use="complete",method = 'spearman')
r <- t(data.frame(d$r))
p <- t(data.frame(d$p))

identical(rownames(r),rownames(p))
dat.r <- data.frame(r)
dat.r$drug <- rownames(dat.r)
dat.r <- gather(dat.r,'riskScore','r',-c(drug))

dat.p <- data.frame(p)
dat.p$drug <- rownames(dat.p)
dat.p <- gather(dat.p,'riskScore','p',-c(drug))
corrp <- (cbind(dat.r, dat.p))[,c("drug","riskScore",'r',"p")]
corrp$change <- as.factor(ifelse(corrp$p < 0.05 & abs(corrp$r) > 0.3,
                                 ifelse(corrp$r > 0.3, 'Positive correlation', 'Negative correlation'), 'NOT'))
write.csv(corrp,file="01.risk_IC50_cor.csv",quote=F)

corrp.0.05.0.3 <- corrp %>% as.data.frame()
corrp.0.05.0.3 <- corrp.0.05.0.3[corrp.0.05.0.3$p<0.05,]
corrp.0.05.0.3 <- corrp.0.05.0.3[abs(corrp.0.05.0.3$r)>0.3,]
table(corrp.0.05.0.3$change)
write.csv(corrp.0.05.0.3,file="01.cor_0.05_0.3.csv",quote=F)  #50

top <- rbind(head(corrp.0.05.0.3[order(corrp.0.05.0.3$r,decreasing = T),],5),
             head(corrp.0.05.0.3[order(corrp.0.05.0.3$r,decreasing = F),],5)
)
#
#devtools::install_github("kongdd/Ipaper")
library(ggplot2)
library(ggthemes)
library(RColorBrewer)
library(Ipaper)
library(scales)
library(ggrepel)

volcano_plot <- ggplot(data = corrp,
                       aes(x = r, #aesggplot()xy，
                           y = -log10(p),
                           color = change)) +
  scale_color_manual(values = c("#377EB8", "darkgray","#F0027F")) +
  scale_x_continuous(breaks = c(-1,-0.3,0,0.3,1)) +
  scale_y_continuous(#trans = "log1p", #y
    trans = "identity",
    breaks = c(0,1,5,10,20,50,100,200)) +
  geom_point(size = 1.2, alpha = 0.4, na.rm=T) + #
  #theme_bw(base_size = 12, base_family = "Times") + #，。
  theme_classic()+
  geom_vline(xintercept = c(-0.3,0.3), #
             lty = 4,
             col = "darkgray",
             lwd = 0.6)+
  geom_hline(yintercept = -log10(0.05),
             lty = 4,
             col = "darkgray",
             lwd = 0.6)+
  theme(legend.position = "right", #
        panel.grid = element_blank(),
        # legend.title = element_text(face = "bold",color = "black",size = 15),
        legend.title=element_blank(),
        legend.text = element_text(face = "bold",color = "black",size = 13),
        plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face = "bold",color = "black",size = 15),
        axis.text.y = element_text(face = "bold",color = "black",size = 15),
        axis.title.x = element_text(face = "bold",color = "black",size = 15),
        axis.title.y = element_text(face = "bold",color = "black",size = 15),
        text = element_text(family = "Times")) +
  #geom_label_repel( #
  geom_text_repel(
    data = top,
    aes(label = top$drug),
    max.overlaps = 30,
    size = 4,
    box.padding = unit(0.5, "lines"),
    min.segment.length = 0,
    point.padding = unit(0.8, "lines"),
    force= 2, #
    segment.color = "black",
    show.legend = FALSE,
    family = "Times"
  )+
  annotate("text", x = 0.3 , y = 0,label = "total = 138 drugs",colour="black",family='Times') +
  labs(x = "Correlation coefficient(risk-IC50)",
       y = "-log10 (p value)",
       legend = '')

volcano_plot
pdf('01.risk_IC50_cor.pdf',  w=8,h=6,family='Times')
volcano_plot
dev.off()
png('01.risk_IC50_cor.png',  w=8,h=6,family='Times', units='in',res=600)
volcano_plot
dev.off()


#------------------
corrp.0.05.0.3 <- read.csv('./01.cor_0.05_0.3.csv')
IC50 <- read.csv('./01.IC50.csv',row.names = 1)
IC50 <- IC50[,-1]
#IC50 <- t(IC50) %>% as.data.frame()
IC50 <- IC50[,corrp.0.05.0.3$drug]
IC50$sample <- rownames(IC50)
risk <- read.csv('../07_Train_estimate/01.train_OS_dat.csv')
risk <- risk[,c('id','risk')]
colnames(risk) <- c('sample','risk')
data <- merge(risk, IC50, by = "sample")

data2 <- gather(data,
                key = drug,
                value = IC50,
                -c('sample', 'risk'))
data2 <- data.frame(data2)

##
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(ggsci)
library(rstatix)
stat.test<-data2%>%
  group_by(drug)%>%
  wilcox_test(IC50 ~ risk)%>%
  adjust_pvalue(method = 'fdr')

stat.test$p1<-ifelse(stat.test$p<0.0001,"****",
                     ifelse(stat.test$p<0.001,"***",
                            ifelse(stat.test$p<0.01,"**",
                                   ifelse(stat.test$p<0.05,"*",'ns'))))
write.csv(stat.test,'02.IC50_diff_wilcox.csv')

top <- head(stat.test[order(stat.test$p,decreasing = F),],10)
top.ic50 <- data2[data2$drug %in% top$drug,]

length(unique(top.ic50$drug)) #15
unique(top.ic50$drug)
#------------
library(ggplot2)
library(ggpubr)
top.ic50$risk <- factor(top.ic50$risk,levels = c('Low','High'))
top <- top[order(top$drug,decreasing = F),]
top$drug <- factor(top$drug,levels = top$drug)

exp_plot <- ggplot(top.ic50,aes(x = risk, y = IC50, color = risk)) +
  geom_violin(trim=F,color='black',aes(fill=risk)) + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  stat_boxplot(geom="errorbar",
               width=0.1,
               color="black",
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.2,
               color="black",
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#0373c9","#eac001"), name = "risk")+
  labs(title="", x="", y = "",size=20) +
  scale_colour_manual(values = c("#0373c9","#eac001"))+
  # scale_color_brewer(palette = 'Set2')+
  stat_compare_means(aes(group = risk,label = "p.format"),
                     method = "wilcox.test",label = 'p.signif',label.x = 1.4)+
  # stat_pvalue_manual(top,
  #                    y.position = c(rep(1,15)),
  #                    size = 3.2,
  #                    family = "Times",
  #                    label = "p1",  #w
  #                    #parse = T,
  #                    face = "bold")+
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=0,hjust=0.5,colour="black",face="bold",size=8),
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12),
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        text=element_text(family='Times'))+
  facet_wrap(~drug,scales = "free",nrow = 2)
exp_plot
ggsave(filename = '02.IC50_diff.pdf',exp_plot,w=8,h=6)
ggsave(filename = '02.IC50_diff.png',exp_plot,w=8,h=6,dpi = 600)

rm(list = ls())
setwd("/data/nas3///29_YQ953-11/")
if (!dir.exists("17_Expression_val")){dir.create("17_Expression_val")}
setwd("17_Expression_val")

library(ggpubr)
gene <- read.csv('../06_Lasso/03.lasso_genes.csv')


## 1--------
dat<-read.csv('../00_rawdata/02.expr_tpm_TCGA.csv',row.names = 1)
colnames(dat)<-gsub('.','-',colnames(dat),fixed = T)
group <- read.csv('../00_rawdata/03.group_TCGA.csv')
table(group$group)
#dat <- dat[group$X]
library(tidyr)
hub_exp<-dat[gene$symbol,]
hub_exp2<-hub_exp
hub_exp2$Symbol<-rownames(hub_exp2)
hub_exp2<-gather(hub_exp2,
                 key = sample,
                 value = expr,
                 -c("Symbol"))

control.sample <- group$X[which(group$group=='Normal')]
## 
hub_exp2$Group<-ifelse(hub_exp2$sample%in%control.sample,'Normal','Tumor')
##
library(ggplot2)
library(tidyverse)
library(ggpubr)
library(ggsci)
library(rstatix)
stat.test<-hub_exp2%>%
  group_by(Symbol)%>%
  wilcox_test(expr ~ Group)%>%
  adjust_pvalue(method = 'fdr')
diff <- read.csv('../01_DEG_TCGA/02.DEG1_sig.csv',row.names = 1)
df<-diff[stat.test$Symbol,]
stat.test$p<-df$pvalue%>%as.numeric()%>%round(digits = 3)
stat.test$p<-ifelse(stat.test$p<0.001,"***",
                    ifelse(stat.test$p<0.01,"**",
                           ifelse(stat.test$p<0.05,"*",'ns')))
library(ggplot2)
library(ggpubr)
hub_exp2$Group <- factor(hub_exp2$Group,levels = c('Normal','Tumor'))
exp_plot <- ggplot(hub_exp2,aes(x = Group, y = expr, color = Group)) +
  geom_violin(trim=F,color="black",aes(fill=Group)) + #, “color=”(#white，)
  #"trim"TRUE(),。FALSE,。
  stat_boxplot(geom="errorbar", 
               width=0.1,
               position = position_dodge(0.9)) +
  geom_boxplot(width=0.4,
               position=position_dodge(0.9),
               outlier.shape = NA)+ #，width=0.1
  scale_fill_manual(values= c("#FF7F00","#e5486e"), name = "Group")+
  labs(title="Expression in Train", x="", y = "",size=20) +
  scale_colour_manual(values = c("#FF7F00","#e5486e"))+
  stat_pvalue_manual(stat.test,
                     #y.position = c(rep(1,5)),
                     y.position = c(15,18,15,7.5),
                     size = 3.2,
                     family = "Times",
                     label = "p",
                     #parse = T,
                     face = "bold")+
  theme_bw()+
  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
        axis.text.x=element_text(angle=45,hjust=1,colour="black",face="bold",size=12), 
        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
        axis.title.x=element_text(size=16,face="bold"),
        axis.title.y=element_text(size=16,face="bold"),
        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
        legend.title = element_text(face = "bold", size = 12),
        legend.position = "top",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())+
  facet_wrap(~Symbol,scales = "free",nrow = 1) 
exp_plot
ggsave(filename = '01.expression.pdf',exp_plot,w=8,h=4,dpi = 300)
ggsave(filename = '01.expression.png',exp_plot,w=8,h=4,units = 'in',dpi = 300)


###GEO
#dat<-read.csv('../00_rawdata/dat(GSE30219).csv',row.names = 1)
#group <- read.csv('../00_rawdata/group(GSE30219).csv')
#table(group$group)
#
#hub_exp<-dat[gene$gene,]
#hub_exp2<-hub_exp
#hub_exp2$Symbol<-rownames(hub_exp2)
#hub_exp2<-gather(hub_exp2,
#                 key = sample,
#                 value = expr,
#                 -c("Symbol"))
#
#control.sample <- group$sample[which(group$group=='Normal')]
### 
#hub_exp2$Group<-ifelse(hub_exp2$sample%in%control.sample,'control','LUAD')
###
#library(ggplot2)
#library(tidyverse)
#library(ggpubr)
#library(ggsci)
#library(rstatix)
#stat.test<-hub_exp2%>%
#  group_by(Symbol)%>%
#  wilcox_test(expr ~ Group)%>%
#  adjust_pvalue(method = 'fdr')
## diff <- read.csv('../01_DEG_TCGA/02.DEG_sig.csv',row.names = 1)
## df<-diff[stat.test$Symbol,]
##stat.test$p<-df$pvalue%>%as.numeric()%>%round(digits = 3)
#stat.test$p<-ifelse(stat.test$p<0.001,"***",
#                    ifelse(stat.test$p<0.01,"**",
#                           ifelse(stat.test$p<0.05,"*",'ns')))
#library(ggplot2)
#library(ggpubr)
#hub_exp2$Group <- factor(hub_exp2$Group,levels = c('control','LUAD'))
#exp_plot <- ggplot(hub_exp2,aes(x = Group, y = expr, color = Group)) +
#  geom_violin(trim=F,color="black",aes(fill=Group)) + #, “color=”(#white，)
#  #"trim"TRUE(),。FALSE,。
#  stat_boxplot(geom="errorbar", 
#               width=0.1,
#               position = position_dodge(0.9)) +
#  geom_boxplot(width=0.4,
#               position=position_dodge(0.9),
#               outlier.shape = NA)+ #，width=0.1
#  scale_fill_manual(values= c("#FF7F00","#e5486e"), name = "Group")+
#  labs(title="Expression in Verify", x="", y = "",size=20) +
#  scale_colour_manual(values = c("#FF7F00","#e5486e"))+
#  stat_pvalue_manual(stat.test,
#                     #y.position = c(rep(1,5)),
#                     y.position = c(11,11,12,10),
#                     size = 3.2,
#                     family = "Times",
#                     label = "p",
#                     #parse = T,
#                     face = "bold")+
#  theme_bw()+
#  theme(plot.title = element_text(hjust =0.5,colour="black",face="bold",size=15),
#        axis.text.x=element_text(angle=45,hjust=1,colour="black",face="bold",size=12), 
#        axis.text.y=element_text(hjust=0.5,colour="black",face="bold",size=12), 
#        axis.title.x=element_text(size=16,face="bold"),
#        axis.title.y=element_text(size=16,face="bold"),
#        legend.text=element_text(face = "bold", hjust = 0.5,colour="black", size=12),
#        legend.title = element_text(face = "bold", size = 12),
#        legend.position = "top",
#        panel.grid.major = element_blank(),
#        panel.grid.minor = element_blank())+
#  facet_wrap(~Symbol,scales = "free",nrow = 1) 
#exp_plot
#ggsave(filename = '02.expression-verify.pdf',exp_plot,w=8,h=4,dpi = 300)
#ggsave(filename = '02.expression-verify.png',exp_plot,w=8,h=4,units = 'in',dpi = 300)