#Download expression data and clinical data of head and neck tumors
library(stringr)
cancer_type="TCGA-HNSC"
if(!dir.exists("clinical"))dir.create("clinical")
if(!dir.exists("expdataA"))dir.create("expdataA")
dir()

length(dir("./clinicalA/"))
length(dir("./expdataA/"))
library(XML)
result <- xmlParse("./clinical/142aea0e-7a7b-4ac4-9dbb-0f62e2379599/nationwidechildrens.org_clinical.TCGA-W5-AA2O.xml")
rootnode <- xmlRoot(result)
rootsize <- xmlSize(rootnode)
print(rootnode[1])
#print(rootnode[2])
xmldataframe <- xmlToDataFrame(rootnode[2])
head(t(xmlToDataFrame(rootnode[2])))

xmls = dir("clinicalA/",pattern = "*.xml$",recursive = T)

td = function(x){
  result <- xmlParse(file.path("clinicalA/",x))
  rootnode <- xmlRoot(result)
  xmldataframe <- xmlToDataFrame(rootnode[2])
  return(t(xmldataframe))
}

cl = lapply(xmls,td)
cl_df <- t(do.call(cbind,cl))
cl_df[1:3,1:3]
clinical = data.frame(cl_df)
clinical[1:4,1:4]


### Sort out the expression data

options(stringsAsFactors = F)
x = read.table("expdataA/04303348-e957-474f-9667-0f36ee8cbbf6/c8cb8ef1-b679-4084-a11a-4ccc68d4a062.htseq.counts.gz")
x2 = read.table("expdataA/06f89bc6-2e97-4bfa-989c-4934e7f28f05/9901dccf-afd1-4ebf-9efd-5aed4de03636.htseq.counts.gz")
identical(x$V1,x2$V1)

count_files = dir("expdataA/",pattern = "*.htseq.counts.gz$",recursive = T)

ex = function(x){
  result <- read.table(file.path("expdataA/",x),row.names = 1,sep = "\t")
  return(result)
}
head(ex("04303348-e957-474f-9667-0f36ee8cbbf6/c8cb8ef1-b679-4084-a11a-4ccc68d4a062.htseq.counts.gz"))
exp = lapply(count_files,ex)
exp <- do.call(cbind,exp)
dim(exp)
exp[1:4,1:4]

meta <- jsonlite::fromJSON("metadata.cart.2022-02-12.json")
colnames(meta)
ids <- meta$associated_entities;class(ids)
ids[[1]]
class(ids[[1]][,1])

ID = sapply(ids,function(x){x[,1]})
file2id = data.frame(file_name = meta$file_name,
                     ID = ID)

head(file2id$file_name)
head(count_files)
count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2]
count_files2[1] %in% file2id$file_name

file2id = file2id[match(count_files2,file2id$file_name),]
colnames(exp) = file2id$ID
exp[1:4,1:4]

dim(exp)
exp = exp[apply(exp, 1, function(x) sum(x > 1) > 12), ]
dim(exp)
exp[1:4,1:4]
options(stringsAsFactors = F)
if(!file.exists("anno.Rdata")){
  #Separate the expression data of lncRNA and mRNA
  library(rtracklayer)
  gtf = rtracklayer::import("gencode.v22.annotation.gtf")
  class(gtf)
  gtf = as.data.frame(gtf);dim(gtf)
  colnames(gtf)
  table(gtf$type)
  

  gtf_gene = gtf[gtf$type=="gene",]
  table(gtf_gene$gene_type)

  lnc_bype = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping")
  table(gtf_gene$gene_type %in% lnc_bype)
  table(gtf_gene$gene_type == "protein_coding")
  lnc_anno = gtf_gene[gtf_gene$gene_type %in% lnc_bype,c("gene_name","gene_id","gene_type")]
  mRNA_anno = gtf_gene[gtf_gene$gene_type == "protein_coding",c("gene_name","gene_id","gene_type")]
  save(lnc_anno,mRNA_anno,file = "anno.Rdata")
}
load("anno.Rdata")

load("TCGA-HNSCgdc.Rdata")

mrnas = intersect(rownames(exp),mRNA_anno$gene_id);length(mrnas)
lncrnas = intersect(rownames(exp),lnc_anno$gene_id);length(lncrnas)

#4-2.Take a subset of the annotation file and expression matrix according to the intersection mRNA and intersection lncRNA and match them

mRNA_exp = exp[mrnas,]
lnc_exp = exp[lncrnas,]

mRNA_anno_s = mRNA_anno[match(mrnas,mRNA_anno$gene_id),]
lnc_anno_s = lnc_anno[match(lncrnas,lnc_anno$gene_id),]
identical(rownames(mRNA_exp),mRNA_anno_s$gene_id)
identical(rownames(lnc_exp),lnc_anno_s$gene_id)

#4-3.Duplicate row names of data boxes and matrices are not allowed, so multiple ensambelid corresponding to one symbol need to be removed.。

k1 = !duplicated(mRNA_anno_s$gene_name);table(k1)
k2 = !duplicated(lnc_anno_s$gene_name);table(k2)

mRNA_exp = mRNA_exp[k1,]
mRNA_anno_s = mRNA_anno_s[k1,]
rownames(mRNA_exp) = mRNA_anno_s$gene_name

lnc_exp = lnc_exp[k2,]
lnc_anno_s = lnc_anno_s[k2,]
rownames(lnc_exp) = lnc_anno_s$gene_name

mRNA_exp[1:2,1:2]
lnc_exp[1:2,1:2]

dim(mRNA_exp);dim(lnc_exp)

save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"deg_before.Rdata"))

#Difference analysis
library(edgeR)

dge <- DGEList(counts=exprset,group=group_list)
dge$samples$lib.size <- colSums(dge$counts)
dge <- calcNormFactors(dge) 

design <- model.matrix(~0+group_list)
rownames(design)<-colnames(dge)
colnames(design)<-levels(group_list)

dge <- estimateGLMCommonDisp(dge,design)
dge <- estimateGLMTrendedDisp(dge, design)
dge <- estimateGLMTagwiseDisp(dge, design)

fit <- glmFit(dge, design)
fit2 <- glmLRT(fit, contrast=c(-1,1)) 

DEG=topTags(fit2, n=nrow(exprset))
DEG=as.data.frame(DEG)
logFC_cutoff <- 1
k1 = (DEG$PValue < 0.05)&(DEG$logFC< -1)
k2 = (DEG$PValue < 0.05)&(DEG$logFC > 1)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))

head(DEG)
table(DEG$change)
edgeR_DEG <- DEG

library(limma)

design <- model.matrix(~0+group_list)
colnames(design)=levels(group_list)
rownames(design)=colnames(exp)

dge <- DGEList(counts=exp)
dge <- calcNormFactors(dge)

v <- voom(dge,design, normalize="quantile")
fit <- lmFit(v, design)

constrasts = paste(rev(levels(group_list)),collapse = "-")
cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) 
fit2=contrasts.fit(fit,cont.matrix)
fit2=eBayes(fit2)

DEG = topTable(fit2, coef=constrasts, n=Inf)
DEG = na.omit(DEG)

logFC_cutoff <- 1

k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff)
k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff)
DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT"))
table(DEG$change)
head(DEG)

limma_voom_DEG <- DEG
#Survival analysis
library(survival)
library(survminer)
logrankfile = paste0(cancer_type,"log_rank_p.Rdata")
if(!file.exists(logrankfile)){
  mySurv=with(meta,Surv(time, event))
  log_rank_p <- apply(exprSet , 1 , function(gene){
    # gene=exprSet[1,]
    meta$group=ifelse(gene>median(gene),'high','low')  
    data.survdiff=survdiff(mySurv~group,data=meta)
    p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1)
    return(p.val)
  })
  log_rank_p=sort(log_rank_p)
  save(log_rank_p,file = logrankfile)
}
load(logrankfile)
table(log_rank_p<0.01) 
table(log_rank_p<0.05) 

#GO
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(circlize)
library(RColorBrewer)
library(dplyr)
library(ggpubr)
library(ComplexHeatmap)

pvalueFilter=0.05     
qvalueFilter=0.05     

colorSel="qvalue"
if(qvalueFilter>0.05){
  colorSel="pvalue"
}

setwd("C:\\Users\\lexb\\Desktop\\communication\\08.GO")
rt=read.table("interGenes.List.txt", header=F, sep="\t", check.names=F)

genes=unique(as.vector(rt[,1]))
entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
entrezIDs=as.character(entrezIDs)
gene=entrezIDs[entrezIDs!="NA"] 
#gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene)


kk=enrichGO(gene=gene, OrgDb=org.Hs.eg.db, pvalueCutoff=1, qvalueCutoff=1, ont="all", readable=T)
GO=as.data.frame(kk)
GO=GO[(GO$pvalue<pvalueFilter & GO$qvalue<qvalueFilter),]

write.table(GO, file="GO.txt", sep="\t", quote=F, row.names = F)


pdf(file="barplot.pdf", width=12, height=7)
bar=barplot(kk, drop=TRUE, showCategory=10, label_format=100, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
print(bar)
dev.off()


pdf(file="bubble.pdf", width=12, height=7)
bub=dotplot(kk, showCategory=10, orderBy="GeneRatio", label_format=100, split="ONTOLOGY", color=colorSel) + facet_grid(ONTOLOGY~., scale='free')
print(bub)
dev.off()

ontology.col=c("#00AFBB", "#E7B800", "#90EE90")
data=GO[order(GO$pvalue),]
datasig=data[data$pvalue<0.05,,drop=F]
BP = datasig[datasig$ONTOLOGY=="BP",,drop=F]
CC = datasig[datasig$ONTOLOGY=="CC",,drop=F]
MF = datasig[datasig$ONTOLOGY=="MF",,drop=F]
BP = head(BP,6)
CC = head(CC,6)
MF = head(MF,6)
data = rbind(BP,CC,MF)
main.col = ontology.col[as.numeric(as.factor(data$ONTOLOGY))]

BgGene = as.numeric(sapply(strsplit(data$BgRatio,"/"),'[',1))
Gene = as.numeric(sapply(strsplit(data$GeneRatio,'/'),'[',1))
ratio = Gene/BgGene
logpvalue = -log(data$pvalue,10)
logpvalue.col = brewer.pal(n = 8, name = "Reds")
f = colorRamp2(breaks = c(0,2,4,6,8,10,15,20), colors = logpvalue.col)
BgGene.col = f(logpvalue)
df = data.frame(GO=data$ID,start=1,end=max(BgGene))
rownames(df) = df$GO
bed2 = data.frame(GO=data$ID,start=1,end=BgGene,BgGene=BgGene,BgGene.col=BgGene.col)
bed3 = data.frame(GO=data$ID,start=1,end=Gene,BgGene=Gene)
bed4 = data.frame(GO=data$ID,start=1,end=max(BgGene),ratio=ratio,col=main.col)
bed4$ratio = bed4$ratio/max(bed4$ratio)*9.5

pdf("GO.circlize.pdf",width=10,height=10)
par(omi=c(0.1,0.1,0.1,1.5))
circos.par(track.margin=c(0.01,0.01))
circos.genomicInitialize(df,plotType="none")
circos.trackPlotRegion(ylim = c(0, 1), panel.fun = function(x, y) {
  sector.index = get.cell.meta.data("sector.index")
  xlim = get.cell.meta.data("xlim")
  ylim = get.cell.meta.data("ylim")
  circos.text(mean(xlim), mean(ylim), sector.index, cex = 0.8, facing = "bending.inside", niceFacing = TRUE)
}, track.height = 0.08, bg.border = NA,bg.col = main.col)

for(si in get.all.sector.index()) {
  circos.axis(h = "top", labels.cex = 0.6, sector.index = si,track.index = 1,
              major.at=seq(0,max(BgGene),by=100),labels.facing = "clockwise")
}
f = colorRamp2(breaks = c(-1, 0, 1), colors = c("green", "black", "red"))
circos.genomicTrack(bed2, ylim = c(0, 1),track.height = 0.1,bg.border="white",
                    panel.fun = function(region, value, ...) {
                      i = getI(...)
                      circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = value[,2], 
                                         border = NA, ...)
                      circos.genomicText(region, value, y = 0.4, labels = value[,1], adj=0,cex=0.8,...)
                    })
circos.genomicTrack(bed3, ylim = c(0, 1),track.height = 0.1,bg.border="white",
                    panel.fun = function(region, value, ...) {
                      i = getI(...)
                      circos.genomicRect(region, value, ytop = 0, ybottom = 1, col = '#BA55D3', 
                                         border = NA, ...)
                      circos.genomicText(region, value, y = 0.4, labels = value[,1], cex=0.9,adj=0,...)
                    })
circos.genomicTrack(bed4, ylim = c(0, 10),track.height = 0.35,bg.border="white",bg.col="grey90",
                    panel.fun = function(region, value, ...) {
                      cell.xlim = get.cell.meta.data("cell.xlim")
                      cell.ylim = get.cell.meta.data("cell.ylim")
                      for(j in 1:9) {
                        y = cell.ylim[1] + (cell.ylim[2]-cell.ylim[1])/10*j
                        circos.lines(cell.xlim, c(y, y), col = "#FFFFFF", lwd = 0.3)
                      }
                      circos.genomicRect(region, value, ytop = 0, ybottom = value[,1], col = value[,2], 
                                         border = NA, ...)
                      #circos.genomicText(region, value, y = 0.3, labels = value[,1], ...)
                    })
circos.clear()
middle.legend = Legend(
  labels = c('Number of Genes','Number of Select','Rich Factor(0-1)'),
  type="points",pch=c(15,15,17),legend_gp = gpar(col=c('pink','#BA55D3',ontology.col[1])),
  title="",nrow=3,size= unit(3, "mm")
)
circle_size = unit(1, "snpc")
draw(middle.legend,x=circle_size*0.42)
main.legend = Legend(
  labels = c("Biological Process","Cellular Component", "Molecular Function"),  type="points",pch=15,
  legend_gp = gpar(col=ontology.col), title_position = "topcenter",
  title = "ONTOLOGY", nrow = 3,size = unit(3, "mm"),grid_height = unit(5, "mm"),
  grid_width = unit(5, "mm")
)
logp.legend = Legend(
  labels=c('(0,2]','(2,4]','(4,6]','(6,8]','(8,10]','(10,15]','(15,20]','>=20'),
  type="points",pch=16,legend_gp=gpar(col=logpvalue.col),title="-log10(pvalue)",
  title_position = "topcenter",grid_height = unit(5, "mm"),grid_width = unit(5, "mm"),
  size = unit(3, "mm")
)
lgd = packLegend(main.legend,logp.legend)
circle_size = unit(1, "snpc")
print(circle_size)
draw(lgd, x = circle_size*0.85, y=circle_size*0.55,just = "left")
dev.off()

#KEGG
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(ggplot2)
library(circlize)
library(RColorBrewer)
library(dplyr)
library(ComplexHeatmap)

pvalueFilter=0.05   
qvalueFilter=0.05     

colorSel="qvalue"
if(qvalueFilter>0.05){
  colorSel="pvalue"
}

setwd("C:\\Users\\lexb\\Desktop\\communication\\09.KEGG")   
rt=read.table("interGenes.List.txt", header=F, sep="\t", check.names=F)    


genes=unique(as.vector(rt[,1]))
entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA)
entrezIDs=as.character(entrezIDs)
rt=data.frame(genes, entrezID=entrezIDs)
gene=entrezIDs[entrezIDs!="NA"]       

kk <- enrichKEGG(gene=gene, organism="hsa", pvalueCutoff=1, qvalueCutoff=1)
KEGG=as.data.frame(kk)
KEGG$geneID=as.character(sapply(KEGG$geneID,function(x)paste(rt$genes[match(strsplit(x,"/")[[1]],as.character(rt$entrezID))],collapse="/")))
KEGG=KEGG[(KEGG$pvalue<pvalueFilter & KEGG$qvalue<qvalueFilter),]

write.table(KEGG, file="KEGG.txt", sep="\t", quote=F, row.names = F)

showNum=30
if(nrow(KEGG)<showNum){
  showNum=nrow(KEGG)
}

pdf(file="barplotKEGG.pdf", width=12, height=7)
barplot(kk, drop=TRUE, showCategory=showNum, label_format=100, color=colorSel)
dev.off()

pdf(file="bubbleKEGG.pdf", width=12, height = 7)
dotplot(kk, showCategory=showNum, orderBy="GeneRatio", label_format=100, color=colorSel)
dev.off()