#Download expression data and clinical data of head and neck tumors library(stringr) cancer_type="TCGA-HNSC" if(!dir.exists("clinical"))dir.create("clinical") if(!dir.exists("expdataA"))dir.create("expdataA") dir() length(dir("./clinicalA/")) length(dir("./expdataA/")) library(XML) result <- xmlParse("./clinical/142aea0e-7a7b-4ac4-9dbb-0f62e2379599/nationwidechildrens.org_clinical.TCGA-W5-AA2O.xml") rootnode <- xmlRoot(result) rootsize <- xmlSize(rootnode) print(rootnode[1]) #print(rootnode[2]) xmldataframe <- xmlToDataFrame(rootnode[2]) head(t(xmlToDataFrame(rootnode[2]))) xmls = dir("clinicalA/",pattern = "*.xml$",recursive = T) td = function(x){ result <- xmlParse(file.path("clinicalA/",x)) rootnode <- xmlRoot(result) xmldataframe <- xmlToDataFrame(rootnode[2]) return(t(xmldataframe)) } cl = lapply(xmls,td) cl_df <- t(do.call(cbind,cl)) cl_df[1:3,1:3] clinical = data.frame(cl_df) clinical[1:4,1:4] ### Sort out the expression data options(stringsAsFactors = F) x = read.table("expdataA/04303348-e957-474f-9667-0f36ee8cbbf6/c8cb8ef1-b679-4084-a11a-4ccc68d4a062.htseq.counts.gz") x2 = read.table("expdataA/06f89bc6-2e97-4bfa-989c-4934e7f28f05/9901dccf-afd1-4ebf-9efd-5aed4de03636.htseq.counts.gz") identical(x$V1,x2$V1) count_files = dir("expdataA/",pattern = "*.htseq.counts.gz$",recursive = T) ex = function(x){ result <- read.table(file.path("expdataA/",x),row.names = 1,sep = "\t") return(result) } head(ex("04303348-e957-474f-9667-0f36ee8cbbf6/c8cb8ef1-b679-4084-a11a-4ccc68d4a062.htseq.counts.gz")) exp = lapply(count_files,ex) exp <- do.call(cbind,exp) dim(exp) exp[1:4,1:4] meta <- jsonlite::fromJSON("metadata.cart.2022-02-12.json") colnames(meta) ids <- meta$associated_entities;class(ids) ids[[1]] class(ids[[1]][,1]) ID = sapply(ids,function(x){x[,1]}) file2id = data.frame(file_name = meta$file_name, ID = ID) head(file2id$file_name) head(count_files) count_files2 = stringr::str_split(count_files,"/",simplify = T)[,2] count_files2[1] %in% file2id$file_name file2id = file2id[match(count_files2,file2id$file_name),] colnames(exp) = file2id$ID exp[1:4,1:4] dim(exp) exp = exp[apply(exp, 1, function(x) sum(x > 1) > 12), ] dim(exp) exp[1:4,1:4] options(stringsAsFactors = F) if(!file.exists("anno.Rdata")){ #Separate the expression data of lncRNA and mRNA library(rtracklayer) gtf = rtracklayer::import("gencode.v22.annotation.gtf") class(gtf) gtf = as.data.frame(gtf);dim(gtf) colnames(gtf) table(gtf$type) gtf_gene = gtf[gtf$type=="gene",] table(gtf_gene$gene_type) lnc_bype = c("3prime_overlapping_ncRNA", "antisense", "bidirectional_promoter_lncRNA", "lincRNA", "macro_lncRNA", "non_coding", "processed_transcript", "sense_intronic" , "sense_overlapping") table(gtf_gene$gene_type %in% lnc_bype) table(gtf_gene$gene_type == "protein_coding") lnc_anno = gtf_gene[gtf_gene$gene_type %in% lnc_bype,c("gene_name","gene_id","gene_type")] mRNA_anno = gtf_gene[gtf_gene$gene_type == "protein_coding",c("gene_name","gene_id","gene_type")] save(lnc_anno,mRNA_anno,file = "anno.Rdata") } load("anno.Rdata") load("TCGA-HNSCgdc.Rdata") mrnas = intersect(rownames(exp),mRNA_anno$gene_id);length(mrnas) lncrnas = intersect(rownames(exp),lnc_anno$gene_id);length(lncrnas) #4-2.Take a subset of the annotation file and expression matrix according to the intersection mRNA and intersection lncRNA and match them mRNA_exp = exp[mrnas,] lnc_exp = exp[lncrnas,] mRNA_anno_s = mRNA_anno[match(mrnas,mRNA_anno$gene_id),] lnc_anno_s = lnc_anno[match(lncrnas,lnc_anno$gene_id),] identical(rownames(mRNA_exp),mRNA_anno_s$gene_id) identical(rownames(lnc_exp),lnc_anno_s$gene_id) #4-3.Duplicate row names of data boxes and matrices are not allowed, so multiple ensambelid corresponding to one symbol need to be removed.。 k1 = !duplicated(mRNA_anno_s$gene_name);table(k1) k2 = !duplicated(lnc_anno_s$gene_name);table(k2) mRNA_exp = mRNA_exp[k1,] mRNA_anno_s = mRNA_anno_s[k1,] rownames(mRNA_exp) = mRNA_anno_s$gene_name lnc_exp = lnc_exp[k2,] lnc_anno_s = lnc_anno_s[k2,] rownames(lnc_exp) = lnc_anno_s$gene_name mRNA_exp[1:2,1:2] lnc_exp[1:2,1:2] dim(mRNA_exp);dim(lnc_exp) save(lnc_exp,mRNA_exp,file = paste0(cancer_type,"deg_before.Rdata")) #Difference analysis library(edgeR) dge <- DGEList(counts=exprset,group=group_list) dge$samples$lib.size <- colSums(dge$counts) dge <- calcNormFactors(dge) design <- model.matrix(~0+group_list) rownames(design)<-colnames(dge) colnames(design)<-levels(group_list) dge <- estimateGLMCommonDisp(dge,design) dge <- estimateGLMTrendedDisp(dge, design) dge <- estimateGLMTagwiseDisp(dge, design) fit <- glmFit(dge, design) fit2 <- glmLRT(fit, contrast=c(-1,1)) DEG=topTags(fit2, n=nrow(exprset)) DEG=as.data.frame(DEG) logFC_cutoff <- 1 k1 = (DEG$PValue < 0.05)&(DEG$logFC< -1) k2 = (DEG$PValue < 0.05)&(DEG$logFC > 1) DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT")) head(DEG) table(DEG$change) edgeR_DEG <- DEG library(limma) design <- model.matrix(~0+group_list) colnames(design)=levels(group_list) rownames(design)=colnames(exp) dge <- DGEList(counts=exp) dge <- calcNormFactors(dge) v <- voom(dge,design, normalize="quantile") fit <- lmFit(v, design) constrasts = paste(rev(levels(group_list)),collapse = "-") cont.matrix <- makeContrasts(contrasts=constrasts,levels = design) fit2=contrasts.fit(fit,cont.matrix) fit2=eBayes(fit2) DEG = topTable(fit2, coef=constrasts, n=Inf) DEG = na.omit(DEG) logFC_cutoff <- 1 k1 = (DEG$P.Value < 0.05)&(DEG$logFC < -logFC_cutoff) k2 = (DEG$P.Value < 0.05)&(DEG$logFC > logFC_cutoff) DEG$change = ifelse(k1,"DOWN",ifelse(k2,"UP","NOT")) table(DEG$change) head(DEG) limma_voom_DEG <- DEG #Survival analysis library(survival) library(survminer) logrankfile = paste0(cancer_type,"log_rank_p.Rdata") if(!file.exists(logrankfile)){ mySurv=with(meta,Surv(time, event)) log_rank_p <- apply(exprSet , 1 , function(gene){ # gene=exprSet[1,] meta$group=ifelse(gene>median(gene),'high','low') data.survdiff=survdiff(mySurv~group,data=meta) p.val = 1 - pchisq(data.survdiff$chisq, length(data.survdiff$n) - 1) return(p.val) }) log_rank_p=sort(log_rank_p) save(log_rank_p,file = logrankfile) } load(logrankfile) table(log_rank_p<0.01) table(log_rank_p<0.05) #GO library(clusterProfiler) library(org.Hs.eg.db) library(enrichplot) library(ggplot2) library(circlize) library(RColorBrewer) library(dplyr) library(ggpubr) library(ComplexHeatmap) pvalueFilter=0.05 qvalueFilter=0.05 colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } setwd("C:\\Users\\lexb\\Desktop\\communication\\08.GO") rt=read.table("interGenes.List.txt", header=F, sep="\t", check.names=F) genes=unique(as.vector(rt[,1])) entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA) entrezIDs=as.character(entrezIDs) gene=entrezIDs[entrezIDs!="NA"] #gene=gsub("c\\(\"(\\d+)\".*", "\\1", gene) kk=enrichGO(gene=gene, OrgDb=org.Hs.eg.db, pvalueCutoff=1, qvalueCutoff=1, ont="all", readable=T) GO=as.data.frame(kk) GO=GO[(GO$pvalue=20'), type="points",pch=16,legend_gp=gpar(col=logpvalue.col),title="-log10(pvalue)", title_position = "topcenter",grid_height = unit(5, "mm"),grid_width = unit(5, "mm"), size = unit(3, "mm") ) lgd = packLegend(main.legend,logp.legend) circle_size = unit(1, "snpc") print(circle_size) draw(lgd, x = circle_size*0.85, y=circle_size*0.55,just = "left") dev.off() #KEGG library(clusterProfiler) library(org.Hs.eg.db) library(enrichplot) library(ggplot2) library(circlize) library(RColorBrewer) library(dplyr) library(ComplexHeatmap) pvalueFilter=0.05 qvalueFilter=0.05 colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } setwd("C:\\Users\\lexb\\Desktop\\communication\\09.KEGG") rt=read.table("interGenes.List.txt", header=F, sep="\t", check.names=F) genes=unique(as.vector(rt[,1])) entrezIDs=mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA) entrezIDs=as.character(entrezIDs) rt=data.frame(genes, entrezID=entrezIDs) gene=entrezIDs[entrezIDs!="NA"] kk <- enrichKEGG(gene=gene, organism="hsa", pvalueCutoff=1, qvalueCutoff=1) KEGG=as.data.frame(kk) KEGG$geneID=as.character(sapply(KEGG$geneID,function(x)paste(rt$genes[match(strsplit(x,"/")[[1]],as.character(rt$entrezID))],collapse="/"))) KEGG=KEGG[(KEGG$pvalue