### Code for PCA gene <- t(exprSet) library(FactoMineR) gene.pca <- PCA(gene, ncp = 2, scale.unit = TRUE, graph = FALSE) plot(gene.pca) pca_sample <- data.frame(gene.pca$ind$coord[ ,1:2]) head(pca_sample) pca_eig1 <- round(gene.pca$eig[1,2], 2) pca_eig2 <- round(gene.pca$eig[2,2],2 ) group <- cbind(colnames(exprSet),group_list) rownames(group)<- colnames(exprSet) colnames(group) <- c("samples","group") head(group) group <- group[rownames(pca_sample), ] head(group) pca_sample <- cbind(pca_sample, group) head(pca_sample) library(ggplot2) p <- ggplot(data = pca_sample, aes(x = Dim.1, y = Dim.2)) + geom_point(aes(color = group), size = 1) + theme(panel.grid = element_blank(), panel.background = element_rect(color = 'black', fill = 'transparent'), legend.key = element_rect(fill = 'transparent')) + labs(x = paste('PCA1:', pca_eig1, '%'), y = paste('PCA2:', pca_eig2, '%'), color = '') p pp <- p + stat_ellipse(aes(color = group), level = 0.95, show.legend = FALSE)+theme_bw() pdf("Fig.PCA.pdf",width = 4,height = 3) pp dev.off() ### Code for differential expression analysis library(limma) design <- model.matrix(~0+factor(group_list)) design colnames(design)=levels(factor(group_list)) rownames(design)=colnames(exprSet) design contrast.matrix<-makeContrasts("UC-Con","CAD-UC","CAD-Con",levels = design) contrast.matrix deg = function(exprSet,design,contrast.matrix){ ##step1 fit <- lmFit(exprSet,design) ##step2 fit2 <- contrasts.fit(fit, contrast.matrix) fit2 <- eBayes(fit2) ##step3 tempOutput = topTable(fit2, coef=3, n=Inf) nrDEG = na.omit(tempOutput) head(nrDEG) return(nrDEG) } nrDEG = deg(exprSet,design,contrast.matrix) DEG=nrDEG logFC_cutoff <- with(DEG,mean(abs( logFC)) + 2*sd(abs( logFC)) ) logFC_cutoff logFC_cutoff <- 0.3 DEG$change = as.factor(ifelse(DEG$adj.P.Val < 0.05 & abs(DEG$logFC) > logFC_cutoff, ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT') ) this_tile <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3), '\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) , '\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',]) ) this_tile write.csv(DEG,file = 'GSE47908-DEG.csv') ### Code for volcano plot library(ggplot2) g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) + geom_point(alpha=0.8, size=0.3) + theme_bw()+ theme(panel.grid=element_blank())+ xlab("log2 fold change") + ylab("-log10 adj.P.Val") + theme(plot.title = element_text(size=15,hjust = 0.5))+ scale_colour_manual(values = c('blue','black','red')) + geom_vline(xintercept=c(-logFC_cutoff,logFC_cutoff),lty=2,col="black",lwd=0.4) + geom_hline(yintercept = -log10(0.05),lty=2,col="black",lwd=0.6)+ #ggtitle( this_tile ) + geom_vline(xintercept = c(-0.804,0.804),lty=2,col="grey",lwd=0.6) pdf("Fig.volcano.pdf",width = 5,height = 4) g dev.off() ### Code for heatmap y1 <- rownames(DEG[DEG$change=="UP",]) y2 <- rownames(DEG[DEG$change=="DOWN",]) x <- c(y1[c(1:50)],y2[c(1:50)]) gene <- exprSet[x,] library(pheatmap) pdf("Fig.heatmap.pdf",width = 4.8,height =4.8 ) pheatmap(gene,scale = "row",clustering_method = "complete", clustering_distance_rows = "correlation", color = colorRampPalette(c( "blue", "white", "red"))(50), fontsize = 4, fontsize_row =1.5, fontsize_col = 4, cluster_cols =T, cluster_rows = T,border_color =NA,cellwidth = 6,cellheight = 2.3) dev.off() ### Code for WGCNA DEG <- DEG[DEG$adj.P.Val<0.05,] DEG$FC <- abs(DEG$logFC) gene <- DEG[DEG$FC>0.3,] gene <- exprSet[rownames(gene),] library(reshape2) library(stringr) library(WGCNA) options(stringsAsFactors = FALSE) enableWGCNAThreads() trait <- read.csv('trait.csv',header = T,row.names = 1,check.names = FALSE) gene <- subset(gene, rowSums(gene)/ncol(gene) >= 1) gene <- as.data.frame(t(gene)) if(T){ powers <- c(c(1:10), seq(from = 12, to=30, by=2)) sft <- pickSoftThreshold(gene, powerVector = powers, verbose = 5) pdf("Fig.soft_threshod.pdf",width = 11,height = 5) par(mfrow = c(1, 2)) plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], type = 'n', xlab = 'Soft Threshold (power)', ylab = 'Scale Free Topology Model Fit,signed R^2', main = paste('Scale independence')) text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2], labels = powers, col = 'red'); abline(h = 0.80, col = 'red') plot(sft$fitIndices[,1], sft$fitIndices[,5], xlab = 'Soft Threshold (power)', ylab = 'Mean Connectivity', type = 'n', main = paste('Mean connectivity')) text(sft$fitIndices[,1], sft$fitIndices[,5], labels = powers, col = 'red') dev.off() power <- 12 save(gene,trait,sft,power,file = "WGCNAstep1-Soft Threshold.Rdata") } if(T){ rm(list=ls()) load("WGCNAstep1-Soft Threshold.Rdata") cor <- WGCNA::cor net = blockwiseModules(gene, power = power, TOMType = "unsigned", minModuleSize = 30, reassignThreshold = 0, mergeCutHeight = 0.28, numericLabels = TRUE, pamRespectsDendro = FALSE, saveTOMs = F, corType = "pearson", loadTOMs=TRUE, saveTOMFileBase = paste0("gene", ".tom"), verbose = 3 ) table(net$colors) # open a graphics window sizeGrWindow(12, 9) # Convert labels to colors for plotting moduleLabels = net$colors moduleColors = labels2colors(moduleLabels) table(moduleColors) # Plot the dendrogram and the module colors underneath pdf("Fig.Cluster Dendrogram.pdf",width = 7,height = 6) plotDendroAndColors(net$dendrograms[[1]], moduleColors[net$blockGenes[[1]]], "Module colors", dendroLabels = FALSE, hang = 0.03, addGuide = TRUE, guideHang = 0.05) dev.off() MEs = net$MEs MEs_col = MEs colnames(MEs_col) = paste0("ME", labels2colors( as.numeric(str_replace_all(colnames(MEs),"ME","")))) MEs_col = orderMEs(MEs_col) head(MEs_col) save(net,MEs_col,moduleColors,file = "WGCNAstep2-co-expression nekwork.Rdata") ) } if(T){ rm(list=ls()) load("WGCNAstep1-Soft Threshold.Rdata") load("WGCNAstep2-co-expression nekwork.Rdata") nGenes = ncol(gene) nSamples = nrow(gene) moduleTraitCor_noFP <- cor(MEs_col, trait, use = "p"); moduleTraitPvalue_noFP = corPvalueStudent(moduleTraitCor_noFP, nSamples); textMatrix_noFP <- paste(signif(moduleTraitCor_noFP, 2), "\n(", signif(moduleTraitPvalue_noFP, 1), ")", sep = ""); pdf("Fig.correlation.pdf",width = 5,height = 6) par(mar = c(4, 8, 4, 3),mfrow=c(1,1)); labeledHeatmap(Matrix = moduleTraitCor_noFP, xLabels = colnames(trait), yLabels = names(MEs_col), ySymbols = names(MEs_col), colorLabels = FALSE, colors = blueWhiteRed(50), textMatrix = textMatrix_noFP, setStdMargins = FALSE, cex.text = 0.6, zlim = c(-1,1),#showCols = c(1,4) main = paste("Module-trait relationships")) dev.off() } ## Code for GO and KEGG enrichment analysis rm(list=ls()) load("WGCNAstep1-Soft Threshold.Rdata") load("WGCNAstep2-co-expression nekwork.Rdata") gene_module <- data.frame(gene_name = colnames(gene), module = moduleColors, stringsAsFactors = FALSE) head(gene_module) load('eg2probe.Rdata') colnames(eg2probe) <- c("ID","gene_name","ENTREZID") x <- eg2probe[match(gene_module$gene_name,eg2probe$gene_name),] gene_module$ENTREZID <- x$ENTREZID rownames(gene_module) <- gene_module$gene_name DEG <- read.csv("GSE47908-DEG.csv",header = T,row.names = 1) mm <- DEG gene_M <- mm[match(gene_module$gene_name,rownames(mm)),] gene_M$ENTREZID <- gene_module$ENTREZID gene_M$module <- gene_module$module gene_M <- gene_M[,c(1,8,9)] gene_M <- na.omit(gene_M) gene_MS <- gene_M[gene_M$module=="green",] genelist <- gene_MS[,2] gene_up <- gene_MS[gene_MS$logFC>0,2] gene_down <- gene_MS[gene_MS$logFC<0,2] gene_diff <- c(gene_up,gene_down) library(clusterProfiler) library(org.Hs.eg.db) library(ggplot2) library(stringr) ## run go analysis if(T){ go_BP <- enrichGO( gene = gene_diff, keyType = "ENTREZID", OrgDb = org.Hs.eg.db, ont = "BP", pAdjustMethod = "BH", pvalueCutoff = 1, qvalueCutoff = 1, readable = TRUE) save(go_BP,file = "MEgreen_go_BP.Rdata") load("MEgreen_go_BP.Rdata") go_BPresult <- go_BP@result go_BPresult <- go_BPresult[go_BPresult$pvalue<0.05,] go_BPresult$RichFactor <- round(as.numeric(str_split(go_BPresult$GeneRatio,"/",simplify = T)[,1])/as.numeric(str_split(go_BPresult$BgRatio,"/",simplify = T)[,1]),digits = 2) # WGCNA_go_BPresult_sort <- WGCNA_go_BPresult[order(WGCNA_go_BPresult$pvalue,-WGCNA_go_BPresult$Count),] write.csv(go_BPresult,file="MEred_goBPterms.csv") ego <- go_BPresult[order(go_BPresult$pvalue,decreasing = F),][c(1:20),] ego <- as.data.frame(ego) ego$Description <- factor(ego$Description,levels = ego$Description) pp <- ggplot(ego,mapping = aes(x=RichFactor,y=Description)) pp p <- pp + geom_point(aes(size=Count,color=-log10(pvalue)))+ coord_cartesian(xlim = c(0.05,0.5))+ scale_x_continuous(breaks = seq(0.0,0.8,0.1))+ scale_colour_gradient(low="blue",high="red")+ scale_y_discrete((name ="GO:BP terms"),limits=rev(levels(ego$Description)))+ labs(x="Rich Factor",y="", size="Gene Numbers",color="-log10(pvalue)")+ theme_bw() #axis.text.y = element_blank() p } ## run KEGG analysis if(T){ kk.diff <- enrichKEGG(gene = gene_diff, organism = 'hsa', pvalueCutoff = 0.9) #0.05 class(kk.diff) kk.diff <- setReadable(kk.diff, 'org.Hs.eg.db', 'ENTREZID') save(kk.diff,file = "MEgreen_kegg.Rdata") load("MEgreen_kegg.Rdata") kegg_diff_dt <- kk.diff@result kegg <- kegg_diff_dt[kegg_diff_dt$pvalue<0.05,] kegg$RichFactor <- round(as.numeric(str_split(kegg$GeneRatio,"/",simplify = T)[,1])/as.numeric(str_split(kegg$BgRatio,"/",simplify = T)[,1]),digits = 2) kk <- kegg[order(kegg$pvalue,decreasing = F),][1:20,] kk$Description <- factor(kk$Description,levels = kk$Description) pp <- ggplot(kk,mapping = aes(x=RichFactor,y=Description)) pp q <- pp + geom_point(aes(size=Count,color=-log10(pvalue)))+ coord_cartesian(xlim = c(0.05,0.5))+ scale_x_continuous(breaks = seq(0.0,0.8,0.1))+ scale_colour_gradient(low="blue",high="red")+ scale_y_discrete((name ="KEGG pathways"),limits=rev(levels(kk$Description)))+ labs(x="Rich Factor",y="", size="Gene Numbers",color="-log10(pvalue)")+ theme_bw() #axis.text.y = element_blank() q } pdf("Fig.MEgreen-BPandKEGG.pdf",width = 12,height = 4) library(cowplot) plot_grid(p,q,align = "h",nrow=1,rel_widths = c(20,19),labels = c("A","B")) #labels = c("A","B") dev.off() ### Code for hub genes rm(list=ls()) load("WGCNAstep1-Soft Threshold.Rdata") load("WGCNAstep2-co-expression nekwork.Rdata") gene_module <- data.frame(gene_name = colnames(gene), module = moduleColors, stringsAsFactors = FALSE) head(gene_module) gene_module_select <- subset(gene_module, module == "green")$gene_name gene_select <- t(gene[,gene_module_select]) tom_select <- tom_sim[gene_module_select,gene_module_select] write.csv(gene_select, file = 'MEgreen_genes.csv') gene_green <- gene[ ,gene_module_select] geneModuleMembership <- signedKME(gene_green, MEs_col['MEgreen'], outputColumnName = 'MM') MMPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nrow(MEs_col))) geneModuleMembership$MMgreen <- abs(geneModuleMembership$MMgreen) geneTraitSignificance <- as.data.frame(cor(gene_green, trait['Progress'], use = 'p')) GSPvalue <- as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nrow(trait))) geneTraitSignificance$Progress <- abs(geneTraitSignificance$Progress) pdf("Fig.scatter_diagram-green.pdf") par(mar=c(5,5,5,3)) verboseScatterplot(geneModuleMembership$MMblue, geneTraitSignificance$Progress, xlab = "Module Membership in green module", ylab = "Gene significance for Progress", main = paste("Module membership vs. gene significance\n"), cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col="green") abline(v=0.9,col = "black",lwd = 2,lty = 2) abline(h=0.7,col = "black",lwd = 2,lty = 2) dev.off() geneModuleMembership[geneModuleMembership<0.9 | MMPvalue>0.05] <- 0 geneTraitSignificance[geneTraitSignificance<0.7 | GSPvalue>0.05] <- 0 select <- cbind(geneModuleMembership, geneTraitSignificance) select <- subset(select, geneModuleMembership>=0.9 & geneTraitSignificance>=0.7) hub <- rownames(select) write.csv(hub,file = 'MEgreen_hubgenes.csv') } ### Code for single gene GSEA library(clusterProfiler) library(org.Hs.eg.db) library(ggplot2) batch_cor <- function(gene){ y <- as.numeric(exprSet[gene,]) rownames <- rownames(exprSet) do.call(rbind,future_lapply(rownames, function(x){ dd <- cor.test(as.numeric(exprSet[x,]),y,type="spearman") data.frame(gene=gene,mRNAs=x,cor=dd$estimate,p.value=dd$p.value ) })) } library(future.apply) plan(multiprocess) system.time(dd <- batch_cor("TFE3")) save(dd,file = "dd.Rdata") load("dd.Rdata") gene <- dd$mRNAs gene = bitr(gene, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db") gene <- dplyr::distinct(gene,SYMBOL,.keep_all=TRUE) gene_df <- data.frame(logFC=dd$cor, SYMBOL = dd$mRNAs) gene_df <- merge(gene_df,gene,by="SYMBOL") gene_list <- gene_df$logFC names(gene_list) = gene_df$ENTREZID gene_list = sort(gene_list, decreasing = TRUE) GSEAgo <- gseGO( geneList = gene_list, OrgDb = org.Hs.eg.db, ont = "BP", #nPerm = 1000, minGSSize = 100, maxGSSize = 500, pvalueCutoff = 0.1, eps = 0, verbose = FALSE) GSEAgo<- setReadable(GSEAgo,'org.Hs.eg.db', 'ENTREZID') go_TFE3 <- GSEAgo save(go_TFE3,file = "GSEAgo_TFE3.Rdata") load("GSEAgo_TFE3.Rdata") go_TFE3 <- go_TFE3@result go_TFE3 <- go_TFE3[go_TFE3$pvalue<0.05,] GSEAkegg <- gseKEGG( geneList = gene_list, keyType = 'kegg', organism = 'hsa', minGSSize = 10, maxGSSize = 500, eps = 0, pvalueCutoff = 0.12) GSEAkegg<- setReadable(GSEAkegg, 'org.Hs.eg.db', 'ENTREZID') kegg_TFE3 <- GSEAkegg save(kegg_TFE3,file = "GSEAkegg_TFE3.Rdata") load("human_GSEAkegg.Rdata") kegg_TFE3 <- GSEAkegg@result kegg_TFE3 <- kegg_TFE3[kegg_TFE3$pvalue<0.05,] ### Code for TCGA database mining library(TCGAbiolinks) TCGAbiolinks::getGDCprojects()$project_id cancer_type="TCGA-COAD" clinical <- GDCquery_clinic(project = cancer_type,type = "clinical") save(clinical,file = "COAD_clinical.Rdata") write.csv(clinical,file = "COAD_clinical.csv") library(dplyr) library(DT) library(SummarizedExperiment) data_type <- "Gene Expression Quantification" data_category <- "Transcriptome Profiling" workflow_type <- "HTSeq - Counts" query_TranscriptomeCounts <- GDCquery(project = cancer_type, data.category = data_category, data.type = data_type, workflow.type = workflow_type) GDCdownload(query_TranscriptomeCounts, method = "api") expdat <- GDCprepare(query = query_TranscriptomeCounts) count_matrix=assay(expdat) View(count_matrix) write.csv(count_matrix,file = "TCGAbiolinks_COAD_counts.csv") exprSet <- read.csv("TCGAbiolinks_COAD_counts.csv",header = T,row.names = 1) library(rtracklayer) library(SummarizedExperiment) gtf1 <- rtracklayer::import('gencode.v37.annotation.gtf') gtf_df <- as.data.frame(gtf1) geneid_df <- dplyr::select(gtf_df,c(gene_name,gene_id,gene_type)) library(dplyr) library(tidyr) geneid_df_nopoint <- geneid_df %>% tidyr::separate(gene_id,into = c("gene_id","drop"),sep="\\.") %>% dplyr::select(-drop) protein_coding <- geneid_df_nopoint[geneid_df_nopoint$gene_type =="protein_coding",] PC <- protein_coding %>% dplyr :: distinct(gene_id,.keep_all=TRUE) exp <- as.data.frame(exprSet) exp$gene_id <- rownames(exp) pcexp <- merge(PC,exp,by="gene_id") Pcexp <- aggregate(x = pcexp[,4:ncol(pcexp)], by = list(pcexp$gene_name), FUN = mean) rownames(Pcexp) <- Pcexp$Group.1 exprSet <- Pcexp[,-1] a <- exprSet library(stringr) group_list=ifelse(as.numeric(str_sub(colnames(a),14,15))<10,"tumor","normal") group_list=factor(group_list,levels = c("normal","tumor")) table(group_list) library(limma) library(edgeR) design <- model.matrix(~0+group_list) colnames(design) <- levels(group_list) rownames(design) <- colnames(a) dge <- DGEList(counts=a) dge <- calcNormFactors(dge) logCPM <- cpm(dge,log = TRUE,prior.count = 3) v <- voom(dge,design,normalize="quantile") fit <- lmFit(v,design) constrasts=paste(rev(levels(group_list)),collapse = "-") cont.matrix <- makeContrasts(constrasts = constrasts,levels = design) fit3 <- contrasts.fit(fit,cont.matrix) fit3 <- eBayes(fit3) DEG <- topTable(fit3,coef = constrasts,n=Inf ) DEG <- na.omit(DEG) logFC_cutoff <- with(DEG,mean(abs(logFC))+2*sd(abs(logFC))) DEG$change <- as.factor(ifelse(DEG$adj.P.Val < 0.05 & abs(DEG$logFC) > logFC_cutoff, ifelse(DEG$logFC > logFC_cutoff ,'UP','DOWN'),'NOT')) table(DEG$change) # down 641, up 342, not 18564 write.csv(DEG,file = "CRC-DEG.csv") DEG <- read.csv("CRC-DEG.csv",header = T,row.names = 1) library(ggplot2) g = ggplot(data=DEG, aes(x=logFC, y=-log10(adj.P.Val), color=change)) + geom_point(alpha=0.8, size=0.3) + theme_bw()+ theme(panel.grid=element_blank())+ xlab("log2 fold change") + ylab("-log10 adj.P.Val") + theme(plot.title = element_text(size=15,hjust = 0.5))+ scale_colour_manual(values = c('blue','black','red')) + geom_vline(xintercept=c(-logFC_cutoff,logFC_cutoff),lty=2,col="black",lwd=0.4) + geom_hline(yintercept = -log10(0.05),lty=2,col="black",lwd=0.6)+ #ggtitle( this_tile ) + geom_vline(xintercept = c(-0.3,0.3),lty=2,col="grey",lwd=0.6) pdf("F-volcano-CRC.pdf",width = 5,height = 4) g dev.off() ### Code for GSEA library(clusterProfiler) library(org.Hs.eg.db) geneList <- DEG$logFC names(geneList) = rownames(DEG) geneList=sort(geneList,decreasing = T) gene <- names(geneList) list=AnnotationDbi::select(org.Hs.eg.db,keys=gene,columns = c("ENTREZID","SYMBOL"), keytype="SYMBOL") idlist <- list[match(gene,list$SYMBOL),] names(geneList) <- idlist$ENTREZID GSEAgo <- gseGO( geneList = geneList, OrgDb = org.Hs.eg.db, ont = "BP", #nPerm = 1000, #置换检验的置换次数 minGSSize = 100, maxGSSize = 500, pvalueCutoff = 0.9, eps = 0, verbose = FALSE) GSEAgo_R <- GSEAgo@result GSEAgo_R <- GSEAgo_R[GSEAgo_R$pvalue<0.05,] write.csv(GSEAgo_R,file = "Table S3.GSEAgo_CRC.csv")