rm( list = ls() ) gc() { GSE = 'GSE200306' disease = "lupus nephritis" platform = 'GPL21847' setwd( "/Users/jiesbidy/Documents/total/20230730" ) } #install.packages("devtools") library(devtools) #install.packages("Rcpp") library(Rcpp) #install_github("hadley/readxl") library(readxl) # library(showtext) # showtext_auto(enable=T) # library(GEOquery) # getOption('timeout') options(timeout=300) gset2 = getGEO( GSE, AnnotGPL = T ) save(gset2, file = paste(GSE,"_eSet2.Rdata",sep = "")) xxx=gset2[["GSE200306_series_matrix.txt.gz"]]@assayData[["exprs"]] library(GEOquery) #gset3 = getGEO(filename = './GSE2372_series_matrix.txt.gz', AnnotGPL = T) gset4 = getGEO(filename = './GSE200306_series_matrix.txt', AnnotGPL = T) save(gset4, file = paste(GSE,"_eSet3.Rdata",sep = "") ) #paste(GSE,"_eSet3.Rdata",sep = "") ### 1.1 #sampleinfo = pData(gset[[1]] ) # #sampleinfo = pData(gset2[[1]] )# #sampleinfo = pData(gset3 ) sampleinfo = pData(gset4 ) #sampleinfo = gset2[[1]]@phenoData@data sampleinfo = gset4@phenoData@data head(sampleinfo) library(tidyverse) term_x = sampleinfo1$source_name_ch1 table(term_x) #sort(c( "WT", "apoE" ) ) #level_x = c( "WT", "apoE" ) #contrast = c( "apoE-WT" ) library(openxlsx) write.csv(sampleinfo, file = "sampleinfo.csv") #write.xlsx(sampleinfo, file = "xxx.csv") sampleinfo = sampleinfo %>% # mutate( Group = "HC" ) %>% # mutate( Group = ifelse( grepl('LN',term_x), "LN", Group ) ) %>% mutate( Group = factor( Group, levels = level_x, ordered = T ) ) %>% # mutate( GSE = GSE ) %>% mutate( Disease = disease ) %>% mutate( Platform = platform ) %>% arrange( Group ) # table(sampleinfo$Group ) save(sampleinfo, level_x, contrast, file = paste('1.1__',GSE,'.Rdata',sep = "") ) ### 1.2 # #exprset1 = exprs(gset[[1]] ) #exprset1 = exprs(gset2[[1]] ) # GEO querry #exprset1 = exprs(gset3 ) # exprset1 = exprs(gset4 ) # ### summary( as.numeric( unlist(exprset1) ) ) table( is.na(exprset1) ) # table( exprset1 < 0 ) exprset1 <- na.omit(exprset1) dim(exprset1) #non_NA <- !is.na(exprset1[,1]) #exprset1<-exprset1[non_NA,] # #summary(non_NA) # #dim(exprset1) #non_NA <- !is.na(exprset1[,1]) #exprset1<-exprset1[non_NA,] # #summary(non_NA) hist(unlist( exprset1 )) # hist(unlist( exprset1 ), breaks = 100 ) #hist(unlist( exprset2 )) #hist(unlist( log2(exprset2) )) # exprset1 = log2(exprset1+1) # # hist(unlist( log2(exprset1) ), breaks = 100 ) identical( colnames( exprset1), sampleinfo1$geo_accession ) # exprset1 = as.matrix( exprset1[ , sampleinfo1$geo_accession ] ) # identical( rownames(sampleinfo1), colnames(exprset1)) colnames(exprset1) = sampleinfo1$geo_accession save( GSE,exprset1, file = paste('1.2__',GSE,'.Rdata',sep = "")) rm( gset, eset, term_x, eset_rma, gset, gset2, gset3, gset4 ) write.csv(exprset1, file = paste('1.2__',GSE,'.csv',sep = "")) #log2(exprset2+1) # #write.csv(exprset1, file = ".csv") # ### 1.3 boxplot( exprset1 ) exprset2=exprset1 # library(limma) exprset2 = normalizeBetweenArrays( exprset2, method = "quantile" ) # normalizeBetweenArrays #exprset2 = exprset1[,-c(80:181)] # #write.csv(exprset2, file = "exprset2.csv") boxplot( exprset2 ) install.packages("magrittr") # library(magrittr) # needs to be run every time you start R and want to use %>% library(ggsci) # #n_select = 5000 # sample( 1:nrow(exprset2)) expr_l = exprset2[ sample(1:nrow(exprset2)) , ] %>% # as.data.frame( ) %>% gather( ) %>% # setNames( c( 'Sample','Expression') ) %>% mutate( Group = sampleinfo1$source_name_ch1[ match( Sample, sampleinfo1$geo_accession ) ] ) %>% # arrange( Group ) %>% # mutate( Sample = factor(Sample, levels = unique(Sample), ordered = T ) ) # unique(expr_l$Sample) # #figure p1 = ggplot(expr_l,aes(x= Sample, y= Expression, fill= Group, color = Group))+ geom_boxplot(outlier.shape = 0.05,outlier.size = 0.05) + theme_light() + ggtitle("Expression levels of samples") + theme( plot.title = element_text( hjust = 0.5 ), axis.text.x = element_blank(), plot.background = element_rect(fill = "transparent",colour = NA) ) + scale_color_nejm(alpha = 1)+ scale_fill_nejm(alpha = 0.8) p1 ggsave( p1, filename = paste('1.3__',GSE,'.pdf',sep = ""), width = 6,height = 3) ggsave( filename = paste('1.3__',GSE,'.pdf',sep = ""), width = 6,height = 3) rm(n_select, expr_l) ### 1.4 library(paletteer) hclust1 <- hclust(dist(t(exprset2)), method = "complete") # hclust plot(hclust1, hang = -1) library(ggfortify) library(ggtree) cut_tree = length( level_5 ) # cut = cutree(hclust1, cut_tree) cut = data.frame(label = names(cut), member = factor(cut) ) cut$Group = sampleinfo1$Group p2 = ggtree(hclust1, ladderize = FALSE, branch.length = "none") %<+% cut + geom_tippoint(size=2, shape = 16, show.legend = T, alpha = 0.8, aes(color = Group)) + # geom_tiplab(aes(label=Group), size=3, hjust=.5) + theme_light() + ggtitle("hClust of samples") + ylab("Sample") + theme( plot.title = element_text( hjust = 0.5 ), plot.background = element_rect(fill = "transparent",colour = NA) ) + coord_flip()+scale_x_reverse() + scale_color_nejm(alpha = 0.8) + guides(color = guide_legend(override.aes = list(size = 4))) #p2 ggsave(p2, filename = paste("1.4__",GSE,".pdf",sep=""),width = 6, height = 3) rm(hclust1, cut_tree, cut ) ### 1.5 PCA---- pca1 = prcomp( as.data.frame( t( exprset2 ) ) ) # prcomp xxx = pca1[["x"]] p3 = autoplot( pca1, data = sampleinfo1, colour = "source_name_ch1", size=2.5, frame = TRUE, frame.type = 'norm')+ theme_light() + ggtitle("PCA of samples") + theme( plot.title = element_text( hjust = 0.5 ), plot.background = element_rect(fill = "transparent",colour = NA) ) + scale_color_nejm(alpha = 0.8) + scale_fill_nejm(alpha = 0.8) #p3 ggsave(p3, filename =paste("1.5_PCA_",GSE,".pdf",sep=""),width = 5.2, height = 4) rm( pca1 ) ### 1.6 sd = exprset2 %>% as.data.frame() %>% mutate( SD = apply(., 1, sd ) ) %>% # arrange( desc(SD) ) %>% slice_max( SD, n= 400 ) %>% dplyr::select( 1:ncol(exprset2) ) library(pheatmap) # library(ggplotify) # legend_col = data.frame( row.names = sampleinfo1$geo_accession, Group = sampleinfo1$source_name_ch1 ) # bk = 2 # brk <- c(seq(-bk,-0.01,by=0.01),seq(0,bk,by=0.01)) heat1 = pheatmap(sd, scale = "row", # annotation_col = legend_col, # color = c(colorRampPalette(colors = c("dodgerblue4","white"))(length(brk)/2), colorRampPalette(colors = c("white","brown"))(length(brk)/2)), legend_breaks=seq(-bk,bk,1), breaks=brk, border_color = NA, show_rownames = F, show_colnames = F ) p4 = as.ggplot(heat1)+ ggtitle('Heatmap of Total Gene')+ xlab('Sample') + ylab('Gene')+ theme( plot.title = element_text(hjust = 0.4), plot.background = element_rect(fill = "transparent",colour = NA) ) p4 ggsave(p4, filename = paste("1.6__",GSE,".pdf",sep= ""),width = 5.2, height = 4) rm(sd, legend_col, bk, brk, heat1 ) library(patchwork) # pp = (p1/p4) + plot_annotation( tag_levels = "A" )#pp = (p1 / p2 / p3 ) | ( p4) + plot_annotation( tag_levels = "A" ) pp rm( sd, pp, p1,p2,p3,p4) #p3|p4 # #p3/p4 # ggsave(pp, filename = paste("1.6__",GSE,".pdf",sep= ""),width = 10, height = 8) ## 3、 library( limma ) ### 3.1 # contrast <- table(sampleinfo1$source_name_ch1) #sampleinfo$Group contrast # design <- model.matrix(~0+factor(sampleinfo1$source_name_ch1) ) %>% as.data.frame() %>% setNames( unique(sampleinfo1$source_name_ch1) ) rownames(design) = sampleinfo1$geo_accession design # c("A-B","C-B","A-C") # contrast.matrix <-makeContrasts( contrasts = "HC-LN", levels = design ) # contrast.matrix # ### 3.2 degs_temp <- lmFit( exprset2, design ) %>% # Fit linear model contrasts.fit( contrast.matrix ) %>% # compute estimated coefficients and standard errors for a given set of contrasts eBayes %>% # compute moderated t-statistics, moderated F-statistic, and log-odds of differential expression topTable( coef = 1, n = Inf ) %>% # coef = 1 na.omit %>% # setNames( c("log2FC", "Mean.Expr", "t", "P.value", "adj.P", "B" ) ) %>% mutate( probe_id = rownames(.) ) logFC_cut = with(degs_temp, mean(abs(log2FC))+2*sd(abs(log2FC))) # degs_temp = degs_temp %>% mutate( DEG = factor( ifelse( abs(degs_temp$log2FC) > logFC_cut & degs_temp$P.value < 0.05, ifelse(degs_temp$log2FC > 0, 'up','down'), 'ns' ), levels = c("up", "ns", "down" ), ordered = T ) ) table( degs_temp$DEG ) save(degs_temp, file = paste('3.2__',GSE,'.Rdata',sep = "")) rm( design, contrast.matrix ) del <- which(degs_temp$DEG=="ns") # degs_temp1 <- degs_temp[-del,] save(degs_temp1, file = paste('3.2_significant_',GSE,'.Rdata',sep = "")) write.csv(degs_temp1, file = "significant.csv") # ### 3.3 p5 = ggplot(data = degs_temp, aes(x = log2FC, y = -log10(P.value), color = DEG)) + # ylim(0,25)+ # # xlim(-0.005,0.005)+ # geom_point(size = 1, shape = 16, alpha = 0.9) + ggtitle('Overall distribution of DEGs') + theme_light() + theme( plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "transparent",colour = NA) ) + scale_color_manual(values = c('brown','grey','dodgerblue3' ))+ guides(color = guide_legend(override.aes = list(size = 4))) + geom_vline( xintercept = c(-logFC_cut, logFC_cut), linetype = "dashed" ) + geom_hline( yintercept = -log10(0.05), linetype = "dashed" ) p5 ggsave( p5, filename = paste("3.3_DEG_",GSE,".pdf",sep= ""),width = 4, height = 4) rm( logFC_cut ) ### 3.4 # chs_x = degs_temp1 %>% slice_max( abs(log2FC), n = 522 ) slice_min( P.value, n = 522 ) # heat_matrix = exprset2[ chs_x$probe_id, ] legend_col = data.frame( row.names = sampleinfo1$geo_accession, Group = sampleinfo1$source_name_ch1 ) # bk = 2 # brk <- c(seq(-bk,-0.01,by=0.01),seq(0,bk,by=0.01)) heat1 = pheatmap(heat_matrix, scale = "row", annotation_col = legend_col, color = c(colorRampPalette(colors = c("dodgerblue4","white"))(length(brk)/2), colorRampPalette(colors = c("white","brown"))(length(brk)/2)), legend_breaks=seq(-bk,bk,1), breaks=brk, treeheight_row = 40, treeheight_col = 40, border_color = NA, cluster_cols = F, show_rownames = T, show_colnames = F ) p6 = as.ggplot(heat1)+ ggtitle('Heatmap of significant DEGs')+ xlab('Sample') + ylab('Gene') + theme( plot.title = element_text(hjust = 0.4), plot.background = element_rect(fill = "transparent",colour = NA) ) p6 ggsave( p6, filename = paste("sig-DEG热图_",GSE,".pdf",sep= ""),width = 6, height = 3.0) rm(chs_x, heat_matrix, legend_col, bk, brk, heat1 ) pp = (p5 | p6) + plot_annotation( tag_levels = "A" ) pp rm(pp, p1,p2,p3,p4,p5,p6) ## 4、 ### 4.1 library(AnnoProbe) # gpl_anno = getGEO(filename = './GPL21847_family.soft.gz', AnnotGPL = T) # gpl_anno = idmap( "GPL2184" ) %>% # as.data.frame() %>% distinct( probe_id, .keep_all = T ) # rownames(gpl_anno) = gpl_anno$probe_id head( gpl_anno ) # exprset2 = exprset2[rownames(exprset1) %in% gpl_anno$probe_id,] # dim(exprset2) gpl_anno = gpl_anno[gpl_anno$probe_id %in% rownames(exprset2),] # # # xxx = checkGeneSymbols( gpl_anno$symbol, species="mouse") # table(xxx$Approved == "FALSE" & xxx$Suggested.Symbol != "NA") # identical(gpl_anno$symbol, xxx$x) # pos = which(xxx$Approved == "FALSE" & xxx$Suggested.Symbol != "NA") # gpl_anno$symbol[pos] = xxx$Suggested.Symbol[pos] ### 4.2 identical( gpl_anno$probe_id, rownames(exprset2)) gpl_anno = gpl_anno[rownames(exprset2),] # identical( gpl_anno$probe_id, rownames(exprset2)) exprset2 = cbind(gpl_anno,exprset2) save(exprset2,file = paste('4.2__',GSE,'.Rdata',sep = "")) # write.csv(exprset2,file = paste('4.2__',GSE,'.csv',sep = "")) ### degs_anno = degs_temp %>% filter( rownames(.) %in% gpl_anno$probe_id ) gpl_anno2 = gpl_anno[ degs_anno$probe_id, ] # identical( gpl_anno2$probe_id,rownames(degs_anno)) degs_anno = cbind( gpl_anno2,degs_anno) %>% dplyr::select(-1) save( degs_anno,file = paste('4.2_DEG_',GSE,'.Rdata',sep = "")) # write.csv(degs_anno,file = paste('4.2_DEGs__',GSE,'.csv',sep = "")) ### 4.4 summary( degs_anno$Mean.Expr) plot(table(sort(table(degs_anno$symbol)))) table( is.na(degs_anno$P.value) ) table( degs_anno$P.value == 0 ) # degs_final = degs_anno %>% group_by( symbol ) %>% mutate( mean = mean(Mean.Expr) ) %>% group_by( ) %>% # filter( Mean.Expr >= mean | Mean.Expr >= mean(Mean.Expr) ) %>% arrange( P.value ) %>% distinct( symbol, .keep_all = T ) # save(degs_final,file = paste('4.4_DEG_',GSE,'.Rdata',sep = "")) #write.csv(degs_final,file = paste('4.4_DEGs__',GSE,'.csv',sep = "")) # exprset3 = exprset1[ degs_final$probe_id, ] identical( degs_final$probe_id, rownames(exprset3)) # rownames(exprset3) = degs_final$symbol save(exprset3,file = paste('4.4__',GSE,'.Rdata',sep = "") ) rm(pos, exprset1, exprset2, degs_temp, degs_anno, gpl_anno, gpl_anno2, xxx ) ## 4.5 load( paste('4.4_DEG_',GSE,'.Rdata',sep = "") ) summary(degs_final$Mean.Expr) summary(abs(degs_final$log2FC)) # qu1 = quantile(degs_final$Mean.Expr, probs = 0.25 ) # fc1 = mean(abs(degs_final$log2FC)) # Log2FC: qu1;fc1 ### library(org.Hs.eg.db) # library(clusterProfiler) # # degs_top = degs_temp1 #filter( Mean.Expr > qu1 & abs(log2FC) > fc1 & P.value < 0.05 ) %>% # #arrange( P.value ) %>% # 按 p #arrange( desc( abs(log2FC) ) ) %>% #pull(probe_id ) %>% # #degs_top=degs_top[,c(1,7)] genes <- as.vector(degs_top$probe_id ) entrezIDs <- mget(genes, org.Hs.egSYMBOL2EG, ifnotfound=NA) #?ҳ???????Ӧ??id entrezIDs <- as.character(entrezIDs) out=cbind(degs_top,entrezID=entrezIDs) write.table(out,file="entrezID.txt",sep="\t",quote=F,row.names=F) save(out,file = paste('4.5_DEG+entrezID_',GSE,'.Rdata',sep = "")) # /////////////////////////---- # // 二、(enrichment)---- # /////////////////////////---- ## e1.3 library(clusterProfiler) library(enrichplot) library("org.Hs.eg.db") library("ggplot2") #load( paste('4.5_Top300DEG_',GSE,'.Rdata',sep = "") ) ### e1.3 GO go_data <- enrichGO( gene = out$ENTREZID, # enrichGO OrgDb="org.Hs.eg.db", # ont ="ALL", # pvalueCutoff = 0.05, # qvalueCutoff = 1, readable= TRUE # ) %>% clusterProfiler::simplify() %>% # pairwise_termsim() # go_data = go_data@result write.csv(go_data, file = "go_data.csv") #GO #install.packages("colorspace") #install.packages("stringi") #install.packages("ggplot2") #if (!requireNamespace("BiocManager", quietly = TRUE)) # install.packages("BiocManager") #BiocManager::install("clusterProfiler") #BiocManager::install("enrichplot") #???ð? library("clusterProfiler") library("org.Hs.eg.db") library("enrichplot") library("ggplot2") #out = out %>% pvalueFilter=0.05 #pֵ???????? qvalueFilter=0.05 #????????pֵ???????? #setwd("/Users/jiesbidy/Desktop/GO") #???ù???Ŀ¼ rt=read.table("entrezID.txt",sep="\t",header=T,check.names=F) #??ȡid.txt?ļ? rt=rt[is.na(rt[,"entrezID"])==F,] #ȥ??????idΪNA?Ļ??? gene=rt$entrezID geneFC=rt$log2FC #geneFC=2^geneFC names(geneFC)=gene #??????ɫ???? colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } #GO???????? kk=enrichGO(gene = gene,OrgDb = org.Hs.eg.db, pvalueCutoff =1, qvalueCutoff = 1, ont="all", readable =T) GO=as.data.frame(kk) GO=GO[(GO$pvalue% setReadable( ., OrgDb = 'org.Hs.eg.db', keyType = 'ENTREZID' ) %>% # pairwise_termsim() # emap save( go_data, kegg_data, file = paste('e1.4__',GSE,'.Rdata',sep = "")) kegg_data=kegg_data@result write.csv(kegg_data, file = "kegg_data.csv") library(DOSE) enrichDO() enrichWP() #KEGG library("clusterProfiler") library("org.Hs.eg.db") library("enrichplot") library("ggplot2") pvalueFilter=0.05 qvalueFilter=0.05 #????????pֵ???????? #setwd("/Users/jiesbidy/Desktop/KEGG") rt=read.table("entrezID.txt",sep="\t",header=T,check.names=F) #??ȡid.txt?ļ? rt=rt[is.na(rt[,"entrezID"])==F,] #ȥ??????idΪNA?Ļ??? colnames(rt)[1]="Gene" gene=rt$entrezID geneFC=rt$log2FC #geneFC=2^geneFC names(geneFC)=gene #??????ɫ???? colorSel="qvalue" if(qvalueFilter>0.05){ colorSel="pvalue" } #kegg???????? kk <- enrichKEGG(gene = gene, organism = "hsa", pvalueCutoff =1, qvalueCutoff =1) KEGG=as.data.frame(kk) KEGG$geneID=as.character(sapply(KEGG$geneID,function(x)paste(rt$Gene[match(strsplit(x,"/")[[1]],as.character(rt$entrezID))],collapse="/"))) KEGG=KEGG[(KEGG$pvalue% # mutate( log2FC = degs_temp1$log2FC[ match(SYMBOL, degs_temp1$symbol ) ] ) %>% # arrange( desc( log2FC ) ) # log2FC table( is.na(btr_gsea_order$ENTREZID) ) # table( is.na(btr_gsea_order$log2FC) ) # head(btr_gsea_order) ; tail(btr_gsea_order) # gsea_list = setNames( btr_gsea_order$log2FC, btr_gsea_order$ENTREZID ) # head(gsea_list);tail(gsea_list) ### e2.2 GSEA gsea_go <- gseGO(gsea_list, ont = "ALL", OrgDb = org.Hs.eg.db, # eps = 0 ) %>% clusterProfiler::simplify( ) ### e2.3 GSEA gsea_kegg <- gseKEGG(gsea_list, keyType = "kegg", organism = 'hsa', # pvalueCutoff = 1, eps = 0, use_internal_data = F ) # gsea_wp <- gseWP(gsea_list, organism = 'Homo sapiens', eps = 0) # gsea_do <- gseDO(gsea_list, by = "fgsea", eps = 0) save( gsea_go, gsea_kegg, file = paste('e2.3_GSEA_',GSE,'.Rdata',sep = "")) ### e2.4 GSEA GO ridgeplot(gsea_go,showCategory = 30)+ theme_light()+ ggtitle("Top GSEA-GO terms")+ theme( plot.title = element_text(hjust = 0.4), plot.background = element_rect(fill = "transparent",colour = NA) ) + scale_fill_gradient( low = "brown", high = "blue" ) ridgeplot(gsea_kegg,showCategory = 30)+ theme_light()+ ggtitle("Top GSEA-KEGG terms")+ theme( plot.title = element_text(hjust = 0.4), plot.background = element_rect(fill = "transparent",colour = NA) ) + scale_fill_gradient( low = "brown", high = "blue" ) enrichplot::gseaplot2(gsea_go,1:5)+ theme_light()+ ggtitle("Top 5 GSEA-GO terms")+ theme( plot.title = element_text(hjust = 0.5), plot.background = element_rect(fill = "transparent",colour = NA) ) rm(btr_gsea_order, gsea_list, gsea_go, gsea_kegg ) # rm(list = ls())