rm(list = ls()) # Load libraries library(limma) library(sva) library(ggplot2) library(factoextra) library(FactoMineR) files = c("TCGA.txt", "GSE17538.txt", "GSE39582.txt") # Get the intersection of genes geneList = list() for (i in 1:length(files)) { inputFile = files[i] rt = read.table(inputFile, header = T, sep = "\t", check.names = F) header = unlist(strsplit(inputFile, "\\.|\\-")) geneList[[header[1]]] = as.vector(rt[, 1]) } intersectGenes = Reduce(intersect, geneList) # Merge the data allTab = data.frame() batchType = c() for (i in 1:length(files)) { inputFile = files[i] header = unlist(strsplit(inputFile, "\\.|\\-")) # Read the input file and preprocess it rt = read.table(inputFile, header = T, sep = "\t", check.names = F) rt = as.matrix(rt) rownames(rt) = rt[, 1] exp = rt[, 2:ncol(rt)] dimnames = list(rownames(exp), colnames(exp)) data = matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames) rt = avereps(data) colnames(rt) = paste0(header[1], "_", colnames(rt)) # Take log2 transformation for large numeric values qx = as.numeric(quantile(rt, c(0, 0.25, 0.5, 0.75, 0.99, 1.0), na.rm = T)) LogC = ((qx[5] > 100) || ((qx[6] - qx[1]) > 50 && qx[2] > 0)) if (LogC) { rt[rt < 0] = 0 rt = log2(rt + 1) } if (header[1] != "TCGA") { rt = normalizeBetweenArrays(rt) } # Merge the data if (i == 1) { allTab = rt[intersectGenes, ] } else { allTab = cbind(allTab, rt[intersectGenes, ]) } batchType = c(batchType, rep(i, ncol(rt))) } # Correct the data and output the corrected results outTab = ComBat(allTab, batchType, par.prior = TRUE) library(FactoMineR) library(factoextra) ddb.pca <- PCA(t(allTab), graph = FALSE) pheno <- data.frame(ID = colnames(allTab)) pheno$cancer <- pheno$ID pheno[1:380, 2] <- "TCGA" pheno[381:618, 2] <- "GSE17538" pheno[619:1184, 2] <- "GSE39582" pdf(file = "Before Batch Effect Removal.pdf", height = 8, width = 8) fviz_pca_ind(ddb.pca, geom.ind = "point", # Show points only pointsize = 2, # Point size pointshape = 21, # Point shape fill.ind = pheno$cancer, # Group color palette = "lacent", # c("#00AFBB", "#E7B800", "#FC4E07") addEllipses = TRUE, # Add confidence ellipses legend.title = "Groups", # Legend title title = "") + theme_bw() + # Beautify using ggplot2 theme(text = element_text(size = 14, face = "plain", color = "black"), axis.title = element_text(size = 16, face = "plain", color = "black"), axis.text = element_text(size = 14, face = "plain", color = "black"), legend.title = element_text(size = 16, face = "plain", color = "black"), legend.text = element_text(size = 14, face = "plain", color = "black"), legend.background = element_blank(), legend.position = c(0.9, 0.1) ) dev.off() library(FactoMineR) library(factoextra) ddb.pca <- PCA(t(outTab), graph = FALSE) pdf(file = "After Batch Effect Removal.pdf", height = 8, width = 8) fviz_pca_ind(ddb.pca, geom.ind = "point", # Show points only pointsize = 2, # Point size pointshape = 21, # Point shape fill.ind = pheno$cancer, # Group color palette = "lacent", # c("#00AFBB", "#E7B800", "#FC4E07") addEllipses = TRUE, # Add confidence ellipses legend.title = "Groups", # Legend title title = "") + theme_bw() + # Beautify using ggplot2 theme(text = element_text(size = 14, face = "plain", color = "black"), axis.title = element_text(size = 16, face = "plain", color = "black"), axis.text = element_text(size = 14, face = "plain", color = "black"), legend.title = element_text(size = 16, face = "plain", color = "black"), legend.text = element_text(size = 14, face = "plain", color = "black"), legend.background = element_blank(), legend.position = c(0.9, 0.1) ) dev.off() save(outTab, file = "merge.RDATA") outTab = rbind(geneNames = colnames(outTab), outTab) write.table(outTab, file = "merge.txt", sep = "\t", quote = F, col.names = F)