# Clear the current workspace
rm(list = ls())

# Load required packages
library(limma)
library(ggplot2)

# Define input files
expFile <- "geneexpr.txt"         # Expression input file
clusterFile <- "Cluster.txt"     # Clustering file

# Read the input files and preprocess
rt <- read.table(expFile, header = TRUE, sep = "\t", check.names = FALSE)
rt <- as.matrix(rt)
exp <- rt
dimnames <- list(rownames(exp), colnames(exp))
data <- matrix(as.numeric(as.matrix(exp)), nrow = nrow(exp), dimnames = dimnames)
data <- avereps(data)
data <- data[rowMeans(data) > 0, ]
data <- t(data)

# Perform PCA analysis
data.pca <- prcomp(data, scale. = TRUE)
pcaPredict <- predict(data.pca)
write.table(pcaPredict, file = "newTab.xls", quote = FALSE, sep = "\t")

# Read the clustering file
cluster <- read.table(clusterFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
cluster <- as.vector(cluster[, 1])

# Set colors
bioCol <- c("#0066FF", "red", "#FF0000", "#6E568C", "#7CC767", "#223D6C", "#D20A13", "#FFD121", "#088247", "#11AA4D")
CluCol <- bioCol[1:length(levels(factor(cluster)))]

# Visualization
PCA <- data.frame(PC1 = pcaPredict[, 1], PC2 = pcaPredict[, 2], cluster = cluster)
PCA.mean <- aggregate(PCA[, 1:2], list(cluster = PCA$cluster), mean)
pdf(file = "PCA.pdf", height = 5, width = 6.5)
ggplot(data = PCA, aes(PC1, PC2)) + geom_point(aes(color = cluster)) +
  scale_colour_manual(name = "cluster", values = CluCol) +
  theme_bw() +
  theme(plot.margin = unit(rep(1.5, 4), 'lines')) +
  annotate("text", x = PCA.mean$PC1, y = PCA.mean$PC2, label = PCA.mean$cluster, cex = 7) +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())
dev.off()