# Clear the current workspace
rm(list = ls())

# Load the merged data from a previous session
load("merge.RDATA")
Merge <- outTab
genename <- read.table("ICD.txt", header = TRUE, sep = "\t")

# Get the intersection of gene names with merged data
AAA <- intersect(genename[, 1], rownames(Merge))
geneexpr <- Merge[AAA, ]

# Save the gene expression data to a file
write.table(geneexpr, "geneexpr.txt", quote = FALSE, row.names = TRUE, sep = "\t")
save(geneexpr, file = "geneexpr.RDATA")

# Load required library for ConsensusClusterPlus
library(ConsensusClusterPlus)

# Read the gene expression data
data <- geneexpr
data <- as.matrix(data)

# Perform clustering using ConsensusClusterPlus
maxK <- 9
results <- ConsensusClusterPlus(data,
                                maxK = maxK,
                                reps = 50,
                                pItem = 0.8,
                                pFeature = 1,
                                clusterAlg = "pam",
                                distance = "euclidean",
                                seed = 123456,
                                plot = "png")

# Output clustering results
clusterNum <- 2  # Number of clusters to consider based on the criteria
cluster <- results[[clusterNum]][["consensusClass"]]
cluster <- as.data.frame(cluster)
colnames(cluster) <- c("cluster")
letter <- c("A", "B", "C", "D", "E", "F", "G")
uniqClu <- levels(factor(cluster$cluster))
cluster$cluster <- letter[match(cluster$cluster, uniqClu)]
clusterOut <- rbind(ID = colnames(cluster), cluster)
write.table(clusterOut, file = "Cluster.txt", sep = "\t", quote = FALSE, col.names = FALSE)

# Load required libraries for survival analysis
library(survival)
library(survminer)
library(tidyverse)

clusterFile <- "Cluster.txt"     # Cluster file
cliFile <- "clinicaldata.txt"    # Clinical data file

# Read the cluster file and preprocess it
cluster <- read.table(clusterFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
rownames(cluster) <- str_replace_all(rownames(cluster), "TCGA_", "")
rownames(cluster) <- str_replace_all(rownames(cluster), "GSE17538_", "")
rownames(cluster) <- str_replace_all(rownames(cluster), "GSE39582_", "")

# Read the clinical data file
cli <- read.table(cliFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
cli <- cli[, 1:2]
cli$futime <- cli$futime / 365

# Merge the cluster and clinical data
sameSample <- intersect(row.names(cluster), row.names(cli))
rt <- cbind(cli[sameSample, , drop = FALSE], cluster[sameSample, , drop = FALSE])

# Perform survival analysis
length <- length(levels(factor(rt$cluster)))
diff <- survdiff(Surv(futime, fustat) ~ cluster, data = rt)
pValue <- 1 - pchisq(diff$chisq, df = length - 1)
if (pValue < 0.001) {
  pValue <- "p<0.001"
} else {
  pValue <- paste0("p=", sprintf("%.03f", pValue))
}
fit <- survfit(Surv(futime, fustat) ~ cluster, data = rt)

# Generate survival plots
bioCol <- c("#0066FF", "#FF9900", "#FF0000", "#6E568C", "#7CC767", "#223D6C", "#D20A13", "#FFD121", "#088247", "#11AA4D")
bioCol <- bioCol[1:length]
surPlot <- ggsurvplot(fit,
                      data = rt,
                      conf.int = FALSE,
                      pval = pValue,
                      pval.size = 6,
                      legend.title = "cluster",
                      legend.labs = levels(factor(rt[,"cluster"])),
                      legend = c(0.8, 0.8),
                      font.legend = 10,
                      xlab = "Time (years)",
                      break.time.by = 1,
                      palette = bioCol,
                      surv.median.line = "hv",
                      risk.table = TRUE,
                      cumevents = FALSE,
                      risk.table.height = 0.25)

# Print the survival plot and save it as a PDF file
print(surPlot)
pdf(file = "survival.pdf", onefile = FALSE, width = 7, height = 5.5)
print(surPlot)
dev.off()