rm(list = ls())

# Load libraries
library(plyr)
library(ggplot2)
library(ggpubr)

scoreFile = "score.group.txt"    # Scoring data file
cliFile = "clinicaldata.txt"      # Clinical data file
trait = "Tomor_type"             # Clinical trait

# Read input files
score = read.table(scoreFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)
cli = read.table(cliFile, header = TRUE, sep = "\t", check.names = FALSE, row.names = 1)

# Identify common samples between scoring and clinical data
sameSample = intersect(row.names(score), row.names(cli))
rt = cbind(score[sameSample,,drop=FALSE], cli[sameSample,,drop=FALSE])

# Replace "fustat" values with "Dead" and "Alive"
rt$fustat[rt$fustat == 1] = "Dead"
rt$fustat[rt$fustat == 0] = "Alive"

# Define colors for clinical traits
bioCol = c("#0066FF","#FF0000","#FF9900","#6E568C","#7CC767","#223D6C","#D20A13","#FFD121","#088247","#11AA4D")
bioCol = bioCol[1:length(unique(rt[,trait]))]

# Count the number of patients in low and high scoring groups for each trait
rt1 = rt[,c(trait, "group")]
rt1 <- na.omit(rt1)
colnames(rt1) = c("trait", "group")
df = as.data.frame(table(rt1))

# Calculate the percentage of patients in each group
df = ddply(df, .(group), transform, percent = Freq/sum(Freq) * 100)

# Calculate position for percentage labels
df = ddply(df, .(group), transform, pos = (cumsum(Freq) - 0.5 * Freq))
df$label = paste0(sprintf("%.0f", df$percent), "%")
df$group = factor(df$group, levels = c("Low", "High"))

# Create a bar plot to visualize the percentage of patients in low and high score groups for each clinical trait
p = ggplot(df, aes(x = factor(group), y = percent, fill = trait)) +
  geom_bar(position = position_stack(), stat = "identity", width = 0.7) +
  scale_fill_manual(values = bioCol) +
  xlab("score") + ylab("Percent weight") +  guides(fill = guide_legend(title = trait)) +
  geom_text(aes(label = label), position = position_stack(vjust = 0.5), size = 3) +
  theme_bw()

# Save the bar plot to a PDF file
pdf(file = "Tomor_type_barplot.pdf", width = 4, height = 5)
print(p)
dev.off()

# Set up comparisons between clinical traits
rt2 = rt[,c(trait, "score")]
rt2 <- na.omit(rt2)
colnames(rt2) = c("trait", "score")
type = levels(factor(rt2[,"trait"]))
rt2$trait = factor(rt2$trait, levels = type)
comp = combn(type, 2)
my_comparisons = list()
for(i in 1:ncol(comp)){
  my_comparisons[[i]] = comp[,i]
}

# Create a box plot to visualize the distribution of scores for different traits and include trait group comparisons
boxplot = ggboxplot(rt2, x = "trait", y = "score", fill = "trait",
                    xlab = trait,
                    ylab = "score",
                    legend.title = trait,
                    palette = bioCol
) + 
  stat_compare_means(comparisons = my_comparisons)

# Save the box plot to a PDF file
pdf(file = "Tomor_type_boxplot.pdf", width = 5, height = 4.5)
print(boxplot)
dev.off()

# Print column names of the 'rt' data frame
colnames(rt)