##########################################################################################
#                          R code to run GOSemSim for enrichment analysis                #
##########################################################################################
# Install and load the required packages if not already installed
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
BiocManager::install("GOSemSim")
library(GOSemSim)

# Load gene list
gene_list <- read.table("cytokine-receptor_84_list.txt", header = FALSE, sep = "\t")
gene_list <- gene_list$V1  # My gene list has one column of gene symbols

# Load the necessary ontology data (GO.db)
if (!requireNamespace("AnnotationDbi", quietly = TRUE)) {
  BiocManager::install("AnnotationDbi")
}
library(AnnotationDbi)
godb <- AnnotationDbi::select(org.Hs.eg.db, keys = gene_list, keytype = "SYMBOL")

# Calculate semantic similarity between genes and GO terms
sim_matrix <- GOSemSim::mgoSim(gene_list, godb, semData = GODbData(godb))

# Perform enrichment analysis
enrich_result <- GOSemSim::goEnrichment(
  sim_matrix,
  semData = GODbData(godb),
  ontology = "BP",  # You can change to "CC" or "MF" for different ontologies
  pvalueCutoff = 0.05,
  qvalueCutoff = 0.1
)

# View the enriched GO terms
head(enrich_result)

##########################################################################################
#         R code to use TCGAbiolinks for the TCGA pan-cancer SNV mutational analysis     #
##########################################################################################

# Install and load the required packages if not already installed
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}
BiocManager::install("TCGAbiolinks")
library(TCGAbiolinks)

# Set working directory to where you want to download and store TCGA data
setwd("TCGAbiolinks")

# Load the TCGA data
query <- GDCquery(project = "TCGA-PANCAN",
                   data.category = "Somatic Mutation",
                   workflow.type = "MuTect2 Variant Aggregation and Masking",
                   file.type = "Masked Somatic Mutation")

# Download the data
GDCdownload(query)

# Prepare the data for analysis
mutation_data <- GDCprepare(query)

# Read the cytokine-receptor gene list
gene_list <- read.table("cytokine-receptor_84_list.txt", header = FALSE, sep = "\t")
gene_list <- gene_list$V1  # My gene list has one column of gene symbols

# Subset the mutation data for the specified genes
subset_data <- subset(mutation_data, gene_id %in% gene_symbols)

# Perform mutational analysis on the subset_data
# You can calculate mutation frequencies, plot mutational spectra, etc.

# For example, to calculate mutation frequencies
mutation_freq <- freqMut(subset_data, gene_col = "gene_id")

# To visualize the mutation frequencies
barplot(mutation_freq$freq, names.arg = mutation_freq$gene_id, las = 2, cex.names = 0.7)