# Clear the environment to avoid conflicts with previous variables
rm(list = ls()) 

# Set global options to prevent automatic conversion of strings to factors
options(stringsAsFactors = FALSE)

# Load the GEOquery library to access GEO datasets
library(GEOquery)

# Define the GEO dataset ID to be retrieved
gse <- "GSE55098"  

# Download the GEO dataset to the current directory without platform information
eSet <- getGEO(gse, destdir = '.', getGPL = FALSE)

# Display the class of the retrieved object and the number of datasets (if multiple)
class(eSet)  
length(eSet)  

# (1) Extract the expression matrix from the first dataset
exp <- exprs(eSet[[1]])  
# Preview the first 4x4 block of the expression matrix
exp[1:4, 1:4]  

# Check the range of expression values to identify potential normalization issues
range(exp)  

# Log2 transformation to normalize expression values and reduce variance
exp <- log2(exp + 1)  

# (2) Extract clinical information (metadata) associated with the samples
pd <- pData(eSet[[1]])  
# Ensure the row names of the clinical data align with the column names of the expression matrix
rownames(pd)  
colnames(exp)  

# (3) Check if column names (samples) in the expression matrix match row names in clinical data
p <- identical(rownames(pd), colnames(exp)); p  
# If not identical, reorder the expression matrix columns to match the clinical data
if (!p) exp <- exp[, match(rownames(pd), colnames(exp))]  

# (4) Extract the annotation platform used for this dataset
gpl <- eSet[[1]]@annotation  
print(gpl)  # Print the platform annotation for reference

# Load the stringr library for string matching operations
library(stringr)

# Assign sample groups based on the title column in the clinical data
group_list <- ifelse(str_detect(pd$title, "control"), "control", "T1D")
print(group_list)  # Display the assigned groups for validation

# Set the reference level of the group factor for downstream statistical analysis
group_list <- factor(group_list, levels = c("control", "T1D"))
print(group_list)  # Verify the factor levels

# Ensure the Bioconductor package 'hgu133plus2.db' is installed and loaded
if (!require(hgu133plus2.db)) BiocManager::install("hgu133plus2.db")
library(hgu133plus2.db)

# List available objects in the 'hgu133plus2.db' package to explore annotation options
ls("package:hgu133plus2.db")

# Extract probe-to-gene mappings using the SYMBOL mapping
ids <- toTable(hgu133plus2SYMBOL)  

# Check for duplicated gene symbols in the mapping table
any(duplicated(ids$symbol))  # TRUE/FALSE if duplicates exist

# View the dimensions of the expression matrix and the probe ID mapping
dim(exp)  # Number of probes (rows) and samples (columns) in the dataset
nrow(ids)  # Number of unique gene-symbol mappings

# Plot a boxplot to visualize the first row of clinical data grouped by sample groups
boxplot(pd[1, ] ~ group_list)

# Transpose the expression matrix to align samples as rows for PCA
dat <- as.data.frame(t(exp))  
dat[1:4, 1:4]  # Preview the first 4x4 block of the transposed matrix

# Load the necessary libraries for PCA and visualization
library(FactoMineR)
library(factoextra)

# Perform Principal Component Analysis (PCA) on the transposed data
dat.pca <- PCA(dat, graph = FALSE)

# Create a PCA plot with sample points colored by their group
pca_plot <- fviz_pca_ind(dat.pca,
                         geom.ind = "point",  # Represent samples as points
                         col.ind = group_list,  # Color points by group
                         #palette = c("#00AFBB", "#E7B800"),  # Optional color palette
                         addEllipses = TRUE,  # Add ellipses for each group
                         legend.title = "Groups")

# Customize the PCA plot with a serif font and increased text size
pca_plot + theme(text = element_text(family = "serif", size = 12))

# Save the PCA plot as a PNG file
ggsave(plot = pca_plot, filename = paste0(gse, "PCA55098.png"))

# Save the PCA plot object for future use
save(pca_plot, file = "pca_plot55098.Rdata")

# Generate a heatmap of the top 1000 most variable genes
cg <- names(tail(sort(apply(exp, 1, sd)), 1000))  # Identify top 1000 variable genes
n <- exp[cg, ]  # Subset the expression matrix

# Create an annotation data frame with sample groups for the heatmap
annotation_col <- data.frame(group = group_list)
rownames(annotation_col) <- colnames(n)  # Ensure row names align with sample names

# Load the pheatmap library and generate the heatmap
library(pheatmap)
pheatmap(n,
         show_colnames = FALSE,  # Hide column names for readability
         show_rownames = FALSE,  # Hide row names for readability
         annotation_col = annotation_col,  # Add group annotation to columns
         scale = "row")  # Scale expression values by row (gene)

# Close the graphics device
dev.off()

# Calculate the correlation matrix for the full expression matrix
co <- cor(exp)

# Generate a heatmap of the correlation matrix
pheatmap(co)

# Perform differential expression analysis using the limma package
library(limma)

# Create a design matrix for linear modeling based on group labels
design <- model.matrix(~group_list)

# Fit a linear model to the expression data
fit <- lmFit(exp, design)

# Apply empirical Bayes smoothing to the model fit
fit <- eBayes(fit)

# Extract all differentially expressed genes (DEGs)
deg <- topTable(fit, coef = 2, number = Inf)

# Load required libraries
library(dplyr)  # Data manipulation
library(ggplot2)  # Visualization
library(clusterProfiler)  # Enrichment analysis
library(org.Hs.eg.db)  # Human gene annotation database

# Add probe IDs as a column in the DEG (differential expression genes) data frame
deg <- mutate(deg, probe_id = rownames(deg))  
head(deg)  # Preview the first few rows of the modified data frame

# Join the DEG data with gene annotation data by probe ID
deg <- inner_join(deg, ids, by = "probe_id")  
# Remove duplicated gene symbols to ensure unique entries
deg <- deg[!duplicated(deg$symbol), ]  

# Define thresholds for log fold change (logFC) and p-value
logFC_t <- 0.5  # Threshold for logFC
P.Value_t <- 0.05  # Threshold for p-value

# Identify genes with significant changes in expression
k1 <- (deg$P.Value < P.Value_t) & (deg$logFC < -logFC_t)  # Down-regulated
k2 <- (deg$P.Value < P.Value_t) & (deg$logFC > logFC_t)  # Up-regulated

# Assign categories (up, down, or stable) to each gene based on thresholds
change <- ifelse(k1, "down", ifelse(k2, "up", "stable"))  
table(change)  # Display the number of genes in each category

# Add the 'change' column to the DEG data frame
deg <- mutate(deg, change)

# Convert gene symbols to ENTREZ IDs for enrichment analysis
s2e <- bitr(deg$symbol,  
            fromType = "SYMBOL",  
            toType = "ENTREZID",  
            OrgDb = org.Hs.eg.db)  

############ Batch Effect Correction with ComBat ############

# Install and load the sva package for batch correction if not installed
if (!requireNamespace("sva", quietly = TRUE)) {
  install.packages("sva")
}
library(sva)

# Ensure exp matrix and batch information are correctly aligned
# Example: Assuming 'batch_info' contains batch labels for each sample
batch_info <- c(rep(1, 20), rep(2, 30))  # Adjust according to your dataset

# Perform batch effect correction using ComBat
exp_corrected <- ComBat(dat = as.matrix(exp), 
                        batch = batch_info, 
                        mod = NULL, 
                        par.prior = TRUE, 
                        prior.plots = FALSE)

# Confirm the dimensions match after correction
dim(exp_corrected)

# 1. Volcano plot: Visualize differential expression results
dat <- deg  # Assign DEG data to a variable for plotting

# Create a basic volcano plot with logFC on the x-axis and -log10(p-value) on the y-axis
p <- ggplot(data = dat, 
            aes(x = logFC, y = -log10(P.Value))) +
  geom_point(alpha = 0.4, size = 3.5, aes(color = change)) +  # Plot points with colors based on change
  ylab("-log10(P-value)") +  # Label the y-axis
  scale_color_manual(values = c("blue", "grey", "red")) +  # Custom color palette
  geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 4, col = "black", lwd = 0.8) +  # Vertical lines at logFC thresholds
  geom_hline(yintercept = -log10(P.Value_t), lty = 4, col = "black", lwd = 0.8) +  # Horizontal line at p-value threshold
  theme_bw()  # Use a clean theme for better readability
p  # Display the plot

# Highlight specific genes on the volcano plot
if (TRUE) {  
  for_label <- dat %>% filter(symbol %in% c("TRPM3", "SFRP1"))  # Select specific genes for labeling
}
View(for_label)  # Preview the labeled genes

# Optional: Select top 20 genes for further labeling
if (TRUE) {
  for_label <- dat %>% head(20)
}

# Identify top up-regulated and down-regulated genes for labeling
if (TRUE) {
  x1 <- dat %>% filter(change == "up") %>% head(3)  # Top 3 up-regulated genes
  x2 <- dat %>% filter(change == "down") %>% head(3)  # Top 3 down-regulated genes
  for_label <- rbind(x1, x2)  # Combine both sets of genes
}

# Add labels to the volcano plot for the selected genes
volcano_plot <- p +
  geom_point(size = 3, shape = 1, data = for_label) +  # Highlight labeled genes
  ggrepel::geom_label_repel(
    aes(label = symbol), family = "serif",  # Add text labels with a serif font
    data = for_label, color = "black"  # Use black color for labels
  )
volcano_plot  # Display the final volcano plot


######## Modify the font and size ########
# Adjust the font family to 'serif' and set the font size to 12 for consistent styling
volcano_plot + theme(text = element_text(family = "serif", size = 12))

# Check the class of the 'for_label' object to ensure it is properly formatted for further operations
class(for_label)

# Save the labeled differential genes (for_label) to a CSV file for record keeping
write.csv(for_label, file = "DEGs.csv")

# Save the volcano plot as PNG files with specific filenames
ggsave(plot = p, filename = paste0("volcano55098.png"))
ggsave(plot = volcano_plot, filename = paste0("volcano.png"))

######## Differential Gene Heatmap ########

# Load previously saved data from step 2 to reuse results without recomputing
load(file = 'step2output55098.Rdata')

# Select genes for the heatmap based on their change status (up/down-regulated)
if (FALSE) {  
  # Option 1: Use all non-stable genes
  cg = deg$probe_id[deg$change != "stable"]
  length(cg)  # Check the number of selected genes
} else {  
  # Option 2: Select the top 20 up-regulated and top 20 down-regulated genes
  x = deg$logFC[deg$change != "stable"]
  names(x) = deg$probe_id[deg$change != "stable"]
  cg = c(names(head(sort(x), 20)), names(tail(sort(x), 20)))  
  length(cg)  # Verify the number of selected genes
}

# Subset the expression matrix to include only selected genes
n = exp[cg, ]
dim(n)  # Check the dimensions of the subset matrix

# Read additional data from a CSV file and preview the content
library(readr)
a <- read.csv("a.csv", row.names = 1)
View(a)  # Display the data for review

######## Heatmap ########

# Load the pheatmap library for heatmap generation
library(pheatmap)

# Create an annotation data frame to label samples by their group (e.g., control, T1D)
annotation_col = data.frame(group = group_list)
rownames(annotation_col) = colnames(a)  # Ensure alignment with sample names

# Generate the heatmap and convert it to a ggplot object for customization
library(ggplotify)
heatmap_plot <- as.ggplot(
  pheatmap(a,
           show_colnames = TRUE, colnames_side = 'top',  # Show column names at the top
           show_rownames = TRUE,                         # Show row names
           scale = "row",                                # Scale data across rows (genes)
           cluster_cols = FALSE,                         # Disable column clustering
           fontfamily = "serif",                         # Set font family for the heatmap
           annotation_col = annotation_col)              # Add sample group annotations
)

# Customize the heatmap with font adjustments for uniformity
heatmap_plot + theme(text = element_text(family = "serif", size = 12))

# Display the heatmap plot
heatmap_plot

# Save the heatmap plot as a PNG file for reproducibility
ggsave(heatmap_plot, filename = paste0("heatmap55098.png"))

# Load a previously saved PCA plot object to reuse in combined plots
load("pca_plot.Rdata")

# Use the patchwork library to arrange multiple plots (PCA, volcano, and heatmap) into one figure
library(patchwork)
(pca_plot + volcano_plot + heatmap_plot) + plot_annotation(tag_levels = "A")

# Save all DEGs to a CSV file for record keeping
write.csv(deg, file = "allDEGs.csv")


####### Plots of EP-DEGs #######

# Clear the environment to avoid conflicts with previous objects
rm(list = ls()) 

# Load saved results from step 5
load(file = "step5output55098.Rdata")

# Load the EP-DEGs dataset and preview it for verification
library(readr)
EP_DEGs <- read_csv("EP-DEGs.csv")
View(EP_DEGs)  # Ensure the dataset is correctly loaded

# Load necessary libraries for plotting
library(dplyr)
library(ggplot2)

# Assign the EP-DEGs data to a variable for plotting
dat <- EP_DEGs

# Create a volcano plot of EP-DEGs
g <- ggplot(data = dat, 
            aes(x = logFC, y = -log10(P.Value))) +
  geom_point(alpha = 0.4, size = 3.5, aes(color = change)) +  # Plot points with colors indicating change
  ylab("-log10(P-value)") +  # Label the y-axis
  scale_color_manual(values = c("blue", "red")) +  # Define custom colors for changes
  geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 4, col = "black", lwd = 0.8) +  # Vertical threshold lines
  geom_hline(yintercept = -log10(P.Value_t), lty = 4, col = "black", lwd = 0.8) +  # Horizontal threshold line
  theme_bw()  # Apply a clean theme for better visualization
g  # Display the plot

# Select specific genes for labeling on the volcano plot
if (TRUE) {  
  for_label <- dat %>% filter(symbol %in% c("TRPM3", "SFRP1"))
}
View(for_label)  # Verify the labeled genes

# Add labels for selected genes to the volcano plot
volcano_plot <- g +
  geom_point(size = 3, shape = 1, data = for_label) +  # Highlight labeled points
  ggrepel::geom_label_repel(
    aes(label = symbol), family = "serif",  # Add labels with serif font
    data = for_label, color = "black"
  )
volcano_plot  # Display the labeled volcano plot

# Adjust font and size for consistency
volcano_plot + theme(text = element_text(family = "serif", size = 12))

######## Heatmap of EP-DEGs ########

# Select genes for the heatmap based on differential expression
if (FALSE) {  
  # Option 1: Use all non-stable genes
  cg <- EP_DEGs$probe_id[EP_DEGs$change != "stable"]
} else {  
  # Option 2: Select top 20 up- and down-regulated genes
  x <- EP_DEGs$logFC[EP_DEGs$change != "stable"]
  names(x) <- EP_DEGs$probe_id[EP_DEGs$change != "stable"]
  cg <- c(names(head(sort(x), 20)), names(tail(sort(x), 20)))
}
n <- exp[cg, ]  # Subset the expression matrix with selected genes
dim(n)  # Verify the dimensions

# Load additional data and review it
a <- read.csv("a.csv", row.names = 1)
View(a)  # Confirm correct loading

# Generate a heatmap with annotations
library(pheatmap)
annotation_col <- data.frame(group = group_list)  # Create group annotations
rownames(annotation_col) <- colnames(a)  # Align annotation row names

# Convert heatmap to ggplot for customization
library(ggplotify)
heatmap_plot <- as.ggplot(
  pheatmap(a, show_colnames = TRUE, colnames_side = 'top', 
           show_rownames = TRUE, scale = "row", cluster_cols = FALSE, 
           fontfamily = "serif", annotation_col = annotation_col)
)
heatmap_plot + theme(text = element_text(family = "serif", size = 12))  # Adjust font
heatmap_plot  # Display the heatmap

# Save the heatmap as PNG
ggsave(heatmap_plot, filename = paste0("heatmap55098.png"))

# Load previously saved PCA plot
load("pca_plot.Rdata")

######## KEGG and GO Enrichment Analysis ########

library(clusterProfiler)
library(org.Hs.eg.db)  # Load human gene database

# Extract gene lists for GO enrichment
gene_up <- DEGs[DEGs$change == 'up', 'ENTREZID']
gene_down <- DEGs[DEGs$change == 'down', 'ENTREZID']
gene_diff <- c(gene_up, gene_down)  # Combine up and down-regulated genes
gene_all <- DEGs[, 'ENTREZID']  # All genes

# Perform GO enrichment analysis
# Adjust p-values using the Benjamini-Hochberg method (FDR correction)

ego_CC <- enrichGO(gene = gene_diff, OrgDb = org.Hs.eg.db, ont = "CC", 
                   pAdjustMethod = "BH", minGSSize = 1, pvalueCutoff = 0.05, 
                   qvalueCutoff = 0.05, readable = TRUE)

# Save GO enrichment results
save(ego_CC, file = "ego_GSE55098.Rdata")

# Bar plot of GO enrichment results
barplot(ego_CC, showCategory = 20) + 
  theme(text = element_text(family = "serif", size = 12))
ggsave(filename = "ego_CC_barplot.png")

# Dot plot of GO enrichment results
ego_CC_dotplot <- dotplot(ego_CC) + 
  theme(text = element_text(family = "serif", size = 12))
ggsave(ego_CC_dotplot, filename = "ego_CC_dotplot.png")

# Generate and save a gene-concept network plot
geneList <- DEGs$logFC
names(geneList) <- DEGs$ENTREZID
ego_CC_cnetplot <- cnetplot(ego_CC, foldChange = geneList, circular = TRUE, colorEdge = TRUE) +
  theme(text = element_text(family = "serif", size = 12))
ggsave(ego_CC_cnetplot, filename = "ego_CC_cnetplot.png")

# Heatmap-like functional classification
ego_CC_heatplot <- heatplot(ego_CC, foldChange = geneList)
ggsave(ego_CC_heatplot, filename = "ego_CC_heatplot.png")

# Save heatplot as PDF
pdf("ego_CC_heatplot.pdf", width = 14, height = 5)
heatplot(ego_CC, foldChange = geneList)
dev.off()

############ BP: Biological Process Analysis ############

# Barplot Visualization of GO Biological Process Enrichment Results
barplot(ego_BP, showCategory = 20)  # Plot top 20 enriched categories
ego_BP_barplot <- barplot(ego_BP, showCategory = 20)  # Store plot in a variable
ego_BP_barplot + theme(text = element_text(family = "serif", size = 12))  # Adjust font size
ggsave(ego_BP_barplot, filename = paste0("ego_BP_barplot.png"))  # Save as PNG

# Dotplot Visualization of GO Biological Process Results
dotplot(ego_BP)  # Generate a dot plot of enriched terms
ego_BP_dotplot <- dotplot(ego_BP)  # Store the dot plot
ego_BP_dotplot + theme(text = element_text(family = "serif", size = 12))  # Adjust styling
ggsave(ego_BP_dotplot, filename = paste0("ego_BP_dotplot.png"))  # Save the plot

# Prepare Gene List for Network Visualization
geneList <- DEGs$logFC  # Extract log fold changes for genes
names(geneList) <- DEGs$ENTREZID  # Name each value with the corresponding ENTREZ ID
geneList <- sort(geneList, decreasing = TRUE)  # Sort in descending order

# Gene-Concept Network Plot
cnetplot(ego_BP, categorySize = "pvalue", foldChange = geneList, colorEdge = TRUE)  # Standard network plot
ego_BP_cnetplot <- cnetplot(ego_BP, foldChange = geneList, circular = TRUE, colorEdge = TRUE)  # Circular layout
ego_BP_cnetplot + theme(text = element_text(family = "serif", size = 12))  # Customize font
ggsave(ego_BP_cnetplot, filename = paste0("ego_BP_cnetplot.png"))  # Save the network plot

# Enrichment Map Plot for Biological Process
emapplot(ego_BP)  # Generate enrichment map

# GOplot Visualization
ego_BP_goplot <- goplot(ego_BP)  # Generate a GO plot
ggsave(ego_BP_goplot, filename = paste0("ego_BP_goplot.png"))  # Save GO plot as PNG

# Heatmap-like Functional Classification
ego_BP_heatplot <- heatplot(ego_BP, foldChange = geneList)  # Generate heatmap plot
ggsave(ego_BP_heatplot, filename = paste0("ego_BP_heatplot.png"))  # Save heatmap as PNG

# Save Heatplot as PDF for Publication-Ready Format
pdf("ego_BP_heatplot.pdf", width = 14, height = 5)
heatplot(ego_BP, foldChange = geneList)
dev.off()

############ MF: Molecular Function Analysis ############

# Barplot of GO Molecular Function Terms
barplot(ego_MF, showCategory = 20)  # Plot top 20 enriched MF categories
ego_MF_barplot <- barplot(ego_MF, showCategory = 20)  # Store plot
ego_MF_barplot + theme(text = element_text(family = "serif", size = 12))
ggsave(ego_MF_barplot, filename = paste0("ego_MF_barplot.png"))

# Dotplot Visualization for Molecular Function
ego_MF_dotplot <- dotplot(ego_MF)
ego_MF_dotplot + theme(text = element_text(family = "serif", size = 12))
ggsave(ego_MF_dotplot, filename = paste0("ego_MF_dotplot.png"))

# Gene-Concept Network Plot for Molecular Function
ego_MF_cnetplot <- cnetplot(ego_MF, foldChange = geneList, circular = TRUE, colorEdge = TRUE)
ego_MF_cnetplot + theme(text = element_text(family = "serif", size = 12))
ggsave(ego_MF_cnetplot, filename = paste0("ego_MF_cnetplot.png"))

# Enrichment Map Plot for Molecular Function
emapplot(ego_MF)

# GOplot for Molecular Function
ego_MF_goplot <- goplot(ego_MF)
ggsave(ego_MF_goplot, filename = paste0("ego_MF_goplot.png"))

# Heatmap-like Functional Classification for Molecular Function
ego_MF_heatplot <- heatplot(ego_MF, foldChange = geneList)
ggsave(ego_MF_heatplot, filename = paste0("ego_MF_heatplot.png"))

############ KEGG Pathway Analysis ############

# Prepare Gene Lists for KEGG Enrichment Analysis
gene_up <- DEGs[DEGs$change == 'up', 'ENTREZID']
gene_down <- DEGs[DEGs$change == 'down', 'ENTREZID']
gene_diff <- c(gene_up, gene_down)  # Combine up- and down-regulated genes
gene_all <- deg[, 'ENTREZID']  # All genes for the universe set

# Perform KEGG Enrichment Analysis
kk.up <- enrichKEGG(gene = gene_up, organism = 'hsa', universe = gene_all, pvalueCutoff = 0.9, qvalueCutoff = 0.9)
kk.down <- enrichKEGG(gene = gene_down, organism = 'hsa', universe = gene_all, pvalueCutoff = 0.9, qvalueCutoff = 0.9)
kk.diff <- enrichKEGG(gene = gene_diff, organism = 'hsa', pvalueCutoff = 0.9)

# Save KEGG Results for Future Use
save(kk.diff, kk.down, kk.up, file = "GSE55098kegg.Rdata")

# Generate Combined KEGG Plot for Up and Down-regulated Genes
g_kegg <- kegg_plot(up_kegg, down_kegg)
ggsave(g_kegg, filename = 'kegg_up_down.png')

############ STRING Network Export ############

# Load DEG Data and Prepare for STRING Export
DEGs <- read.csv(file = "EP-DEGs.csv")
gene_up <- DEGs[DEGs$change == 'up', 'symbol']
gene_down <- DEGs[DEGs$change == 'down', 'symbol']
gene_diff <- c(gene_up, gene_down)

# Write Gene Lists to Files for STRING Analysis
write.table(gene_up, file = "upgene.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(gene_down, file = "downgene.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)
write.table(gene_diff, file = "diffgene.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)

# Process STRING Interaction Data for Cytoscape Export
tsv <- read.table("string_interactions_short.tsv", comment.char = "!", header = TRUE)
tsv2 <- tsv[, c(1, 2, ncol(tsv))]  # Select relevant columns
write.table(tsv2, file = "cyto.txt", sep = "\t", quote = FALSE, row.names = FALSE)

# Export DEG Information for Cytoscape
deg_info <- DEGs[DEGs$change != "stable", c("symbol", "logFC", "P.Value")]
write.table(deg_info, file = "deg.txt", sep = "\t", quote = FALSE, row.names = FALSE)