# Clear the environment to avoid conflicts with previous variables rm(list = ls()) # Set global options to prevent automatic conversion of strings to factors options(stringsAsFactors = FALSE) # Load the GEOquery library to access GEO datasets library(GEOquery) # Define the GEO dataset ID to be retrieved gse <- "GSE55098" # Download the GEO dataset to the current directory without platform information eSet <- getGEO(gse, destdir = '.', getGPL = FALSE) # Display the class of the retrieved object and the number of datasets (if multiple) class(eSet) length(eSet) # (1) Extract the expression matrix from the first dataset exp <- exprs(eSet[[1]]) # Preview the first 4x4 block of the expression matrix exp[1:4, 1:4] # Check the range of expression values to identify potential normalization issues range(exp) # Log2 transformation to normalize expression values and reduce variance exp <- log2(exp + 1) # (2) Extract clinical information (metadata) associated with the samples pd <- pData(eSet[[1]]) # Ensure the row names of the clinical data align with the column names of the expression matrix rownames(pd) colnames(exp) # (3) Check if column names (samples) in the expression matrix match row names in clinical data p <- identical(rownames(pd), colnames(exp)); p # If not identical, reorder the expression matrix columns to match the clinical data if (!p) exp <- exp[, match(rownames(pd), colnames(exp))] # (4) Extract the annotation platform used for this dataset gpl <- eSet[[1]]@annotation print(gpl) # Print the platform annotation for reference # Load the stringr library for string matching operations library(stringr) # Assign sample groups based on the title column in the clinical data group_list <- ifelse(str_detect(pd$title, "control"), "control", "T1D") print(group_list) # Display the assigned groups for validation # Set the reference level of the group factor for downstream statistical analysis group_list <- factor(group_list, levels = c("control", "T1D")) print(group_list) # Verify the factor levels # Ensure the Bioconductor package 'hgu133plus2.db' is installed and loaded if (!require(hgu133plus2.db)) BiocManager::install("hgu133plus2.db") library(hgu133plus2.db) # List available objects in the 'hgu133plus2.db' package to explore annotation options ls("package:hgu133plus2.db") # Extract probe-to-gene mappings using the SYMBOL mapping ids <- toTable(hgu133plus2SYMBOL) # Check for duplicated gene symbols in the mapping table any(duplicated(ids$symbol)) # TRUE/FALSE if duplicates exist # View the dimensions of the expression matrix and the probe ID mapping dim(exp) # Number of probes (rows) and samples (columns) in the dataset nrow(ids) # Number of unique gene-symbol mappings # Plot a boxplot to visualize the first row of clinical data grouped by sample groups boxplot(pd[1, ] ~ group_list) # Transpose the expression matrix to align samples as rows for PCA dat <- as.data.frame(t(exp)) dat[1:4, 1:4] # Preview the first 4x4 block of the transposed matrix # Load the necessary libraries for PCA and visualization library(FactoMineR) library(factoextra) # Perform Principal Component Analysis (PCA) on the transposed data dat.pca <- PCA(dat, graph = FALSE) # Create a PCA plot with sample points colored by their group pca_plot <- fviz_pca_ind(dat.pca, geom.ind = "point", # Represent samples as points col.ind = group_list, # Color points by group #palette = c("#00AFBB", "#E7B800"), # Optional color palette addEllipses = TRUE, # Add ellipses for each group legend.title = "Groups") # Customize the PCA plot with a serif font and increased text size pca_plot + theme(text = element_text(family = "serif", size = 12)) # Save the PCA plot as a PNG file ggsave(plot = pca_plot, filename = paste0(gse, "PCA55098.png")) # Save the PCA plot object for future use save(pca_plot, file = "pca_plot55098.Rdata") # Generate a heatmap of the top 1000 most variable genes cg <- names(tail(sort(apply(exp, 1, sd)), 1000)) # Identify top 1000 variable genes n <- exp[cg, ] # Subset the expression matrix # Create an annotation data frame with sample groups for the heatmap annotation_col <- data.frame(group = group_list) rownames(annotation_col) <- colnames(n) # Ensure row names align with sample names # Load the pheatmap library and generate the heatmap library(pheatmap) pheatmap(n, show_colnames = FALSE, # Hide column names for readability show_rownames = FALSE, # Hide row names for readability annotation_col = annotation_col, # Add group annotation to columns scale = "row") # Scale expression values by row (gene) # Close the graphics device dev.off() # Calculate the correlation matrix for the full expression matrix co <- cor(exp) # Generate a heatmap of the correlation matrix pheatmap(co) # Perform differential expression analysis using the limma package library(limma) # Create a design matrix for linear modeling based on group labels design <- model.matrix(~group_list) # Fit a linear model to the expression data fit <- lmFit(exp, design) # Apply empirical Bayes smoothing to the model fit fit <- eBayes(fit) # Extract all differentially expressed genes (DEGs) deg <- topTable(fit, coef = 2, number = Inf) # Load required libraries library(dplyr) # Data manipulation library(ggplot2) # Visualization library(clusterProfiler) # Enrichment analysis library(org.Hs.eg.db) # Human gene annotation database # Add probe IDs as a column in the DEG (differential expression genes) data frame deg <- mutate(deg, probe_id = rownames(deg)) head(deg) # Preview the first few rows of the modified data frame # Join the DEG data with gene annotation data by probe ID deg <- inner_join(deg, ids, by = "probe_id") # Remove duplicated gene symbols to ensure unique entries deg <- deg[!duplicated(deg$symbol), ] # Define thresholds for log fold change (logFC) and p-value logFC_t <- 0.5 # Threshold for logFC P.Value_t <- 0.05 # Threshold for p-value # Identify genes with significant changes in expression k1 <- (deg$P.Value < P.Value_t) & (deg$logFC < -logFC_t) # Down-regulated k2 <- (deg$P.Value < P.Value_t) & (deg$logFC > logFC_t) # Up-regulated # Assign categories (up, down, or stable) to each gene based on thresholds change <- ifelse(k1, "down", ifelse(k2, "up", "stable")) table(change) # Display the number of genes in each category # Add the 'change' column to the DEG data frame deg <- mutate(deg, change) # Convert gene symbols to ENTREZ IDs for enrichment analysis s2e <- bitr(deg$symbol, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = org.Hs.eg.db) ############ Batch Effect Correction with ComBat ############ # Install and load the sva package for batch correction if not installed if (!requireNamespace("sva", quietly = TRUE)) { install.packages("sva") } library(sva) # Ensure exp matrix and batch information are correctly aligned # Example: Assuming 'batch_info' contains batch labels for each sample batch_info <- c(rep(1, 20), rep(2, 30)) # Adjust according to your dataset # Perform batch effect correction using ComBat exp_corrected <- ComBat(dat = as.matrix(exp), batch = batch_info, mod = NULL, par.prior = TRUE, prior.plots = FALSE) # Confirm the dimensions match after correction dim(exp_corrected) # 1. Volcano plot: Visualize differential expression results dat <- deg # Assign DEG data to a variable for plotting # Create a basic volcano plot with logFC on the x-axis and -log10(p-value) on the y-axis p <- ggplot(data = dat, aes(x = logFC, y = -log10(P.Value))) + geom_point(alpha = 0.4, size = 3.5, aes(color = change)) + # Plot points with colors based on change ylab("-log10(P-value)") + # Label the y-axis scale_color_manual(values = c("blue", "grey", "red")) + # Custom color palette geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 4, col = "black", lwd = 0.8) + # Vertical lines at logFC thresholds geom_hline(yintercept = -log10(P.Value_t), lty = 4, col = "black", lwd = 0.8) + # Horizontal line at p-value threshold theme_bw() # Use a clean theme for better readability p # Display the plot # Highlight specific genes on the volcano plot if (TRUE) { for_label <- dat %>% filter(symbol %in% c("TRPM3", "SFRP1")) # Select specific genes for labeling } View(for_label) # Preview the labeled genes # Optional: Select top 20 genes for further labeling if (TRUE) { for_label <- dat %>% head(20) } # Identify top up-regulated and down-regulated genes for labeling if (TRUE) { x1 <- dat %>% filter(change == "up") %>% head(3) # Top 3 up-regulated genes x2 <- dat %>% filter(change == "down") %>% head(3) # Top 3 down-regulated genes for_label <- rbind(x1, x2) # Combine both sets of genes } # Add labels to the volcano plot for the selected genes volcano_plot <- p + geom_point(size = 3, shape = 1, data = for_label) + # Highlight labeled genes ggrepel::geom_label_repel( aes(label = symbol), family = "serif", # Add text labels with a serif font data = for_label, color = "black" # Use black color for labels ) volcano_plot # Display the final volcano plot ######## Modify the font and size ######## # Adjust the font family to 'serif' and set the font size to 12 for consistent styling volcano_plot + theme(text = element_text(family = "serif", size = 12)) # Check the class of the 'for_label' object to ensure it is properly formatted for further operations class(for_label) # Save the labeled differential genes (for_label) to a CSV file for record keeping write.csv(for_label, file = "DEGs.csv") # Save the volcano plot as PNG files with specific filenames ggsave(plot = p, filename = paste0("volcano55098.png")) ggsave(plot = volcano_plot, filename = paste0("volcano.png")) ######## Differential Gene Heatmap ######## # Load previously saved data from step 2 to reuse results without recomputing load(file = 'step2output55098.Rdata') # Select genes for the heatmap based on their change status (up/down-regulated) if (FALSE) { # Option 1: Use all non-stable genes cg = deg$probe_id[deg$change != "stable"] length(cg) # Check the number of selected genes } else { # Option 2: Select the top 20 up-regulated and top 20 down-regulated genes x = deg$logFC[deg$change != "stable"] names(x) = deg$probe_id[deg$change != "stable"] cg = c(names(head(sort(x), 20)), names(tail(sort(x), 20))) length(cg) # Verify the number of selected genes } # Subset the expression matrix to include only selected genes n = exp[cg, ] dim(n) # Check the dimensions of the subset matrix # Read additional data from a CSV file and preview the content library(readr) a <- read.csv("a.csv", row.names = 1) View(a) # Display the data for review ######## Heatmap ######## # Load the pheatmap library for heatmap generation library(pheatmap) # Create an annotation data frame to label samples by their group (e.g., control, T1D) annotation_col = data.frame(group = group_list) rownames(annotation_col) = colnames(a) # Ensure alignment with sample names # Generate the heatmap and convert it to a ggplot object for customization library(ggplotify) heatmap_plot <- as.ggplot( pheatmap(a, show_colnames = TRUE, colnames_side = 'top', # Show column names at the top show_rownames = TRUE, # Show row names scale = "row", # Scale data across rows (genes) cluster_cols = FALSE, # Disable column clustering fontfamily = "serif", # Set font family for the heatmap annotation_col = annotation_col) # Add sample group annotations ) # Customize the heatmap with font adjustments for uniformity heatmap_plot + theme(text = element_text(family = "serif", size = 12)) # Display the heatmap plot heatmap_plot # Save the heatmap plot as a PNG file for reproducibility ggsave(heatmap_plot, filename = paste0("heatmap55098.png")) # Load a previously saved PCA plot object to reuse in combined plots load("pca_plot.Rdata") # Use the patchwork library to arrange multiple plots (PCA, volcano, and heatmap) into one figure library(patchwork) (pca_plot + volcano_plot + heatmap_plot) + plot_annotation(tag_levels = "A") # Save all DEGs to a CSV file for record keeping write.csv(deg, file = "allDEGs.csv") ####### Plots of EP-DEGs ####### # Clear the environment to avoid conflicts with previous objects rm(list = ls()) # Load saved results from step 5 load(file = "step5output55098.Rdata") # Load the EP-DEGs dataset and preview it for verification library(readr) EP_DEGs <- read_csv("EP-DEGs.csv") View(EP_DEGs) # Ensure the dataset is correctly loaded # Load necessary libraries for plotting library(dplyr) library(ggplot2) # Assign the EP-DEGs data to a variable for plotting dat <- EP_DEGs # Create a volcano plot of EP-DEGs g <- ggplot(data = dat, aes(x = logFC, y = -log10(P.Value))) + geom_point(alpha = 0.4, size = 3.5, aes(color = change)) + # Plot points with colors indicating change ylab("-log10(P-value)") + # Label the y-axis scale_color_manual(values = c("blue", "red")) + # Define custom colors for changes geom_vline(xintercept = c(-logFC_t, logFC_t), lty = 4, col = "black", lwd = 0.8) + # Vertical threshold lines geom_hline(yintercept = -log10(P.Value_t), lty = 4, col = "black", lwd = 0.8) + # Horizontal threshold line theme_bw() # Apply a clean theme for better visualization g # Display the plot # Select specific genes for labeling on the volcano plot if (TRUE) { for_label <- dat %>% filter(symbol %in% c("TRPM3", "SFRP1")) } View(for_label) # Verify the labeled genes # Add labels for selected genes to the volcano plot volcano_plot <- g + geom_point(size = 3, shape = 1, data = for_label) + # Highlight labeled points ggrepel::geom_label_repel( aes(label = symbol), family = "serif", # Add labels with serif font data = for_label, color = "black" ) volcano_plot # Display the labeled volcano plot # Adjust font and size for consistency volcano_plot + theme(text = element_text(family = "serif", size = 12)) ######## Heatmap of EP-DEGs ######## # Select genes for the heatmap based on differential expression if (FALSE) { # Option 1: Use all non-stable genes cg <- EP_DEGs$probe_id[EP_DEGs$change != "stable"] } else { # Option 2: Select top 20 up- and down-regulated genes x <- EP_DEGs$logFC[EP_DEGs$change != "stable"] names(x) <- EP_DEGs$probe_id[EP_DEGs$change != "stable"] cg <- c(names(head(sort(x), 20)), names(tail(sort(x), 20))) } n <- exp[cg, ] # Subset the expression matrix with selected genes dim(n) # Verify the dimensions # Load additional data and review it a <- read.csv("a.csv", row.names = 1) View(a) # Confirm correct loading # Generate a heatmap with annotations library(pheatmap) annotation_col <- data.frame(group = group_list) # Create group annotations rownames(annotation_col) <- colnames(a) # Align annotation row names # Convert heatmap to ggplot for customization library(ggplotify) heatmap_plot <- as.ggplot( pheatmap(a, show_colnames = TRUE, colnames_side = 'top', show_rownames = TRUE, scale = "row", cluster_cols = FALSE, fontfamily = "serif", annotation_col = annotation_col) ) heatmap_plot + theme(text = element_text(family = "serif", size = 12)) # Adjust font heatmap_plot # Display the heatmap # Save the heatmap as PNG ggsave(heatmap_plot, filename = paste0("heatmap55098.png")) # Load previously saved PCA plot load("pca_plot.Rdata") ######## KEGG and GO Enrichment Analysis ######## library(clusterProfiler) library(org.Hs.eg.db) # Load human gene database # Extract gene lists for GO enrichment gene_up <- DEGs[DEGs$change == 'up', 'ENTREZID'] gene_down <- DEGs[DEGs$change == 'down', 'ENTREZID'] gene_diff <- c(gene_up, gene_down) # Combine up and down-regulated genes gene_all <- DEGs[, 'ENTREZID'] # All genes # Perform GO enrichment analysis # Adjust p-values using the Benjamini-Hochberg method (FDR correction) ego_CC <- enrichGO(gene = gene_diff, OrgDb = org.Hs.eg.db, ont = "CC", pAdjustMethod = "BH", minGSSize = 1, pvalueCutoff = 0.05, qvalueCutoff = 0.05, readable = TRUE) # Save GO enrichment results save(ego_CC, file = "ego_GSE55098.Rdata") # Bar plot of GO enrichment results barplot(ego_CC, showCategory = 20) + theme(text = element_text(family = "serif", size = 12)) ggsave(filename = "ego_CC_barplot.png") # Dot plot of GO enrichment results ego_CC_dotplot <- dotplot(ego_CC) + theme(text = element_text(family = "serif", size = 12)) ggsave(ego_CC_dotplot, filename = "ego_CC_dotplot.png") # Generate and save a gene-concept network plot geneList <- DEGs$logFC names(geneList) <- DEGs$ENTREZID ego_CC_cnetplot <- cnetplot(ego_CC, foldChange = geneList, circular = TRUE, colorEdge = TRUE) + theme(text = element_text(family = "serif", size = 12)) ggsave(ego_CC_cnetplot, filename = "ego_CC_cnetplot.png") # Heatmap-like functional classification ego_CC_heatplot <- heatplot(ego_CC, foldChange = geneList) ggsave(ego_CC_heatplot, filename = "ego_CC_heatplot.png") # Save heatplot as PDF pdf("ego_CC_heatplot.pdf", width = 14, height = 5) heatplot(ego_CC, foldChange = geneList) dev.off() ############ BP: Biological Process Analysis ############ # Barplot Visualization of GO Biological Process Enrichment Results barplot(ego_BP, showCategory = 20) # Plot top 20 enriched categories ego_BP_barplot <- barplot(ego_BP, showCategory = 20) # Store plot in a variable ego_BP_barplot + theme(text = element_text(family = "serif", size = 12)) # Adjust font size ggsave(ego_BP_barplot, filename = paste0("ego_BP_barplot.png")) # Save as PNG # Dotplot Visualization of GO Biological Process Results dotplot(ego_BP) # Generate a dot plot of enriched terms ego_BP_dotplot <- dotplot(ego_BP) # Store the dot plot ego_BP_dotplot + theme(text = element_text(family = "serif", size = 12)) # Adjust styling ggsave(ego_BP_dotplot, filename = paste0("ego_BP_dotplot.png")) # Save the plot # Prepare Gene List for Network Visualization geneList <- DEGs$logFC # Extract log fold changes for genes names(geneList) <- DEGs$ENTREZID # Name each value with the corresponding ENTREZ ID geneList <- sort(geneList, decreasing = TRUE) # Sort in descending order # Gene-Concept Network Plot cnetplot(ego_BP, categorySize = "pvalue", foldChange = geneList, colorEdge = TRUE) # Standard network plot ego_BP_cnetplot <- cnetplot(ego_BP, foldChange = geneList, circular = TRUE, colorEdge = TRUE) # Circular layout ego_BP_cnetplot + theme(text = element_text(family = "serif", size = 12)) # Customize font ggsave(ego_BP_cnetplot, filename = paste0("ego_BP_cnetplot.png")) # Save the network plot # Enrichment Map Plot for Biological Process emapplot(ego_BP) # Generate enrichment map # GOplot Visualization ego_BP_goplot <- goplot(ego_BP) # Generate a GO plot ggsave(ego_BP_goplot, filename = paste0("ego_BP_goplot.png")) # Save GO plot as PNG # Heatmap-like Functional Classification ego_BP_heatplot <- heatplot(ego_BP, foldChange = geneList) # Generate heatmap plot ggsave(ego_BP_heatplot, filename = paste0("ego_BP_heatplot.png")) # Save heatmap as PNG # Save Heatplot as PDF for Publication-Ready Format pdf("ego_BP_heatplot.pdf", width = 14, height = 5) heatplot(ego_BP, foldChange = geneList) dev.off() ############ MF: Molecular Function Analysis ############ # Barplot of GO Molecular Function Terms barplot(ego_MF, showCategory = 20) # Plot top 20 enriched MF categories ego_MF_barplot <- barplot(ego_MF, showCategory = 20) # Store plot ego_MF_barplot + theme(text = element_text(family = "serif", size = 12)) ggsave(ego_MF_barplot, filename = paste0("ego_MF_barplot.png")) # Dotplot Visualization for Molecular Function ego_MF_dotplot <- dotplot(ego_MF) ego_MF_dotplot + theme(text = element_text(family = "serif", size = 12)) ggsave(ego_MF_dotplot, filename = paste0("ego_MF_dotplot.png")) # Gene-Concept Network Plot for Molecular Function ego_MF_cnetplot <- cnetplot(ego_MF, foldChange = geneList, circular = TRUE, colorEdge = TRUE) ego_MF_cnetplot + theme(text = element_text(family = "serif", size = 12)) ggsave(ego_MF_cnetplot, filename = paste0("ego_MF_cnetplot.png")) # Enrichment Map Plot for Molecular Function emapplot(ego_MF) # GOplot for Molecular Function ego_MF_goplot <- goplot(ego_MF) ggsave(ego_MF_goplot, filename = paste0("ego_MF_goplot.png")) # Heatmap-like Functional Classification for Molecular Function ego_MF_heatplot <- heatplot(ego_MF, foldChange = geneList) ggsave(ego_MF_heatplot, filename = paste0("ego_MF_heatplot.png")) ############ KEGG Pathway Analysis ############ # Prepare Gene Lists for KEGG Enrichment Analysis gene_up <- DEGs[DEGs$change == 'up', 'ENTREZID'] gene_down <- DEGs[DEGs$change == 'down', 'ENTREZID'] gene_diff <- c(gene_up, gene_down) # Combine up- and down-regulated genes gene_all <- deg[, 'ENTREZID'] # All genes for the universe set # Perform KEGG Enrichment Analysis kk.up <- enrichKEGG(gene = gene_up, organism = 'hsa', universe = gene_all, pvalueCutoff = 0.9, qvalueCutoff = 0.9) kk.down <- enrichKEGG(gene = gene_down, organism = 'hsa', universe = gene_all, pvalueCutoff = 0.9, qvalueCutoff = 0.9) kk.diff <- enrichKEGG(gene = gene_diff, organism = 'hsa', pvalueCutoff = 0.9) # Save KEGG Results for Future Use save(kk.diff, kk.down, kk.up, file = "GSE55098kegg.Rdata") # Generate Combined KEGG Plot for Up and Down-regulated Genes g_kegg <- kegg_plot(up_kegg, down_kegg) ggsave(g_kegg, filename = 'kegg_up_down.png') ############ STRING Network Export ############ # Load DEG Data and Prepare for STRING Export DEGs <- read.csv(file = "EP-DEGs.csv") gene_up <- DEGs[DEGs$change == 'up', 'symbol'] gene_down <- DEGs[DEGs$change == 'down', 'symbol'] gene_diff <- c(gene_up, gene_down) # Write Gene Lists to Files for STRING Analysis write.table(gene_up, file = "upgene.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) write.table(gene_down, file = "downgene.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) write.table(gene_diff, file = "diffgene.txt", row.names = FALSE, col.names = FALSE, quote = FALSE) # Process STRING Interaction Data for Cytoscape Export tsv <- read.table("string_interactions_short.tsv", comment.char = "!", header = TRUE) tsv2 <- tsv[, c(1, 2, ncol(tsv))] # Select relevant columns write.table(tsv2, file = "cyto.txt", sep = "\t", quote = FALSE, row.names = FALSE) # Export DEG Information for Cytoscape deg_info <- DEGs[DEGs$change != "stable", c("symbol", "logFC", "P.Value")] write.table(deg_info, file = "deg.txt", sep = "\t", quote = FALSE, row.names = FALSE)