# ============================================================================== # Metagenomic Statistical Analysis and Visualization Script # Version: 2.0 # Author: Bioinformatician # Date: 2024-01-20 # ============================================================================== # Install necessary R packages if (!require("tidyverse")) install.packages("tidyverse") if (!require("vegan")) install.packages("vegan") if (!require("ggplot2")) install.packages("ggplot2") if (!require("ggpubr")) install.packages("ggpubr") if (!require("reshape2")) install.packages("reshape2") if (!require("RColorBrewer")) install.packages("RColorBrewer") if (!require("pheatmap")) install.packages("pheatmap") if (!require("corrplot")) install.packages("corrplot") if (!require("DESeq2")) { if (!require("BiocManager")) install.packages("BiocManager") BiocManager::install("DESeq2") } # Load packages library(tidyverse) library(vegan) library(ggplot2) library(ggpubr) library(reshape2) library(RColorBrewer) library(pheatmap) library(corrplot) library(DESeq2) # Set theme and colors theme_set(theme_bw(base_size = 14)) my_colors <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7") # Set random seed to ensure reproducible results set.seed(123) # ============================================================================== # Data Loading and Preprocessing # ============================================================================== #' Load and preprocess metagenomic data #' #' @param species_file Path to species abundance file #' @param function_file Path to functional abundance file #' @param metadata_file Path to metadata file #' @return List containing processed data frames load_metagenomic_data <- function(species_file = "species_abundance.csv", function_file = "function_abundance.csv", metadata_file = "metadata.csv") { # Read data files species_abundance <- read.csv(species_file, row.names = 1, header = TRUE, check.names = FALSE) function_abundance <- read.csv(function_file, row.names = 1, header = TRUE, check.names = FALSE) metadata <- read.csv(metadata_file, row.names = 1, header = TRUE) # Ensure data consistency common_samples <- intersect(colnames(species_abundance), rownames(metadata)) species_abundance <- species_abundance[, common_samples] function_abundance <- function_abundance[, common_samples] metadata <- metadata[common_samples, , drop = FALSE] # Filter low-abundance features filter_low_abundance <- function(abundance_table, threshold = 0.001) { mean_abundance <- apply(abundance_table, 1, mean) filtered_table <- abundance_table[mean_abundance > threshold, ] return(filtered_table) } species_filtered <- filter_low_abundance(species_abundance) function_filtered <- filter_low_abundance(function_abundance) # Normalize to relative abundance normalize_relative <- function(abundance_table) { relative_abundance <- apply(abundance_table, 2, function(x) x/sum(x)) return(as.data.frame(t(relative_abundance))) } species_relative <- normalize_relative(species_filtered) function_relative <- normalize_relative(function_filtered) return(list( species_abundance = species_abundance, function_abundance = function_abundance, species_relative = species_relative, function_relative = function_relative, metadata = metadata )) } # Load data data_list <- load_metagenomic_data() species_relative <- data_list$species_relative function_relative <- data_list$function_relative species_filtered <- data_list$species_abundance metadata <- data_list$metadata # ============================================================================== # Species Composition Analysis Visualization # ============================================================================== #' Create stacked bar plot of top taxa composition plot_taxa_barplot <- function(abundance_data, metadata, top_n = 10, group_var = "Group") { # Calculate mean abundance and select top taxa mean_abundance <- apply(abundance_data, 2, mean) top_taxa <- names(sort(mean_abundance, decreasing = TRUE))[1:top_n] # Extract top taxa data top_data <- abundance_data[, top_taxa, drop = FALSE] top_data$Others <- 1 - rowSums(top_data) top_data$Sample <- rownames(top_data) # Merge with metadata top_data <- merge(top_data, metadata, by.x = "Sample", by.y = "row.names") # Convert to long format top_data_long <- melt(top_data, id.vars = c("Sample", group_var), variable.name = "Taxa", value.name = "Abundance") # Create plot p <- ggplot(top_data_long, aes_string(x = "Sample", y = "Abundance", fill = "Taxa")) + geom_bar(stat = "identity", position = "stack") + facet_grid(as.formula(paste(".~", group_var)), scales = "free_x", space = "free") + labs(x = "Sample", y = "Relative Abundance", title = paste("Top", top_n, "Taxa Composition")) + scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Paired"))(top_n + 1)) + theme( axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "right", legend.text = element_text(size = 8) ) return(p) } #' Create box plot for specific taxon across groups plot_taxa_boxplot <- function(abundance_data, metadata, taxa, group_var = "Group") { plot_data <- data.frame( Abundance = abundance_data[, taxa], Sample = rownames(abundance_data) ) plot_data <- merge(plot_data, metadata, by.x = "Sample", by.y = "row.names") p <- ggplot(plot_data, aes_string(x = group_var, y = "Abundance", fill = group_var)) + geom_boxplot(alpha = 0.7, outlier.shape = NA) + geom_jitter(position = position_jitter(width = 0.2), size = 2, alpha = 0.7) + labs(x = group_var, y = "Relative Abundance", title = paste("Abundance of", taxa)) + stat_compare_means(method = "kruskal.test", label = "p.format") + scale_fill_manual(values = my_colors) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) return(p) } # Generate species composition plots taxa_barplot <- plot_taxa_barplot(species_relative, metadata, top_n = 10, group_var = "Group") taxa_boxplot <- plot_taxa_boxplot(species_relative, metadata, "Bacteroides", "Group") print(taxa_barplot) print(taxa_boxplot) # ============================================================================== # Functional Composition Analysis Visualization # ============================================================================== #' Create heatmap of functional pathway abundance plot_function_heatmap <- function(abundance_data, metadata, top_n = 20, group_var = "Group") { mean_abundance <- apply(abundance_data, 2, mean) top_functions <- names(sort(mean_abundance, decreasing = TRUE))[1:top_n] top_data <- abundance_data[, top_functions] # Create annotation data frame annotation_df <- data.frame(Group = metadata[, group_var]) rownames(annotation_df) <- rownames(metadata) # Create heatmap pheatmap(t(as.matrix(top_data)), scale = "row", clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete", annotation_col = annotation_df, show_colnames = FALSE, main = paste("Top", top_n, "Functional Pathways Heatmap"), color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), fontsize_row = 8) } #' Compare specific functional pathways across groups plot_function_comparison <- function(abundance_data, metadata, function_id, group_var = "Group") { plot_data <- data.frame( Abundance = abundance_data[, function_id], Sample = rownames(abundance_data) ) plot_data <- merge(plot_data, metadata, by.x = "Sample", by.y = "row.names") p <- ggplot(plot_data, aes_string(x = group_var, y = "Abundance", fill = group_var)) + geom_boxplot(alpha = 0.7, outlier.shape = NA) + geom_jitter(position = position_jitter(width = 0.2), size = 2, alpha = 0.7) + labs(x = group_var, y = "Relative Abundance", title = paste("Abundance of", function_id)) + stat_compare_means(method = "kruskal.test", label = "p.format") + scale_fill_manual(values = my_colors) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) return(p) } # Generate functional analysis plots function_heatmap <- plot_function_heatmap(function_relative, metadata, top_n = 20, group_var = "Group") function_comparison <- plot_function_comparison(function_relative, metadata, "K00001", "Group") # ============================================================================== # Alpha Diversity Analysis # ============================================================================== #' Calculate alpha diversity indices calculate_alpha_diversity <- function(abundance_data) { data_matrix <- as.matrix(abundance_data) alpha_df <- data.frame( Sample = rownames(abundance_data), Shannon = diversity(data_matrix, index = "shannon"), Simpson = diversity(data_matrix, index = "simpson"), Richness = apply(data_matrix > 0, 1, sum), Chao1 = estimateR(data_matrix)[2, ], # Chao1 richness estimator ACE = estimateR(data_matrix)[4, ] # ACE richness estimator ) # Calculate Evenness (Pielou's evenness) alpha_df$Evenness <- alpha_df$Shannon / log(alpha_df$Richness) return(alpha_df) } #' Visualize alpha diversity across groups plot_alpha_diversity <- function(alpha_df, group_var = "Group") { alpha_long <- melt(alpha_df, id.vars = c("Sample", group_var), variable.name = "Index", value.name = "Value") p <- ggplot(alpha_long, aes_string(x = group_var, y = "Value", fill = group_var)) + geom_boxplot(alpha = 0.7, outlier.shape = NA) + geom_jitter(position = position_jitter(width = 0.2), size = 1.5, alpha = 0.7) + facet_wrap(~ Index, scales = "free_y", ncol = 3) + labs(x = group_var, y = "Diversity Index Value", title = "Alpha Diversity Indices") + stat_compare_means(method = "kruskal.test", label = "p.format", size = 3) + scale_fill_manual(values = my_colors) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) return(p) } # Calculate and visualize alpha diversity alpha_diversity <- calculate_alpha_diversity(species_relative) alpha_diversity <- merge(alpha_diversity, metadata, by.x = "Sample", by.y = "row.names") alpha_plot <- plot_alpha_diversity(alpha_diversity, "Group") print(alpha_plot) # ============================================================================== # Beta Diversity Analysis # ============================================================================== #' Calculate beta diversity distance matrix calculate_beta_diversity <- function(abundance_data, method = "bray") { dist_matrix <- vegdist(abundance_data, method = method) return(dist_matrix) } #' Perform PCoA analysis perform_pcoa <- function(dist_matrix) { pcoa <- cmdscale(dist_matrix, k = 3, eig = TRUE) variance_exp <- pcoa$eig / sum(pcoa$eig) * 100 pcoa_df <- data.frame( Sample = rownames(pcoa$points), PCo1 = pcoa$points[, 1], PCo2 = pcoa$points[, 2], PCo3 = pcoa$points[, 3] ) return(list(pcoa_df = pcoa_df, variance_exp = variance_exp)) } #' Plot PCoA results plot_pcoa <- function(pcoa_df, variance_exp, group_var = "Group", show_ellipses = TRUE) { p <- ggplot(pcoa_df, aes_string(x = "PCo1", y = "PCo2", color = group_var)) + geom_point(size = 3, alpha = 0.7) + labs( x = paste0("PCo1 (", round(variance_exp[1], 2), "%)"), y = paste0("PCo2 (", round(variance_exp[2], 2), "%)"), title = "Principal Coordinates Analysis (PCoA)", color = group_var ) + scale_color_manual(values = my_colors) + theme(legend.position = "right") if (show_ellipses) { p <- p + stat_ellipse(level = 0.95, linetype = 2, alpha = 0.5) } return(p) } #' Perform PERMANOVA analysis perform_permanova <- function(dist_matrix, metadata, formula_str, permutations = 999) { permanova <- adonis2( as.formula(formula_str), data = metadata, permutations = permutations, method = "bray" ) return(permanova) } # Calculate beta diversity and perform PCoA bray_dist <- calculate_beta_diversity(species_relative) pcoa_result <- perform_pcoa(bray_dist) pcoa_df <- merge(pcoa_result$pcoa_df, metadata, by.x = "Sample", by.y = "row.names") pcoa_plot <- plot_pcoa(pcoa_df, pcoa_result$variance_exp, "Group") # Perform PERMANOVA permanova_result <- perform_permanova(bray_dist, metadata, "bray_dist ~ Group") print(pcoa_plot) print(permanova_result) # ============================================================================== # Differential Abundance Analysis # ============================================================================== #' Perform differential abundance analysis using DESeq2 perform_deseq2 <- function(abundance_data, metadata, group_var = "Group", reference = NULL) { # Convert to integer counts (pseudo-counts) count_data <- round(abundance_data * 10000) count_data[count_data < 0] <- 0 # Ensure non-negative # Create DESeq2 object dds <- DESeqDataSetFromMatrix( countData = count_data, colData = metadata, design = as.formula(paste("~", group_var)) ) # Set reference level if provided if (!is.null(reference)) { dds[[group_var]] <- relevel(dds[[group_var]], ref = reference) } # Run DESeq2 analysis dds <- DESeq(dds) res <- results(dds) res_df <- as.data.frame(res) res_df$Feature <- rownames(res_df) return(list(results = res_df, dds = dds)) } #' Create volcano plot for differential abundance results plot_volcano <- function(deseq_results, p_cutoff = 0.05, fc_cutoff = 2, top_n = 10) { volcano_data <- deseq_results %>% mutate( Significance = case_when( padj < p_cutoff & log2FoldChange > log2(fc_cutoff) ~ "Up-regulated", padj < p_cutoff & log2FoldChange < -log2(fc_cutoff) ~ "Down-regulated", TRUE ~ "Not significant" ), Label = ifelse(-log10(padj) > 10 & abs(log2FoldChange) > 2, Feature, "") ) p <- ggplot(volcano_data, aes(x = log2FoldChange, y = -log10(padj), color = Significance)) + geom_point(alpha = 0.6, size = 1.5) + geom_hline(yintercept = -log10(p_cutoff), linetype = "dashed", color = "black", alpha = 0.5) + geom_vline(xintercept = c(-log2(fc_cutoff), log2(fc_cutoff)), linetype = "dashed", color = "black", alpha = 0.5) + scale_color_manual(values = c("Down-regulated" = "blue", "Not significant" = "gray", "Up-regulated" = "red")) + labs(x = "log2(Fold Change)", y = "-log10(Adjusted p-value)", title = "Volcano Plot - Differential Abundance") + theme(legend.position = "right") + geom_text(aes(label = Label), vjust = 1.5, hjust = 0.5, size = 3, check_overlap = TRUE) return(p) } #' Create heatmap of differentially abundant features plot_differential_heatmap <- function(abundance_data, deseq_results, metadata, p_cutoff = 0.05, fc_cutoff = 2, top_n = 25, group_var = "Group") { # Filter significant features sig_features <- deseq_results %>% filter(padj < p_cutoff & abs(log2FoldChange) > log2(fc_cutoff)) %>% arrange(padj) %>% head(top_n) %>% pull(Feature) if (length(sig_features) == 0) { warning("No significant features found for the given thresholds.") return(NULL) } # Extract significant data sig_data <- abundance_data[, sig_features, drop = FALSE] # Create annotation annotation_df <- data.frame(Group = metadata[, group_var]) rownames(annotation_df) <- rownames(metadata) # Create heatmap pheatmap(t(as.matrix(sig_data)), scale = "row", clustering_distance_rows = "euclidean", clustering_distance_cols = "euclidean", clustering_method = "complete", annotation_col = annotation_df, show_colnames = FALSE, main = paste("Differentially Abundant Features (FDR <", p_cutoff, "& |FC| >", fc_cutoff, ")"), color = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), fontsize_row = 8) } # Perform differential abundance analysis deseq_results <- perform_deseq2(species_filtered, metadata, "Group", reference = "Control") volcano_plot <- plot_volcano(deseq_results$results) differential_heatmap <- plot_differential_heatmap(species_relative, deseq_results$results, metadata) print(volcano_plot) # ============================================================================== # Functional Enrichment Analysis # ============================================================================== #' Perform functional enrichment analysis perform_enrichment_analysis <- function(deseq_results, annotation_file = NULL, p_cutoff = 0.05, fc_cutoff = 2) { # Filter significant features sig_features <- deseq_results %>% filter(padj < p_cutoff & abs(log2FoldChange) > log2(fc_cutoff)) %>% pull(Feature) if (is.null(annotation_file)) { # Return basic statistics if no annotation file provided return(data.frame( Significant_Features = length(sig_features), Upregulated = sum(deseq_results$padj < p_cutoff & deseq_results$log2FoldChange > log2(fc_cutoff), na.rm = TRUE), Downregulated = sum(deseq_results$padj < p_cutoff & deseq_results$log2FoldChange < -log2(fc_cutoff), na.rm = TRUE) )) } # Load pathway annotations (example structure) pathway_annotation <- read.csv(annotation_file, header = TRUE) # Perform enrichment analysis sig_pathways <- pathway_annotation %>% filter(Feature %in% sig_features) %>% group_by(Pathway) %>% summarize(Count = n(), .groups = 'drop') %>% arrange(desc(Count)) return(sig_pathways) } #' Plot enrichment results plot_enrichment <- function(enrichment_results, top_n = 15, title = "Pathway Enrichment Analysis") { if (nrow(enrichment_results) == 0) { message("No enrichment results to plot.") return(NULL) } top_pathways <- enrichment_results %>% head(top_n) p <- ggplot(top_pathways, aes(x = reorder(Pathway, Count), y = Count)) + geom_bar(stat = "identity", fill = "steelblue", alpha = 0.7) + coord_flip() + labs(x = "Pathway", y = "Number of Features", title = title) + theme_minimal() + theme(axis.text.y = element_text(size = 10)) return(p) } # Example enrichment analysis (requires annotation file) # enrichment_results <- perform_enrichment_analysis(deseq_results$results, "pathway_annotation.csv") # enrichment_plot <- plot_enrichment(enrichment_results) # ============================================================================== # Correlation Analysis # ============================================================================== #' Calculate correlations between features and environmental factors calculate_correlations <- function(abundance_data, env_data, method = "spearman") { cor_matrix <- cor(abundance_data, env_data, method = method) return(cor_matrix) } #' Plot correlation heatmap plot_correlation_heatmap <- function(cor_matrix, p_matrix = NULL, title = "Correlation Heatmap") { corrplot(cor_matrix, method = "color", type = "full", order = "hclust", tl.cex = 0.7, tl.col = "black", col = colorRampPalette(rev(brewer.pal(n = 7, name = "RdYlBu")))(100), main = title) if (!is.null(p_matrix)) { corrplot(cor_matrix, p.mat = p_matrix, insig = "label_sig", sig.level = c(0.001, 0.01, 0.05), pch.cex = 0.8, pch.col = "white", add = TRUE) } } #' Perform Mantel test perform_mantel_test <- function(species_data, env_data, method = "bray", permutations = 999) { species_dist <- vegdist(species_data, method = method) env_dist <- vegdist(env_data, method = "euclidean") mantel_test <- mantel(species_dist, env_dist, method = "spearman", permutations = permutations) return(mantel_test) } # Example correlation analysis (requires environmental data) # env_data <- read.csv("environmental_factors.csv", row.names = 1, header = TRUE) # cor_results <- calculate_correlations(species_relative, env_data) # plot_correlation_heatmap(cor_results) # ============================================================================== # Save Results and Generate Report # ============================================================================== #' Save all analysis results save_analysis_results <- function(output_dir = "analysis_results") { # Create output directory if (!dir.exists(output_dir)) { dir.create(output_dir, recursive = TRUE) } # Save plots ggsave(file.path(output_dir, "taxa_composition.png"), taxa_barplot, width = 12, height = 8, dpi = 300, bg = "white") ggsave(file.path(output_dir, "alpha_diversity.png"), alpha_plot, width = 10, height = 6, dpi = 300, bg = "white") ggsave(file.path(output_dir, "beta_diversity_pcoa.png"), pcoa_plot, width = 8, height = 6, dpi = 300, bg = "white") ggsave(file.path(output_dir, "volcano_plot.png"), volcano_plot, width = 8, height = 6, dpi = 300, bg = "white") # Save data results write.csv(alpha_diversity, file.path(output_dir, "alpha_diversity_results.csv"), row.names = FALSE) write.csv(deseq_results$results, file.path(output_dir, "differential_abundance_results.csv"), row.names = FALSE) # Save statistical results sink(file.path(output_dir, "permanova_results.txt")) cat("PERMANOVA Results\n") cat("================\n\n") print(permanova_result) sink() # Save session information sink(file.path(output_dir, "session_info.txt")) cat("Session Information\n") cat("==================\n\n") print(sessionInfo()) sink() # Generate summary report generate_summary_report(output_dir) message("Analysis results saved to: ", output_dir) } #' Generate summary report generate_summary_report <- function(output_dir) { report_file <- file.path(output_dir, "analysis_summary_report.txt") sink(report_file) cat("METAGENOMIC ANALYSIS SUMMARY REPORT\n") cat("===================================\n\n") cat("Analysis Date:", date(), "\n") cat("Number of Samples:", nrow(metadata), "\n") cat("Number of Species Features:", ncol(species_relative), "\n") cat("Number of Functional Features:", ncol(function_relative), "\n\n") cat("ALPHA DIVERSITY SUMMARY\n") cat("-----------------------\n") cat("Average Shannon Index:", round(mean(alpha_diversity$Shannon, na.rm = TRUE), 3), "\n") cat("Average Simpson Index:", round(mean(alpha_diversity$Simpson, na.rm = TRUE), 3), "\n") cat("Average Richness:", round(mean(alpha_diversity$Richness, na.rm = TRUE), 1), "\n\n") cat("BETA DIVERSITY RESULTS\n") cat("----------------------\n") cat("PERMANOVA R-squared:", round(permanova_result$R2[1], 4), "\n") cat("PERMANOVA p-value:", permanova_result$`Pr(>F)`[1], "\n\n") cat("DIFFERENTIAL ABUNDANCE RESULTS\n") cat("------------------------------\n") sig_up <- sum(deseq_results$results$padj < 0.05 & deseq_results$results$log2FoldChange > 0, na.rm = TRUE) sig_down <- sum(deseq_results$results$padj < 0.05 & deseq_results$results$log2FoldChange < 0, na.rm = TRUE) cat("Significantly upregulated features:", sig_up, "\n") cat("Significantly downregulated features:", sig_down, "\n") cat("Total significant features:", sig_up + sig_down, "\n") sink() } # Save all results save_analysis_results() # ============================================================================== # Main Analysis Workflow # ============================================================================== #' Main analysis workflow function run_complete_analysis <- function() { message("Starting complete metagenomic analysis workflow...") # Load data message("1. Loading and preprocessing data...") data_list <- load_metagenomic_data() # Species composition analysis message("2. Performing species composition analysis...") taxa_barplot <- plot_taxa_barplot(data_list$species_relative, data_list$metadata) # Alpha diversity analysis message("3. Calculating alpha diversity...") alpha_div <- calculate_alpha_diversity(data_list$species_relative) alpha_div <- merge(alpha_div, data_list$metadata, by.x = "Sample", by.y = "row.names") alpha_plot <- plot_alpha_diversity(alpha_div) # Beta diversity analysis message("4. Performing beta diversity analysis...") bray_dist <- calculate_beta_diversity(data_list$species_relative) pcoa_result <- perform_pcoa(bray_dist) pcoa_df <- merge(pcoa_result$pcoa_df, data_list$metadata, by.x = "Sample", by.y = "row.names") pcoa_plot <- plot_pcoa(pcoa_df, pcoa_result$variance_exp) permanova_result <- perform_permanova(bray_dist, data_list$metadata, "bray_dist ~ Group") # Differential abundance analysis message("5. Performing differential abundance analysis...") deseq_results <- perform_deseq2(data_list$species_abundance, data_list$metadata) volcano_plot <- plot_volcano(deseq_results$results) # Save results message("6. Saving results...") save_analysis_results() message("Analysis workflow completed successfully!") return(list( taxa_barplot = taxa_barplot, alpha_plot = alpha_plot, pcoa_plot = pcoa_plot, volcano_plot = volcano_plot, permanova_result = permanova_result, deseq_results = deseq_results )) } # Uncomment the following line to run the complete analysis workflow # results <- run_complete_analysis() message("Metagenomic analysis script loaded successfully!") message("Use run_complete_analysis() to execute the full workflow.")