# Simulate metagenomic species abundance data n_species<- 1000 n_samples<- 30 # Generate species abundance matrix (using negative binomial distribution to simulate count data) abundance_matrix<- matrix( rnbinom(n_species * n_samples, mu = 500, size = 5), nrow = n_species, ncol = n_samples ) rownames(abundance_matrix) <- paste0("species_", 1:n_species) colnames(abundance_matrix) <- paste0("sample_", 1:n_samples) # Create grouping information (e.g. disease vs healthy) group <- factor(rep(c("healthy", "disease"), each = n_samples/2)) # Simulate some species with genuinely differentially abundant da_species<- sample(1:n_species, 100) # 100 species with differential abundance for (i in da_species) { abundance_matrix[i, group == "disease"] <- abundance_matrix[i, group == "disease"] * runif(1, 2, 10) } # Perform Wilcoxon signed-rank test (suitable for microbiome data) p_values<- apply(abundance_matrix, 1, function(x) { wilcox.test(x ~ group)$p.value }) # Calculate median fold change median_fc<- apply(abundance_matrix, 1, function(x) { median_disease<- median(x[group == "disease"]) median_healthy<- median(x[group == "healthy"]) median_disease / median_healthy }) # Apply FDR correction qvalues<- p.adjust(p_values, method = "BH") # Create results data frame da_results<- data.frame( species = rownames(abundance_matrix), p_value = p_values, fold_change = median_fc, log2_fold_change = log2(median_fc), qvalue = qvalues, significant = qvalues<0.05 # FDR < 5% ) # View number of significantly differentially abundant species cat("Number of significantly differentially abundant species (FDR < 0.05):", sum(da_results$significant), "\n") # Plot Manhattan plot displaying FDR-corrected results manhattan_plot<- ggplot(da_results, aes(x = 1:nrow(da_results), y = -log10(p_value), color = significant, size = abs(log2_fold_change))) + geom_point(alpha = 0.7) + scale_color_manual(values = c("gray", "red")) + geom_hline(yintercept = -log10(max(da_results$p_value[da_results$significant])), linetype = "dashed", color = "blue") + labs( x = "Species Index", y = "-Log10 P-value", title = "Manhattan Plot for Species Differential Abundance", subtitle = paste0("Significant species (FDR < 0.05): ", sum(da_results$significant)) ) + theme_minimal() + theme(legend.position = "none") print(manhattan_plot) # Filter and save significant results significant_species<- da_results %>% filter(significant) %>% arrange(qvalue) write.csv(significant_species, "significant_species_fdr_0.05.csv", row.names = FALSE) r # Simulate functional enrichment analysis results (e.g. KEGG pathway enrichment analysis) n_pathways<- 500 # Generate simulated pathway enrichment results pathway_results<- data.frame( pathway_id = paste0("pathway_", 1:n_pathways), pathway_name = paste("Pathway", 1:n_pathways), p_value = runif(n_pathways, 0, 0.1), # Simulated p-value enrichment_score = runif(n_pathways, -2, 2) # Simulated enrichment fraction ) # Randomly select some pathways as genuinely enriched pathways true_enriched<- sample(1:n_pathways, 50) pathway_results$p_value[true_enriched] <- runif(50, 0, 0.01) # Make the p-values for these pathways smaller # Apply FDR correction pathway_results$qvalue<- p.adjust(pathway_results$p_value, method = "BH") pathway_results$significant<- pathway_results$qvalue< 0.05 # View number of significantly enriched pathways cat("Number of significantly enriched pathways (FDR < 0.05):", sum(pathway_results$significant), "\n") # Plot enrichment analysis results enrichment_plot<- pathway_results %>% arrange(p_value) %>% mutate(log10p = -log10(p_value)) %>% ggplot(aes(x = enrichment_score, y = log10p, color = significant, label = pathway_name)) + geom_point(alpha = 0.7, size = 3) + scale_color_manual(values = c("gray", "red")) + geom_text_repel( data = subset(pathway_results, significant & abs(enrichment_score) > 1), size = 3, max.overlaps = 15 ) + labs( x = "Enrichment Score", y = "-Log10 P-value", title = "Pathway Enrichment Analysis with FDR Correction", subtitle = paste0("Significant pathways (FDR < 0.05): ", sum(pathway_results$significant)) ) + theme_minimal() + theme(legend.position = "none") print(enrichment_plot) # Save enrichment analysis results write.csv(pathway_results, "pathway_enrichment_results_with_fdr.csv", row.names = FALSE)