# Simulate metagenomic species abundance data
n_species<- 1000
n_samples<- 30

# Generate species abundance matrix (using negative binomial distribution to simulate count data)
abundance_matrix<- matrix(
rnbinom(n_species * n_samples, mu = 500, size = 5),
nrow = n_species,
ncol = n_samples
)
rownames(abundance_matrix) <- paste0("species_", 1:n_species)
colnames(abundance_matrix) <- paste0("sample_", 1:n_samples)

# Create grouping information (e.g. disease vs healthy)
group <- factor(rep(c("healthy", "disease"), each = n_samples/2))

# Simulate some species with genuinely differentially abundant
da_species<- sample(1:n_species, 100)  # 100 species with differential abundance
for (i in da_species) {
abundance_matrix[i, group == "disease"] <- 
abundance_matrix[i, group == "disease"] * runif(1, 2, 10)
}

# Perform Wilcoxon signed-rank test (suitable for microbiome data)
p_values<- apply(abundance_matrix, 1, function(x) {
wilcox.test(x ~ group)$p.value
})

# Calculate median fold change
median_fc<- apply(abundance_matrix, 1, function(x) {
median_disease<- median(x[group == "disease"])
median_healthy<- median(x[group == "healthy"])
median_disease / median_healthy
})

# Apply FDR correction
qvalues<- p.adjust(p_values, method = "BH")

# Create results data frame
da_results<- data.frame(
  species = rownames(abundance_matrix),
p_value = p_values,
fold_change = median_fc,
  log2_fold_change = log2(median_fc),
qvalue = qvalues,
  significant = qvalues<0.05  # FDR < 5%
)

# View number of significantly differentially abundant species
cat("Number of significantly differentially abundant species (FDR < 0.05):", sum(da_results$significant), "\n")

# Plot Manhattan plot displaying FDR-corrected results
manhattan_plot<- ggplot(da_results, aes(x = 1:nrow(da_results), y = -log10(p_value), 
                                         color = significant, size = abs(log2_fold_change))) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("gray", "red")) +
geom_hline(yintercept = -log10(max(da_results$p_value[da_results$significant])), 
linetype = "dashed", color = "blue") +
labs(
    x = "Species Index",
    y = "-Log10 P-value",
    title = "Manhattan Plot for Species Differential Abundance",
    subtitle = paste0("Significant species (FDR < 0.05): ", sum(da_results$significant))
  ) +
theme_minimal() +
theme(legend.position = "none")

print(manhattan_plot)

# Filter and save significant results
significant_species<- da_results %>%
  filter(significant) %>%
  arrange(qvalue)

write.csv(significant_species, "significant_species_fdr_0.05.csv", row.names = FALSE)


r
# Simulate functional enrichment analysis results (e.g. KEGG pathway enrichment analysis)
n_pathways<- 500

# Generate simulated pathway enrichment results
pathway_results<- data.frame(
pathway_id = paste0("pathway_", 1:n_pathways),
pathway_name = paste("Pathway", 1:n_pathways),
p_value = runif(n_pathways, 0, 0.1),  # Simulated p-value
enrichment_score = runif(n_pathways, -2, 2)  # Simulated enrichment fraction
)

# Randomly select some pathways as genuinely enriched pathways
true_enriched<- sample(1:n_pathways, 50)
pathway_results$p_value[true_enriched] <- runif(50, 0, 0.01)  # Make the p-values for these pathways smaller

# Apply FDR correction
pathway_results$qvalue<- p.adjust(pathway_results$p_value, method = "BH")
pathway_results$significant<- pathway_results$qvalue< 0.05

# View number of significantly enriched pathways
cat("Number of significantly enriched pathways (FDR < 0.05):", sum(pathway_results$significant), "\n")

# Plot enrichment analysis results
enrichment_plot<- pathway_results %>%
  arrange(p_value) %>%
mutate(log10p = -log10(p_value)) %>%
ggplot(aes(x = enrichment_score, y = log10p, color = significant, label = pathway_name)) +
geom_point(alpha = 0.7, size = 3) +
scale_color_manual(values = c("gray", "red")) +
geom_text_repel(
    data = subset(pathway_results, significant & abs(enrichment_score) > 1),
    size = 3,
max.overlaps = 15
  ) +
labs(
    x = "Enrichment Score",
    y = "-Log10 P-value",
    title = "Pathway Enrichment Analysis with FDR Correction",
    subtitle = paste0("Significant pathways (FDR < 0.05): ", sum(pathway_results$significant))
  ) +
theme_minimal() +
theme(legend.position = "none")

print(enrichment_plot)

# Save enrichment analysis results
write.csv(pathway_results, "pathway_enrichment_results_with_fdr.csv", row.names = FALSE)