## Explore selected SNPs
suppressPackageStartupMessages(require(dplyr))

# Initiate 
fn <- c("paretto_optimal.csv",
        "paretto_optimal_filtered.csv",
        "LR_top10.csv",
        "XGB_top10.csv",
        "TabNet_top10.csv",
        "GWAS_top10.csv"
)

# Load the lists of SNPs
snps <- sapply(fn, function(x) read.csv(paste0("data/", x)))

# Explore 
sapply(snps, nrow)
sapply(snps, colnames)
sapply(snps[3:6], function(x) signif(c(min(x$p.value), max(x$p.value)), 3))
sapply(snps[3:5], function(x) signif(c(min(x[, 2]), max(x[, 2])), 3))

# Correct SNPs ids
d1 <- sapply(snps, function(x) {
  x$rs <- stringr::str_match(x[, 1], "(rs[0-9]+)")[, 2]
  x
})

## Load the results of annotation with snpEff

# Initiate
files <- sprintf("ann/chr%s.eff.txt", 1:22)

# Load 
d2 <- plyr::ldply(files, function(x) data.table::fread(x))

# See the distribution
t1 <- table(d2$V7) %>% 
  data.frame() %>% 
  arrange(desc(Freq)) %>% 
  mutate(cs = cumsum(Freq/sum(Freq)*100))

# Save
WriteXLS::WriteXLS(t1, "out/so-distribution.xlsx")

# Annotate by merging
d3 <- sapply(d1, function(df) {
  merge(df, d2, by.x = "rs", by.y = "V3", all.x = T)
})

# Save as xlsx file
WriteXLS::WriteXLS(d3, "out/selected-snps-annotation.xlsx")

# Load genes
fn <- names(snps)
genes <- sapply(fn, function(x) {
  readxl::read_xlsx("out/genes-01.xlsx", sheet = x, col_names = F)
}, USE.NAMES = F) 

names(genes) <- fn 

# Visulize intersections
dt1 <- UpSetR::fromList(genes)
UpSetR::upset(dt1, nsets = 6, keep.order = T, text.scale = 2)

# Visulize without pareto-optimal filtered
nm1 <- names(genes)
nm2 <- nm1[!nm1 %in% "paretto_optimal_filtered.csv"]
dt2 <- UpSetR::fromList(genes[nm2])
colnames(dt2) <- colnames(dt2) %>% 
  sub(pattern = "_top10.csv", replacement = "") %>%
  sub(pattern = ".csv", replacement = "") %>%
  sub(pattern = "paretto_optimal", replacement = "Paretto Optimal")  
png("pic/upset-plot.png", width = 1266, height = 900)
UpSetR::upset(dt2, nsets = 6, keep.order = T, text.scale = 3)
invisible(dev.off())

# Load IS genes
is <- data.table::fread("data/C0948008_disease_gda_summary.tsv")

# Intersect genes with IS genes
out1 <- sapply(genes, function(x) {
  x[x %in% is$Gene]
})

# Load idg genes
idg <- data.table::fread("out/idg-genes-snps.csv")

# Intersect significant genes with IDG ones
out2 <- sapply(genes, function(x) {
  x[x %in% idg$idg]
})

g1 <- genes %>% unlist() %>% unique()
g1 %in% idg$idg

# Intersect IS genes with IDG ones
out <- is[is$Gene %in% idg$idg, ]