## Explore selected SNPs suppressPackageStartupMessages(require(dplyr)) # Initiate fn <- c("paretto_optimal.csv", "paretto_optimal_filtered.csv", "LR_top10.csv", "XGB_top10.csv", "TabNet_top10.csv", "GWAS_top10.csv" ) # Load the lists of SNPs snps <- sapply(fn, function(x) read.csv(paste0("data/", x))) # Explore sapply(snps, nrow) sapply(snps, colnames) sapply(snps[3:6], function(x) signif(c(min(x$p.value), max(x$p.value)), 3)) sapply(snps[3:5], function(x) signif(c(min(x[, 2]), max(x[, 2])), 3)) # Correct SNPs ids d1 <- sapply(snps, function(x) { x$rs <- stringr::str_match(x[, 1], "(rs[0-9]+)")[, 2] x }) ## Load the results of annotation with snpEff # Initiate files <- sprintf("ann/chr%s.eff.txt", 1:22) # Load d2 <- plyr::ldply(files, function(x) data.table::fread(x)) # See the distribution t1 <- table(d2$V7) %>% data.frame() %>% arrange(desc(Freq)) %>% mutate(cs = cumsum(Freq/sum(Freq)*100)) # Save WriteXLS::WriteXLS(t1, "out/so-distribution.xlsx") # Annotate by merging d3 <- sapply(d1, function(df) { merge(df, d2, by.x = "rs", by.y = "V3", all.x = T) }) # Save as xlsx file WriteXLS::WriteXLS(d3, "out/selected-snps-annotation.xlsx") # Load genes fn <- names(snps) genes <- sapply(fn, function(x) { readxl::read_xlsx("out/genes-01.xlsx", sheet = x, col_names = F) }, USE.NAMES = F) names(genes) <- fn # Visulize intersections dt1 <- UpSetR::fromList(genes) UpSetR::upset(dt1, nsets = 6, keep.order = T, text.scale = 2) # Visulize without pareto-optimal filtered nm1 <- names(genes) nm2 <- nm1[!nm1 %in% "paretto_optimal_filtered.csv"] dt2 <- UpSetR::fromList(genes[nm2]) colnames(dt2) <- colnames(dt2) %>% sub(pattern = "_top10.csv", replacement = "") %>% sub(pattern = ".csv", replacement = "") %>% sub(pattern = "paretto_optimal", replacement = "Paretto Optimal") png("pic/upset-plot.png", width = 1266, height = 900) UpSetR::upset(dt2, nsets = 6, keep.order = T, text.scale = 3) invisible(dev.off()) # Load IS genes is <- data.table::fread("data/C0948008_disease_gda_summary.tsv") # Intersect genes with IS genes out1 <- sapply(genes, function(x) { x[x %in% is$Gene] }) # Load idg genes idg <- data.table::fread("out/idg-genes-snps.csv") # Intersect significant genes with IDG ones out2 <- sapply(genes, function(x) { x[x %in% idg$idg] }) g1 <- genes %>% unlist() %>% unique() g1 %in% idg$idg # Intersect IS genes with IDG ones out <- is[is$Gene %in% idg$idg, ]