Emilie Ramsahai 1, Vrijesh Tripathi 1, Melford John 2

1 Department of Mathematics and Statistics, The University of the West Indies, St Augustine, Trinidad and Tobago 2 Department of Preclinical Sciences, The University of the West Indies, St Augustine, Trinidad and Tobago

This document contains code used in our analysis. It is purely written in R using a series of R and Bioconductor packages. This report has been generated using the knitr R package (http://yihui.name/knitr/). For a complete list of packages and their versions please have a look at the end of this document.

#To install the BioConductor libraries use the command
#source("https://bioconductor.org/biocLite.R")
#biocLite("reactome.db")
#biocLite("org.Hs.eg.db")
#The data files are to be placed in your working directory

  rm(list=ls())
  library("reactome.db")
## Loading required package: AnnotationDbi
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: IRanges
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## 
## Attaching package: 'IRanges'
## The following object is masked from 'package:grDevices':
## 
##     windows
  library("org.Hs.eg.db")
## 
  library("igraph")
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:IRanges':
## 
##     union
## The following object is masked from 'package:S4Vectors':
## 
##     union
## The following objects are masked from 'package:BiocGenerics':
## 
##     normalize, path, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
  library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:IRanges':
## 
##     space
## The following object is masked from 'package:S4Vectors':
## 
##     space
## The following object is masked from 'package:stats':
## 
##     lowess
  library("sets")
## 
## Attaching package: 'sets'
## The following object is masked from 'package:igraph':
## 
##     %>%
## Analysis can be done with any network
## 3 networks are provided 1.Pathway Commons 2. BioGrid 3. STRING
## The final network is used for processing.
## Ensure the required network is the final network to be loaded
## This arrangement of the code uses the STRING network 

## PATHWAY COMMONS   
  load("pcom.rdata")
  g<-org.Hs.PCommons_DN
  g
## IGRAPH 3297702 DN-- 15044 249606 -- 
## + attr: name (v/c), geneid (v/n), symbol (v/c), description (v/c),
## | catalysis_precedes (e/n), controls_expression_of (e/n),
## | controls_phosphorylation_of (e/n), controls_state_change_of
## | (e/n), controls_transport_of (e/n)
## + edges from 3297702 (vertex names):
##  [1] PNCK->EIF4G3   PNCK->HSP90AB1 PNCK->HSP90AA1 DPM2->ALG10   
##  [5] DPM2->PIGL     DPM2->ALG10B   AACS->NME1     AACS->NME3    
##  [9] AACS->NME4     AACS->SLC16A7  AACS->CPT1C    AACS->CPT2    
## [13] AACS->PGM2     AACS->OXCT1    AACS->NME2     AACS->PMM2    
## [17] AACS->PPA1     AACS->NME7     AACS->CPT1A    AACS->AOX1    
## + ... omitted several edges
#BIOGRID-ORGANISM-Homo_sapiens-3.4.163.tab2

  b<-read.csv(file = "BIOGRID-ORGANISM-Homo_sapiens-3.4.163.tab2.txt", sep = "\t")[ ,1:9]
  colnames(b)<-c("V1","V2","V3", "V4", "V5", "V6", "V7", "V8", "V9")
  b1<-b[,8:9]
  bg <- graph.data.frame(b1, directed=FALSE)
  g<-bg

## STRING
  sppi<-read.table('1_ppi_string.txt',header = FALSE, sep='\t')
  sg <- graph.data.frame(sppi, directed=FALSE)
  g<-sg
  
## ALL Reactome pathways

  pathways <- toTable(reactomePATHNAME2ID)
  pathwaysSelectedSpecies <- pathways[grep("Homo sapiens: ", iconv(pathways$path_name)), ]
  head(pathwaysSelectedSpecies[,2] )
## [1] "Homo sapiens: 2-LTR circle formation"                                         
## [2] "Homo sapiens: 5-Phosphoribose 1-diphosphate biosynthesis"                     
## [3] "Homo sapiens: A tetrasaccharide linker sequence is required for GAG synthesis"
## [4] "Homo sapiens: ABC transporter disorders"                                      
## [5] "Homo sapiens: ABC transporters in lipid homeostasis"                          
## [6] "Homo sapiens: ABC-family proteins mediated transport"
#Input reference List of driver genes, and Pathway of interest (Signal transduction)
vogel<-read.table('vogel.txt',header = FALSE, sep='\t')
c5000<-read.table('Cancer5000.csv',header = FALSE, sep='\t')
census<-read.table('census.csv',header = FALSE, sep=',')
ven1<-venn( list(Cancer5000=c5000$V1,Vogelstein=vogel$V1,CGC=census$V1))

isect <- attr(ven1, "intersection")
refDriver<-isect$`Cancer5000:Vogelstein:CGC`

dfrefDriver<-data.frame(refDriver)

PidSymbol<-dfrefDriver$refDriver

indx<-V(g)$name %in% PidSymbol
subv<- V(g)[indx]
net2 <- induced.subgraph(graph=g,vids=subv)
dg <- decompose.graph(net2) 
DriverNetwork<-dg[[1]]  
DriverNetwork
## IGRAPH d9e81d6 UN-- 68 643 -- 
## + attr: name (v/c), V3 (e/n)
## + edges from d9e81d6 (vertex names):
##  [1] AKT1  --ASXL1  AKT1  --BRAF   AKT1  --BRCA1  ATM   --BRCA1 
##  [5] CASP8 --CDH1   AKT1  --CDKN2A ATM   --CDKN2A BRCA1 --CDKN2A
##  [9] AKT1  --CREBBP APC   --CREBBP ARID1A--CREBBP ATM   --CREBBP
## [13] BRCA1 --CREBBP CDH1  --CREBBP AKT1  --CTNNB1 APC   --CTNNB1
## [17] BRCA1 --CTNNB1 CASP8 --CTNNB1 CDH1  --CTNNB1 CREBBP--CTNNB1
## [21] AKT1  --EGFR   BRAF  --EGFR   CDH1  --EGFR   CDKN2A--EGFR  
## [25] CTNNB1--EGFR   AKT1  --EP300  ARID1A--EP300  ATM   --EP300 
## [29] BRCA1 --EP300  CDH1  --EP300  CDKN2A--EP300  CREBBP--EP300 
## + ... omitted several edges
plot(DriverNetwork,vertex.label=NA)

VdfDriverNetwork<-data.frame(V1 = V(DriverNetwork)$name)

## Analysis can be done with any pathway
## 3 pathways were tested 1.Gene Expression 2. Immune System 3. Signal Transduction
## The final pathway is used for processing.
## Ensure the required pathway is the final pathway to be loaded
## This arrangement of the code uses the Signal Transduction network 

pathwayOfInterest ="Homo sapiens: Gene expression (Transcription)"
pathwayOfInterest ="Homo sapiens: Immune System"
pathwayOfInterest = "Homo sapiens: Signal Transduction"



xx <- as.list(reactomePATHNAME2ID) 
pathwayOfInterestID = as.character(xx[pathwayOfInterest])
xx2 <- as.list(reactomePATHID2EXTID) 
PidGene<-(xx2[pathwayOfInterestID])
PidGeneid<-PidGene[[1]]
PathwayGeneList<-as.character(PidGeneid)
PathwayGeneListAlias<-data.frame(select(org.Hs.eg.db, PathwayGeneList, c("ALIAS"), "ENTREZID"))
## 'select()' returned many:many mapping between keys and columns
PidSymbol<-PathwayGeneListAlias$ALIAS

indx<-V(g)$name %in% PidSymbol
subv<- V(g)[indx]
net2 <- induced.subgraph(graph=g,vids=subv)

#decompose to determine final ST connected pathway

dg <- decompose.graph(net2) 
PathwayNetwork<-dg[[1]] 
PathwayNetwork
## IGRAPH daac892 UN-- 2655 207992 -- 
## + attr: name (v/c), V3 (e/n)
## + edges from daac892 (vertex names):
##  [1] AKT1    --AR       AKT1    --ARHGEF12 AKT1    --ARHGEF7 
##  [4] ARHGEF12--ARHGEF7  ARPC2   --ARPC4    APC     --ASH2L   
##  [7] AR      --ASH2L    ARPC2   --ATP6V1B1 ATP6V1B1--ATP6V1D 
## [10] AKT1    --AURKB    ASH2L   --AURKB    APC     --AXIN1   
## [13] AKT1    --BAD      AKT1    --BCL2     BAD     --BCL2    
## [16] AR      --BIRC2    AURKB   --BIRC2    BCL2    --BIRC2   
## [19] AR      --BIRC3    AURKB   --BIRC3    BCL2    --BIRC3   
## [22] BIRC2   --BIRC3    APC     --BIRC5    AR      --BIRC5   
## + ... omitted several edges
plot(PathwayNetwork,vertex.label=NA)

VdfPathwayNetwork<-data.frame(V1 = V(PathwayNetwork)$name)
Vdf<-data.frame(V1 = V(g)$name)

x <- org.Hs.egSYMBOL2EG 
mapped_genes <- mappedkeys(x) 
VdfPathwayNetwork2<-as.character(VdfPathwayNetwork$V1)
indx<-(mapped_genes %in% VdfPathwayNetwork2)
VdfPathwayNetwork1<-mapped_genes[indx]
VdfPathwayNetworkEZ<-data.frame(select(org.Hs.eg.db, VdfPathwayNetwork1, c("ENTREZID","GENENAME"), "ALIAS"))
## 'select()' returned 1:many mapping between keys and columns
dim(VdfPathwayNetworkEZ)
## [1] 2877    3
head(VdfPathwayNetworkEZ)
##   ALIAS ENTREZID                                           GENENAME
## 1   A2M        2                              alpha-2-macroglobulin
## 2 ABCA4       24          ATP binding cassette subfamily A member 4
## 3  ABL1       25 ABL proto-oncogene 1, non-receptor tyrosine kinase
## 4   ABR       29          ABR, RhoGEF and GTPase activating protein
## 5  ACTB       60                                         actin beta
## 6  ACTB   728378                POTE ankyrin domain family member F
VdfDriverNetworkc<-as.character(VdfDriverNetwork$V1)
VdfDriverNetworkEZ<-data.frame(select(org.Hs.eg.db, VdfDriverNetworkc, c("ENTREZID","GENENAME"), "ALIAS"))
## 'select()' returned 1:many mapping between keys and columns
dim(VdfDriverNetworkEZ)
## [1] 74  3
head(VdfDriverNetworkEZ)
##    ALIAS ENTREZID
## 1   AKT1      207
## 2    APC      324
## 3    APC     5624
## 4 ARID1A     8289
## 5  ASXL1   171023
## 6    ATM      472
##                                                     GENENAME
## 1                              AKT serine/threonine kinase 1
## 2                       APC, WNT signaling pathway regulator
## 3 protein C, inactivator of coagulation factors Va and VIIIa
## 4                              AT-rich interaction domain 1A
## 5     additional sex combs like 1, transcriptional regulator
## 6                                ATM serine/threonine kinase
size<-nrow(VdfPathwayNetworkEZ)     
size1<-nrow(VdfDriverNetworkEZ)
Jindex<-matrix(0,size,size1)        

colnames(Jindex)<-VdfDriverNetworkEZ$ENTREZID
rownames(Jindex)<-VdfPathwayNetworkEZ$ENTREZID

xx1 <- as.list(reactomeEXTID2PATHID)
for (j in 1:size){

nvgene<-VdfPathwayNetworkEZ$ENTREZID[j]

for(i in 1:size1) {
valid<-FALSE
valid1<-FALSE
valid2<-FALSE


# Calculate jcard index for 
vgene<-VdfDriverNetworkEZ$ENTREZID[i]

EZ2Pid<-(xx1[vgene])
if (!(is.null(EZ2Pid[[1]]))){
valid1<-TRUE
dfvEZ2Pid<-stack(EZ2Pid)
}


EZ2Pid<-(xx1[nvgene])
if (!(is.null(EZ2Pid[[1]]))){
    valid2<-TRUE
    dfnvEZ2Pid<-stack(EZ2Pid)
}

# Similarity between pathways 


if (valid1 && valid2){
c<-dfnvEZ2Pid$values
a<-as.character(c)
A<-as.set(a)

c<-dfvEZ2Pid$values
B<-as.set(c)
Jindex[j,i]<-set_similarity(A, B)


} # if valid1
} # for i
} #for j

dim(Jindex)
## [1] 2877   74
Jindex[1:10,1:16]
##               207        324       5624       8289     171023        472
## 2      0.02970297 0.02222222 0.11538462 0.00000000 0.00000000 0.00000000
## 24     0.05208333 0.07500000 0.00000000 0.00000000 0.00000000 0.00000000
## 25     0.09649123 0.01515152 0.02040816 0.10000000 0.00000000 0.14666667
## 29     0.03125000 0.02500000 0.00000000 0.00000000 0.00000000 0.00000000
## 60     0.09027778 0.06451613 0.03797468 0.04109589 0.05882353 0.01724138
## 728378 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000
## 71     0.08870968 0.04054054 0.00000000 0.00000000 0.00000000 0.00000000
## 88     0.02803738 0.01960784 0.02941176 0.00000000 0.00000000 0.00000000
## 91     0.02150538 0.02777778 0.00000000 0.00000000 0.00000000 0.00000000
## 92     0.02127660 0.02702703 0.00000000 0.00000000 0.00000000 0.00000000
##               673        672        841        999      51343      1029
## 2      0.01851852 0.00000000 0.01923077 0.11764706 0.00000000 0.0000000
## 24     0.06122449 0.00000000 0.02040816 0.06060606 0.00000000 0.0000000
## 25     0.01333333 0.17460317 0.04225352 0.07272727 0.05263158 0.1355932
## 29     0.02040816 0.00000000 0.04347826 0.06451613 0.00000000 0.0000000
## 60     0.14893617 0.04950495 0.02912621 0.12345679 0.01098901 0.0312500
## 728378 0.00000000 0.00000000 0.00000000 0.00000000 0.00000000 0.0000000
## 71     0.19444444 0.00000000 0.03703704 0.16949153 0.01449275 0.0000000
## 88     0.19607843 0.00000000 0.01724138 0.04761905 0.00000000 0.0000000
## 91     0.02222222 0.00000000 0.02325581 0.03571429 0.00000000 0.0000000
## 92     0.02173913 0.00000000 0.02272727 0.03448276 0.00000000 0.0000000
##              1387       1499       1956       2033
## 2      0.02325581 0.02941176 0.01369863 0.01904762
## 24     0.03658537 0.04687500 0.07575758 0.02970297
## 25     0.10101010 0.13750000 0.06741573 0.11304348
## 29     0.01219512 0.03174603 0.04545455 0.00990099
## 60     0.09302326 0.17142857 0.12389381 0.14285714
## 728378 0.00000000 0.00000000 0.00000000 0.00000000
## 71     0.05309735 0.18823529 0.14130435 0.04545455
## 88     0.02173913 0.02702703 0.05263158 0.01801802
## 91     0.02597403 0.03389831 0.03174603 0.02083333
## 92     0.02564103 0.03333333 0.03125000 0.02061856
Jindex[Jindex > 0] <- 1
image( Jindex, xaxt= "n", yaxt= "n" )

reach2=function(x){         
r=vector(length=vcount(x))
for (i in 1:vcount(x)){
n=neighborhood(x,2,nodes=i)
ni=unlist(n)
l=length(ni)
r[i]=(l)/vcount(x)}
r}

r2<-reach2(g)
Vdf$r2<-r2
range<-quantile(r2)

size<-length(VdfPathwayNetwork2)
size1<-length(VdfDriverNetworkc)
RLevel<-matrix(0,size,size1)        

colnames(RLevel)<-VdfDriverNetworkc
rownames(RLevel)<-VdfPathwayNetwork2

for (j in 1:size){
nvgene<-VdfPathwayNetwork2[j]
  
for(i in 1:size1) {

vgene<-VdfDriverNetworkc[i]
  
geneid1<-as.character(vgene)
geneid2<-as.character(nvgene)
r21<-Vdf[Vdf$V1==geneid1,]$r2
r22<-Vdf[Vdf$V1==geneid2,]$r2
r21I<-findInterval(r21,range)
r22I<-findInterval(r22,range)

if (r21I==r22I)
    RLevel[j,i]<-1



} # for i
} #for j

RLevel[1:10,1:16]
##          AKT1 APC ARID1A ASXL1 ATM BRAF BRCA1 CASP8 CDH1 CDKN2A CREBBP
## AKT1        1   1      1     0   1    1     1     1    1      1      1
## AMOT        0   0      0     1   0    0     0     0    0      0      0
## APC         1   1      1     0   1    1     1     1    1      1      1
## APOA1       0   0      0     0   0    0     0     0    0      0      0
## AR          1   1      1     0   1    1     1     1    1      1      1
## ARHGEF12    1   1      1     0   1    1     1     1    1      1      1
## ARHGEF7     1   1      1     0   1    1     1     1    1      1      1
## ARPC2       1   1      1     0   1    1     1     1    1      1      1
## ARPC4       0   0      0     1   0    0     0     0    0      0      0
## ASH2L       1   1      1     0   1    1     1     1    1      1      1
##          CTNNB1 EGFR EP300 ERBB2 FBXW7
## AKT1          1    1     1     1     1
## AMOT          0    0     0     0     0
## APC           1    1     1     1     1
## APOA1         0    0     0     0     0
## AR            1    1     1     1     1
## ARHGEF12      1    1     1     1     1
## ARHGEF7       1    1     1     1     1
## ARPC2         1    1     1     1     1
## ARPC4         0    0     0     0     0
## ASH2L         1    1     1     1     1
#######################

df1Jindex<-as.data.frame(as.table(Jindex))
df2Jindex<-merge(df1Jindex,VdfDriverNetworkEZ,by.x="Var2",by.y="ENTREZID")
df3Jindex<-merge(df2Jindex,VdfPathwayNetworkEZ,by.x="Var1",by.y="ENTREZID")
df4Jindex<-df3Jindex[,c("Freq","ALIAS.x","ALIAS.y")]
PMatrix3<-df4Jindex
ssize<-50

#1000 repetitions

for (j in 1:1000) {

dim(PMatrix3)   
head(PMatrix3)

df2<-PMatrix3
colnames(df2) <- c("PFreq","Var2","Var1")
head(df2)
drivers<-data.frame(unique(df2$Var2))

smpl <- drivers[sample(nrow(drivers), ssize),]
indx<-df2$Var2 %in% smpl
dfPMatrix3<-df2[indx,]
df2<-dfPMatrix3
head(df2)
tail(df2)


dim(PMatrix3)   
dim(dfPMatrix3)
dim(df2)
dim(RLevel) 

df1Jindex<-as.data.frame(as.table(RLevel))
df3<-df1Jindex
dim(df3) 


indx<-df3$Var2 %in% smpl
dfJindex<-df3[indx,]
df3<-dfJindex
head(df3)
tail(df3)

dim(Jindex)
dim(dfJindex) 
dim(df3)

################################################

colnames(df2) <- c("PFreq","Var2","Var1")
d4<-merge(df2, df3, by = c("Var1","Var2"), all.x = TRUE)
d4[is.na(d4)] <- 0
colnames(d4)[1] <- "cgene"
colnames(d4)[2] <- "dgene"
colnames(d4)[3] <- "Pscore"
colnames(d4)[4] <- "Nscore"
head(d4)

dftbl<-data.frame(table(d4$Nscore,d4$Pscore,d4$cgene))
head(dftbl)
dim(dftbl)
colnames(dftbl)[1]<-"gene"
colnames(dftbl)[2]<-"path"
colnames(dftbl)[3]<-"cgene"
head(dftbl)

df2<-dftbl
head(df2)
candidate<- data.frame(table(df2$cgene))
ngene<-nrow(candidate)
tab <- t(matrix(nrow=2,ncol=2))

for (i in 1:ngene) {
    t<-(i-1)*4
    tab[1,1]<-df2[t+1,4]
    tab[2,1]<-df2[t+2,4]
    tab[1,2]<-df2[t+3,4]
    tab[2,2]<-df2[t+4,4]
    f<-fisher.test(tab)
    candidate[i,"Freq"]<-f$p.value
}


if (j==1){
  gene1<-candidate
  candidate$Var1     <- NULL
  colnames(candidate)<-paste('Random', j, sep='')
  gene<-candidate
}  
if (j>1){
 
candidate$Var1     <- NULL
colnames(candidate)<-paste('Random', j, sep='')
gene<-cbind(gene,candidate)
gene1<-cbind(gene1,candidate) 
}


}

candidate<- data.frame(table(df2$cgene))
rownames(gene)<-candidate$Var1

##p-values adjusted to control the False Discovery Rate (FDR) 
##using the Benjamini-Hochberg procedure for multiple-testing correction. 

ngene<-nrow(gene)
for (i in 1:ngene) {
adjPval<-gene[i,1:1000]
gene[i,1:1000]<-round(p.adjust(adjPval, "BH"), 3)
}

x<-as.matrix(gene)
heatmap.2(x, col=redgreen(75),  dendrogram= "row", trace="none",density.info="none",labRow=" ", labCol=" ")

 sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 17134)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] sets_1.0-18          gplots_3.0.1         igraph_1.2.2        
##  [4] org.Hs.eg.db_3.6.0   reactome.db_1.64.0   AnnotationDbi_1.42.1
##  [7] IRanges_2.14.10      S4Vectors_0.18.3     Biobase_2.40.0      
## [10] BiocGenerics_0.26.0 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.18       knitr_1.20         magrittr_1.5      
##  [4] bit_1.1-14         stringr_1.3.1      blob_1.1.1        
##  [7] caTools_1.17.1.1   tools_3.5.1        KernSmooth_2.23-15
## [10] DBI_1.0.0          gtools_3.8.1       htmltools_0.3.6   
## [13] yaml_2.2.0         bit64_0.9-7        rprojroot_1.3-2   
## [16] digest_0.6.15      bitops_1.0-6       memoise_1.1.0     
## [19] evaluate_0.12      RSQLite_2.1.1      rmarkdown_1.10    
## [22] gdata_2.18.0       stringi_1.2.4      compiler_3.5.1    
## [25] backports_1.1.2    pkgconfig_2.0.1
#devtools::session_info()