#Ensure that final matrices match the N_molecules X 286 (ncols == 286)
#Changes to the smiles files (.smi) need to be done directly in Notepad++ or similar
#All data should be save in an RData workspace

#inspect attached libraries
sessionInfo() 


#missing installations...
install.packages("rJava",,"http://rforge.net")

if (!requireNamespace("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("Rcpi") #no updates!!!
install.packages("devtools")
library(devtools)
devtools::install_github("kevinushey/Kmisc")

#load libraries
library(caret)
library(Rcpi)
library(ChemmineR)
library(dplyr)
library(ChemmineOB)
library("kernlab")
library(rJava)
library(Kmisc)
library(h2o)

#initiate h2o and check status
#insufficient memory might affect accuracy
localH2O = h2o.init(ip = "localhost",
                    port = 54321,
                    max_mem_size="16G",
                    nthreads = 4)
h2o.clusterStatus()


#****set your working directory & files (keep this consistent)
file_loc<-"DRIVE:/DIRECTORY"
setwd(file_loc)

#load molecules referenced in supplemental files of Meyerhof et al (ref. #11)
#**!!Replace DRIVE with address to the smiles file
x.mol <- readMolFromSmi('DRIVE:/DIRECTORY/smst2.smi', type = "mol") #[Rcpi] once cid was set to false, read ok
class(x.mol)
x.smi <- readMolFromSmi('DRIVE:/DIRECTORY/smst2.smi', type = "text")

#Extract Molecular Fingerprints 
x1 <- extractDrugEstateComplete(x.mol) #worked with x.mol from a write with cid=FALSE; mostly 0 -> low Var
x2 <- extractDrugMACCSComplete(x.mol)
x3 <- extractDrugOBFP4(x.smi, type = "smile")

##Extract inter-relational Geometric Molecular Graph representations & convert to dataframe
##fp is a 82X113 molGraph
fp  = extractDrugGraph(x.mol)
class(fp)
fpn<-pad(fp,113)
ufp<-unlist(fpn,recursive = TRUE, use.names = TRUE)
str(ufp)
fpdf <- data.frame(matrix(unlist(fpn), nrow=length(fpn), byrow=T),stringsAsFactors=FALSE) 
x4<-fpdf

#remove low and zero variance columns
x1 <- x1[, -nearZeroVar(x1)]
x2 <- x2[, -nearZeroVar(x2)]
x3 <- x3[, -nearZeroVar(x3)]
x4 <- x4[, -nearZeroVar(x4)]

#Examine Column Contributions
tot.cols<-ncol(x1)+ncol(x2)+ncol(x3)+ncol(x4)
pctTop<-ncol(x1)/tot.cols
pctStr<-(ncol(x2)+ncol(x3))/tot.cols
pctGeo<-ncol(x4)/tot.cols

#load RQI (Relative Quinine Indices) referenced in supplemental files of Meyerhof et al (ref. #11)
yd<-read.csv("smiles_rqi.csv")
summary(yd)
y<-yd$RQI
yxmol<-as.data.frame(cbind(y,x4,x1,x2,x3))
dim(yxmol)
colNames <- make.names(names=names(yxmol), unique=TRUE, allow_ = TRUE)
names(yxmol) <- colNames

##----Build a GBM-----##

h2o.clusterStatus() #Ensure still runnning/healthy

hdxmol<-as.h2o(yxmol)

hdx.gbm5pc <- h2o.gbm(x = c(2:286),
                      y = 1,
                      training_frame = hdxmol,
                      distribution = "AUTO",
                      ntrees = 199,
                      max_depth = 7,
                      min_rows = 2,
                      learn_rate = 0.2,
                      nfolds = 8,
                      fold_assignment = "Modulo",
                      keep_cross_validation_predictions = TRUE,
                      seed = 1)

plot(hdx.gbm5pc, timestep = "duration", metric = "mae",xlab="DURATION IN SECONDS")

####-Prediction with the GBM-################
####Edit the .smi file with the last entry as the new molecule
###For salts, use the cationic active.
##Below loading process is erratic at best:
#------------------------------------------------------------------------------------
##sdf<-smiles2sdf("CC[N+](CC)(CC1=CC=CC=C1)CC(=O)NC2=C(C=CC=C2C)C")
#smiset <- sdf2smiles(sdf) #this is a sdfset
#write.SMI(smiset[1], file="tstMOl.smi")
#convertFormatFile("smi","mol","tstMOl.smi","tstCmpd.mol")
#only a smiles required else, it fails
#------------------------------------------------------------------------------------
tx.mol <- readMolFromSmi('DRIVE:/DIRECTORY/ts_smst2.smi', type = "mol") 
tx.smi <- readMolFromSmi('DRIVE:/DIRECTORY/ts_smst2.smi', type = "text") 
#obtain extended prediction input space (previous molecules + the new molecule to be predicted)
z1 <- extractDrugEstateComplete(tx.mol) 
z2 <- extractDrugMACCSComplete(tx.mol)
z3 <- extractDrugOBFP4(tx.smi, type = "smile")
fp2  = extractDrugGraph(tx.mol)
class(fp2)
fpn2<-pad(fp2,113) 
fpdf2 <- data.frame(matrix(unlist(fpn2), nrow=length(fpn2), byrow=T),stringsAsFactors=FALSE) #fp2 is now a 1X113 molGraph
z4<-fpdf2

#remove low and zero variance cols
z1 <- z1[, -nearZeroVar(z1)]
z2 <- z2[, -nearZeroVar(z2)]
z3 <- z3[, -nearZeroVar(z3)]
z4 <- z4[, -nearZeroVar(z4)]

ts_yxmol<-as.data.frame(cbind(z4,z1,z2,z3))
tsColNames<- make.names(names=names(ts_yxmol), unique=TRUE, allow_ = TRUE)
names(ts_yxmol) <- tsColNames

#---predict new molecules---#
txfr<-as.h2o(ts_yxmol)
pr3<-as.data.frame(h2o.predict(hdx.gbm5pc,txfr))$predict #must recreate the model and the matrix
print(pr3[83])