#Ensure that final matrices match the N_molecules X 286 (ncols == 286) #Changes to the smiles files (.smi) need to be done directly in Notepad++ or similar #All data should be save in an RData workspace #inspect attached libraries sessionInfo() #missing installations... install.packages("rJava",,"http://rforge.net") if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("Rcpi") #no updates!!! install.packages("devtools") library(devtools) devtools::install_github("kevinushey/Kmisc") #load libraries library(caret) library(Rcpi) library(ChemmineR) library(dplyr) library(ChemmineOB) library("kernlab") library(rJava) library(Kmisc) library(h2o) #initiate h2o and check status #insufficient memory might affect accuracy localH2O = h2o.init(ip = "localhost", port = 54321, max_mem_size="16G", nthreads = 4) h2o.clusterStatus() #****set your working directory & files (keep this consistent) file_loc<-"DRIVE:/DIRECTORY" setwd(file_loc) #load molecules referenced in supplemental files of Meyerhof et al (ref. #11) #**!!Replace DRIVE with address to the smiles file x.mol <- readMolFromSmi('DRIVE:/DIRECTORY/smst2.smi', type = "mol") #[Rcpi] once cid was set to false, read ok class(x.mol) x.smi <- readMolFromSmi('DRIVE:/DIRECTORY/smst2.smi', type = "text") #Extract Molecular Fingerprints x1 <- extractDrugEstateComplete(x.mol) #worked with x.mol from a write with cid=FALSE; mostly 0 -> low Var x2 <- extractDrugMACCSComplete(x.mol) x3 <- extractDrugOBFP4(x.smi, type = "smile") ##Extract inter-relational Geometric Molecular Graph representations & convert to dataframe ##fp is a 82X113 molGraph fp = extractDrugGraph(x.mol) class(fp) fpn<-pad(fp,113) ufp<-unlist(fpn,recursive = TRUE, use.names = TRUE) str(ufp) fpdf <- data.frame(matrix(unlist(fpn), nrow=length(fpn), byrow=T),stringsAsFactors=FALSE) x4<-fpdf #remove low and zero variance columns x1 <- x1[, -nearZeroVar(x1)] x2 <- x2[, -nearZeroVar(x2)] x3 <- x3[, -nearZeroVar(x3)] x4 <- x4[, -nearZeroVar(x4)] #Examine Column Contributions tot.cols<-ncol(x1)+ncol(x2)+ncol(x3)+ncol(x4) pctTop<-ncol(x1)/tot.cols pctStr<-(ncol(x2)+ncol(x3))/tot.cols pctGeo<-ncol(x4)/tot.cols #load RQI (Relative Quinine Indices) referenced in supplemental files of Meyerhof et al (ref. #11) yd<-read.csv("smiles_rqi.csv") summary(yd) y<-yd$RQI yxmol<-as.data.frame(cbind(y,x4,x1,x2,x3)) dim(yxmol) colNames <- make.names(names=names(yxmol), unique=TRUE, allow_ = TRUE) names(yxmol) <- colNames ##----Build a GBM-----## h2o.clusterStatus() #Ensure still runnning/healthy hdxmol<-as.h2o(yxmol) hdx.gbm5pc <- h2o.gbm(x = c(2:286), y = 1, training_frame = hdxmol, distribution = "AUTO", ntrees = 199, max_depth = 7, min_rows = 2, learn_rate = 0.2, nfolds = 8, fold_assignment = "Modulo", keep_cross_validation_predictions = TRUE, seed = 1) plot(hdx.gbm5pc, timestep = "duration", metric = "mae",xlab="DURATION IN SECONDS") ####-Prediction with the GBM-################ ####Edit the .smi file with the last entry as the new molecule ###For salts, use the cationic active. ##Below loading process is erratic at best: #------------------------------------------------------------------------------------ ##sdf<-smiles2sdf("CC[N+](CC)(CC1=CC=CC=C1)CC(=O)NC2=C(C=CC=C2C)C") #smiset <- sdf2smiles(sdf) #this is a sdfset #write.SMI(smiset[1], file="tstMOl.smi") #convertFormatFile("smi","mol","tstMOl.smi","tstCmpd.mol") #only a smiles required else, it fails #------------------------------------------------------------------------------------ tx.mol <- readMolFromSmi('DRIVE:/DIRECTORY/ts_smst2.smi', type = "mol") tx.smi <- readMolFromSmi('DRIVE:/DIRECTORY/ts_smst2.smi', type = "text") #obtain extended prediction input space (previous molecules + the new molecule to be predicted) z1 <- extractDrugEstateComplete(tx.mol) z2 <- extractDrugMACCSComplete(tx.mol) z3 <- extractDrugOBFP4(tx.smi, type = "smile") fp2 = extractDrugGraph(tx.mol) class(fp2) fpn2<-pad(fp2,113) fpdf2 <- data.frame(matrix(unlist(fpn2), nrow=length(fpn2), byrow=T),stringsAsFactors=FALSE) #fp2 is now a 1X113 molGraph z4<-fpdf2 #remove low and zero variance cols z1 <- z1[, -nearZeroVar(z1)] z2 <- z2[, -nearZeroVar(z2)] z3 <- z3[, -nearZeroVar(z3)] z4 <- z4[, -nearZeroVar(z4)] ts_yxmol<-as.data.frame(cbind(z4,z1,z2,z3)) tsColNames<- make.names(names=names(ts_yxmol), unique=TRUE, allow_ = TRUE) names(ts_yxmol) <- tsColNames #---predict new molecules---# txfr<-as.h2o(ts_yxmol) pr3<-as.data.frame(h2o.predict(hdx.gbm5pc,txfr))$predict #must recreate the model and the matrix print(pr3[83])