The notebook provides details of the analysis including scripts, code, and file descriptions.
foldx_scan.py -- Python script used to run FoldX on a computer cluster
Folding stability command example: 'foldx_scan.py -i wt.pdb -n 10 -t 300 -p'
Binding stability command example: 'foldx_scan.py -i wt.pdb -n 20 -g ABC -m ABC -t 300 -p'
pdb2pqr.sh -- Bash script used to process files using pdb2pqr
Command example: 'pdb2pqr.sh -i in -s'
The following list of commands uses structure 'in.pdb'. The same list was used for all molecular dynamics simulations.
gmx-5.0 pdb2gmx -vsite hydrogens -ff charmm22star -water tip3p -ignh -f in.pdb -o out.gro -p out.top -i out.posre
gmx-5.0 editconf -f out.gro -o out.gro -bt dodecahedron -d 1.0
gmx-5.0 solvate -cs -cp out.gro -o out.gro -p out.top
gmx-5.0 grompp -f em -po em.mdout -c out.gro -p out.top -o em
gmx-5.0 genion -s em -o out.gro -p out.top -conc 0.15 -neutral -pname NA -nname CL
gmx-5.0 grompp -f em -po em.mdout -c out.gro -p out.top -o em
mdrun-5.0_gpu -ntomp 0 -pin on -v -cpo pr -s pr -o pr -x pr -c pr.last -e pr -g pr
gmx-5.0 grompp -f pr -po pr.mdout -c em.last -p out.top -o pr
mdrun-5.0_gpu -ntomp 0 -pin on -v -cpo pr -s pr -o pr -x pr -c pr.last -e pr -g pr
gmx-5.0 grompp -f md -po md.mdout -c pr.last -p out.top -o md
mdrun-5.0_gpu -ntomp 0 -pin on -v -cpo md -s md -o md -x md -c md.last -e md -g md
gmx-5.0 grompp -f prod -po prod.mdout -c out -p out -o prod
mdrun-5.0_gpu -ntomp 0 -pin on -cpo prod -s prod -o prod -x prod -c prod.last -e prod -g prod
The folder 'test_systems' holds the experiemtnal and predicted $\Delta \Delta G$ values for each of the 10 folding and the 10 binding systems. These 20 files all have same format, each wiht the following four columns: (1) mutation (mut#
); (2) experimentally determined ddG value (exp
); (3) predcited ddG values using FoldX applied only to the experimental structure without MD simulations (foldx_exp
); (4) predicted $\Delta \Delta G$ values based on our method of anlayzing each of 100 MD snapshots with FoldX and then averaging. The following R code was used to read in and compare the abiltiy of two methods of predicting stability effects.
# Locate and input test system results
setwd("/Users/Craig/Dropbox/Craig_Work/Project--Ebola/Jupytr/Workflow_Files/test_systems")
files <- list.files()
exp.colnames <- c("mutation", "exp.ddG", "foldX.ddG", "MD.fX.ddG")
file.names.list <- sapply(files, function(x) strsplit(x, split="_"))
fold.files <- which(unlist(lapply(file.names.list, function(x) x[2] == "fold")))
bind.files <- which(unlist(lapply(file.names.list, function(x) x[2] == "bind")))
fold.systems1 <- sapply(names(fold.files), function(x) unlist(strsplit(x, split="_"))[3])
fold.systems2 <- sapply(fold.systems1, function(x) unlist(strsplit(x, split="\\."))[1])
bind.systems1 <- sapply(names(bind.files), function(x) unlist(strsplit(x, split="_"))[3])
bind.systems2 <- sapply(bind.systems1, function(x) unlist(strsplit(x, split="\\."))[1])
systems2 <- fold.systems2
system.names <- lapply(file.names.list, function(x) unlist(strsplit(x[3], split="\\."))[1])
n.systems <- length(fold.files)
First cell below defines whether to output pdf and table reslts. Second cell below does analysis and creates outputs as specified.
plot.pdf <- TRUE
pdf.file <- "Folding ddG experimental systems (v2).pdf"
print.table <- TRUE
table.file <- "Folding Summary Table v2.txt"
files.to.use <- fold.files
# Set to TRUE if you want to produce the pdf
plot.pdf <- TRUE
if (plot.pdf == TRUE){
pdf(file="Folding ddG experimental systems (v2).pdf", width=15, height=7)
layout(mat=matrix(nrow=2, ncol=5, data=seq(1,10), byrow=TRUE), heights=rep(3,2), widths=rep(3,5)) -> l
#layout.show(l)
#dev.off()
}
# Analysis
lm.smry <- as.data.frame(matrix(nrow=length(systems2)+1, ncol=13))
colnames(lm.smry) <- c("Protein", "n.sites", "n.mutations", "FX.bias", "FX.rMSE", "FX.Intercept", "FX.Slope", "FX.R2", "MDS.bias", "MDS.rMSE", "MDS.Intercept", "MDS.Slope", "MDS.R2")
lm.smry$Protein <- c(systems2, "Means")
round.vals <- c(NA, NA, 0,0, 2,2,2,2,2,2,2,2,2,2)
for (file.i in 1:length(files.to.use)){
data <- read.table(file=files[files.to.use[file.i]])
colnames(data) <- exp.colnames
max.E <- max(data$exp.ddG)
max.F <- max(data$foldX.ddG)
max.M <- max(data$MD.fX.ddG)
min.E <- min(data$exp.ddG)
min.F <- min(data$foldX.ddG)
min.b <- min(min.E, min.F)
max.b <- max(max.E, max.F)
lm <- lm(data$exp.ddG ~ data$foldX.ddG)
lm2 <- lm(data$exp.ddG ~ data$MD.fX.ddG )
mean.error.FX <- mean(abs(data$exp.ddG - data$foldX.ddG))
mean.error.MDS <- mean(abs(data$exp.ddG - data$MD.fX.ddG))
bias.FX <- mean(data$exp.ddG - data$foldX.ddG)
bias.MDS <- mean(data$exp.ddG - data$MD.fX.ddG)
muts <- as.character(data$mutation)
sites <- sapply(muts, function(x) substring(x, 3, nchar(x)-1))
lm.smry$n.sites[file.i] <- length(unique(sites))
lm.smry$n.mutations[file.i]<- length(muts)
lm.smry$FX.Intercept[file.i] <- round(coefficients(lm)[1],2)
lm.smry$FX.Slope[file.i] <- round(coefficients(lm)[2],2)
lm.smry$FX.R2[file.i] <- round(summary(lm)$r.squared,2)
lm.smry$FX.rMSE[file.i] <- round(mean.error.FX,2)
lm.smry$MDS.Intercept[file.i] <- round(coefficients(lm2)[1],2)
lm.smry$MDS.Slope[file.i] <- round(coefficients(lm2)[2],2)
lm.smry$MDS.R2[file.i] <- round(summary(lm2)$r.squared,2)
lm.smry$MDS.rMSE[file.i] <- round(mean.error.MDS,2)
lm.smry$FX.bias[file.i] <- round(bias.FX,2)
lm.smry$MDS.bias[file.i] <- round(bias.MDS,2)
mean.exp <- mean(data$exp.ddG)
TSS <- sum((data$exp.ddG - mean.exp)^2)
fX.SS <- sum((data$exp.ddG - data$foldX.ddG)^2)
fX.Resids <- (data$exp.ddG - data$foldX.ddG)
MS.SS <- sum((data$exp.ddG - data$MD.fX.ddG)^2)
MS.resids <- (data$exp.ddG - data$MD.fX.ddG)
R2.fX <- (TSS - fX.SS)/TSS
R2.MS <- (TSS - MS.SS)/TSS
if (plot.pdf == TRUE){
par(mar=c(5,5,4,1))
grey.col <- "grey75"
pt.cex <- 0.8
leg.loc <- "bottomright"
scale <- 1.2
plot(data$MD.fX.ddG, data$exp.ddG, pch=24, bg="black", cex=pt.cex, xlim=c(min.b*scale, max.b*scale), ylim=c(min.b*scale, max.b*scale), xlab=expression(paste(Delta, Delta, G, " Predicted", " (kcal/mol)")), ylab=expression(paste(Delta, Delta, G, " Experimentally Observed", " (kcal/mol)")), main=systems2[file.i])
abline(0,1, lty="solid", col=grey.col, lwd=3)
points(data$foldX.ddG, data$exp.ddG, pch=4, bg="white", cex=pt.cex)
abline(coefficients(lm), lty="dashed")
abline(coefficients(lm2))
points(data$foldX.ddG, data$exp.ddG, pch=4, bg="black")
points(data$MD.fX.ddG, data$exp.ddG, pch=24, bg="black")
if (file.i == 1){
legend(leg.loc, legend=c("Perfect Fit", as.expression(bquote(paste("Ex ", R^2==.(format(round(summary(lm)$r.squared,2), nsmall=2))))), as.expression(bquote(paste("MD ", R^2==.(format(round(summary(lm2)$r.squared,2), nsmall=2)))))), bty="n", lty=c("solid", "dashed", "solid"), col=c(grey.col,"black", "black"), pch=c(NA, 4, 24), pt.bg=c(NA, NA, "black"), lwd=c(3,1,1))
} else{
legend(leg.loc, legend=c(as.expression(bquote(paste("Ex ", R^2==.(format(round(summary(lm)$r.squared,2), nsmall=2))))), as.expression(bquote(paste("MD ", R^2==.(format(round(summary(lm2)$r.squared,2),nsmall=2)))))), bty="n")
}
}
}
for (col.i in 2:length(lm.smry[1,])){
colmean <- round(mean(lm.smry[,col.i], na.rm=TRUE), round.vals[col.i])
lm.smry[n.systems+1,col.i] <- colmean
}
lm.smry
if (plot.pdf == TRUE){
dev.off()
}
if (print.table == TRUE){
write.table(x=lm.smry, file=table.file, col.names=TRUE, row.names=FALSE, sep="\t")
}
# Unpound these lines to output the results as LaTex Table
#library(xtable)
#x <- xtable(lm.smry, digits=c(1,round.vals), floating=FALSE, tabular.environment="tabular", hline.after=NULL, include.rownames=FALSE, include.colnames=TRUE, caption="Folding Stability")
#print(x, include.rownames=FALSE)
First cell below defines whether to output pdf and table reslts. Second cell below does analysis and creates outputs as specified.
plot.pdf <- TRUE
pdf.file <- "Binding ddG experimental systems (v2).pdf"
print.table <- TRUE
table.file <- "Binding Summary Table v2.txt"
# Note that this code is the duplicated from the folding code above.
files.to.use <- bind.files
if (plot.pdf == TRUE){
pdf(file=pdf.file, width=15, height=7)
layout(mat=matrix(nrow=2, ncol=5, data=seq(1,10), byrow=TRUE), heights=rep(3,2), widths=rep(3,5)) -> l
#layout.show(l)
#dev.off()
}
# Analysis
lm.smry <- as.data.frame(matrix(nrow=length(systems2)+1, ncol=13))
colnames(lm.smry) <- c("Protein", "n.sites", "n.mutations", "FX.bias", "FX.rMSE", "FX.Intercept", "FX.Slope", "FX.R2", "MDS.bias", "MDS.rMSE", "MDS.Intercept", "MDS.Slope", "MDS.R2")
lm.smry$Protein <- c(systems2, "Means")
round.vals <- c(NA, NA, 0,0, 2,2,2,2,2,2,2,2,2,2)
for (file.i in 1:length(files.to.use)){
data <- read.table(file=files[files.to.use[file.i]])
colnames(data) <- exp.colnames
max.E <- max(data$exp.ddG)
max.F <- max(data$foldX.ddG)
max.M <- max(data$MD.fX.ddG)
min.E <- min(data$exp.ddG)
min.F <- min(data$foldX.ddG)
min.b <- min(min.E, min.F)
max.b <- max(max.E, max.F)
lm <- lm(data$exp.ddG ~ data$foldX.ddG)
lm2 <- lm(data$exp.ddG ~ data$MD.fX.ddG )
mean.error.FX <- mean(abs(data$exp.ddG - data$foldX.ddG))
mean.error.MDS <- mean(abs(data$exp.ddG - data$MD.fX.ddG))
bias.FX <- mean(data$exp.ddG - data$foldX.ddG)
bias.MDS <- mean(data$exp.ddG - data$MD.fX.ddG)
muts <- as.character(data$mutation)
sites <- sapply(muts, function(x) substring(x, 3, nchar(x)-1))
lm.smry$n.sites[file.i] <- length(unique(sites))
lm.smry$n.mutations[file.i]<- length(muts)
lm.smry$FX.Intercept[file.i] <- round(coefficients(lm)[1],2)
lm.smry$FX.Slope[file.i] <- round(coefficients(lm)[2],2)
lm.smry$FX.R2[file.i] <- round(summary(lm)$r.squared,2)
lm.smry$FX.rMSE[file.i] <- round(mean.error.FX,2)
lm.smry$MDS.Intercept[file.i] <- round(coefficients(lm2)[1],2)
lm.smry$MDS.Slope[file.i] <- round(coefficients(lm2)[2],2)
lm.smry$MDS.R2[file.i] <- round(summary(lm2)$r.squared,2)
lm.smry$MDS.rMSE[file.i] <- round(mean.error.MDS,2)
lm.smry$FX.bias[file.i] <- round(bias.FX,2)
lm.smry$MDS.bias[file.i] <- round(bias.MDS,2)
mean.exp <- mean(data$exp.ddG)
TSS <- sum((data$exp.ddG - mean.exp)^2)
fX.SS <- sum((data$exp.ddG - data$foldX.ddG)^2)
fX.Resids <- (data$exp.ddG - data$foldX.ddG)
MS.SS <- sum((data$exp.ddG - data$MD.fX.ddG)^2)
MS.resids <- (data$exp.ddG - data$MD.fX.ddG)
R2.fX <- (TSS - fX.SS)/TSS
R2.MS <- (TSS - MS.SS)/TSS
if (plot.pdf == TRUE){
par(mar=c(5,5,4,1))
grey.col <- "grey75"
pt.cex <- 0.8
leg.loc <- "bottomright"
scale <- 1.2
plot(data$MD.fX.ddG, data$exp.ddG, pch=24, bg="black", cex=pt.cex, xlim=c(min.b*scale, max.b*scale), ylim=c(min.b*scale, max.b*scale), xlab=expression(paste(Delta, Delta, G, " Predicted", " (kcal/mol)")), ylab=expression(paste(Delta, Delta, G, " Experimentally Observed", " (kcal/mol)")), main=systems2[file.i])
abline(0,1, lty="solid", col=grey.col, lwd=3)
points(data$foldX.ddG, data$exp.ddG, pch=4, bg="white", cex=pt.cex)
abline(coefficients(lm), lty="dashed")
abline(coefficients(lm2))
points(data$foldX.ddG, data$exp.ddG, pch=4, bg="black")
points(data$MD.fX.ddG, data$exp.ddG, pch=24, bg="black")
if (file.i == 1){
legend(leg.loc, legend=c("Perfect Fit", as.expression(bquote(paste("Ex ", R^2==.(format(round(summary(lm)$r.squared,2), nsmall=2))))), as.expression(bquote(paste("MD ", R^2==.(format(round(summary(lm2)$r.squared,2), nsmall=2)))))), bty="n", lty=c("solid", "dashed", "solid"), col=c(grey.col,"black", "black"), pch=c(NA, 4, 24), pt.bg=c(NA, NA, "black"), lwd=c(3,1,1))
} else{
legend(leg.loc, legend=c(as.expression(bquote(paste("Ex ", R^2==.(format(round(summary(lm)$r.squared,2), nsmall=2))))), as.expression(bquote(paste("MD ", R^2==.(format(round(summary(lm2)$r.squared,2),nsmall=2)))))), bty="n")
}
}
}
for (col.i in 2:length(lm.smry[1,])){
colmean <- round(mean(lm.smry[,col.i], na.rm=TRUE), round.vals[col.i])
lm.smry[n.systems+1,col.i] <- colmean
}
lm.smry
if (plot.pdf == TRUE){
dev.off()
}
if (print.table == TRUE){
write.table(x=lm.smry, file=table.file, col.names=TRUE, row.names=FALSE, sep="\t")
}
# Unpound these lines to output the results as LaTex Table
#library(xtable)
#x <- xtable(lm.smry, digits=c(1,round.vals), floating=FALSE, tabular.environment="tabular", hline.after=NULL, include.rownames=FALSE, include.colnames=TRUE, caption="Folding Stability")
#print(x, include.rownames=FALSE)
Here we first read in the our ME+FoldX analysis of the Ebola system. Column 1 is the mutation, column 2 is the GP-antibody binding affinity, column 3 is the GP1-GP2 dimer binding stability, column 4 is the GP12 trimer binding stability, and column 5 is the monomer folding stability.
setwd("/Users/Craig/Dropbox/Craig_Work/Project--Ebola/Jupytr/Workflow_Files")
ebov.data <- read.table(file="mayinga_foldx.txt", header=TRUE, stringsAsFactors=FALSE)
head(ebov.data)
tail(ebov.data)
fold.cut <- 3.0 #2.5
bind.cut <- 3.0 #2.5
anti.bind.cut <- 2.0
fold.via <- which(ebov.data$monomer_fold < fold.cut)
fold.invia <- which(ebov.data$monomer_fold >= fold.cut)
anti.bind.via <- which(ebov.data$antibody_bind < anti.bind.cut)
anti.bind.invia <- which(ebov.data$antibody_bind >= anti.bind.cut)
dimer.bind.via <- which(ebov.data$dimer_bind < bind.cut)
dimer.bind.invia <- which(ebov.data$dimer_bind >= bind.cut)
trimer.bind.via <- which(ebov.data$trimer_bind < bind.cut)
trimer.bind.invia <- which(ebov.data$trimer_bind >= bind.cut)
watch.2 <- watches <- intersect(fold.via, anti.bind.invia)
ebov.data[watch.2,]
# Some other useful tid bits.
di.tri.via <- intersect(dimer.bind.via, trimer.bind.via)
di.tri.fold.via <- intersect(di.tri.via, fold.via)
via.anti.disrupt <- intersect(di.tri.fold.via, anti.bind.invia)
Note code right now outputs plot to this notebook. To instead create a pdf, unpound the first and last lines of the 2nd cell below.
mon.fold <- ebov.data$monomer_fold #[mon.cex.ord]
di.bind <- ebov.data$dimer_bind #[mon.cex.ord]
tri.bind <- ebov.data$trimer_bind #[mon.cex.ord]
anti.bind <- ebov.data$antibody_bind #[mon.cex.ord]
mon.cex <- rep(1, length(mon.fold))
mon.fold.big <- which(mon.fold > 5)
mon.cex[mon.fold.big] <- 1+mon.fold[mon.fold.big]/5
mon.cex.ord <- order(mon.cex, decreasing=TRUE)
obs <- which(ebov.data$mutation[watch.2]=="ND550K")
#pdf(file="watch_list_figure_v5.pdf", width=6, height=6)
par(mar=c(5,5,4,1))
let.cex <- 1.2
label.cex <- 1.3
plot(anti.bind, max.ddG, pch=20+max.id, bg="white", xlim=c(-2, 15), ylim=c(-3, 15), xlab=expression(paste(Delta, Delta, "G"["Antibody-GP binding"], " (kcal/mol)")), ylab=expression(paste(Delta, Delta, "G"["max"], " (kcal/mol)")), cex.lab =label.cex, xaxt="n", yaxt="n")
axis(side=1, at=c(seq(0,5),10,15), labels=c(0,rep(NA, 4),5,10,15))
axis(side=2, at=c(seq(0,5),10,15), labels=c(0,rep(NA, 4),5,10,15))
#abline(0,0)
#abline(v=0)
abline(3.0, 0, lty="dashed")
abline(v=2.0, lty="dashed")
abline(-3.0, 0, lty="dashed")
#abline(v=-2.5, lty="dashed")
points(anti.bind[watches], max.ddG[watches], pch=20+max.id[watches], bg="red", cex=1.2)
text(-1.5,-1.5, labels="A", font=2, cex=let.cex)
text(-1.5,10, labels="B", font=2, cex=let.cex)
text(3.9, 10, labels="C", font=2, cex=let.cex)
text(3.9,-1.5, labels="D", font=2, col="black", cex=let.cex)
text(9,0, labels="Watch List", font=2, col="red", cex=let.cex)
points(anti.bind[watches[obs]], max.ddG[watches[obs]], pch=20+max.id[watches], bg="red", cex=1.75)
text(anti.bind[watches[obs]], max.ddG[watches[obs]], labels="N550K", adj=c(0,3))
points(c(anti.bind[watches[obs]],anti.bind[watches[obs]]+0.3), c(max.ddG[watches[obs]], max.ddG[watches[obs]]-1), type="l")
legend(9, 11, legend=c("Dimer binding", "Trimer binding", "Monomer folding"), pch=c(21, 22, 23), pt.bg="white", cex=1.0, bg="white")
#text(anti.bind[watches], max.ddG[watches], labels=letters[1:24], pos=4, cex=0.75)
#dev.off()