# Project file for the paper # Chemical composition and the potential # for proteomic transformation in cancer, # hypoxia, and hyperosmotic stress # Author: Jeffrey M. Dick # First version: 20160410 # Last updated: 20170513 # Developed with R 3.2.4 and the following packages: # CHNOSZ 1.1.0 (https://cran.r-project.org/package=CHNOSZ) # canprot 0.0.5 (http://github.com/jedick/canprot) # optional: extrafont 0.17 (https://cran.r-project.org/package=extrafont) # Usage: # 1. Place this file in the R working directory. # 2. Create a subdirectory named 'potentials'. # 3. To use the Linux Biolinum font, change the following to TRUE. useLB <- FALSE # 4. Source this file in R and run the following commands. # basis_comp(pdf=TRUE) --> basis_comp.pdf (Figure S1) # cpcp(1, pdf=TRUE, arrows=TRUE) --> arrows1.pdf (Figure S2) # cpcp(2, pdf=TRUE, arrows=TRUE) --> arrows2.pdf (Figure S2) # cpcp(1, pdf=TRUE) --> cpcp1.pdf (Fig. 1) # cpcp(2, pdf=TRUE) --> cpcp2.pdf (Fig. 2) # (the above also produce summary_*.csv files needed for the following calculations) # mksummary() --> summary.csv (Table S1) # comppot() --> potentials/*.[csv|pdf] (Figure S3) # comppot(pdf=TRUE) --> comppot.pdf (Fig. 3) # lp() --> lipid:protein ratio (Fig. 4) # load required packages and data library(CHNOSZ) data(thermo) library(canprot) data(canprot) if(useLB) { # http://blog.revolutionanalytics.com/2012/09/how-to-use-your-favorite-fonts-in-r-charts.html library(extrafont) family <- "Linux Biolinum" } else { family <- "Helvetica" } # to plot compositions of down- and up-expressed proteins connected by arrows 20170311 arrowplot <- function(comptab, vars=c("ZC", "nH2O"), col="black", plot.text=TRUE) { # convert to data frame if needed if(!is.data.frame(comptab)) comptab <- do.call(rbind, comptab) # which columns we're using stats <- c("mean", "p.value") iX <- sapply(paste(vars[1], stats, sep="."), grep, colnames(comptab)) iY <- sapply(paste(vars[2], stats, sep="."), grep, colnames(comptab)) # get means for down- and up-expressed proteins X_d <- comptab[, iX[[1]][1]] X_u <- comptab[, iX[[1]][2]] X_p <- comptab[, iX[[2]]] Y_d <- comptab[, iY[[1]][1]] Y_u <- comptab[, iY[[1]][2]] Y_p <- comptab[, iY[[2]]] # set up plot x <- vars[1] y <- vars[2] if(vars[1]=="ZC") xlab <- expression(italic(Z)[C]) if(vars[2]=="nH2O") ylab <- expression(bar(italic(n))[H[2] * O]) # initialize plot with range of all points plot(type="n", c(X_d, X_u), c(Y_d, Y_u), xlab=xlab, ylab=ylab) # plot arrows arrows(X_d, Y_d, X_u, Y_u, length=0.1) # point symbols: open circle, filled circle, filled square (0, 1 or 2 vars with p-value < 0.05) p_signif <- rowSums(data.frame(X_p < 0.05, Y_p < 0.05)) pch <- ifelse(p_signif==2, 15, ifelse(p_signif==1, 19, 21)) # plot points with specified color col <- rep(col, length.out=nrow(comptab)) if(!plot.text) points(X_d, Y_d, pch=pch, col=col, bg="white") else { # plot bigger points with letters inside points(X_d, Y_d, pch=pch, col=col, bg="white", cex=2) # use white letters on colored background col[pch %in% c(15, 19)] <- "white" text(X_d, Y_d, c(letters, LETTERS)[seq_along(X_d)], col=col, cex=0.9) } } # compare QEC and CHNOS basis species on nO2-ZC and nH2O-ZC diagrams 20170503 basis_comp <- function(pdf=FALSE) { # amino acid compositions of all proteins in the UniProt human proteome aa <- human_base # protein formula and ZC protein.formula <- protein.formula(aa) ZC <- ZC(protein.formula) if(pdf) pdf("basis_comp.pdf", family=family, width=7, height=7) par(mfrow=c(2, 2)) par(mar=c(4, 4, 2.5, 1)) par(cex=1.1) par(mgp=c(2.5, 1, 0)) nH2Olab <- expression(bar(italic(n))[H[2] * O]) nO2lab <- expression(bar(italic(n))[O[2]]) ZClab <- expression(italic(Z)[C]) # composition projected into different sets of basis species for(basis in c("QEC", "CHNOS")) { basis(basis) protein.basis <- protein.basis(aa) protein.length <- protein.length(aa) residue.basis <- protein.basis/protein.length smoothScatter(ZC, residue.basis[, "O2"], xlab=ZClab, ylab=nO2lab) smoothScatter(ZC, residue.basis[, "H2O"], xlab=ZClab, ylab=nH2Olab) QEClab <- syslab(c("glutamine", "glutamic acid", "cysteine", "H2O", "O2"), dash="-") CHNOSlab <- syslab(c("CO2", "NH3", "H2S", "H2O", "O2"), dash="-") if(basis=="QEC") mtext(QEClab, outer = TRUE, cex = 1.2, line=-1.5) if(basis=="CHNOS") mtext(CHNOSlab, outer = TRUE, cex = 1.2, line=-17.5) } if(pdf) dev.off() } # compare plots for cancer proteomes 20160913 cpcp <- function(which=1, pdf=FALSE, basis="QEC+", arrows=FALSE) { # set up plot if(which==1) { if(arrows) file <- "arrows1.pdf" else file <- "cpcp1.pdf" groups <- c("CRC", "pancreatic") } else if(which==2) { if(arrows) file <- "arrows2.pdf" else file <- "cpcp2.pdf" groups <- c("hypoxia", "osmotic") } if(pdf) pdf(file, family=family, width=9, height=4) layout(matrix(c(1, 2), nrow=1, byrow=TRUE)) par(mar=c(3.5, 4, 2, 1.5)) par(mgp=c(2.5, 1, 0)) # loop over groups of experiments for(i in seq_along(groups)) { pdat_fun <- paste0("pdat_", groups[i]) datasets <- get(pdat_fun)() comptab <- lapply_canprot(datasets, function(dataset) { pdat <- get_pdat(dataset, pdat_fun, basis) ZC_nH2O(pdat, plot.it=FALSE) }, varlist=c(pdat_fun)) col <- rep("black", length(datasets)) # highlight chronic pancreatitis and low-grade tumor datasets col[grepl("=low", datasets)] <- "red" # highlight reoxygenation datasets col[grepl("ReOx", datasets)] <- "blue" # highlight spheroid datasets col[grepl("=SPH", datasets)] <- "red" # highlight adenoma / normal datasets col[grepl("=AD", datasets)] <- "red" # highlight adipose-derived stem cell datasets col[grepl("=ASC", datasets)] <- "orange" # make the plot if(arrows) arrowplot(comptab, col=col) else diffplot(comptab, col=col, plot.rect=TRUE) if(!arrows) { # add figure label label.figure(LETTERS[i], xfrac=0.04, family=family, cex=1.5, adj=0) # write summary table comptab <- do.call(rbind, comptab) comptab <- cbind(set=c(letters, LETTERS)[1:nrow(comptab)], comptab) comptab[, 6:15] <- signif(comptab[, 6:15], 4) write.csv(comptab, paste0("summary_", groups[i], ".csv"), row.names=FALSE, quote=3) } } if(pdf) { dev.off() # embed fonts - maybe not here to save space #embed_fonts(file, options="-dPDFSETTINGS=/prepress") } } # function to plot values with colors and labels plotit <- function(values, rankdat, add=FALSE) { col <- get_colors(values) with(rankdat, { if(!add) { image(xs, ys, values, col=col, useRaster=TRUE, xlab=xlab, ylab=ylab, axes=FALSE) if(diff(range(xs)) > 9) { axis(1, at=seq(-88, -40, by=12)) axis(2, at=seq(-24, 24, by=12), las=1) } else { axis(1, at=seq(-70, -62, by=2)) axis(2, at=seq(-6, 2, by=2), las=1) } box() } # show equal-rank line contour(xs, ys, values, levels=0, add=TRUE, drawlabels=FALSE, lwd=1.5, col="white") }) } # merging potential diagrams for pancreatic cancer 20161012 calcpot <- function(group="pancreatic", datasets=c("LHE+04", "PCS+11_PDAC"), basis="QEC+", res=50, plot.it=FALSE, each100=FALSE, xlim=c(-70.2, -61.8), ylim=c(-6.2, 2.2), m=0) { # get the data data(canprot) pdat_fun <- paste0("pdat_", group) rankdat <- lapply_canprot(datasets, function(dataset) { pdat <- get_pdat(dataset, pdat_fun, basis) rankplot(pdat, res=res, plot.it=FALSE, xlim=xlim, ylim=ylim) }, varlist=pdat_fun, min.length=5) rankdiff <- lapply(rankdat, "[[", "rankdiff") dat <- read.csv(paste0("summary_", group, ".csv"), as.is=TRUE) # function to scale all values by same factor such that maximum is 100 scale100 <- function(valuelist, each100=FALSE) { if(each100) lapply(valuelist, function(x) x * 100 / max(abs(range(x)))) else { totalrange <- range(valuelist) lapply(valuelist, function(x) x * 100 / max(abs(totalrange))) } } # scale and display the rank differences rankdiff <- scale100(rankdiff, each100) # calculate the percentages of total values rabs <- lapply(rankdiff, abs) rsum <- sapply(rabs, sum) rperc <- round(100 * rsum / sum(rsum), 1) if(plot.it) for(i in seq_along(rankdat)) { plotit(rankdiff[[i]], rankdat[[i]]) mtext(datasets[i], side=4, cex=0.85, las=0, adj=0, line=0.1) # show the dataset description and letter idat <- match(datasets[i], dat$dataset) label.figure(dat$set[idat], xfrac=0.25, cex=1.5) label.figure(paste0(" (", rperc[i], "%)"), adj=0, xfrac=0.25, cex=1.5) # put a circle around the letter opar <- par(xpd=NA) points(grconvertX(0.25, "nfc"), grconvertY(0.95, "nfc"), cex=3.5) par(opar) } if(m) { # merge the rank differences for all combinations of 'm' number of datasets combs <- combn(length(rankdat), m) comb <- x <- y <- numeric() print(paste("calcpot: calculating equipotential lines for", ncol(combs), "combinations")) for(i in 1:ncol(combs)) { merged <- Reduce("+", rankdiff[combs[, i]]) cl <- contourLines(rankdat[[1]]$xs, rankdat[[1]]$ys, merged, levels=0) if(length(cl) > 0) { # in case the line has multiple segments clx <- do.call(c, lapply(cl, "[[", 2)) cly <- do.call(c, lapply(cl, "[[", 3)) x <- c(x, clx) y <- c(y, cly) comb <- c(comb, rep(i, length(clx))) } } out <- data.frame(comb=comb, x=x, y=y) return(out) } else { # merge the rank differences for all datasets merged <- Reduce("+", rankdiff) / length(rankdat) merged <- scale100(list(merged)) return(invisible(list(merged=merged, rankdat=rankdat))) } } get_main <- function(what, n=NA) { group <- strsplit(what, "_")[[1]][1] metric <- strsplit(what, "_")[[1]][2] direction <- strsplit(what, "_")[[1]][3] if(direction=="up") than <- "> 0.01" else than <- "< -0.01" if(metric=="ZC") main <- substitute(Delta*italic(Z)[C]~than, list(than=than)) if(metric=="H2O") main <- substitute(Delta*italic(bar(n))[H[2]*O]~than, list(than=than)) if(!is.na(n)) { # make long titles to be placed on two lines if(group %in% c("hypoxia", "osmotic")) { if(group=="hypoxia") group <- "hypoxia or 3D culture" if(group=="osmotic") group <- "hyperosmotic stress" main <- c(group, substitute(main~~"("*x*")", list(group=group, main=main, x=n))) } else main <- substitute(group~~main~~"("*x*")", list(group=group, main=main, x=n)) } return(main) } getpot <- function(what="hypoxia_ZC_up", each100=FALSE, res=50, pdf=FALSE) { group <- strsplit(what, "_")[[1]][1] metric <- strsplit(what, "_")[[1]][2] direction <- strsplit(what, "_")[[1]][3] dat <- read.csv(paste0("summary_", group, ".csv"), as.is=TRUE) cnames <- colnames(dat) # the difference for this metric diff <- dat[, grepl(metric, cnames) & grepl("diff", cnames)] # the p-value and CLES for the other metric p.value <- dat[, !grepl(metric, cnames) & grepl("p.value", cnames)] CLES <- dat[, !grepl(metric, cnames) & grepl("CLES", cnames)] # the rows that have the difference in the specified direction # and the magnitude of the difference is at least 0.01 # and for which the other metric doesn't have a large or significant change if(direction=="up") idat <- diff > 0.01 & !(p.value < 0.05 | abs(signif(CLES, 2) - 50) >= 10) if(direction=="down") idat <- diff < -0.01 & !(p.value < 0.05 | abs(signif(CLES, 2) - 50) >= 10) if(pdf) { if(!"potentials" %in% dir()) stop("A directory named 'potentials' does not exist. Please create it.") # calculate potentials over large area and save figures and coordinates of equipotential lines # number of plots - 1 extra for title np <- sum(idat) + 1 if(np <= 28) mfrow <- c(4, 7) if(np <= 24) mfrow <- c(4, 6) if(np <= 20) mfrow <- c(4, 5) if(np <= 18) mfrow <- c(3, 6) if(np <= 16) mfrow <- c(4, 4) if(np <= 15) mfrow <- c(3, 5) if(np <= 12) mfrow <- c(3, 4) if(np <= 9) mfrow <- c(3, 3) if(np <= 8) mfrow <- c(2, 4) if(np <= 6) mfrow <- c(2, 3) if(np <= 4) mfrow <- c(2, 2) pdf(file <- paste0("potentials", "/", what, ".pdf"), family=family, width=2.5*mfrow[2], height=2.5*mfrow[1]) par(mfrow=mfrow) par(mar=c(3.5, 4, 2, 1.5)) par(mgp=c(2.5, 1, 0)) # first plot the title plot.new() opar <- par(xpd=TRUE) text(0.5, 0.8, group, cex=3) text(0.5, 0.6, get_main(what), cex=3) par(opar) # m=1 here to calculate lines for each dataset individually # expand x- and y-axes (48 log units) mr <- calcpot(group=group, datasets=dat$dataset[idat], res=res, plot.it=TRUE, each100=each100, xlim=c(-88, -40), ylim=c(-24, 24), m=1) write.csv(mr, paste0("potentials", "/", what, ".csv"), row.names=FALSE, quote=FALSE) dev.off() #embed_fonts(file, options="-dPDFSETTINGS=/prepress") } else { mr <- calcpot(group=group, datasets=dat$dataset[idat], res=res, each100=each100) return(invisible(mr)) } } comppot <- function(pdf=FALSE, each100=FALSE, res=50) { # what we are plotting - group_metric_direction cancer <- c("CRC_ZC_up","pancreatic_ZC_up", "CRC_H2O_up", "pancreatic_H2O_up") environment <- c("hypoxia_ZC_down", "osmotic_H2O_down") getpotfun <- function(what) getpot(what, each100=each100, res=res, pdf=!pdf) merged <- list() # ZC merged <- c(merged, lapply(cancer[1:2], getpotfun)) merged <- c(merged, lapply(environment[1], getpotfun)) # H2O merged <- c(merged, lapply(cancer[3:4], getpotfun)) merged <- c(merged, lapply(environment[2], getpotfun)) if(pdf) { pdf(file <- paste0("comppot", ".pdf"), family=family, width=6.75, height=4.64) par(mfrow=c(2, 3)) par(mar=c(3.4, 4, 2.6, 1.5)) par(mgp=c(2.5, 1, 0)) # names of all things we are plotting what <- c(cancer[1:2], environment[1], cancer[3:4], environment[2]) for(i in 1:length(merged)) { # the merged potential diagram plotit(merged[[i]][[1]][[1]], merged[[i]][[2]][[1]]) # median and interquartile range of the individual equipotential lines plot_median(what[i], add=TRUE, transpose=grepl("ZC",what[i])) # plot the merged equipotential line again (for emphasis) plotit(merged[[i]][[1]][[1]], merged[[i]][[2]][[1]], add=TRUE) # add figure label label.figure(LETTERS[i], xfrac=0.1, yfrac=0.91, family=family, cex=1.5) } dev.off() #embed_fonts(file, options="-dPDFSETTINGS=/prepress") } } # plot median and 1st and 3rd quartiles of equipotential lines 20161021 plot_median <- function(what, each100=FALSE, transpose=FALSE, add=FALSE, type=7) { # equipotential lines for individual datasets indiv <- read.csv(paste0("potentials", "/", what, ".csv"), as.is=TRUE) # show scatterplot of points on all lines if(!add) plot(indiv$x, indiv$y, pch=".") if(transpose) { ys <- seq(-24, 24, by=0.2) xs <- sapply(unique(indiv$comb), function(comb) { idat <- indiv$comb == comb # use loess to get smooth lines indlo <- loess(x~y, data=indiv[idat, ], span=0.2) spx <- predict(indlo, newdata=ys) # show loess lines if(!add) lines(spx, ys, col="gray") return(spx) }) # calculate and plot median xmedian <- apply(xs, 1, median, na.rm=TRUE) lines(xmedian, ys, lwd=2) # calculate and plot 1st and 3rd quartiles xq1 <- apply(xs, 1, quantile, probs=0.25, na.rm=TRUE, type=type) xq3 <- apply(xs, 1, quantile, probs=0.75, na.rm=TRUE, type=type) lines(xq1, ys, lty=2) lines(xq3, ys, lty=2) } else { xs <- seq(-88, -40, by=0.2) ys <- sapply(unique(indiv$comb), function(comb) { idat <- indiv$comb == comb # use loess to get smooth lines indlo <- loess(y~x, data=indiv[idat, ], span=0.2) spy <- predict(indlo, newdata=xs) # show loess lines if(!add) lines(xs, spy, col="gray") return(spy) }) # calculate and plot median ymedian <- apply(ys, 1, median, na.rm=TRUE) lines(xs, ymedian, lwd=1.5) # calculate and plot 1st and 3rd quartiles yq1 <- apply(ys, 1, quantile, probs=0.25, na.rm=TRUE, type=type) yq3 <- apply(ys, 1, quantile, probs=0.75, na.rm=TRUE, type=type) lines(xs, yq1, lty=2) lines(xs, yq3, lty=2) } } # estimate lipid:protein ratio in cancer 20161121 # a computer-aided "back of the envelope" calculation lp <- function() { # assumption: constant lipid formula (Wikipedia: triglyceride) lipid <- "C55H102O6" ZC_lipid <- ZC(lipid) # -1.564 # assumption: average protein formula (UniProt: human proteome) protein <- colMeans(protein.formula(human_base)) # C2683H4236N749O810S24 ZC_protein <- ZC(protein) # -0.119 # assumption: change of ZC in proteins in hypoxia and cancer (this study) DZC_hypoxia <- -0.03 DZC_cancer <- 0.03 ## assumption: normal lipid:protein dry weight ratio (MP15: E. coli macromolecular composition) ##LP_normal <- 9/55 ## assumption: lipid:protein dry weight ratio in hypoxia (GBB77: triglyceride increase by 240%) #LP_hypoxia <- 2 * LP_normal # assumption: normal and hypoxia lipid:protein dry weight ratio (GBB77: Table 2) LP_normal <- (115.4 + 15.0 + 12.3 + 9.6) / 1000 # 0.152 LP_hypoxia <- (114.7 + 16.4 + 41.9 + 18.6) / 1000 # 0.192 # assumption: lipid:protein carbon atom ratio (representative lipid and protein formulas) LP_mass2mol <- (makeup(lipid)["C"] / mass(lipid)) / (makeup(protein)["C"] / mass(protein)) LPC_hypoxia <- LP_hypoxia * LP_mass2mol LPC_normal <- LP_normal * LP_mass2mol # assumption: ZC in cell is sum of protein and lipid contributions ZC_cell <- function(LPC, DZC) ZC_lipid * LPC/(LPC+1) + (ZC_protein + DZC) * 1/(LPC+1) # calculation: ZC in hypoxic cells (lipid + protein) ZC_hypoxia <- ZC_cell(LPC_hypoxia, DZC_hypoxia) ZC_normal <- ZC_cell(LPC_normal, 0) # unknown: lipid:protein carbon atom ratio in cancer ZC_cancer <- function(LPC_cancer) ZC_cell(LPC_cancer, DZC_cancer) # assumption: ZC in cell is same in cancer and hypoxia DZC <- function(LPC_cancer) ZC_cancer(LPC_cancer) - ZC_hypoxia # calculation: lipid:protein carbon atom ratio in cancer LPC_cancer <- uniroot(DZC, c(0, 1))$root # calculation: lipid:protein dry weight ratio in cancer LP_cancer <- LPC_cancer / LP_mass2mol # calculation: lipid:protein dry weight ratio percent difference between hypoxia and cancer LP_diff <- 100*(LP_cancer - LP_hypoxia) / LP_hypoxia # display the results vars <- c("ZC_lipid", "protein", "ZC_protein", "LP_hypoxia", "LPC_hypoxia", "ZC_hypoxia", "LPC_cancer", "LP_cancer", "LP_diff") sapply(vars, function(x) { if(x=="protein") cat(x, as.chemical.formula(round(get(x))), "\n") else cat(x, round(get(x), 2), "\n") }) cat("\n") } # make summary tables 20161220 mksummary <- function(groups=c("CRC", "pancreatic", "hypoxia", "osmotic")) { for(i in seq_along(groups)) { dat <- read.csv(paste0("summary_", groups[i], ".csv"), as.is=TRUE) dat <- cbind(group=groups[i], dat) if(i==1) out <- dat else out <- rbind(out, dat) } write.csv(out, "summary.csv", row.names=FALSE, quote=4) }