# Project file for the paper
#   Chemical composition and the potential
#   for proteomic transformation in cancer,
#   hypoxia, and hyperosmotic stress
# Author: Jeffrey M. Dick
# First version: 20160410
# Last updated: 20170513

# Developed with R 3.2.4 and the following packages:
# CHNOSZ 1.1.0 (https://cran.r-project.org/package=CHNOSZ)
# canprot 0.0.5 (http://github.com/jedick/canprot)
# optional: extrafont 0.17 (https://cran.r-project.org/package=extrafont)

# Usage:
# 1. Place this file in the R working directory.
# 2. Create a subdirectory named 'potentials'.
# 3. To use the Linux Biolinum font, change the following to TRUE.
useLB <- FALSE
# 4. Source this file in R and run the following commands.
# basis_comp(pdf=TRUE) --> basis_comp.pdf        (Figure S1)
# cpcp(1, pdf=TRUE, arrows=TRUE) --> arrows1.pdf (Figure S2)
# cpcp(2, pdf=TRUE, arrows=TRUE) --> arrows2.pdf (Figure S2)
# cpcp(1, pdf=TRUE)    --> cpcp1.pdf             (Fig. 1)
# cpcp(2, pdf=TRUE)    --> cpcp2.pdf             (Fig. 2)
#   (the above also produce summary_*.csv files needed for the following calculations)
# mksummary()        --> summary.csv             (Table S1)
# comppot()          --> potentials/*.[csv|pdf]  (Figure S3)
# comppot(pdf=TRUE)  --> comppot.pdf             (Fig. 3)
# lp()               --> lipid:protein ratio     (Fig. 4)

# load required packages and data
library(CHNOSZ)
data(thermo)
library(canprot)
data(canprot)
if(useLB) {
  # http://blog.revolutionanalytics.com/2012/09/how-to-use-your-favorite-fonts-in-r-charts.html
  library(extrafont)
  family <- "Linux Biolinum"
} else {
  family <- "Helvetica"
}

# to plot compositions of down- and up-expressed proteins connected by arrows 20170311
arrowplot <- function(comptab, vars=c("ZC", "nH2O"), col="black", plot.text=TRUE) {
  # convert to data frame if needed
  if(!is.data.frame(comptab)) comptab <- do.call(rbind, comptab)
  # which columns we're using
  stats <- c("mean", "p.value")
  iX <- sapply(paste(vars[1], stats, sep="."), grep, colnames(comptab))
  iY <- sapply(paste(vars[2], stats, sep="."), grep, colnames(comptab))
  # get means for down- and up-expressed proteins
  X_d <- comptab[, iX[[1]][1]]
  X_u <- comptab[, iX[[1]][2]]
  X_p <- comptab[, iX[[2]]]
  Y_d <- comptab[, iY[[1]][1]]
  Y_u <- comptab[, iY[[1]][2]]
  Y_p <- comptab[, iY[[2]]]
  # set up plot
  x <- vars[1]
  y <- vars[2]
  if(vars[1]=="ZC") xlab <- expression(italic(Z)[C])
  if(vars[2]=="nH2O") ylab <- expression(bar(italic(n))[H[2] * O])
  # initialize plot with range of all points
  plot(type="n", c(X_d, X_u), c(Y_d, Y_u), xlab=xlab, ylab=ylab)
  # plot arrows
  arrows(X_d, Y_d, X_u, Y_u, length=0.1)
  # point symbols: open circle, filled circle, filled square (0, 1 or 2 vars with p-value < 0.05)
  p_signif <- rowSums(data.frame(X_p < 0.05, Y_p < 0.05))
  pch <- ifelse(p_signif==2, 15, ifelse(p_signif==1, 19, 21))
  # plot points with specified color
  col <- rep(col, length.out=nrow(comptab))
  if(!plot.text) points(X_d, Y_d, pch=pch, col=col, bg="white") else {
    # plot bigger points with letters inside
    points(X_d, Y_d, pch=pch, col=col, bg="white", cex=2)
    # use white letters on colored background
    col[pch %in% c(15, 19)] <- "white"
    text(X_d, Y_d, c(letters, LETTERS)[seq_along(X_d)], col=col, cex=0.9)
  }
}

# compare QEC and CHNOS basis species on nO2-ZC and nH2O-ZC diagrams 20170503
basis_comp <- function(pdf=FALSE) {
  # amino acid compositions of all proteins in the UniProt human proteome
  aa <- human_base
  # protein formula and ZC
  protein.formula <- protein.formula(aa)
  ZC <- ZC(protein.formula)
  if(pdf) pdf("basis_comp.pdf", family=family, width=7, height=7)
  par(mfrow=c(2, 2))
  par(mar=c(4, 4, 2.5, 1))
  par(cex=1.1)
  par(mgp=c(2.5, 1, 0))
  nH2Olab <- expression(bar(italic(n))[H[2] * O])
  nO2lab <- expression(bar(italic(n))[O[2]])
  ZClab <- expression(italic(Z)[C])
  # composition projected into different sets of basis species
  for(basis in c("QEC", "CHNOS")) {
    basis(basis)
    protein.basis <- protein.basis(aa)
    protein.length <- protein.length(aa)
    residue.basis <- protein.basis/protein.length
    smoothScatter(ZC, residue.basis[, "O2"], xlab=ZClab, ylab=nO2lab)
    smoothScatter(ZC, residue.basis[, "H2O"], xlab=ZClab, ylab=nH2Olab)
    QEClab <- syslab(c("glutamine", "glutamic acid", "cysteine", "H2O", "O2"), dash="-")
    CHNOSlab <- syslab(c("CO2", "NH3", "H2S", "H2O", "O2"), dash="-")
    if(basis=="QEC") mtext(QEClab, outer = TRUE, cex = 1.2, line=-1.5)
    if(basis=="CHNOS") mtext(CHNOSlab, outer = TRUE, cex = 1.2, line=-17.5)
  }
  if(pdf) dev.off()
}

# compare plots for cancer proteomes 20160913
cpcp <- function(which=1, pdf=FALSE, basis="QEC+", arrows=FALSE) {
  # set up plot
  if(which==1) {
    if(arrows) file <- "arrows1.pdf" else file <- "cpcp1.pdf"
    groups <- c("CRC", "pancreatic")
  } else if(which==2) {
    if(arrows) file <- "arrows2.pdf" else file <- "cpcp2.pdf"
    groups <- c("hypoxia", "osmotic")
  }
  if(pdf) pdf(file, family=family, width=9, height=4)
  layout(matrix(c(1, 2), nrow=1, byrow=TRUE))
  par(mar=c(3.5, 4, 2, 1.5))
  par(mgp=c(2.5, 1, 0))
  # loop over groups of experiments
  for(i in seq_along(groups)) {
    pdat_fun <- paste0("pdat_", groups[i])
    datasets <- get(pdat_fun)()
    comptab <- lapply_canprot(datasets, function(dataset) {
      pdat <- get_pdat(dataset, pdat_fun, basis)
      ZC_nH2O(pdat, plot.it=FALSE)
    }, varlist=c(pdat_fun))
    col <- rep("black", length(datasets))
    # highlight chronic pancreatitis and low-grade tumor datasets
    col[grepl("=low", datasets)] <- "red"
    # highlight reoxygenation datasets
    col[grepl("ReOx", datasets)] <- "blue"
    # highlight spheroid datasets
    col[grepl("=SPH", datasets)] <- "red"
    # highlight adenoma / normal datasets
    col[grepl("=AD", datasets)] <- "red"
    # highlight adipose-derived stem cell datasets
    col[grepl("=ASC", datasets)] <- "orange"
    # make the plot
    if(arrows) arrowplot(comptab, col=col)
    else diffplot(comptab, col=col, plot.rect=TRUE)
    if(!arrows) {
      # add figure label
      label.figure(LETTERS[i], xfrac=0.04, family=family, cex=1.5, adj=0)
      # write summary table
      comptab <- do.call(rbind, comptab)
      comptab <- cbind(set=c(letters, LETTERS)[1:nrow(comptab)], comptab)
      comptab[, 6:15] <- signif(comptab[, 6:15], 4)
      write.csv(comptab, paste0("summary_", groups[i], ".csv"), row.names=FALSE, quote=3)
    }
  }
  if(pdf) {
    dev.off()
    # embed fonts - maybe not here to save space
    #embed_fonts(file, options="-dPDFSETTINGS=/prepress")
  }
}

# function to plot values with colors and labels
plotit <- function(values, rankdat, add=FALSE) {
  col <- get_colors(values)
  with(rankdat, {
    if(!add) {
      image(xs, ys, values, col=col, useRaster=TRUE, xlab=xlab, ylab=ylab, axes=FALSE)
      if(diff(range(xs)) > 9) {
        axis(1, at=seq(-88, -40, by=12))
        axis(2, at=seq(-24, 24, by=12), las=1)
      } else {
        axis(1, at=seq(-70, -62, by=2))
        axis(2, at=seq(-6, 2, by=2), las=1)
      }
      box()
    }
    # show equal-rank line
    contour(xs, ys, values, levels=0, add=TRUE, drawlabels=FALSE, lwd=1.5, col="white")
  })
}

# merging potential diagrams for pancreatic cancer 20161012
calcpot <- function(group="pancreatic", datasets=c("LHE+04", "PCS+11_PDAC"),
                            basis="QEC+", res=50, plot.it=FALSE,
                            each100=FALSE, xlim=c(-70.2, -61.8), ylim=c(-6.2, 2.2), m=0) {
  # get the data
  data(canprot)
  pdat_fun <- paste0("pdat_", group)
  rankdat <- lapply_canprot(datasets, function(dataset) {
    pdat <- get_pdat(dataset, pdat_fun, basis)
    rankplot(pdat, res=res, plot.it=FALSE, xlim=xlim, ylim=ylim)
  }, varlist=pdat_fun, min.length=5)
  rankdiff <- lapply(rankdat, "[[", "rankdiff")
  dat <- read.csv(paste0("summary_", group, ".csv"), as.is=TRUE)
  # function to scale all values by same factor such that maximum is 100
  scale100 <- function(valuelist, each100=FALSE) {
    if(each100) lapply(valuelist, function(x) x * 100 / max(abs(range(x))))
    else {
      totalrange <- range(valuelist)
      lapply(valuelist, function(x) x * 100 / max(abs(totalrange)))
    }
  }
  # scale and display the rank differences
  rankdiff <- scale100(rankdiff, each100)
  # calculate the percentages of total values
  rabs <- lapply(rankdiff, abs)
  rsum <- sapply(rabs, sum)
  rperc <- round(100 * rsum / sum(rsum), 1)
  if(plot.it) for(i in seq_along(rankdat)) {
    plotit(rankdiff[[i]], rankdat[[i]])
    mtext(datasets[i], side=4, cex=0.85, las=0, adj=0, line=0.1)
    # show the dataset description and letter
    idat <- match(datasets[i], dat$dataset)
    label.figure(dat$set[idat], xfrac=0.25, cex=1.5)
    label.figure(paste0("     (", rperc[i], "%)"), adj=0, xfrac=0.25, cex=1.5)
    # put a circle around the letter
    opar <- par(xpd=NA)
    points(grconvertX(0.25, "nfc"), grconvertY(0.95, "nfc"), cex=3.5)
    par(opar)
  }
  if(m) {
    # merge the rank differences for all combinations of 'm' number of datasets
    combs <- combn(length(rankdat), m)
    comb <- x <- y <- numeric()
    print(paste("calcpot: calculating equipotential lines for", ncol(combs), "combinations"))
    for(i in 1:ncol(combs)) {
      merged <- Reduce("+", rankdiff[combs[, i]])
      cl <- contourLines(rankdat[[1]]$xs, rankdat[[1]]$ys, merged, levels=0)
      if(length(cl) > 0) {
        # in case the line has multiple segments
        clx <- do.call(c, lapply(cl, "[[", 2))
        cly <- do.call(c, lapply(cl, "[[", 3))
        x <- c(x, clx)
        y <- c(y, cly)
        comb <- c(comb, rep(i, length(clx)))
      }
    }
    out <- data.frame(comb=comb, x=x, y=y)
    return(out)
  } else {
    # merge the rank differences for all datasets
    merged <- Reduce("+", rankdiff) / length(rankdat)
    merged <- scale100(list(merged))
    return(invisible(list(merged=merged, rankdat=rankdat)))
  }
}

get_main <- function(what, n=NA) {
  group <- strsplit(what, "_")[[1]][1]
  metric <- strsplit(what, "_")[[1]][2]
  direction <- strsplit(what, "_")[[1]][3]
  if(direction=="up") than <- "> 0.01" else than <- "< -0.01"
  if(metric=="ZC") main <- substitute(Delta*italic(Z)[C]~than, list(than=than))
  if(metric=="H2O") main <- substitute(Delta*italic(bar(n))[H[2]*O]~than, list(than=than))
  if(!is.na(n)) {
    # make long titles to be placed on two lines
    if(group %in% c("hypoxia", "osmotic")) {
      if(group=="hypoxia") group <- "hypoxia or 3D culture"
      if(group=="osmotic") group <- "hyperosmotic stress"
      main <- c(group, substitute(main~~"("*x*")", list(group=group, main=main, x=n)))
    } else main <- substitute(group~~main~~"("*x*")", list(group=group, main=main, x=n))
  }
  return(main)
}

getpot <- function(what="hypoxia_ZC_up", each100=FALSE, res=50, pdf=FALSE) {
  group <- strsplit(what, "_")[[1]][1]
  metric <- strsplit(what, "_")[[1]][2]
  direction <- strsplit(what, "_")[[1]][3]
  dat <- read.csv(paste0("summary_", group, ".csv"), as.is=TRUE)
  cnames <- colnames(dat)
  # the difference for this metric
  diff <- dat[, grepl(metric, cnames) & grepl("diff", cnames)]
  # the p-value and CLES for the other metric
  p.value <- dat[, !grepl(metric, cnames) & grepl("p.value", cnames)]
  CLES <- dat[, !grepl(metric, cnames) & grepl("CLES", cnames)]
  # the rows that have the difference in the specified direction
  # and the magnitude of the difference is at least 0.01
  # and for which the other metric doesn't have a large or significant change
  if(direction=="up") idat <- diff > 0.01 & !(p.value < 0.05 | abs(signif(CLES, 2) - 50) >= 10)
  if(direction=="down") idat <- diff < -0.01 & !(p.value < 0.05 | abs(signif(CLES, 2) - 50) >= 10)
  if(pdf) {
    if(!"potentials" %in% dir()) stop("A directory named 'potentials' does not exist. Please create it.")
    # calculate potentials over large area and save figures and coordinates of equipotential lines
    # number of plots - 1 extra for title
    np <- sum(idat) + 1
    if(np <= 28) mfrow <- c(4, 7)
    if(np <= 24) mfrow <- c(4, 6)
    if(np <= 20) mfrow <- c(4, 5)
    if(np <= 18) mfrow <- c(3, 6)
    if(np <= 16) mfrow <- c(4, 4)
    if(np <= 15) mfrow <- c(3, 5)
    if(np <= 12) mfrow <- c(3, 4)
    if(np <= 9) mfrow <- c(3, 3)
    if(np <= 8) mfrow <- c(2, 4)
    if(np <= 6) mfrow <- c(2, 3)
    if(np <= 4) mfrow <- c(2, 2)
    pdf(file <- paste0("potentials", "/", what, ".pdf"), family=family, width=2.5*mfrow[2], height=2.5*mfrow[1])
    par(mfrow=mfrow)
    par(mar=c(3.5, 4, 2, 1.5))
    par(mgp=c(2.5, 1, 0))
    # first plot the title
    plot.new()
    opar <- par(xpd=TRUE)
    text(0.5, 0.8, group, cex=3)
    text(0.5, 0.6, get_main(what), cex=3)
    par(opar)
    # m=1 here to calculate lines for each dataset individually
    # expand x- and y-axes (48 log units)
    mr <- calcpot(group=group, datasets=dat$dataset[idat], res=res, plot.it=TRUE, each100=each100, xlim=c(-88, -40), ylim=c(-24, 24), m=1)
    write.csv(mr, paste0("potentials", "/", what, ".csv"), row.names=FALSE, quote=FALSE)
    dev.off()
    #embed_fonts(file, options="-dPDFSETTINGS=/prepress")
  } else {
    mr <- calcpot(group=group, datasets=dat$dataset[idat], res=res, each100=each100)
    return(invisible(mr))
  }
}

comppot <- function(pdf=FALSE, each100=FALSE, res=50) {
  # what we are plotting - group_metric_direction
  cancer <- c("CRC_ZC_up","pancreatic_ZC_up", "CRC_H2O_up", "pancreatic_H2O_up")
  environment <- c("hypoxia_ZC_down", "osmotic_H2O_down")
  getpotfun <- function(what) getpot(what, each100=each100, res=res, pdf=!pdf)
  merged <- list()
  # ZC
  merged <- c(merged, lapply(cancer[1:2], getpotfun))
  merged <- c(merged, lapply(environment[1], getpotfun))
  # H2O
  merged <- c(merged, lapply(cancer[3:4], getpotfun))
  merged <- c(merged, lapply(environment[2], getpotfun))
  if(pdf) {
    pdf(file <- paste0("comppot", ".pdf"), family=family, width=6.75, height=4.64)
    par(mfrow=c(2, 3))
    par(mar=c(3.4, 4, 2.6, 1.5))
    par(mgp=c(2.5, 1, 0))
    # names of all things we are plotting
    what <- c(cancer[1:2], environment[1], cancer[3:4], environment[2])
    for(i in 1:length(merged)) {
      # the merged potential diagram
      plotit(merged[[i]][[1]][[1]], merged[[i]][[2]][[1]])
      # median and interquartile range of the individual equipotential lines
      plot_median(what[i], add=TRUE, transpose=grepl("ZC",what[i]))
      # plot the merged equipotential line again (for emphasis)
      plotit(merged[[i]][[1]][[1]], merged[[i]][[2]][[1]], add=TRUE)
      # add figure label
      label.figure(LETTERS[i], xfrac=0.1, yfrac=0.91, family=family, cex=1.5)
    }
    dev.off()
    #embed_fonts(file, options="-dPDFSETTINGS=/prepress")
  }
}

# plot median and 1st and 3rd quartiles of equipotential lines 20161021
plot_median <- function(what, each100=FALSE, transpose=FALSE, add=FALSE, type=7) {
  # equipotential lines for individual datasets
  indiv <- read.csv(paste0("potentials", "/", what, ".csv"), as.is=TRUE)
  # show scatterplot of points on all lines
  if(!add) plot(indiv$x, indiv$y, pch=".")
  if(transpose) {
    ys <- seq(-24, 24, by=0.2)
    xs <- sapply(unique(indiv$comb), function(comb) {
      idat <- indiv$comb == comb
      # use loess to get smooth lines
      indlo <- loess(x~y, data=indiv[idat, ], span=0.2)
      spx <- predict(indlo, newdata=ys)
      # show loess lines
      if(!add) lines(spx, ys, col="gray")
      return(spx)
    })
    # calculate and plot median
    xmedian <- apply(xs, 1, median, na.rm=TRUE)
    lines(xmedian, ys, lwd=2)
    # calculate and plot 1st and 3rd quartiles
    xq1 <- apply(xs, 1, quantile, probs=0.25, na.rm=TRUE, type=type)
    xq3 <- apply(xs, 1, quantile, probs=0.75, na.rm=TRUE, type=type)
    lines(xq1, ys, lty=2)
    lines(xq3, ys, lty=2)
  } else {
    xs <- seq(-88, -40, by=0.2)
    ys <- sapply(unique(indiv$comb), function(comb) {
      idat <- indiv$comb == comb
      # use loess to get smooth lines
      indlo <- loess(y~x, data=indiv[idat, ], span=0.2)
      spy <- predict(indlo, newdata=xs)
      # show loess lines
      if(!add) lines(xs, spy, col="gray")
      return(spy)
    })
    # calculate and plot median
    ymedian <- apply(ys, 1, median, na.rm=TRUE)
    lines(xs, ymedian, lwd=1.5)
    # calculate and plot 1st and 3rd quartiles
    yq1 <- apply(ys, 1, quantile, probs=0.25, na.rm=TRUE, type=type)
    yq3 <- apply(ys, 1, quantile, probs=0.75, na.rm=TRUE, type=type)
    lines(xs, yq1, lty=2)
    lines(xs, yq3, lty=2)
  }
}

# estimate lipid:protein ratio in cancer 20161121
# a computer-aided "back of the envelope" calculation
lp <- function() {
  # assumption: constant lipid formula (Wikipedia: triglyceride)
  lipid <- "C55H102O6"
  ZC_lipid <- ZC(lipid)      # -1.564
  # assumption: average protein formula (UniProt: human proteome)
  protein <- colMeans(protein.formula(human_base))
                             # C2683H4236N749O810S24 
  ZC_protein <- ZC(protein)  # -0.119
  # assumption: change of ZC in proteins in hypoxia and cancer (this study)
  DZC_hypoxia <- -0.03
  DZC_cancer <- 0.03
  ## assumption: normal lipid:protein dry weight ratio (MP15: E. coli macromolecular composition)
  ##LP_normal <- 9/55
  ## assumption: lipid:protein dry weight ratio in hypoxia (GBB77: triglyceride increase by 240%)
  #LP_hypoxia <- 2 * LP_normal
  # assumption: normal and hypoxia lipid:protein dry weight ratio (GBB77: Table 2)
  LP_normal <- (115.4 + 15.0 + 12.3 +  9.6) / 1000    # 0.152
  LP_hypoxia <- (114.7 + 16.4 + 41.9 + 18.6) / 1000   # 0.192
  # assumption: lipid:protein carbon atom ratio (representative lipid and protein formulas)
  LP_mass2mol <- (makeup(lipid)["C"] / mass(lipid)) / (makeup(protein)["C"] / mass(protein))
  LPC_hypoxia <- LP_hypoxia * LP_mass2mol
  LPC_normal <- LP_normal * LP_mass2mol
  # assumption: ZC in cell is sum of protein and lipid contributions
  ZC_cell <- function(LPC, DZC) ZC_lipid * LPC/(LPC+1) + (ZC_protein + DZC) * 1/(LPC+1)
  # calculation: ZC in hypoxic cells (lipid + protein)
  ZC_hypoxia <- ZC_cell(LPC_hypoxia, DZC_hypoxia)
  ZC_normal <- ZC_cell(LPC_normal, 0)
  # unknown: lipid:protein carbon atom ratio in cancer
  ZC_cancer <- function(LPC_cancer) ZC_cell(LPC_cancer, DZC_cancer)
  # assumption: ZC in cell is same in cancer and hypoxia
  DZC <- function(LPC_cancer) ZC_cancer(LPC_cancer) - ZC_hypoxia
  # calculation: lipid:protein carbon atom ratio in cancer
  LPC_cancer <- uniroot(DZC, c(0, 1))$root
  # calculation: lipid:protein dry weight ratio in cancer
  LP_cancer <- LPC_cancer / LP_mass2mol
  # calculation: lipid:protein dry weight ratio percent difference between hypoxia and cancer
  LP_diff <- 100*(LP_cancer - LP_hypoxia) / LP_hypoxia
  # display the results
  vars <- c("ZC_lipid", "protein", "ZC_protein", "LP_hypoxia", "LPC_hypoxia", "ZC_hypoxia", "LPC_cancer", "LP_cancer", "LP_diff")
  sapply(vars, function(x) {
    if(x=="protein") cat(x, as.chemical.formula(round(get(x))), "\n")
    else cat(x, round(get(x), 2), "\n")
  })
  cat("\n")
}

# make summary tables 20161220
mksummary <- function(groups=c("CRC", "pancreatic", "hypoxia", "osmotic")) {
  for(i in seq_along(groups)) {
    dat <- read.csv(paste0("summary_", groups[i], ".csv"), as.is=TRUE)
    dat <- cbind(group=groups[i], dat)
    if(i==1) out <- dat else out <- rbind(out, dat)
  }
  write.csv(out, "summary.csv", row.names=FALSE, quote=4)
}