Data Import

suppressPackageStartupMessages({
  require(ape)
  require(DT)
  require(ggplot2)
  require(plyr)
  require(reshape2)
  require(rjson)
  require(slam)
  require(vegan)
})
## Warning: package 'ape' was built under R version 3.3.2
## Warning: package 'ggplot2' was built under R version 3.3.2
## Warning: package 'slam' was built under R version 3.3.2
## Warning: package 'vegan' was built under R version 3.3.2
metadata   <- read.delim("MetaData_Ecig_Final.txt")
json       <- rjson::fromJSON(file="Data_Ecig_Final.biom")
WeightedDM <- as.matrix(read.csv("Weighted_UniFrac_Ecig.csv", row.names = 1))

TaxaIDs   <- sapply(json$rows,    function (x) x$id)
SampleIDs <- sapply(json$columns, function (x) x$id)

# Parse the BIOM file (sparse abundance matrix encoded in JSON)
otus <- slam::simple_triplet_matrix(
          i        = sapply(json$data, function (x) x[[1]]) + 1,
          j        = sapply(json$data, function (x) x[[2]]) + 1,
          v        = sapply(json$data, function (x) x[[3]]),
          nrow     = length(TaxaIDs),
          ncol     = length(SampleIDs),
          dimnames = list(TaxaIDs, SampleIDs))

rownames(metadata)          <- metadata[['Sample']]
metadata                    <- metadata[colnames(otus),]
metadata[['Observed OTUs']] <- colapply_simple_triplet_matrix(otus, function (x) { sum(x>0) })
metadata[['Shannon']]       <- colapply_simple_triplet_matrix(otus, function (x) {
                                 x <- x[x>0]; -1 * sum((x / sum(x)) * log(x / sum(x))) })

# Roll up OTU abundances by genus and convert to relative abundance (%)
otu2genus <- setNames(
  nm     = sapply(json$rows, function (x) { x$id }),
  object = sapply(json$rows, function (x) {
    paste(collapse = "; ", sub("^_+", "", x$metadata$taxonomy))
  })
)
otu2genus <- factor(unname(otu2genus[rownames(otus)]))
genera    <- slam::rollup(otus, 1L, otu2genus, sum)
genera$v  <- genera$v / col_sums(genera)[genera$j]


remove("otus", "otu2genus", "json", "TaxaIDs", "SampleIDs")

Figure 1

long <- melt(
  data          = metadata[,c("SampleType", "Group", "Observed OTUs", "Shannon")], 
  id.vars       = c("SampleType", "Group"), 
  variable.name = "Metric", 
  value.name    = "Diversity" )

for (i in levels(long[['SampleType']])) {
  df <- subset(long, SampleType==i)
  
  gg <- ggplot(df, aes(x=Group, y=Diversity, color=Group, fill=Group)) +
    geom_boxplot(position=position_dodge(.79), outlier.shape=NA, alpha=.3, size=1) +
    geom_point(position=position_dodge(.79), alpha=.5, size=1.8) +
    facet_wrap('Metric', scales = "free_y") + ylim(0, NA) + 
    scale_fill_manual( values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) + 
    scale_color_manual(values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) + 
    labs(x=NULL, y=i) + 
    theme(text = element_text(size=20), legend.position = 'none')
  print(gg)
}

stats <- ddply(long, c('SampleType', 'Metric'), function (x) {
  
  pVals <- sapply(
    X   = list(c("Con", "EC"), c("Con", "TS"), c("EC", "TS")), 
    FUN = function (y) {
      suppressWarnings({
        wilcox.test(
          x = x[x[['Group']] == y[1], 'Diversity'], 
          y = x[x[['Group']] == y[2], 'Diversity']
        )$p.value
      })
  })
  
  pVals <- c(pVals, kruskal.test(x[['Diversity']] ~ x[['Group']])$p.value)
  
  data.frame(
    'Comparison' = c("Con vs. EC", "Con vs. TS", "EC vs. TS", "All vs. All"), 
    'Statistic'  = c(rep("Mann-Whitney", 3), "Kruskal-Wallis"),
    'P-Value'    = signif(pVals, 3) )
})
datatable(stats, rownames = FALSE, class = "display compact")
remove("df", "long", "i", "gg", "stats")

Figure 2

for (i in levels(metadata[['SampleType']])) {
  
  ids <- rownames(subset(metadata, SampleType == i))
  dm  <- as.dist(WeightedDM[ids, ids])
  res <- ape::pcoa(dm)
  ord <- data.frame(res$vectors[,1:2], Group=metadata[ids,'Group'])
  eig <- signif(res$values$Relative_eig[1:2] * 100, 3)
  
  df <- ddply(ord, 'Group', function (x) {
    x$cx <- mean(x[['Axis.1']])
    x$cy <- mean(x[['Axis.2']])
    return (x)
  })
  
  gg <- ggplot(df) +
    geom_point(aes(x=Axis.1, y=Axis.2, color=Group, fill=Group), size=2) +
    geom_segment(aes(x=Axis.1, y=Axis.2, color=Group, xend=cx, yend=cy), size=.75, alpha=0.4) +
    scale_color_manual(values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
    xlab(paste0("PC1 (", eig[1], "% variation explained)")) +
    ylab(paste0("PC2 (", eig[2], "% variation explained)")) +
    ggtitle(paste0(i, ", Weighted UniFrac PCoA")) + 
    theme(
      panel.border     = element_rect(color="black", fill=F, size=1),
      panel.grid.major = element_blank(), 
      panel.grid.minor = element_blank(),
      panel.background = element_rect(fill="white"),
      text             = element_text(size=14),
      plot.title       = element_text(size=16) )
  print(gg)
}

stats <- ddply(metadata, 'SampleType', function (x) {
  
  pVals <- sapply(
    X = list(c("Con", "EC"), c("Con", "TS"), c("EC", "TS"), c("Con", "EC", "TS")), 
    FUN = function (y) {
      
      ids <- as.character(subset(x, Group %in% y)[['Sample']])
      mtx <- WeightedDM[ids, ids]
      
      perms <- t(replicate(999, sample(1:nrow(mtx), nrow(mtx))))
      obj   <- vegan::adonis(mtx ~ metadata[rownames(mtx),'Group'], permutations=perms)
      
      return (obj$aov.tab[['Pr(>F)']][[1]])
  })
  
  data.frame(
    'Compairson' = c("Con vs. EC", "Con vs. TS", "EC vs. TS", "All vs. All"), 
    'Statistic'  = "Permanova",
    'P-Value'    = pVals )
})

datatable(stats, rownames = FALSE, class = "display compact")
remove("df", "gg", "ord", "res", "stats", "dm", "eig", "i", "ids")

Figure 3

# Limit to only the Feces samples
md <- subset(metadata, SampleType == "Feces")
gn <- genera[,rownames(md)]

# Exclude taxa with less than 1% avg abundance
gn <- as.matrix(gn)[row_means(gn) >= .01,]
rownames(gn) <- sub("^.*; ", "", rownames(gn))

long <- melt(gn, varnames=c('Genus', 'Sample'), value.name="Abundance")
long[['Group']] <- md[as.character(long[['Sample']]), 'Group']

# How significant are the differences in abundance?
stats <- ddply(long, 'Genus', function (x) {
  data.frame(P.Value = signif(kruskal.test(x[['Abundance']] ~ x[['Group']])$p.value, 3))
})
stats[['FDR.Adj']] <- signif(p.adjust(stats[['P.Value']], method = 'fdr'), 3)

# Arrange Genera on plot from most to least significant
long[['Genus']]    <- factor(
  x      = as.character(long[['Genus']]), 
  levels = stats[['Genus']][order(stats[['P.Value']])] )

ggplot(long, aes(x=Genus, y=Abundance, color=Group, fill=Group)) +
  geom_boxplot(position=position_dodge(.79), outlier.shape=NA, alpha=.3) +
  geom_point(position=position_dodge(.79), alpha=.5) +
  scale_fill_manual( values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
  scale_color_manual(values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
  scale_y_sqrt(labels=scales::percent, limits=c(0,1), breaks=c(.04, .16, .36, .64, 1)) + 
  theme(
    text        = element_text(size=18),
    plot.title  = element_text(size=18),
    axis.text.x = element_text(angle = -30, vjust = 1, hjust = 0, size = 12) ) +
  labs(title="Genera in Feces Samples with at least 1% Mean Abundance", x=NULL)

datatable(stats, rownames = FALSE, class = "display compact")
remove("gn", "long", "md", "stats")

Supp Figure 1

S1 - A

dm  <- as.dist(WeightedDM)
res <- ape::pcoa(dm)
ord <- data.frame(res$vectors[,1:2], SampleType=metadata[rownames(res$vectors), 'SampleType'])
eig <- signif(res$values$Relative_eig[1:2] * 100, 3)

df <- ddply(ord, 'SampleType', function (x) {
  x$cx <- mean(x[['Axis.1']])
  x$cy <- mean(x[['Axis.2']])
  return (x)
})

ggplot(df) +
  geom_point(aes(x=Axis.1, y=Axis.2, color=SampleType)) +
  geom_segment(aes(x=Axis.1, y=Axis.2, color=SampleType, xend=cx, yend=cy), alpha=0.5) +
  xlab(paste0("PC1 (", eig[1], "% variation explained)")) +
  ylab(paste0("PC2 (", eig[2], "% variation explained)")) +
  ggtitle("Weighted UniFrac PCoA") + 
  theme(
    panel.border     = element_rect(color="black", fill=F, size=1),
    panel.grid.major = element_blank(), 
    panel.grid.minor = element_blank(),
    panel.background = element_rect(fill="white"),
    text             = element_text(size=14),
    plot.title       = element_text(size=16) )

remove("df", "ord", "res", "dm", "eig")

S1 - B

long <- melt(
  data          = metadata[,c('SampleType', 'Observed OTUs', 'Shannon')], 
  id.vars       = "SampleType", 
  variable.name = "Metric", 
  value.name    = "Diversity" )

levels(long[['SampleType']]) <- sub("Buccal", "Buccal\n", levels(long[['SampleType']]))

ggplot(long, aes(x=SampleType, y=Diversity, color=SampleType, fill=SampleType)) +
  geom_boxplot(size=1, outlier.shape=NA, alpha=.3) +
  geom_point(alpha=.5) +
  facet_wrap('Metric', scales = "free_y") + 
  ylim(0, NA) + 
  xlab(NULL) + 
  theme(text = element_text(size=20), legend.position = 'none' )

remove("long")

S1 - C

# Exclude taxa with less than 1% avg abundance
gn <- as.matrix(genera)[row_means(genera) >= .01,]
rownames(gn) <- sub("^.*; ", "", rownames(gn))

long <- melt(as.matrix(gn), varnames=c('Genus', 'Sample'), value.name="Abundance")
long[['SampleType']] <- metadata[as.character(long[['Sample']]), 'SampleType']

# How significant are the differences in abundance?
stats <- ddply(long, 'Genus', function (x) {
  data.frame(
    P.Value = signif(kruskal.test(x[['Abundance']] ~ x[['SampleType']])$p.value, 3),
    Feces   = signif(mean(subset(x, SampleType == "Feces")[['Abundance']]), 3),
    Saliva  = signif(mean(subset(x, SampleType == "Saliva")[['Abundance']]), 3),
    Swab    = signif(mean(subset(x, SampleType == "Buccal Swab")[['Abundance']]), 3))
})
stats[['FDR.Adj']] <- p.adjust(stats[['P.Value']], method = 'fdr')
stats <- stats[order(stats[['P.Value']]),]

# Limit to top 20, order from most to least significant
top20 <- as.character(head(stats[['Genus']], 20))
long  <- long[long[['Genus']] %in% top20,]
long[['Genus']] <- sub("^.+; ([A-Z])", "\\1", as.character(long[['Genus']]))
long[['Genus']] <- factor(long[['Genus']], sub("^.+; ([A-Z])", "\\1", top20))

ggplot(long, aes(x=Genus, y=Abundance, color=SampleType, fill=SampleType)) +
  geom_boxplot(position=position_dodge(.79), outlier.shape=NA, alpha=.3) +
  geom_point(position=position_dodge(.79), alpha=.5) +
  scale_y_sqrt(labels=scales::percent, limits=c(0,1.1), breaks=c(.04, .16, .36, .64, 1)) + 
  theme(
    text        = element_text(size=15),
    plot.title  = element_text(size=15),
    axis.text.x = element_text(angle = -30, vjust = 1, hjust = 0, size = 10),
    plot.margin = unit(x=c(.01, .08, .01, .01), units="npc"),
    legend.position = 'bottom' ) +
  ggtitle("Top 20 Most Significant Differences in Genera with at least 1% Mean Abundance") +
  xlab(NULL)

datatable(stats, rownames = FALSE, class = "display compact")
remove("gn", "long", "top20", "stats")

Supp Figure 2

S2 - A

# Limit to only the Buccal Swab samples
md <- subset(metadata, SampleType == "Buccal Swab")
gn <- genera[,rownames(md)]

# Exclude taxa with less than 1% avg abundance
gn <- as.matrix(gn)[row_means(gn) >= .01,]
rownames(gn) <- sub("^.*; ", "", rownames(gn))

long <- melt(gn, varnames=c('Genus', 'Sample'), value.name="Abundance")
long[['Group']] <- md[as.character(long[['Sample']]), 'Group']

# How significant are the differences in abundance?
stats <- ddply(long, 'Genus', function (x) {
  data.frame(
    P.Value = signif(kruskal.test(x[['Abundance']] ~ x[['Group']])$p.value, 3),
    Con     = signif(mean(subset(x, Group == "Con")[['Abundance']]), 3),
    EC      = signif(mean(subset(x, Group == "EC")[['Abundance']]), 3),
    TS      = signif(mean(subset(x, Group == "TS")[['Abundance']]), 3)
  )
})
stats[['FDR.Adj']] <- signif(p.adjust(stats[['P.Value']], method = 'fdr'), 3)

# Arrange Genera on plot from most to least significant
long[['Genus']]    <- factor(
  x      = as.character(long[['Genus']]), 
  levels = stats[['Genus']][order(stats[['P.Value']])] )

ggplot(long, aes(x=Genus, y=Abundance, color=Group, fill=Group)) +
  geom_boxplot(position=position_dodge(.79), outlier.shape=NA, alpha=.3) +
  geom_point(position=position_dodge(.79), alpha=.5) +
  scale_fill_manual( values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
  scale_color_manual(values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
  scale_y_sqrt(labels=scales::percent, limits=c(0,1), breaks=c(.04, .16, .36, .64, 1)) + 
  theme(
    text        = element_text(size=18),
    plot.title  = element_text(size=18),
    axis.text.x = element_text(angle = -30, vjust = 1, hjust = 0, size = 12) ) +
  xlab(NULL) + ggtitle("Genera in Buccal Swab Samples with at least 1% Mean Abundance")

datatable(stats, rownames = FALSE, class = "display compact")
remove("gn", "long", "md", "stats")

S2 - B

# Limit to only the Saliva samples
md <- subset(metadata, SampleType == "Saliva")
gn <- genera[,rownames(md)]

# Exclude taxa with less than 1% avg abundance
gn <- as.matrix(gn)[row_means(gn) >= .01,]
rownames(gn) <- sub("^.*; ", "", rownames(gn))

long <- melt(gn, varnames=c('Genus', 'Sample'), value.name="Abundance")
long[['Group']] <- md[as.character(long[['Sample']]), 'Group']

# How significant are the differences in abundance?
stats <- ddply(long, 'Genus', function (x) {
  data.frame(
    P.Value = signif(kruskal.test(x[['Abundance']] ~ x[['Group']])$p.value, 3),
    Con     = signif(mean(subset(x, Group == "Con")[['Abundance']]), 3),
    EC      = signif(mean(subset(x, Group == "EC")[['Abundance']]), 3),
    TS      = signif(mean(subset(x, Group == "TS")[['Abundance']]), 3)
  )
})
stats[['FDR.Adj']] <- signif(p.adjust(stats[['P.Value']], method = 'fdr'), 3)

# Arrange Genera on plot from most to least significant
long[['Genus']]    <- factor(
  x      = as.character(long[['Genus']]), 
  levels = stats[['Genus']][order(stats[['P.Value']])] )

ggplot(long, aes(x=Genus, y=Abundance, color=Group, fill=Group)) +
  geom_boxplot(position=position_dodge(.79), outlier.shape=NA, alpha=.3) +
  geom_point(position=position_dodge(.79), alpha=.5) +
  scale_fill_manual( values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
  scale_color_manual(values = c("Con"="#e69f00", "EC"="#56b4e9", "TS"="#009e73")) +
  scale_y_sqrt(labels=scales::percent, limits=c(0,1), breaks=c(.04, .16, .36, .64, 1)) + 
  theme(
    text        = element_text(size=18),
    plot.title  = element_text(size=18),
    axis.text.x = element_text(angle = -30, vjust = 1, hjust = 0, size = 12) ) +
  labs(x=NULL, title="Genera in Saliva Samples with at least 1% Mean Abundance")

datatable(stats, rownames = FALSE, class = "display compact")
remove("gn", "long", "md", "stats")

S3

stats <- ddply(metadata, 'SampleType', function (md) {
  
  rownames(md) <- as.character(md[['Sample']])
  
  # Find the top 10 most abundant genera with minimum avg abundance >= 1%
  gn    <- genera[,rownames(md)]
  gn    <- gn[row_means(gn) >= .01,]
  top10 <- head(names(rev(sort(row_means(genera)))), 10)
  gn    <- as.matrix(genera[top10,rownames(md)])
  
  # Compute correlation between each genus and CO ppm
  long <- melt(gn, varnames=c('Genus', 'Sample'), value.name="Abundance")
  long[['CO_ppm']] <- md[as.character(long[['Sample']]), 'CO_ppm']
  
  result <- ddply(long, 'Genus', function (x) {
    obj <- lm(Abundance ~ CO_ppm, data=x)
    data.frame(
      'R.Squared' = signif(summary(obj)$r.squared, 3),
      'P.Value'   = signif(coef(summary(obj))['CO_ppm', 'Pr(>|t|)'], 3)
      )
  })
  result[['FDR.Adj']] <- signif(p.adjust(result[['P.Value']], method = 'fdr'), 3)
  
  return (result)
})
signifTaxa <- as.character(stats[['Genus']][stats[['FDR.Adj']] <= 0.05])
stats[['Genus']] <- sub("^.+; ([A-Z])", "\\1", as.character(stats[['Genus']]))

# Plot correlations for significantly correlated genera
long <- melt(
  data       = as.matrix(genera[signifTaxa,]),
  varnames   = c("Genus", "Sample"),
  value.name = "Abundance"
)
long[['SampleType']] <- metadata[as.character(long[['Sample']]), 'SampleType']
long[['CO_ppm']]     <- metadata[as.character(long[['Sample']]), 'CO_ppm']
long[['Genus']]      <- sub("^.+; ([A-Z])", "\\1", as.character(long[['Genus']]))

ggplot(long, aes(x=CO_ppm, y=Abundance, color=SampleType, fill=SampleType)) +
  geom_point() + 
  geom_smooth(method = "lm") +
  facet_grid(Genus ~ SampleType) +
  scale_y_continuous(labels=scales::percent) + 
  theme(
    text            = element_text(size=18),
    plot.title      = element_text(size=18),
    legend.position = "none" ) +
  ggtitle("Genera Correlated with CO ppm in Fecal Samples")

datatable(stats, rownames = FALSE, class = "display compact")
remove("long", "signifTaxa", "stats")