This notebook will use the tidyverse
set of packages for data loading, manipulation and plotting, and knitr
for easy printing of markdown tables for the main paper.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.29 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.1
## [5] purrr_0.3.4 readr_1.3.1 tidyr_1.1.1 tibble_3.0.3
## [9] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 cellranger_1.1.0 pillar_1.4.6 compiler_3.6.3
## [5] dbplyr_1.4.4 tools_3.6.3 digest_0.6.25 lubridate_1.7.9
## [9] jsonlite_1.7.0 evaluate_0.14 lifecycle_0.2.0 gtable_0.3.0
## [13] pkgconfig_2.0.3 rlang_0.4.7 reprex_0.3.0 cli_2.0.2
## [17] rstudioapi_0.11 DBI_1.1.0 yaml_2.2.1 haven_2.3.1
## [21] xfun_0.16 withr_2.2.0 xml2_1.3.2 httr_1.4.2
## [25] fs_1.5.0 hms_0.5.3 generics_0.0.2 vctrs_0.3.2
## [29] grid_3.6.3 tidyselect_1.1.0 glue_1.4.1 R6_2.4.1
## [33] fansi_0.4.1 readxl_1.3.1 rmarkdown_2.3 modelr_0.1.8
## [37] blob_1.2.1 magrittr_1.5 backports_1.1.8 scales_1.1.1
## [41] ellipsis_0.3.1 htmltools_0.5.0 rvest_0.3.6 assertthat_0.2.1
## [45] colorspace_1.4-1 stringi_1.4.6 munsell_0.5.0 broom_0.7.0
## [49] crayon_1.3.4
Due to heterogeneous reporting across each pipeline, we need to specify which are comparable and standardise the column names.
## paleomix only reports QF reads so will only use those
eager_cols <- c(
replicate = "Replicate",
sample_name = "Sample Name",
processed_reads = "# reads after C&M prior mapping",
mapped_qf_reads = "# mapped reads prior RMDup QF",
ontarget_qf = "Endogenous DNA QF (%)",
dedupped_mapped_reads = "Mapped Reads after RMDup",
mean_depth_coverage = "Mean Coverage",
mean_read_length = "average fragment length"
)
nfcoreeager_cols <- c(
replicate = "Replicate",
sample_name = "Sample",
processed_reads = "Samtools Flagstat (pre-samtools filter)_mqc-generalstats-samtools_flagstat_pre_samtools_filter-flagstat_total",
mapped_qf_reads = "Samtools Flagstat (post-samtools filter)_mqc-generalstats-samtools_flagstat_post_samtools_filter-mapped_passed",
ontarget_qf = "endorSpy_mqc-generalstats-endorspy-endogenous_dna_post",
dedupped_mapped_reads = "QualiMap_mqc-generalstats-qualimap-mapped_reads",
mean_depth_coverage = "QualiMap_mqc-generalstats-qualimap-mean_coverage",
mean_read_length = "DamageProfiler_mqc-generalstats-damageprofiler-mean_readlength"
)
paleomix_cols <- c(
replicate = "Replicate",
sample_name = "Target",
processed_reads = "seq_retained_reads",
mapped_qf_reads = "hits_raw(GCF_902167405.1_gadMor3.0_genomic)",
ontarget_qf = "hits_raw_frac(GCF_902167405.1_gadMor3.0_genomic)",
dedupped_mapped_reads = "hits_unique(GCF_902167405.1_gadMor3.0_genomic)",
mean_depth_coverage = "hits_coverage(GCF_902167405.1_gadMor3.0_genomic)",
mean_read_length = "hits_length(GCF_902167405.1_gadMor3.0_genomic)"
)
## Function to standardise colum names using above lists to allow easy joining
rename_col <- function(x, list, inverted = T) {
if (inverted) {
replacement_list <- names(list)
names(replacement_list) <- list
replacement_list[x]
} else {
names(list[x])
}
}
Next we can actually load the files
For EAGER and nf-core/eager
raw_eager <- Sys.glob("results/Report_EAGER*") %>%
enframe(name = NULL, value = "File") %>%
mutate(Replicate = map(File, ~tools::file_path_sans_ext(.x) %>% str_split("_") %>% unlist %>% tail(n = 1)) %>% unlist,
Contents = map(File, ~read_csv(.x))) %>%
unnest() %>%
filter(Replicate != 1)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(Contents)`
raw_nfcoreeager <- Sys.glob("results/nf*") %>%
enframe(name = NULL, value = "File") %>%
mutate(Replicate = map(File, ~tools::file_path_sans_ext(.x) %>% str_split("_") %>% unlist %>% tail(n = 1)) %>% unlist,
Contents = map(File, ~read_tsv(.x))) %>%
unnest() %>%
filter(Replicate != 1,
!grepl("_", Sample)) %>%
mutate(Sample = str_sub(Sample, 1, 6))
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(Contents)`
And for paleomix
raw_paleomix <- Sys.glob("results/paleomix_C*") %>%
enframe(name = NULL, value = "File") %>%
mutate(Replicate = map(File, ~tools::file_path_sans_ext(.x) %>% str_split("_") %>% unlist %>% tail(n = 1)) %>% unlist,
Contents = map(File, ~read_tsv(.x, comment = "#"))) %>%
unnest() %>%
filter(Measure != "lib_type") %>%
mutate(Value = as.numeric(Value)) %>%
pivot_wider(names_from = Measure, values_from = Value) %>%
filter(Replicate != 1) %>%
mutate(`hits_raw_frac(GCF_902167405.1_gadMor3.0_genomic)` = `hits_raw_frac(GCF_902167405.1_gadMor3.0_genomic)` * 100)
## Warning: `cols` is now required when using unnest().
## Please use `cols = c(Contents)`
The following function will allow us to make the input data of all pipelines into a standard (long) format. It will then summarise across the replicates.
standardise_summarise <- function(x, cols) {
x %>%
select(cols) %>%
group_by(sample_name) %>%
select(-replicate) %>%
pivot_longer(cols = !contains("sample_name"), names_to = "category", values_to = "value") %>%
group_by(sample_name, category) %>%
summarise(
Mean = round(mean(value), digits = 1),
SD = round(sd(value), digits = 1)
) %>%
unite(col = "mean_sd_values", Mean, SD, sep = " ± ")
}
And now we can combine
combined <- bind_rows(
standardise_summarise(raw_eager, eager_cols) %>% mutate(pipeline = "eager"),
standardise_summarise(raw_nfcoreeager, nfcoreeager_cols) %>% mutate(pipeline = "nf-core-eager"),
standardise_summarise(raw_paleomix, paleomix_cols) %>% mutate(pipeline = "paleomix")) %>%
select(pipeline, everything()) %>%
pivot_wider(names_from = pipeline, values_from = mean_sd_values) %>%
mutate(category = factor(category, levels = c("processed_reads", "mapped_qf_reads", "ontarget_qf", "dedupped_mapped_reads", "mean_depth_coverage", "mean_read_length"))) %>%
arrange(category)
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(cols)` instead of `cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## `summarise()` regrouping output by 'sample_name' (override with `.groups` argument)
## `summarise()` regrouping output by 'sample_name' (override with `.groups` argument)
## `summarise()` regrouping output by 'sample_name' (override with `.groups` argument)
Finally we can plot as markdown for transfer into the main paper.
combined %>%
kable()
sample_name | category | eager | nf-core-eager | paleomix |
---|---|---|---|---|
COD076 | processed_reads | 71388991 ± 0 | 71388991 ± 0 | 72100142 ± 0 |
COD092 | processed_reads | 69615709 ± 0 | 69615709 ± 0 | 70249181 ± 0 |
COD076 | mapped_qf_reads | 16786467.7 ± 106.5 | 16786491.1 ± 89.9 | 16686607.2 ± 91.3 |
COD092 | mapped_qf_reads | 16283216.3 ± 71.3 | 16283194.7 ± 37.4 | 16207986.2 ± 44.4 |
COD076 | ontarget_qf | 23.5 ± 0 | 23.5 ± 0 | 23.1 ± 0 |
COD092 | ontarget_qf | 23.4 ± 0 | 23.4 ± 0 | 23.1 ± 0 |
COD076 | dedupped_mapped_reads | 12107264.4 ± 87.8 | 12107293.7 ± 69.7 | 12193415.8 ± 86.7 |
COD092 | dedupped_mapped_reads | 13669323.7 ± 87.6 | 13669328 ± 32.4 | 13795703.3 ± 47.9 |
COD076 | mean_depth_coverage | 0.9 ± 0 | 0.9 ± 0 | 0.9 ± 0 |
COD092 | mean_depth_coverage | 1 ± 0 | 1 ± 0 | 1 ± 0 |
COD076 | mean_read_length | 49.4 ± 0 | 49.4 ± 0 | 49.4 ± 0 |
COD092 | mean_read_length | 48.8 ± 0 | 48.8 ± 0 | 48.7 ± 0 |