This notebook will use the tidyverse
set of packages for data loading, manipulation and plotting, and knitr
for easy printing of markdown tables for the main paper.
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.1
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
sessionInfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.29 forcats_0.5.0 stringr_1.4.0 dplyr_1.0.1
## [5] purrr_0.3.4 readr_1.3.1 tidyr_1.1.1 tibble_3.0.3
## [9] ggplot2_3.3.2 tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 cellranger_1.1.0 pillar_1.4.6 compiler_3.6.3
## [5] dbplyr_1.4.4 tools_3.6.3 digest_0.6.25 lubridate_1.7.9
## [9] jsonlite_1.7.0 evaluate_0.14 lifecycle_0.2.0 gtable_0.3.0
## [13] pkgconfig_2.0.3 rlang_0.4.7 reprex_0.3.0 cli_2.0.2
## [17] rstudioapi_0.11 DBI_1.1.0 yaml_2.2.1 haven_2.3.1
## [21] xfun_0.16 withr_2.2.0 xml2_1.3.2 httr_1.4.2
## [25] fs_1.5.0 hms_0.5.3 generics_0.0.2 vctrs_0.3.2
## [29] grid_3.6.3 tidyselect_1.1.0 glue_1.4.1 R6_2.4.1
## [33] fansi_0.4.1 readxl_1.3.1 rmarkdown_2.3 modelr_0.1.8
## [37] blob_1.2.1 magrittr_1.5 backports_1.1.8 scales_1.1.1
## [41] ellipsis_0.3.1 htmltools_0.5.0 rvest_0.3.6 assertthat_0.2.1
## [45] colorspace_1.4-1 stringi_1.4.6 munsell_0.5.0 broom_0.7.0
## [49] crayon_1.3.4
We will load the pre-aggregated and downloaded runtimes as recorded by the GNU time
unix utility
results <- read_tsv("benchmarking_aggregated_runtimes.txt",
col_names = c("Run", "Runtime"))
## Parsed with column specification:
## cols(
## Run = col_character(),
## Runtime = col_character()
## )
Next we can clean up the file names to find the corresponding pipeline name.
results_clean <- results %>%
separate(col = Run, sep = ":", c("File", "Line", "Category")) %>%
select(-Line) %>%
mutate(
File = str_remove(File, "time_") %>%
str_remove(".log") %>%
str_remove("runtimes/") %>%
str_replace("nf-core_eager", "nf-core/eager") %>%
str_replace("paleomix_optimised", "paleomix-optimised"),
Runtime_Minutes = map(Runtime, ~ str_split(.x, "m") %>%
unlist() %>%
unlist() %>%
pluck(1)) %>% unlist() %>% as.numeric()
) %>%
separate(File, sep = "_", into = c("Pipeline", "Replicate")) %>%
select(-Runtime) %>%
filter(Replicate != 1)
To get the final results we will summarise the mean and standard deviation of the three runtime metrics and print the table as markdown.
results_final_tidy <- results_clean %>%
group_by(Pipeline, Category) %>%
summarise(
Mean = round(mean(Runtime_Minutes), digits = 1),
SD = round(sd(Runtime_Minutes), digits = 1)
) %>%
arrange(Category, Mean)
## `summarise()` regrouping output by 'Pipeline' (override with `.groups` argument)
results_final_print <- results_final_tidy %>%
unite(col = "Mean Runtime", Mean, SD, sep = " ± ") %>%
pivot_wider(names_from = Category, values_from = `Mean Runtime`) %>%
kable()
results_final_print
Pipeline | real | sys | user |
---|---|---|---|
nf-core-eager-optimised | 105.6 ± 4.6 | 13.6 ± 0.7 | 1593 ± 79.7 |
paleomix-optimised | 130.6 ± 8.7 | 12 ± 0.7 | 1820.2 ± 36.9 |
nf-core-eager | 209.2 ± 4.4 | 11 ± 0.9 | 1407.7 ± 30.2 |
EAGER | 224.2 ± 4.9 | 22.9 ± 0.3 | 1736.3 ± 70.2 |
paleomix | 314.6 ± 2.9 | 10.7 ± 1 | 1506.7 ± 14 |
We can also plot the results.
## Get get order of fastest to slowest based on real time
results_to_plot <- results_clean %>% mutate(Pipeline = factor(Pipeline, levels = rev(results_final_tidy$Pipeline %>% unique)))
ggplot(results_to_plot, aes(Runtime_Minutes, Pipeline)) +
geom_violin(aes(colour = Pipeline)) +
geom_point(pch = 20, alpha = 0.7) +
xlab("Runtime (minutes)") +
facet_wrap(~Category, scales = "free_x") +
scale_colour_brewer(palette = "Set1", guide = guide_legend(nrow = 2)) +
theme_minimal() +
theme(legend.position = "bottom")