palmID
is a contained analysis suite for viral RNA-dependent RNA polymerases (RdRP) based on the “Palmprint” RNA virus barcodes described by Babaian and Edgar, 2021.
# INITIALIZE PALMID WORKSPACE -------------------
# library("palmid")
# Establish Serratus server connection
con <- SerratusConnect()
# Input file
# Generated within `palmid` container
# params are defined in YAML header to expose to CLI
input.path <- params$input.path
if (is.null(input.path)) {
stop("Error: No input provided.")
}
input.fa <- paste0(input.path, '.input.fa') # palmscan-palmprint sequence
input.pp <- paste0(input.path, '.trim.fa') # palmscan-palmprint sequence
input.fev <- paste0(input.path, '.fev') # palmscan .fev report
input.rep <- paste0(input.path, '.txt') # palmscan text motif-report
input.pro <- paste0(input.path, '.pro') # diamond palmDB-alignment file
input.msa <- paste0(input.path, '.msa.fa') # muscle msa
# Output HTML Report
output.html <- paste0(input.path, '.html')
save.plots <- FALSE # save individual plots as png
# Parameters
id_threshold <- 0 # Minimum AA% to retain a hit
max_palmdb_hits <- 500 # Maximum number of alignment hits in PalmDB hits to return
run.time <- Sys.time()
# IMPORT DATASETS -------------------------------
# Import a palmprint-analysis
pp.in <- read.fev(input.fev, FIRST = TRUE)
# Import a diamond-aligned pro file
pro.df <- read.pro(input.pro)
# Populate with Nickname/Taxonomy-data
pro.df$nickname <- get.nickname(pro.df$sseqid, con, ordinal = T)
pro.df <- get.proTax(pro.df, con)
# Set backstop when too many similar hits come up
palmdb.hits <- length(pro.df$qseqid)
if (palmdb.hits > max_palmdb_hits) {
pro.df <- pro.df[ 1:max_palmdb_hits, ]
print.hitn <- paste0("Reporting Top ", max_palmdb_hits, "/", palmdb.hits, " matches")
} else {
print.hitn <- paste0("Reporting all ", palmdb.hits, " matches")
}
# SQL-Import of palmprint/sra meta-data
# parent/child sOTU lookup, sra, biosample, date, organism, geo
palm.sra <- get.palmSra(pro.df, con)
# SQL-import of STAT kmer taxonomic analysis of the retrieved
# matching SRA libraries
stat.sra <- get.sraSTAT(palm.sra$run_id, con)
# Populate stat.sra with percent identity from palm.sra
stat.sra$pident <- palm.sra$pident[ match(stat.sra$run_id, palm.sra$run_id) ]
# GENERATE REPORT-PLOTS -------------------------
# Palmprint Report
pp.report <- PlotReport(pp.in, palmdb)
# Diamond-palmDB Alignment Report
pro.report <- PlotProReport(pro.df, html = T)
# PalmDB Viral Taxonomy Report
tax.report <- PlotTaxReport(pro.df)
# Geospatial distribution Report
geo.report <- PlotGeo2(palm.sra)
date.report <- PlotTimeline(palm.sra)
# Host/Library organism Report
orgn.report <- PlotOrgn(palm.sra, freq = FALSE)
stat.report <- PlotSTAT(stat.sra)
# Print out version
cat( paste0("palmID Version: ", params$palmid.version) )
palmID Version: 0.0.4
## Palmprint Detection
RNA-dependent RNA Polymerase detected within input sequence.
The 99-aa palmprint barcode has a score of 51.4 (high-confidence).
## palmDB Search
There are 66 palmprints matching the input sequence in palmDB.
Reporting all matches with an average aa-identity of 37.9% (28.7 - 99%).
Top match is 'u6346 (skinnyVaisya)' at 99% aa-identity
with an alignment CIGAR 99M, and a taxonomy of
species: unclassified
family : unclassified
phylum : unclassified
The closest named species is u232099 - Porcine bastrovirus at 83.8% identity.
## SRA Search
Matching palmprints identified in 306 SRA sequencing libraries.
Top-palmprint 'u6346 (skinnyVaisya)' was found in 17 libraries,
with top-annotation: viral metagenome
Identification of the the core catalytic motifs A,B,C within the input sequence and reporting the “palmprint” RNA-virus barcode.
Input FASTA sequence:
>NP_062883.2 non-structural polyprotein [Rubella virus]
MEKLLDEVLAPGGPYNLTVGSWVRDHVRSIVEGAWEVRDVVTAAQKRAIVAVIPRPVFTQMQVSDHPALHAISRYTRRHWIEWGPKEALHVLIDPSPGLLREVARVERRWVALCLHRTARKLATALAETASEAWHADYVCALRGAPSGPFYVHPEDVPHGGRAVADRCLLYYTPMQMCELMRTIDATLLVAVDLWPVALAAHVGDDWDDLGIAWHLDHDGGCPADCRGAGAGPTPGYTRPCTTRIYQVLPDTAHPGRLYRCGPRLWTRDCAVAELSWEVAQHCGHQARVRAVRCTLPIRHVRSLQPSARVRLPDLVHLAEVGRWRWFSLPRPVFQRMLSYCKTLSPDAYYSERVFKFKNALCHSITLAGNVLQEGWKGTCAEEDALCAYVAFRAWQSNARLAGIMKGAKRCAADSLSVAGWLDTIWDAIKRFLGSVPLAERMEEWEQDAAVAAFDRGPLEDGGRHLDTVQPPKSPPRPEIAATWIVHAASEDRHCACAPRCDVPRERPSAPAGQPDDEALIPPWLFAERRALRCREWDFEALRARADTAAAPAPPAPRPARYPTVLYRHPAHHGPWLTLDEPGEADAALVLCDPLGQPLRGPERHFAAGAHMCAQARGLQAFVRVVPPPERPWADGGARAWAKFFRGCAWAQRLLGEPAVMHLPYTDGDVPQLIALALRTLAQQGAALALSVRDLPGGAAFDANAVTAAVRAGPRQSAAASPPPGDPPPPRRARRSQRHSDARGTPPPAPARDPPPPAPSPPAPPRAGDPVPPIPAGPADRARDAELEVACEPSGPPTSTRADPDSDIVESYARAAGPVHLRVRDIMDPPPGCKVVVNAANEGLLAGSGVCGAIFANATAALAANCRRLAPCPTGEAVATPGHGCGYTHIIHAVAPRRPRDPAALEEGEALLERAYRSIVALAAARRWACVACPLLGAGVYGWSAAESLRAALAATRTEPVERVSLHICHPDRATLTHASVLVGAGLAARRVSPPPTEPLASCPAGDPGRPAQRSASPPATPLGDATAPEPRGCQGCELCRYTRVTNDRAYVNLWLERDRGATSWAMRIPEVVVYGPEHLATHFPLNHYSVLKPAEVRPPRGMCGSDMWRCRGWHGMPQVRCTPSNAHAALCRTGVPPRASTRGGELDPNTCWLRAAANVAQAARACGAYTSAGCPKCAYGRALSEARTHEDFAALSQRWSASHADASPDGTGDPLDPLMETVGCACSRVWVGSEHEAPPDHLLVSLHRAPNGPWGVVLEVRARPEGGNPTGHFVCAVGGGPRRVSDRPHLWLAVPLSRGGGTCAATDEGLAQAYYDDLEVRRLGDDAMARAALASVQRPRKGPYNIRVWNMAAGAGKTTRILAAFTREDLYVCPTNALLHEIQAKLRARDIDIKNAATYERRLTKPLAAYRRIYIDEAFTLGGEYCAFVASQTTAEVICVGDRDQCGPHYANNCRTPVPDRWPTERSRHTWRFPDCWAARLRAGLDYDIEGERTGTFACNLWDGRQVDLHLAFSRETVRRLHEAGIRAYTVREAQGMSVGTACIHVGRDGTDVALALTRDLAIVSLTRASDALYLHELEDGSLRAAGLSAFLDAGALAELKEVPAGIDRVVAVEQAPPPLPPADGIPEAQDVPPFCPRTLEELVFGRAGHPHYADLNRVTEGEREVRYMRISRHLLNKNHTEMPGTERVLSAVCAVRRYRAGEDGSTLRTAVARQHPRPFRQIPPPRVTAGVAQEWRMTYLRERIDLTDVYTQMGVAARELTDRYARRYPEIFAGMCTAQSLSVPAFLKATLKCVDAALGPRDTEDCHAAQGKAGLEIRAWAKEWVQVMSPHFRAIQKIIMRALRPQFLVAAGHTEPEVDAWWQAHYTTNAIEVDFTEFDMNQTLATRDVELEISAALLGLPCAEDYRALRAGSYCTLRELGSTETGCERTSGEPATLLHNTTVAMCMAMRMVPKGVRWAGIFQGDDMVIFLPEGARSAALKWTPAEVGLFGFHIPVKHVSTPTPSFCGHVGTAAGLFHDVMHQAIKVLCRRFDPDVLEEQQVALLDRLRGVYAALPDTVAANAAYYDYSAERVLAIVRELTAYARGRGLDHPATIGALEEIQTPYARANLHDAD
palmprint sequence:
>NP_062883.2
IEVDFTEFDMNQTLATRDVELEISAALLGLPCAEDYRALRAGSYCTLRELGSTETGCERTSGEPATLLHNTTVAMCMAMR
MVPKGVRWAGIFQGDDMVI
catalytic-motifs:
>NP_062883.2
A:258-269(13.9) B:318-331(14.0) C:349-356(13.5)
IEVDFTEFDMNQ <48> SGEPATLLHNTTVA <17> FQGDDMVI [99]
+|.||| || | |||| | | || + | ||||||
lenDyskFDksq SGdanTslGNTltn vsGDDsvv
Score 51.4, high-confidence-RdRP: high-PSSM-score.reward-DDGGDD.good-segment-length.
Quality control metrics on the motif-scores and size distribution of the “palmprint” informs the confidence that the input sequence contains a viral RdRP. Input scores are shown compared to score distribution from 15,010 canonical viral RdRP from GenBank.
# Palmscan QC-plot
plot(pp.report)
A MUSCLE
-generated multiple sequence alignment of the top-10 matching hits in palmdb
.
# Multiple Sequence Alignment of Top-10 hits in palmDB
cat(paste0(readLines(input.msa), collapse = "\n"))
>u80864_37.0
GYMDGTQFDSCQNAFTQEIERNILSR-LGMPTEAIEAYYSIRNNYLLSSSTFSAL-IDAAKTSGEPGTLLLNTILMMCLTAWLLRTKTVHSVIIGQGDDCFI
>u131679_75.8
VEVDFTEFDRNQTLATRDLEVEVTSHLLGFDC--LDDYKALRAGSYCQLRDLAQTETGCERTSGEPATLLHNTIVGMAM-AMRMVPPGLPWAGIFQGDDMVL
>u196907_72.7
VEVDFTEFDRNQTLATRDLEVEVTSSILGFDN--ADDYKALRAGSYCQLRDIASTETGCERTSGEPATLLHNTVVGMAM-AMRMVMSGAAWSGIFQGDDMVL
>u17187_76.8
VEVDFTEFDMHQSLATRDVELEISASLLGLPS--AEDYRALRSGSYCLIRDVAHTETGCERTSGEPATLLHNTLVATVM-AMRMCSRCRGWAGVFQGDDMVL
>NP_062883.2
IEVDFTEFDMNQTLATRDVELEISAALLGLPC--AEDYRALRAGSYCTLRELGSTETGCERTSGEPATLLHNTTVAMCM-AMRMVPKGVRWAGIFQGDDMVI
>u6346_99.0
IEVDFTEFDMNQTLATRDVELEISAALLGLPC--AEDYRALRAGSYCTLRDLGSTETGCERTSGEPATLLHNTTVAMCM-AMRMVPKGVRWAGIFQGDDMVI
>u24856_86.9
VEVDFTEFDMNQTLATRDVEAEISASLLGLPS--IEDYKALRAGSYCLLRDIASTETGCERTSGEPATLLHNTLVGMCM-AMRMVPKGVRWAGIFQGDDMVM
>u232099_83.8
VEVDFSEFDMNQTLATRDVEIEVSASLLGLPS--ADDYRALRAGSYCLLRDVAATETGCERTSGEPATLLHNTLVGMCM-AMRMVPKGVRWHGIFQGDDMVL
>u206482_37.4
LANDFSEFDSTQGVVTTKLEQKLWRH-CGLPAHLVSLYGVLRASWRIKVPGVASVQNGDMRHSGEPFTLLGNTAVNMAVSGVIMQCQGFSFAG-FKGDDSIV
>u239943_38.8
VCNDFTEFDSTQTDTTVAFEANLLRW-LGVPDCVAQLYVALRAAARVECSMLGVNAPNA-RMSGEANTLFGNTVVTMAV-NALLFPGKFGWAA-FKGDDSIV
>u168864_37.8
CCNDFTEFDSTQNSATATFECRLLRK-LGVPDGVVELYREMRAQFRVLGRDYTYT-TQWNRASGEPNTLFGNTLVTMAVNGVMSLGKDYHCA-MFKGDDSLV
Input “palmprint” are aligned against palmdb
using diamond
to retrieve related viruses. Upto the top 500 related viruses are reported.
# Protein-alignment of Input vs. palmDB
# Taxonomic Demarcations (~)
# Species >90% - Red
# Genus 70-90% - Orange
# Family 45-70% - Green
# Phylum <45% - Purple
plotly::ggplotly(pro.report)
Species-, Genus- and Phylum- level taxonomy of the matching hits in PalmDB allow for taxonomic-inference of the input virus.
# Viral Taxonomy of palmDB Hits
plot(tax.report)
RNA viruses matching input are cross-referenced against 5.7M SRA sequencing libraries to identify sra-virus matches and their associated meta-data.
Organism meta-data from the SRA sequencing libraries matching the input-virus. Word size and color is scaled by proximity of the input-virus to its match in the library. Wordclouds show a) Researcher provided annotations of the libraries or b) STAT automated taxonomic (orders) analysis of the reads contained within the library (Katz et al., 2021).
# Plot SRA Wordcloud - id
# (scaled by AA% proximity to input palmprint)
plot(orgn.report)
# Plot SRA-Organism Wordcloud - freq
# (scaled by frequency in the SRA)
plot(stat.report)