File S1. Pseudocode implementation of lowest common ancestor assignment (LCA) algorithm. ### Notes: # this file is perl-based pseudocode that describes the LCA implementation used in this study. Variables are generally not scoped explicitly, the perl path is omitted, and compilation warnings are not specified. Input/output paths are omitted. The form of the input files is noted but these accessory files must be generated using a text-processing language such as perl, sed, or awk. # the current example uses parameter values for LCA analysis of cytochrome oxidase I amplicons. These parameters differed among barcodes. # the pseudocode follows perl syntax and uses hash marks for commenting, as several widely used text editors autoformat text elements for increased legibility ### Operation: # 1. The script reports the OTU, the max bit score and average %id of HSPs contributing to an accepted assignment, the taxonomic assignment and it's rank # 2. The script requires a matrix of each HSP of an OTU and their corresponding taxonomic assignments. This can be generated from the original BLAST output and NCBI taxonomy flat files (see NCBI ftp site). The input form is tab-delimited and collects the information for each HSP on a single line: OTU, HSP accession, bit score, %id, and the complete NCBI taxonomy associated with accession, iterated as taxid, rank, taxon. # 3. The goal of the script is to identify the maximum bit score for each HSP, the list of HSPs for that OTU that are within the LCA criterion (the complement of the $thresh_id variable), and assess at progressively more inclusive ranks whether accepted HSPs share the same taxonomic assignment. When that condition holds, the taxonomic assignment is made at that rank, providing certain modifiers are not invoked, see below # 4. The script largely eschews subroutines/modules for this example. ######################## ######################## ### Set parameters for assignment $threshold=300;# sets the minimum bit score for an HSP to be considered $thresh_id=0.97;# sets the percent of max score for an HSP to be considered print "An end of file mark (EOF) will be appended\n"; @ranks=("species","genus","family","order","class"); print "the ranks to be used are: @ranks\n"; ### Get input file get_file_data(); #subroutine gets input file and returns it as @filedata $len=@filedata;#flow control variable $filedata[$len]="EOF";#appends end of file mark. no action is needed when eof line is found, merely allows completion of final otu open(OUTFILE,">assigned.taxa") || die "can't open outfile\n";#creates output files in the working directory open(OUTFILE2,">assignment.logfile") || die "can't open logfile\n"; ### Loop through HSP matrix iteratively for each taxon level foreach $rank (@ranks) {print "working on $rank\n";print OUTFILE "\n"; $c=0;$assigned=0;$assignflag=0; #counters are reassigned to zero with each pass ### Parse each line of matrix while ($c<$len+1) {chomp $filedata[$c]; if($c % 1000 == 1) {print "working on line $c\n";} #updates status of program to standard out @arr=split("\t",$filedata[$c]); $otu=$arr[0];$score=$arr[2];$id=$arr[3]; unless(exists($assigned{$otu})) {#Skips OTUs already assigned at previous rank ### Search line for taxon associated with hsp at the current rank. Assumes taxonomy is given in the form: taxid, rank, taxon scientific name. The NCBI taxid is generally preferred as an identifier because it is unique, whereas nonunique scientific names are not uncommon and can lead to error. However, at this stage, the correct and unique taxonomy is assumed to be present in the input file and therefore the name itself rather than id is used. The $taxon variable is filled with a dummy value at the start, because NCBI accessions may not have a defined value for a given rank. If dummy variable still present at assignment, assignment is skipped. $d=0;$linelen=@arr;$taxon="not available-$c";while($d<$linelen) {if($arr[$d] eq $rank) {$taxon=$arr[$d+1];} $d++;} ### Determine whether HSP is a continuation of the same OTU or a new one and capture taxon, bit, and %id if ($c==0) {$current=$otu;$max{$taxon}=$score;$id{$taxon}=$id;$count{$taxon}=1;print OUTFILE2 "####\nworking on rank: $rank\n\notu is $otu\n";} else { if($otu eq $current) {if($score>$max{$taxon}) {$max{$taxon}=$score;} $id{$taxon}+=$id;$count{$taxon}++;} #acquire hsp metrics for this otu else {#attempt an assignment at the current rank for just completed otu while(($k,$v)=each %max) {print OUTFILE2 "k is $k and v is $v\n"; #print max values for each taxon to log for later inspection if($v>$best) {$best=$v;} } print OUTFILE2 "best bit score for $current is $best\n"; #first time through is to identify best while(($k,$v)=each %max) {if($v>($best*$thresh_id)) {$candidates++;$avgid=$id{$k}/$count{$k};#second time through is to identify lca candidates print OUTFILE2 "keeping taxon $k with max bit score $v, avgid $avgid, and count ",$count{$k},"\n"; $kepttax=$k;$keptscore=$v; #k,v are scoped only for loop $result="$current\t$kepttax\t$keptscore\t$rank\t$avgid\n"; }} if($candidates==1 and $best>$threshold) {#Determine if basic LCA criteria met. Additional filters can be added here unless($kepttax=~m/not available/) {$qc=0; #qc is a quality check flag that blocks accepting of result for species and genus unless avgid and bit score meet thresholds. if($result=~m/species/) {$qc++; if($avgid>94.9 and $keptscore>549) {#note blast output reports id on a 1-100 scale, not 0-1. $assignflag++; }} elsif($result=~m/genus/) {$qc++; if($avgid>89.9 and $keptscore>449) { $assignflag++; }} if($qc==0) {$assignflag++;} } #close unless } #close if candidates if ($assignflag>0) {print OUTFILE $result;$assigned{$current}++;$assigned++;print OUTFILE2 "assigned $current to $kepttax\n";} elsif($assignflag==0) {print OUTFILE2 "no assignment made at this rank\n";} print OUTFILE2 "\n\n####\nnew otu $otu at line $c\n"; %max=();$best=0;$candidates=0;$assignflag=0; #Reset variables for next loop $current=$otu;$max{$taxon}=$score;$id{$taxon}=$id;$count{$taxon}=1; #change current to new otus and obtain hsp metrics }#close else otu ne current } #close else $c != 0 } #close unless exists assigned otu $c++;} #close matrix counter loop print "completed $assigned assignments at $rank rank\n";} # close rank looop ### Close files and exit close OUTFILE;close OUTFILE2;exit; ### SUBS ### sub get_file_data { print "Enter the name of the file to open: \n"; my $infile = ;chomp $infile;$infile =~s/^file:\/\///; open(INFILE, "$infile") || die "Cannot open file \"$infile\" to write to!\n"; @filedata =;close INFILE; return @filedata; }