Data S3: Scripts developed specifically for the integration site identification pipeline. Additional scripts not developed in this study are referenced in the text and only novel scripts developed for this study are included. #! /usr/bin/perl -w use strict; use Bio::SeqIO; use Parallel::ForkManager; #Author: Ulrike Löber (loeber@izw-berlin.de) #Input: Sequences to be investigated (fasta format; path to fasta files; ARGV[0]) & Smith Waterman alignments of sequences (ARGV[0]) to bait reference in tabular format (EMBOSS[water]; path to *.water files; specific naming [e.g. 3end_r_$sample]; ARGV[1]) #Parameter: Minimum alignment length (ARGV[3]); minimum percentage identity (ARGV[4]); number of threads (ARGV[5]) #Output: Bait specific filtered and trimmed sequences (fasta format; path to output directory; ARGV[2]) my $seqdir=$ARGV[0]; my $alndir=$ARGV[1]; my $outdir=$ARGV[2]; system "find $seqdir. -empty -type f -delete"; opendir (INDIR, "$seqdir.") or die $!; my @sequences=grep /\.fasta$/, readdir (INDIR); close INDIR; my $alnlength=$ARGV[3]; my $identitythreshold=$ARGV[4]; my $cpus= $ARGV[5]; print "Now searching for all the sequences in @sequences which share at least $alnlength bp on $identitythreshold % identity with the bait and print out in $outdir. \n"; my $sample; my $id; my $seq; my %seqhash; my $input; my $inseq; my $trimmedseq; my @values1; my @values2; my @values3; my @values4; my $alnstart; my $alnend; my $cutpos; my $manager = new Parallel::ForkManager( $cpus ); ################## start file handling ####################### foreach my $sequencefile(@sequences){ $sample=$sequencefile; $sample=~s/\.fasta//g; my $outfile3= "trimmed_3end_$sequencefile"; my $outfile5= "trimmed_5end_$sequencefile"; open (OUTFILE3, ">$outdir$outfile3") or die $!; open (OUTFILE5, ">$outdir$outfile5") or die $!; opendir (ALIGNDIR, "$alndir") or die $!; my @aligndir1=grep /3end_r_$sample\.water.tab$/, readdir (ALIGNDIR); close ALIGNDIR; open (INFILE1,"$alndir$aligndir1[0]") or die $!; my @align1=; close INFILE1; opendir (ALIGNDIR, "$alndir") or die $!; my @aligndir2=grep /3end_f_$sample\.water.tab$/, readdir (ALIGNDIR); close ALIGNDIR; open (INFILE2,"$alndir$aligndir2[0]") or die $!; my @align2=; close INFILE2; opendir (ALIGNDIR, "$alndir") or die $!; my @aligndir3=grep /5end_r_$sample\.water.tab$/, readdir (ALIGNDIR); close ALIGNDIR; open (INFILE3,"$alndir$aligndir3[0]") or die $!; my @align3=; close INFILE3; opendir (ALIGNDIR, "$alndir") or die $!; my @aligndir4=grep /5end_f_$sample\.water.tab$/, readdir (ALIGNDIR); close ALIGNDIR; open (INFILE4,"$alndir$aligndir4[0]") or die $!; my @align4=; close INFILE4; $input= Bio::SeqIO-> new( - file => "$seqdir$sequencefile", -format => "fasta"); ################## end file handling ####################### my $count= @align1; print "$count\n"; ################## start parsing data from pairwise alignment tables ########### while (my $inseq=$input-> next_seq()){ $id=$inseq->id(); #print "$id\n"; $seq=$inseq->seq(); #print "$id\n$seq\n"; $seqhash{$id}=$seq; #print "$seqhash{$id}\n"; } for (my $i=1; $i<$count;$i++){ $manager->start and next; @values1=split(/\t/,$align1[$i]); #3end r trimm from start of algn to last base of seq (right side) @values2=split(/\t/,$align2[$i]); #3end f trimm from end of algn to first base of seq (left side) @values3=split(/\t/,$align3[$i]); #5end r trimm from end of algn to first base of seq (left side) @values4=split(/\t/,$align4[$i]); #5end f trimm from start of algn to last base of seq (right side) ################## end parsing data from pairwise alignment tables ########### ################## start comparison section ################################## if ($values1[2]>$alnlength&&$values1[3]>=$identitythreshold){ if ($values2[2]>$values1[2]&&$values2[3]>=$identitythreshold){ if ($values3[2]>$values2[2]&&$values3[3]>=$identitythreshold){ if ($values4[2]>$values3[2]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values3[1]}; chomp $values3[7]; $alnend=$values3[7]; $trimmedseq=$seq ; $cutpos=$alnend+$values3[5]+2; #cutpos is the endpoint of read plus shift of bait (alignment end of bait) -2bp $trimmedseq=~s/^.{$cutpos}//g; print OUTFILE5 ">$values3[1]\n$trimmedseq\n"; } } elsif($values4[2]>$values2[2]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values2[1]}; chomp $values2[7]; $alnend=$values2[7]; $trimmedseq=$seq ; $cutpos=$alnend+(32-$values2[5])+35; #cutpos is entpoint of read plus shift of bait (difference of endpoint of bait to 32)-35 bp $trimmedseq=~s/^.{$cutpos}//g; print OUTFILE3 ">$values2[1]\n$trimmedseq\n"; } } elsif($values3[2]>$values1[2]&&$values3[3]>=$identitythreshold){ if($values4[2]>$values3[3]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values3[1]}; $alnend=$values3[7]; $trimmedseq=$seq ; $cutpos=$alnend+$values3[5]+2; #cutpos is the endpoint of read plus shift of bait (alignment end of bait) -2bp $trimmedseq=~s/^.{$cutpos}//g; print OUTFILE5 ">$values3[1]\n$trimmedseq\n"; } } elsif($values4[2]>$values1[2]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values1[1]}; $alnstart=$values1[6]; $cutpos=$alnstart-(32-$values1[4])-35; $trimmedseq=substr($seq,0,$cutpos) ; # the first n nucleotides till start of alignment #$trimmedseq=~m/^.{$alnstart}/; print OUTFILE3 ">$values1[1]\n$trimmedseq\n"; } } elsif($values2[2]>$alnlength&&$values2[3]>=$identitythreshold){ if($values3[2]>$values2[2]&&$values3[3]>=$identitythreshold){ if($values4[2]>$values3[2]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values3[1]}; $alnend=$values3[7]; $trimmedseq=$seq ; $cutpos=$alnend+$values3[5]+2; #cutpos is the endpoint of read plus shift of bait (alignment end of bait) -2bp $trimmedseq=~s/^.{$cutpos}//g; print OUTFILE5 ">$values3[1]\n$trimmedseq\n"; } } elsif($values4[2]>$values2[2]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values2[1]}; $alnend=$values2[7]; $trimmedseq=$seq ; $cutpos=$alnend+(32-$values2[5])+35; #cutpos is entpoint of read plus shift of bait (difference of endpoint of bait to 32)-35 bp $trimmedseq=~s/^.{$cutpos}//g; print OUTFILE3 ">$values2[1]\n$trimmedseq\n"; } } elsif($values3[2]>$alnlength&&$values3[3]>=$identitythreshold){ if($values4[2]>$values3[2]&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ $seq = $seqhash{$values3[1]}; $alnend=$values3[7]; $trimmedseq=$seq ; $cutpos=$alnend+$values3[5]+2; #cutpos is the endpoint of read plus shift of bait (alignment end of bait) -2bp $trimmedseq=~s/^.{$cutpos}//g; print OUTFILE5 ">$values3[1]\n$trimmedseq\n"; } } elsif($values4[2]>$alnlength&&$values4[3]>=$identitythreshold){ $seq = $seqhash{$values4[1]}; chomp $values4[6]; $alnstart=$values4[6]; $cutpos=$alnstart-$values4[4]-2; #cutpos is the startpoint of read minus shift of bait (alignment start of bait) -2bp $trimmedseq=substr($seq,0,$cutpos) ; print OUTFILE5 ">$values4[1]\n$trimmedseq\n"; } else{ #print "$values1[1] rejected!\n"; } $manager->finish; } # } ################## end comparison section ################################## close OUTFILE3; close OUTFILE5; }