####### File_S1
####### Compiled example code
# Note 1: files, directories, and executables have been made generic to simplify the code.
# Note 2: threads and memory allocations have been removed as these are both data- and computing resource-dependent
# Note 3: most processes were looped to use the exact same settings for both bioreactors or both long read assemblies, but for simplicity the code is shown as though there was a single bioreactor and/or long read assembly

##### Read QC
### Long reads, ie Oxford Nanopore Technologies platform
reformat.sh in=ONT.fq out=temprds.fq minlen=3000 # reformat.sh is a BBtools utility
porechop -i temprds.fq -o ONT.qc.fq --min_split_read_size 3000

### Short reads, ie Illumina MiSeq platform
bbduk.sh in1=MS_R1.fq in2=MS_R2.fq out1=MS_R1.qc.fq out2=MS_R2.qc.fq qtrim=rl trimq=18 minlength=200 ktrim=r ref=adapters,artifacts,phix,lambda,mtst

##### Long read subsampling
trycycler subsample -r ONT.qc.fq -o subsampled/ # creates 12 by default in the "subsampled/" directory

##### Short read and short-read first hybrid assembly
### Unicycler, short reads only
python3 unicycler-runner.py -o Unicycler_short --min_fasta_length 1000 -1 MS_R1.qc.fq -2 MS_R2.qc.fq

### Unicycler, long reads only
python3 unicycler-runner.py -o Unicycler_long --min_fasta_length 1000 -l ONT.qc.fq

### Unicycler, hybrid
python3 unicycler-runner.py -o Unicycler_hyb --min_fasta_length 1000 -1 MS_R1.qc.fq -2 MS_R2.qc.fq -l ONT.qc.fq

### SPAdes, short reads only
python3 spades.py --meta -k 27,47,63,77,89,99,107,115,121,127 -o Spades_short -1 MS_R1.qc.fq -2 MS_R2.qc.fq # k-mers match those picked by Unicycler for short read only assembly

### SPAdes, long reads only
python3 spades.py --meta -o Spades_long --nanopore ONT.qc.fq

### SPAdes, hybrid
python3 spades.py --meta -k 27,47,63,77,89,99,107,115,121,127 -o Spades_hyb -1 MS_R1.qc.fq -2 MS_R2.qc.fq --nanopore ONT.qc.fq # k-mers match those picked by Unicycler for short read only assembly

### OPERA-MS, hybrid
perl OPERA-MS.pl --out-dir Opera_hyb --contig-len-thr 1000 --no-ref-clustering --short-read-assembler spades --long-read-mapper minimap2 --short-read1 MS_R1.qc.fq --short-read2 MS_R2.qc.fq --long-read ONT.qc.fq

##### Long read assembly
### Canu, default
canu -p Canu_def_asm -d ONT_canudef_asm/ genomeSize=1m -nanopore-raw ONT.qc.fq

### Canu, metagenome-optimized
canu -p Canu_meta_asm -d ONT_canumeta_asm/ corMinCoverage=0 corOutCoverage=all corMhapSensitivity=high correctedErrorRate=0.105 genomeSize=5m corMaxEvidenceCoverageLocal=10 corMaxEvidenceCoverageGlobal=10 -nanopore-raw ONT.qc.fq

### Canu, metagenome-optimized with subsampled reads
for reads in subsampled/*qc.fq ; do
canu -p Canu_sub_"${reads/.qc.fq/_asm}" -d Canu_sub/Canu_"${reads/.qc.fq/_asm}" corMinCoverage=0 corOutCoverage=all stopOnLowCoverage=0.5 corMhapSensitivity=high correctedErrorRate=0.105 genomeSize=5m corMaxEvidenceCoverageLocal=10 corMaxEvidenceCoverageGlobal=10 -nanopore-raw $reads
done

### Flye, default
flye -o Flye_def_asm/ --nano-raw ONT.qc.fq

### Flye, metagenome-optimized
flye -o Flye_meta_asm --nano-raw ONT.qc.fq --meta

### Flye, metagenome-optimized with subsampled reads
for reads in subsampled/*qc.fq ; do
flye -o Flye_sub/Flye_sub_"${reads/.qc.fq/_asm}" --nano-raw $reads --meta
done

##### Compile initial assemblies into one location
mkdir asms_init
for asm in */*_asm* ; do
ASM1=${asm/*\//}
ASM=${ASM1/.fasta/.fa} # this assumes that the output assembly names are relatively similar from all programs with the exception of ".fasta" vs ".fa" , but the truth is that the outputs are not easily generalized and more complicated code (e.g. per assembler) is necessary to make the names harmonious
reformat.sh in="$asm" out=asms_init/"$ASM" minlen=4000 # reformat.sh is a BBtools utility
done

##### Subset circular and long (>1 Mbp) contigs, then estimate quality and infer phylogeny
mkdir asms_init_subsets/{circ,long,checkm,gtdb}
mkdir -p asms_init_subsets/circ/indivseqs/
mkdir -p asms_init_subsets/long/indivseqs/
for asm in asms_init/*fa
ASM1=${asm/*\//}
ASM=${ASM1/_asm.fa/_long.fa}
# Note, there is no uniform way that assembly programs output circularity information, so here it has been assumed that for each assembly the user has identified the circular contigs and put them all in one list named "circularcontigs.list"
filterbyname.sh in="$asm" out=asms_init_subsets/circ/"$ASM"_circ.fa names=circularcontigs.list keep=t # filterbyname.sh is a BBtools utility
filterbyname.sh in="$asm" out=asms_init_subsets/long/"$ASM"_long.fa names=circularcontigs.list minlen=1000000 keep=f # filterbyname.sh is a BBtools utility
grep '>' asms_init_subsets/circ/"$ASM"_circ.fa | sed 's|>||' asms_init_subsets/circ/"$ASM"_circ.list
	for seq in $(< asms_init_subsets/circ/"$ASM"_circ.list ) ; do
	filterbyname.sh in=asms_init_subsets/circ/"$ASM"_circ.fa out=asms_init_subsets/circ/indivseqs/"$ASM"_"$seq".fa name="$seq" keep=t # filterbyname.sh is a BBtools utility
	done
grep '>' asms_init_subsets/long/"$ASM"_long.fa | sed 's|>||' asms_init_subsets/long/"$ASM"_long.list
	for seq in $(< asms_init_subsets/long/"$ASM"_long.list ) ; do
	filterbyname.sh in=asms_init_subsets/long/"$ASM"_long.fa out=asms_init_subsets/long/indivseqs/"$ASM"_"$seq".fa name="$seq" keep=t # filterbyname.sh is a BBtools utility
	done
done
for subset in circ long ; do 
checkm lineage_wf -f asms_init_subsets/checkm/"$subset"/"$subset"_checkm.txt --tab_table -x fa asms_init_subsets/"$subset"/indivseqs/ asms_init_subsets/checkm/"$subset"
gtdbtk classify_wf --genome_dir asms_init_subsets/"$subset"/indivseqs/" --out_dir asms_init_subsets/gtdb/"$subset"_gtdb -x fa

##### Long read correction of long read assembly
mkdir lr_corrected
for lrasm in asms_init/*_meta_asm.fa ; do 
asmblr1=${lrasm/asms_init\//}
asmblr=${asmblr1/_meta*/}
minimap2 -c -x map-ont $lrasm ONT.qc.fq > ONT_map_"$asmblr".paf
racon  -u ONT.qc.fq ONT_map_"$asmblr".paf $lrasm > lr_corrected/"$asmblr"_rac1_asm.fa
	for n in $( seq 1 9 ) ; do
	newn=$[ n + 1 ]
	minimap2 -c -x map-ont lr_corrected/"$asmblr"_rac"$n"_asm.fa ONT.qc.fq > ONT_map_"$asmblr"_rac"$n".paf
	racon -u ONT.qc.fq ONT_map_"$asmblr"_rac"$n".paf lr_corrected/"$asmblr"_rac"$n"_asm.fa >lr_corrected/"$asmblr"_rac"$newn"_asm.fa
	done
done

##### Short read polishing of long read assembly
mkdir sr_polished
for lrasm in asms_init/*meta_asm.fa lrcorrected/*rac2* lrcorrected/*rac5* lrcorrected/*rac10* ; do 
asm1=${lrasm/*\//}
asm2=${asm1/_meta/}
asm=${asm2/_asm.fa/}
bbmap.sh ref="$lrasm" in1=MS_R1.qc.fq in2=MS_R2.qc.fq outm=MS_map_"$asm".bam minid=0.9 idfilter=0.975 ambiguous=all pairedonly=t bs=bs.sh nodisk; sh bs.sh
pilon --genome $lrasm --bam MS_map_"$asm"_sorted.bam --output "$asm"_pil1_asm.fa --outdir sr_polished --changes --fix all --duplicates 
	for n in $( seq 1 9 ) ; do
	newn=$[ n + 1 ]
	bbmap.sh ref=sr_polished/"$asm"_pil"$n"_asm.fa in1=MS_R1.qc.fq in2=MS_R2.qc.fq outm=MS_map_"$asm"_pil"$n".bam minid=0.9 idfilter=0.975 ambiguous=all pairedonly=t bs=bs.sh nodisk ; sh bs.sh
	pilon --genome sr_polished/"$asm"_pil"$n"_asm.fa --bam MS_map_"$asm"_pil"$n"_sorted.bam --output "$asm"_pil"$newn"_asm.fa --outdir sr_polished --changes --fix all --duplicates
	done
done

##### Compile all assemblies into one location
mkdir ALL_ASMS
cp asms_init/*fa ALL_ASMS/
cp lr_corrected/*fa ALL_ASMS/
cp sr_polished/*fa ALL_ASMS/

##### Assembly quality estimation
### General statistics using Quast
metaquast.py -o mQuast --max-ref-number 0 ALL_ASMS/*fa

### Open reading frame identification using Prodigal
mkdir prodigal/{GBK,FAA,FNA}
for asm in ALL_ASMS/*fa
ASM1=${asm/ALL_ASMS\//}
ASM=${ASM1/_asm.fa}
prodigal -i $asm -o prodigal/GBK/"$ASM".gbk -a prodigal/FAA/"$ASM".faa -d prodigal/FNA/"$ASM".fna -p meta
sed 's| |.*|' prodigal/FAA/"$ASM".faa > prodigal/FAA/"$ASM"_cln.faa 
sed 's| |.*|' prodigal/FNA/"$ASM".fna > prodigal/FNA/"$ASM"_cln.fna 
done

### Gene fragmentation estimation using IDEEL
mkdir ideel/{genomes,proteins} # IDEEL is by default set up to run in its install path, so either adjust IDEEL's executables or copy data to the install location to run. 
cp ALL_ASMS/*fa ideel/genomes/
cp prodigal/FAA/*faa ideel/proteins/
cd ideel
snakemake --use-conda all

### Assembly-level marker gene fragmention and redundancy using BUSCO
mkdir busco/
for faa in prodigal/FAA/*faa
FAA1=${faa/*\//}
FAA=${FAA1/.faa}
busco -i $faa -o $FAA --out_path busco/ -m prot -l bacteria_odb10 --download_path busco/
done

### Short read recruitment using BBMap
mkdir bbmapstats/
for asm in ALL_ASMS/*fa
ASM1=${asm/ALL_ASMS\//}
ASM=${ASM1/_asm.fa}
bbmap.sh ref="$asm" in1=MS_R1.qc.fq in2=MS_R2.qc.fq perfectmode=t ambiguous=random mappedonly=t pairedonly=t statsfile=bbmapstats/"$ASM"_stats.txt scafstats=bbmapstats/"$ASM"_scaf.txt rpkm=bbmapstats/"$ASM"_rpkm.txt covstats=bbmapstats/"$ASM"_cov.txt nodisk
done

### Assembly-level marker gene redundancy using CheckM
checkm taxonomy_wf -f checkm/checkm.txt --tab_table -x faa domain Bacteria prodigal/FAA/ checkm/

### Metabat2, automated binning
cat MS_R1.qc.fq MS_R2.qc.fq > MSreads # precursor for mmlong utility
mkdir mbat/{mapping,allbins,checkm,busco}
for asm in ALL_ASMS/*fa
ASM1=${asm/ALL_ASMS\//}
ASM=${ASM1/_asm.fa}
bash /mmlong/mmlong_tools/mmlong_readcoverage2 -a $asm -d MSreads -m ilm -o mbat/mapping 
sed "s|coverage|MS_cov|" mbat/mapping/MSreads_cov.csv | sed "s|stdev|MS_var|" > mbat/mapping/MS_map_"$ASM"_cov.csv
bash /mmlong/mmlong_tools/mmlong_readcoverage2 -a $asm -d ONT.qc.fq -m np -o mbat/mapping
sed "s|coverage|ONT_cov|" mbat/mapping/ONT.qc.fq_cov.csv | sed "s|stdev|ONT_var|" > mbat/mapping/ONT_map_"$ASM"_cov.csv
grep '>' $asm | sed 's|>||' > contigs
R $ASM
library("tidyverse")
args <- commandArgs() 
l1=list.files(pattern="contigs")
l2=sort(list.files(pattern=paste0("mbat/mapping/.*map_",args[N],"_cov.csv"))) # adjust the args[N] to call the variable from the command line suit R setup
out=c(l1,l2) %>% 
	lapply(read_csv) %>% 
	reduce(.,full_join,by="scaffold") # works in a test... not sure why or how... or why or how I cannot do it another way
write_tsv(out,paste0("mbat/mapping/",args[N],"_depth.tsv") # adjust the args[N] to call the variable from the command line suit R setup
q()
metabat2 -i $asm -a mbat/mapping/"$ASM"_depth.tsv -o mbat/"$ASM"/"$ASM" --saveCls --unbinned
cp mbat/"$ASM"/"$ASM".[0-9][0-9]*.fa mbat/allbins/
done
checkm taxonomy_wf -f mbat/checkm/checkm.txt --tab_table -x fa domain Bacteria mbat/allbins/ mbat/checkm
for bin in mbat/allbins/*fa ; do
BIN1=${bin/*\//}
BIN=${BIN1/.fa/}
busco -i $bin -o "$BIN" --out_path mbat/busco/ -m genome -l bacteria_odb10 --download_path mbat/busco/
done
seqkit stats mbat/allbins/*fa | sed 's|  *|\t|' > mbat/seqkitstats.tsv

### Identification and depth quantification of rpoB genes using HMMscan, BBtools utilities, and mmlong utilities
cat MS_R1.qc.fq MS_R2.qc.fq > MSreads # precursor for mmlong utility
mkdir rpob/{tbls,contigs,genes,proteins,mapping}
cp /HMMdbs/PF04563.hm* rpob/ # HMM downloaded and prepared 2020-06-09
for faa in prodigal/FAA/*_cln.faa 
FAA1=${faa/*\//}
FAA=${FAA1/_cln.faa}
ASM=${FAA1/_cln.faa/_asm.fa}
FNA1=${faa/FAA/FNA}
FNA=${FNA1/faa/fna}
hmmscan --seed 42 -Z 47079205 -E 1000 --tblout rpob/tbls/"$FAA"_rpob_seq.txt -o rpob/tbls/"$FAA"_rpob.txt rpob/PF04563.hmm $faa 
grep 'PF04563.15 ' rpob/tbls/"$FAA"_rpob_seq.txt | sed 's|.*PF04563.15 ||' | sed 's|  *|	|g' | awk -F '\t' '{ if ( $3 <= 1e-3  && $4 >= 20 ) print $1 }' > genes.list
sed 's|_[0-9][0-9]* ||' genes.list > contigs.list 
filterbyname.sh in="$faa" out=rpob/proteins/"$FAA" names=genes.list include=t # filterbyname.sh is a BBtools utility
filterbyname.sh in="$fna" out=rpob/genes/"$FNA" names=genes.list include=t # filterbyname.sh is a BBtools utility
filterbyname.sh in=ALL_ASMS/"$ASM" out=rpob/contigs/"$ASM".fa names=contigs.list include=t # filterbyname.sh is a BBtools utility
done
for rpocont in rpob/contigs/*fa
rpoCONT1=${rpocont/*\//}
rpoCONT=${rpoCONT1/.fa/}
bash /mmlong/mmlong_tools/mmlong_readcoverage2 -a $rpocont -d MSreads -m ilm -o rpob/mapping 
sed "s|coverage|MS_cov|" rpob/mapping/MSreads_cov.csv | sed "s|stdev|MS_var|" > rpob/mapping/MS_map_"$rpoCONT"_cov.csv
bash /mmlong/mmlong_tools/mmlong_readcoverage2 -a $rpocont -d ONT.qc.fq -m np -o rpob/mapping
sed "s|coverage|ONT_cov|" rpob/mapping/ONT.qc.fq_cov.csv | sed "s|stdev|ONT_var|" > rpob/mapping/ONT_map_"$rpoCONT"_cov.csv
grep '>' $rpocont | sed 's|>||' > contigs
R $ASM
library("tidyverse")
args <- commandArgs() 
l1=list.files(pattern="contigs")
l2=sort(list.files(pattern=paste0("rpob/mapping/.*map_",args[N],"_cov.csv"))) # adjust the args[N] to call the variable from the command line suit R setup
out=c(l1,l2) %>% 
	lapply(read_csv) %>% 
	reduce(.,full_join,by="scaffold") # works in a test... not sure why or how... or why or how I cannot do it another way
write_tsv(out,paste0("rpob/mapping/",args[N],"_depth.tsv") # adjust the args[N] to call the variable from the command line suit R setup
q()