################### COI AMPLICON DATA ANALYSIS PIPELINE - EXAMPLE SCRIPT #################### ## METHOD ## The eexperimental design is detailed in the main manuscript. Briefly, ## one mock community was amplified seven times, each time with a different indexed primer set (similar index on forward and reverse primer) ## the resulting seven PCR products were pooled and a single indexed Illumina TruSeq adaptor added using ligation to each pool ## the resulting library was sequenced twice on two consecutive MiSeq runs ## Illumina files are available in the Dryad repository (see manuscript) 16_S10_L001_R1_001_run1.fastq.gz ### Illumina MISEQ run 1, R1 direction: 16_S10_L001_R1_001_run2.fastq.gz ### Illumina MISEQ run 2, R1 direction 16_S10_L001_R2_001_run1.fastq.gz ### Illumina MISEQ run 1, R2 direction 16_S10_L001_R2_001_run2.fastq.gz ### Illumina MISEQ run 2, R2 direction ################### Denoising with BFC (Li, 2015) #################### # an error correcting tool designed specifically for Illumina short reads # Download BFC and place your files unzipped in the folder "bfc-master" # Denoise each file of the paired end read ./bfc -s 3g -t16 16_S10_L001_R1_001_run1.fastq | gzip -1 > 16_S10_L001_R1_001_run1.corrected.fq.gz ./bfc -s 3g -t16 16_S10_L001_R2_001_run1.fastq | gzip -1 > 16_S10_L001_R2_001_run1.corrected.fq.gz ./bfc -s 3g -t16 16_S10_L001_R1_001_run2.fastq | gzip -1 > 16_S10_L001_R1_001_run2.corrected.fq.gz ./bfc -s 3g -t16 16_S10_L001_R2_001_run2.fastq | gzip -1 > 16_S10_L001_R2_001_run2.corrected.fq.gz # -s specifies the size of the genome # amplicons are short but larger -s tends to be more accurate (Li personnal comm.) ## BFC outputs: # 16_S10_L001_R1_001_run1.corrected.fq # 16_S10_L001_R1_001_run2.corrected.fq # 16_S10_L001_R2_001_run1.corrected.fq # 16_S10_L001_R2_001_run2.corrected.fq #################### Assemble paired-end reads with Usearch (Edgar, 2010) #################### # Download Usearch8.1 and place it in your working directory # Make file executable # Run -fastq_mergepairs ./usearch8.1 -fastq_mergepairs 16_S10_L001_R1_001_run1.corrected.fq -reverse 16_S10_L001_R2_001_run1.corrected.fq -fastqout Mock_Run1_denoised_contiged.fastq -fastq_merge_maxee 1.0 ./usearch8.1 -fastq_mergepairs 16_S10_L001_R1_001_run2.corrected.fq -reverse 16_S10_L001_R2_001_run2.corrected.fq -fastqout Mock_Run2_denoised_contiged.fastq -fastq_merge_maxee 1.0 # here we set expected error threshold (-fastq_merge_maxee) to 1 (see http://drive5.com/usearch/manual/exp_errs.html) # see additional options here (http://drive5.com/usearch/manual/cmd_fastq_mergepairs.html) ## Usearch outputs: # Mock_Run1_denoised_contiged.fastq # Mock_Run2_denoised_contiged.fastq #################### Create .fasta file with Mothur (Schloss et al. 2009) #################### # Download the last version of Mothur # place your .fastq files in your home directory fastq.info(fastq=Mock_Run1_denoised_contiged.fastq) fastq.info(fastq=Mock_Run2_denoised_contiged.fastq) ## Mothur outputs: # Mock_Run1_denoised_contiged.fasta # Mock_Run1_denoised_contiged.fastq # Mock_Run1_denoised_contiged.qual # Mock_Run2_denoised_contiged.fasta # Mock_Run2_denoised_contiged.fastq # Mock_Run2_denoised_contiged.qual #################### Sample de-multiplexing and additional QC with Mothur (Schloss et al. 2009) #################### # prepare two separate .oligos files that specify primer and barcode sequences for read demultiplexing of each MiSeq run # MSN_ARMS_MiSeq_Run1_Adapt16.oligos primer GGWACWGGWTGAACWGTWTAYCCYCC TANACYTCNGGRTGNCCRAARAAYCA barcode AGACGC AGACGC Mock_P2 barcode AGTGTA AGTGTA Mock_P3 barcode ACTAGC ACTAGC Mock_P5 barcode ACAGTC ACAGTC Mock_P6 barcode ATCGAC ATCGAC Mock_P7 barcode ATGTCG ATGTCG Mock_P8 barcode ATAGCA ATAGCA Mock_P9 # MSN_ARMS_MiSeq_Run2_Adapt16.oligos primer GGWACWGGWTGAACWGTWTAYCCYCC TANACYTCNGGRTGNCCRAARAAYCA barcode AGACGC AGACGC Mock_P2_run2 barcode AGTGTA AGTGTA Mock_P3_run2 barcode ACTAGC ACTAGC Mock_P5_run2 barcode ACAGTC ACAGTC Mock_P6_run2 barcode ATCGAC ATCGAC Mock_P7_run2 barcode ATGTCG ATGTCG Mock_P8_run2 barcode ATAGCA ATAGCA Mock_P9_run2 # then run the following commands in Mothur trim.seqs(fasta=Mock_Run1_denoised_contiged.fasta,oligos=MSN_ARMS_MiSeq_Run1_Adapt16.oligos,maxambig=0, maxhomop=8, processors=4, bdiffs=0, pdiffs=0, minlength=300, maxlength=350, checkorient=t) trim.seqs(fasta=Mock_Run2_denoised_contiged.fasta,oligos=MSN_ARMS_MiSeq_Run2_Adapt16.oligos,maxambig=0, maxhomop=8, processors=4, bdiffs=0, pdiffs=0, minlength=300, maxlength=350, checkorient=t) # here we are removing all sequences with any ambiguities such as "N" (maxambig=0), any homopolymer region longer than 8bp (maxhomop=8) # any mismatch in the barcode sequence (bdiffs=0) and any mismatch in the primer sequence (pdiffs=0) ## Mothur outputs: # Mock_Run1_denoised_contiged.trim.fasta # Mock_Run1_denoised_contiged.groups # Mock_Run2_denoised_contiged.trim.fasta # Mock_Run2_denoised_contiged.groups # merge files merge.files(input=Mock_Run1_denoised_contiged.trim.fasta-Mock_Run2_denoised_contiged.trim.fasta, output=Mock_CombinedRuns_denoised_contiged.trim.fasta) merge.files(input=Mock_Run1_denoised_contiged.groups-Mock_Run2_denoised_contiged.groups, output=Mock_CombinedRuns_denoised_contiged.groups) ## Mothur outputs: # Mock_CombinedRuns_denoised_contiged.trim.fasta # Mock_CombinedRuns_denoised_contiged.groups #################### Dereplication with Mothur (Schloss et al. 2009) #################### # Identification of unique sequences so that only one copy of each sequence is reported. It is used to reduce the size of the dataset and speed up downstream analysis # Run the following command in Mothur unique.seqs(fasta=Mock_CombinedRuns_denoised_contiged.trim.fasta) ## Mothur outputs: # Mock_CombinedRuns_denoised_contiged.trim.unique.fasta # Mock_CombinedRuns_denoised_contiged.trim.names #################### Align amino acid sequences with MACSE (Ranwez et al. 2011) #################### # this step aligns the reads to a high quality reference dataset of DNA barcodes "BIOCODE2014_MACSE_313.fasta" using amino acid translations # and identifies reads with interruptions of open reading frame # This step uses a lot of RAM # so the file needs to be split in smaller files of 10,000 sequences first # In terminal, use the following command split -l 20000 Mock_CombinedRuns_denoised_contiged.trim.unique.fasta ## Outputs: # xaa # xab # xac # xad # xae # xaf # ... # the program enrichAlignment aligns reads one by one to the reference dataset "BIOCODE2014_MACSE_313.fasta" # Download MACSE "macse_v1.0.0i_7.jar" # Use the following command in the terminal for each file to align java -Xmx256m -jar macse_v1.0.0i_7.jar -prog enrichAlignment -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xaa -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT xaa_NT -out_AA xaa_AA -seqToAdd_logFile xaa_log.csv java -Xmx256m -jar macse_v1.0.0i_7.jar -prog enrichAlignment -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xab -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT xab_NT -out_AA xab_AA -seqToAdd_logFile xab_log.csv java -Xmx256m -jar macse_v1.0.0i_7.jar -prog enrichAlignment -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xac -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT xac_NT -out_AA xac_AA -seqToAdd_logFile xac_log.csv java -Xmx256m -jar macse_v1.0.0i_7.jar -prog enrichAlignment -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xad -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT cad_NT -out_AA xad_AA -seqToAdd_logFile xac_log.csv # Use the same command to align the rest of the files # here you are removing all sequences with any frameshift (-maxFS_inSeq 0) with any insertion of amino acid (-maxINS_inSeq 0) # with any deletion of amino acid (-maxDEL_inSeq 0) and with any stop codon (-maxSTOP_inSeq 0) # Export alignments java -jar macse_v1.0.0i_7.jar -prog exportAlignment -align xaa_NT -charForRemainingFS - -gc_def 5 -out_AA xaa_AA_macse.fasta -out_NT xaa_NT_macse.fasta -statFile xaa.csv java -jar macse_v1.0.0i_7.jar -prog exportAlignment -align xab_NT -charForRemainingFS - -gc_def 5 -out_AA xab_AA_macse.fasta -out_NT xab_NT_macse.fasta -statFile xab.csv java -jar macse_v1.0.0i_7.jar -prog exportAlignment -align xac_NT -charForRemainingFS - -gc_def 5 -out_AA xac_AA_macse.fasta -out_NT xac_NT_macse.fasta -statFile xac.csv java -jar macse_v1.0.0i_7.jar -prog exportAlignment -align xad_NT -charForRemainingFS - -gc_def 5 -out_AA xad_AA_macse.fasta -out_NT xad_NT_macse.fasta -statFile xad.csv # Use the same command to export the rest of the files # Create a .accnos file with label of reference sequences grep ">" BIOCODE2014_MACSE_313.fasta > BIOCODE2014_MACSE_313_headers.accnos # Remove “>” using search/replace in Textwranger # Remove sequences of the reference dataset using Mothur remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xaa_NT_macse.fasta) remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xab_NT_macse.fasta) remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xac_NT_macse.fasta) remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xad_NT_macse.fasta) remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xae_NT_macse.fasta) # Use the same command to remove sequences of the reference dataset from the rest of the files # Upload each file into Geneious (Biomatters) # Select all, then click “Align/Assemble” > “Consensus align” > “Create alignment of all sequences” # Save as “Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.fasta” #################### Update .names and .group files #################### # Because MACSE removed sequences with interruptions of open reading frame from the .fasta file, the .names and .group files do not match anymore # Some lines need to be removed # to do so, we first identify the sequence labels that are no longer in the aligned .fasta file # In terminal # First, extract the sequence labels grep ">" Mock_CombinedRuns_denoised_contiged.trim.unique.fasta > FastaHeaders.txt grep ">" Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.fasta > FastaHeaders.MACSE.txt # Second, sort labels sort -n -d FastaHeaders.txt > FastaHeaders.sorted.txt sort -n -d FastaHeaders.MACSE.txt > FastaHeaders.MACSE.sorted.txt # Third, find missing labels in FastaHeaders.MACSE.sorted.txt comm -23 FastaHeaders.sorted.txt FastaHeaders.MACSE.sorted.txt > outfile.accnos # Remove “>” from outfile.accnos using search/replace in Textwranger # In Mothur, remove sequences from .names and .group files remove.seqs(accnos=outfile.accnos,name=Mock_CombinedRuns_denoised_contiged.trim.names,group=Mock_CombinedRuns_denoised_contiged.groups) ## Mothur outputs: # Mock_CombinedRuns_denoised_contiged.trim.pick.names # Mock_CombinedRuns_denoised_contiged.pick.groups #################### Precluster reads with Mothur (Schloss et al. 2009) #################### # this is used to reduce the variability in the dataset and speed up the clustering step # First, summarize .names and .group files in a table using Mothur count.seqs(name=Mock_CombinedRuns_denoised_contiged.trim.pick.names, group=Mock_CombinedRuns_denoised_contiged.pick.groups) ## Mothur output: # Mock_CombinedRuns_denoised_contiged.trim.pick.count_table # the following command then merges reads differing by three or fewer than three bases pre.cluster(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.pick.count_table,diffs=3) ## Mothur outputs: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.fasta # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.count_table #################### De novo chimera removal with Uchime (Edgar et al. 2011) #################### # Uchime is implemented in Mothur chimera.uchime(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.count_table) ## Mothur output: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.denovo.uchime.chimeras # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.denovo.uchime.accnos # chimera.uchime generates a .accnos file containing the label of all potential chimeric sequences # remove chimeras in mothur remove.seqs(accnos=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.denovo.uchime.accnos,fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.count_table) ## Mothur output: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.fasta # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.count_table #################### Remove singletons with Mothur (Schloss et al. 2009) #################### # this command splits the dataset in rare and common OTUs # cutoff=1 specifies that rare preclusters contain a single sequence split.abund(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.count_table, cutoff=1,accnos=true) ## Mothur output: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.rare.count_table # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.abund.count_table # rare.accnos # abund.accnos # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.rare.fasta # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.abund.fasta # use rare.accnos file to remove singletons remove.seqs(accnos=rare.accnos,fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.count_table) ## Mothur outputs: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.fasta # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.count_table #################### Cluster with CROP (Hao et al. 2011) #################### # Abundance information forms the base of the Gaussian mixture model used by CROP to delineate OTUs # However CROP does not take .names file as input (to specify abundance) # Therefore we need to deunique the .fasta file (opposite of dereplicate) # in Mothur, run the following command deunique.seqs(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.count_table) ## Mothur output: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.fasta # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.groups # Run the following command in CROP CROP_1_33 CROP -i Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.fasta -o Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP -l 3 -u 4 -b 75 -z 450 # -l 3 -u 4 specify the soft threshold # -b the number of blocks = no. unique sequences / 50 ## CROP outputs: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.fasta # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.list #################### Create final OTU table in Mothur (Schloss et al. 2009) #################### count.seqs(name=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.list,group=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.groups) ## Mothur output: # Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.count_table # At this stage the dataset is ready for taxonomic assignments ## References # Edgar, R.C. (2010). Search and clustering orders of magnitude faster than BLAST. Bioinformatics, 26, 2460–1 # Edgar, R.C., Haas, B.J., Clemente, J.C., Quince, C. & Knight, R. (2011). UCHIME improves sensitivity and speed of chimera detection. Bioinformatics, 27, 2194–2200 # Hao, X., Jiang, R. & Chen, T. (2011). Clustering 16S rRNA for OTU prediction: a method of unsupervised Bayesian clustering. Bioinformatics, 27, 611–8 # Li, H. (2015). BFC: correcting Illumina sequencing errors. Bioinformatics, 31, 2885–7 # Ranwez, V., Harispe, S., Delsuc, F. & Douzery, E.J.P. (2011). MACSE: Multiple Alignment of Coding SEquences accounting for frameshifts and stop codons. PloS One, 6, e22594 # Schloss, P.D., Westcott, S.L., Ryabin, T., Hall, J.R., Hartmann, M., Hollister, E.B., Lesniewski, R.A., Oakley, B.B., Parks, D.H., Robinson, C.J., Sahl, J.W., Stres, B., Thallinger, G.G., Van Horn, D.J. & Weber, C.F. (2009). Introducing mothur: open-source, platform-independent, community-supported software for describing and comparing microbial communities. Applied and Environmental Microbiology, 75, 7537–7541