################### COI AMPLICON DATA ANALYSIS PIPELINE - EXAMPLE SCRIPT ####################

## METHOD
## The eexperimental design is detailed in the main manuscript. Briefly,
## one mock community was amplified seven times, each time with a different indexed primer set (similar index on forward and reverse primer)
## the resulting seven PCR products were pooled and a single indexed Illumina TruSeq adaptor added using ligation to each pool
## the resulting library was sequenced twice on two consecutive MiSeq runs

## Illumina files are available in the Dryad repository (see manuscript)

    16_S10_L001_R1_001_run1.fastq.gz  ### Illumina MISEQ run 1, R1 direction: 
    16_S10_L001_R1_001_run2.fastq.gz  ### Illumina MISEQ run 2, R1 direction
    16_S10_L001_R2_001_run1.fastq.gz  ### Illumina MISEQ run 1, R2 direction
    16_S10_L001_R2_001_run2.fastq.gz  ### Illumina MISEQ run 2, R2 direction


################### Denoising with BFC (Li, 2015) ####################

# an error correcting tool designed specifically for Illumina short reads
# Download BFC and place your files unzipped in the folder "bfc-master"
# Denoise each file of the paired end read

    ./bfc -s 3g -t16 16_S10_L001_R1_001_run1.fastq | gzip -1 > 16_S10_L001_R1_001_run1.corrected.fq.gz
    ./bfc -s 3g -t16 16_S10_L001_R2_001_run1.fastq | gzip -1 > 16_S10_L001_R2_001_run1.corrected.fq.gz
    ./bfc -s 3g -t16 16_S10_L001_R1_001_run2.fastq | gzip -1 > 16_S10_L001_R1_001_run2.corrected.fq.gz
    ./bfc -s 3g -t16 16_S10_L001_R2_001_run2.fastq | gzip -1 > 16_S10_L001_R2_001_run2.corrected.fq.gz

# -s specifies the size of the genome
# amplicons are short but larger -s tends to be more accurate (Li personnal comm.)

## BFC outputs:
# 16_S10_L001_R1_001_run1.corrected.fq
# 16_S10_L001_R1_001_run2.corrected.fq
# 16_S10_L001_R2_001_run1.corrected.fq
# 16_S10_L001_R2_001_run2.corrected.fq


#################### Assemble paired-end reads with Usearch (Edgar, 2010) ####################

# Download Usearch8.1 and place it in your working directory
# Make file executable
# Run -fastq_mergepairs

    ./usearch8.1  -fastq_mergepairs 16_S10_L001_R1_001_run1.corrected.fq -reverse 16_S10_L001_R2_001_run1.corrected.fq -fastqout Mock_Run1_denoised_contiged.fastq -fastq_merge_maxee 1.0
    ./usearch8.1  -fastq_mergepairs 16_S10_L001_R1_001_run2.corrected.fq -reverse 16_S10_L001_R2_001_run2.corrected.fq -fastqout Mock_Run2_denoised_contiged.fastq -fastq_merge_maxee 1.0

# here we set expected error threshold (-fastq_merge_maxee) to 1 (see http://drive5.com/usearch/manual/exp_errs.html)
# see additional options here (http://drive5.com/usearch/manual/cmd_fastq_mergepairs.html)

## Usearch outputs:
# Mock_Run1_denoised_contiged.fastq
# Mock_Run2_denoised_contiged.fastq


#################### Create .fasta file with Mothur (Schloss et al. 2009) ####################

# Download the last version of Mothur
# place your .fastq files in your home directory

    fastq.info(fastq=Mock_Run1_denoised_contiged.fastq)
    fastq.info(fastq=Mock_Run2_denoised_contiged.fastq)

## Mothur outputs:
# Mock_Run1_denoised_contiged.fasta
# Mock_Run1_denoised_contiged.fastq
# Mock_Run1_denoised_contiged.qual
# Mock_Run2_denoised_contiged.fasta
# Mock_Run2_denoised_contiged.fastq
# Mock_Run2_denoised_contiged.qual


#################### Sample de-multiplexing and additional QC with Mothur (Schloss et al. 2009) ####################

# prepare two separate .oligos files that specify primer and barcode sequences for read demultiplexing of each MiSeq run 

# MSN_ARMS_MiSeq_Run1_Adapt16.oligos

    primer	GGWACWGGWTGAACWGTWTAYCCYCC	TANACYTCNGGRTGNCCRAARAAYCA	
    barcode	AGACGC	AGACGC	Mock_P2
    barcode	AGTGTA	AGTGTA	Mock_P3
    barcode	ACTAGC	ACTAGC	Mock_P5
    barcode	ACAGTC	ACAGTC	Mock_P6
    barcode	ATCGAC	ATCGAC	Mock_P7
    barcode	ATGTCG	ATGTCG	Mock_P8
    barcode	ATAGCA	ATAGCA	Mock_P9

# MSN_ARMS_MiSeq_Run2_Adapt16.oligos

    primer	GGWACWGGWTGAACWGTWTAYCCYCC	TANACYTCNGGRTGNCCRAARAAYCA	
    barcode	AGACGC	AGACGC	Mock_P2_run2
    barcode	AGTGTA	AGTGTA	Mock_P3_run2
    barcode	ACTAGC	ACTAGC	Mock_P5_run2
    barcode	ACAGTC	ACAGTC	Mock_P6_run2
    barcode	ATCGAC	ATCGAC	Mock_P7_run2
    barcode	ATGTCG	ATGTCG	Mock_P8_run2
    barcode	ATAGCA	ATAGCA	Mock_P9_run2

# then run the following commands in Mothur

    trim.seqs(fasta=Mock_Run1_denoised_contiged.fasta,oligos=MSN_ARMS_MiSeq_Run1_Adapt16.oligos,maxambig=0, maxhomop=8, processors=4, bdiffs=0, pdiffs=0, minlength=300, maxlength=350, checkorient=t)
    trim.seqs(fasta=Mock_Run2_denoised_contiged.fasta,oligos=MSN_ARMS_MiSeq_Run2_Adapt16.oligos,maxambig=0, maxhomop=8, processors=4, bdiffs=0, pdiffs=0, minlength=300, maxlength=350, checkorient=t)

# here we are removing all sequences with any ambiguities such as "N" (maxambig=0), any homopolymer region longer than 8bp (maxhomop=8)
# any mismatch in the barcode sequence (bdiffs=0) and any mismatch in the primer sequence (pdiffs=0)

## Mothur outputs:
# Mock_Run1_denoised_contiged.trim.fasta
# Mock_Run1_denoised_contiged.groups
# Mock_Run2_denoised_contiged.trim.fasta
# Mock_Run2_denoised_contiged.groups

# merge files

    merge.files(input=Mock_Run1_denoised_contiged.trim.fasta-Mock_Run2_denoised_contiged.trim.fasta, output=Mock_CombinedRuns_denoised_contiged.trim.fasta)
    merge.files(input=Mock_Run1_denoised_contiged.groups-Mock_Run2_denoised_contiged.groups, output=Mock_CombinedRuns_denoised_contiged.groups)

## Mothur outputs:
# Mock_CombinedRuns_denoised_contiged.trim.fasta
# Mock_CombinedRuns_denoised_contiged.groups


####################  Dereplication with Mothur (Schloss et al. 2009)  ####################

# Identification of unique sequences so that only one copy of each sequence is reported. It is used to reduce the size of the dataset and speed up downstream analysis
# Run the following command in Mothur

    unique.seqs(fasta=Mock_CombinedRuns_denoised_contiged.trim.fasta)

## Mothur outputs:
# Mock_CombinedRuns_denoised_contiged.trim.unique.fasta
# Mock_CombinedRuns_denoised_contiged.trim.names


#################### Align amino acid sequences with MACSE (Ranwez et al. 2011) ####################

# this step aligns the reads to a high quality reference dataset of DNA barcodes "BIOCODE2014_MACSE_313.fasta" using amino acid translations
# and identifies reads with interruptions of open reading frame

# This step uses a lot of RAM
# so the file needs to be split in smaller files of 10,000 sequences first
# In terminal, use the following command

    split -l 20000 Mock_CombinedRuns_denoised_contiged.trim.unique.fasta

## Outputs:
# xaa
# xab
# xac
# xad
# xae
# xaf
# ...

# the program enrichAlignment aligns reads one by one to the reference dataset "BIOCODE2014_MACSE_313.fasta"
# Download MACSE "macse_v1.0.0i_7.jar"
# Use the following command in the terminal for each file to align

    java -Xmx256m -jar macse_v1.0.0i_7.jar  -prog enrichAlignment  -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xaa -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT xaa_NT -out_AA xaa_AA -seqToAdd_logFile xaa_log.csv
    java -Xmx256m -jar macse_v1.0.0i_7.jar  -prog enrichAlignment  -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xab -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT xab_NT -out_AA xab_AA -seqToAdd_logFile xab_log.csv
    java -Xmx256m -jar macse_v1.0.0i_7.jar  -prog enrichAlignment  -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xac -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT xac_NT -out_AA xac_AA -seqToAdd_logFile xac_log.csv
    java -Xmx256m -jar macse_v1.0.0i_7.jar  -prog enrichAlignment  -seq BIOCODE2014_MACSE_313.fasta -align BIOCODE2014_MACSE_313.fasta -seq_lr xad -maxFS_inSeq 0 -maxSTOP_inSeq 0 -maxINS_inSeq 0 -maxDEL_inSeq 0 -gc_def 5 -fs_lr -10 -stop_lr -10 -out_NT cad_NT -out_AA xad_AA -seqToAdd_logFile xac_log.csv

# Use the same command to align the rest of the files
# here you are removing all sequences with any frameshift (-maxFS_inSeq 0) with any insertion of amino acid (-maxINS_inSeq 0)
# with any deletion of amino acid (-maxDEL_inSeq 0) and with any stop codon (-maxSTOP_inSeq 0)

# Export alignments

    java -jar macse_v1.0.0i_7.jar  -prog exportAlignment -align xaa_NT -charForRemainingFS - -gc_def 5 -out_AA xaa_AA_macse.fasta -out_NT xaa_NT_macse.fasta -statFile xaa.csv
    java -jar macse_v1.0.0i_7.jar  -prog exportAlignment -align xab_NT -charForRemainingFS - -gc_def 5 -out_AA xab_AA_macse.fasta -out_NT xab_NT_macse.fasta -statFile xab.csv
    java -jar macse_v1.0.0i_7.jar  -prog exportAlignment -align xac_NT -charForRemainingFS - -gc_def 5 -out_AA xac_AA_macse.fasta -out_NT xac_NT_macse.fasta -statFile xac.csv
    java -jar macse_v1.0.0i_7.jar  -prog exportAlignment -align xad_NT -charForRemainingFS - -gc_def 5 -out_AA xad_AA_macse.fasta -out_NT xad_NT_macse.fasta -statFile xad.csv

# Use the same command to export the rest of the files

# Create a .accnos file with label of reference sequences

    grep ">" BIOCODE2014_MACSE_313.fasta > BIOCODE2014_MACSE_313_headers.accnos

# Remove “>” using search/replace in Textwranger
# Remove sequences of the reference dataset using Mothur

    remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xaa_NT_macse.fasta)
    remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xab_NT_macse.fasta)
    remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xac_NT_macse.fasta)
    remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xad_NT_macse.fasta)
    remove.seqs(accnos=BIOCODE2014_MACSE_313_headers.accnos, fasta=xae_NT_macse.fasta)

# Use the same command to remove sequences of the reference dataset from the rest of the files

# Upload each file into Geneious (Biomatters)
# Select all, then click “Align/Assemble” > “Consensus align” > “Create alignment of all sequences”

# Save as “Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.fasta”


####################  Update .names  and .group files ####################

# Because MACSE removed sequences with interruptions of open reading frame from the .fasta file, the .names and .group files do not match anymore
# Some lines need to be removed
# to do so, we first identify the sequence labels that are no longer in the aligned .fasta file
# In terminal
# First, extract the sequence labels

    grep ">" Mock_CombinedRuns_denoised_contiged.trim.unique.fasta > FastaHeaders.txt
    grep ">" Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.fasta > FastaHeaders.MACSE.txt

# Second, sort labels

    sort -n -d FastaHeaders.txt > FastaHeaders.sorted.txt
    sort -n -d FastaHeaders.MACSE.txt > FastaHeaders.MACSE.sorted.txt

# Third, find missing labels in FastaHeaders.MACSE.sorted.txt

    comm -23 FastaHeaders.sorted.txt FastaHeaders.MACSE.sorted.txt  > outfile.accnos

# Remove “>” from outfile.accnos using search/replace in Textwranger
# In Mothur, remove sequences from .names and .group files

    remove.seqs(accnos=outfile.accnos,name=Mock_CombinedRuns_denoised_contiged.trim.names,group=Mock_CombinedRuns_denoised_contiged.groups)

## Mothur outputs:
# Mock_CombinedRuns_denoised_contiged.trim.pick.names
# Mock_CombinedRuns_denoised_contiged.pick.groups


####################  Precluster reads with Mothur (Schloss et al. 2009) ####################

# this is used to reduce the variability in the dataset and speed up the clustering step
# First, summarize .names and .group files in a table using Mothur

    count.seqs(name=Mock_CombinedRuns_denoised_contiged.trim.pick.names, group=Mock_CombinedRuns_denoised_contiged.pick.groups)

## Mothur output:
# Mock_CombinedRuns_denoised_contiged.trim.pick.count_table

# the following command then merges reads differing by three or fewer than three bases

    pre.cluster(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.pick.count_table,diffs=3)

## Mothur outputs:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.fasta
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.count_table

####################  De novo chimera removal with Uchime (Edgar et al. 2011) ####################

# Uchime is implemented in Mothur

    chimera.uchime(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.count_table)

## Mothur output:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.denovo.uchime.chimeras
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.denovo.uchime.accnos

# chimera.uchime generates a .accnos file containing the label of all potential chimeric sequences
# remove chimeras in mothur

    remove.seqs(accnos=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.denovo.uchime.accnos,fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.count_table)

## Mothur output:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.fasta
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.count_table

####################  Remove singletons with Mothur (Schloss et al. 2009) ####################

# this command splits the dataset in rare and common OTUs
# cutoff=1 specifies that rare preclusters contain a single sequence

    split.abund(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.count_table, cutoff=1,accnos=true)

## Mothur output:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.rare.count_table
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.abund.count_table
# rare.accnos
# abund.accnos
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.rare.fasta
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.abund.fasta

# use rare.accnos file to remove singletons

    remove.seqs(accnos=rare.accnos,fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.count_table)

## Mothur outputs:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.fasta
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.count_table

####################  Cluster with CROP (Hao et al. 2011) ####################

# Abundance information forms the base of the Gaussian mixture model used by CROP to delineate OTUs
# However CROP does not take .names file as input (to specify abundance)
# Therefore we need to deunique the .fasta file (opposite of dereplicate)
# in Mothur, run the following command

    deunique.seqs(fasta=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.fasta,count=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.count_table)

## Mothur output:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.fasta
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.groups

# Run the following command in CROP

    CROP_1_33 CROP -i Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.fasta -o Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP -l 3 -u 4 -b 75 -z 450

# -l 3 -u 4 specify the soft threshold
# -b the number of blocks = no. unique sequences / 50

## CROP outputs:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.fasta
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.list

####################  Create final OTU table in Mothur (Schloss et al. 2009) ####################

    count.seqs(name=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.list,group=Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant.groups)

## Mothur output:
# Mock_CombinedRuns_denoised_contiged.trim.unique.MACSE.precluster.pick.pick.redundant_CROP.cluster.count_table

# At this stage the dataset is ready for taxonomic assignments


## References
# Edgar, R.C. (2010). Search and clustering orders of magnitude faster than BLAST. Bioinformatics, 26, 2460–1
# Edgar, R.C., Haas, B.J., Clemente, J.C., Quince, C. & Knight, R. (2011). UCHIME improves sensitivity and speed of chimera detection. Bioinformatics, 27, 2194–2200
# Hao, X., Jiang, R. & Chen, T. (2011). Clustering 16S rRNA for OTU prediction: a method of unsupervised Bayesian clustering. Bioinformatics, 27, 611–8
# Li, H. (2015). BFC: correcting Illumina sequencing errors. Bioinformatics, 31, 2885–7
# Ranwez, V., Harispe, S., Delsuc, F. & Douzery, E.J.P. (2011). MACSE: Multiple Alignment of Coding SEquences accounting for frameshifts and stop codons. PloS One, 6, e22594
# Schloss, P.D., Westcott, S.L., Ryabin, T., Hall, J.R., Hartmann, M., Hollister, E.B., Lesniewski, R.A., Oakley, B.B., Parks, D.H., Robinson, C.J., Sahl, J.W., Stres, B., Thallinger, G.G., Van Horn, D.J. & Weber, C.F. (2009). Introducing mothur: open-source, platform-independent, community-supported software for describing and comparing microbial communities. Applied and Environmental Microbiology, 75, 7537–7541