# Gavriliuc et. al (2020) Long-term storage of feces at -80C versus -20C is # negligible for 16S rRNA amplicon profiling of the equine bacterial # microbiome. PeerJ. # # This script removes primers from paired end sequences using Cutadapt # through R. Works on sequences renamed from Illumina Miseq names to sample # names, but should work either way. Requires Cutadapt, see below: # (https://cutadapt.readthedocs.io/en/stable/installation.html), # # After removing primers, the remainder of the script conducts a custom # implementation of the DADA2 pipeline # Works on a directory of sequences that have had primer sequences removed. # Will need the flowing R packages: DADA2, ShortRead, Biostrings # # Initialization rm(list=ls()) pkgs <- c("dada2", "ShortRead", "Biostrings") lapply(pkgs, library, character.only=T, quiet=T, warn.conflicts=F) library(reticulate) ; library(tidyverse) conda_list()[[1]][2] %>% use_condaenv(required = TRUE) # Paths of sequences (here I use the sequences renamed by sample) and where # cutadapt is installed #path <- "/home/stefan.gavriliuc/projects/ice/only_twenty/seqs" path <- "/home/stefan/storage_temps/seqs" cutadapt <- "/home/stefan/miniconda3/envs/equine_nemabiome_pipeline/bin/cutadapt" list.files(path) # Create character vector of locations + names of forward seqs fnFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE)) # and same for reverse fnRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE)) # primers FWD <- "CCTACGGGNGGCWGCAG" REV <- "GACTACHVGGGTATCTAATCC" # Function that returns all orientations (forward, reverse, their complements # and reverse complements) of primers allOrients <- function(primer) { require(Biostrings) dna <- DNAString(primer) orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), RevComp = reverseComplement(dna)) return(sapply(orients, toString)) } # Get all of our orientation FWD.orients <- allOrients(FWD) REV.orients <- allOrients(REV) # Specify location + name of reads filtered for ambiguous bases (Ns) fnFs.filtN <- file.path(path, "filtN", basename(fnFs)) fnRs.filtN <- file.path(path, "filtN", basename(fnRs)) # Filter out all sequences containing Ns filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN=0, compress = FALSE, multithread = FALSE) #Function to count the number of primer hits primerHits <- function(primer, fn) { nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE) return(sum(nhits > 0)) } # Count our hits rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]), FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]), REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]), REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]])) system(paste(cutadapt, "--version")) # ensure that cutadapt is working # This will be the directory containing reads with primers removed path.cut <- file.path(path, paste0("cutadapt")) if(!dir.exists(path.cut)) dir.create(path.cut) # Make the directory # Specify directory + names of reads with primers removed fnFs.cut <- file.path(path.cut, basename(fnFs)) fnRs.cut <- file.path(path.cut, basename(fnRs)) FWD.RC <- dada2:::rc(FWD) REV.RC <- dada2:::rc(REV) # Specify cutadapt parameters # g stands for front of the read, # a stands for the adaptor (primer) to remove # capitalized parameters correspond to the R2 read R1.flags <- paste("-g", FWD, "-a", REV.RC) R2.flags <- paste("-G", REV, "-A", FWD.RC) # Run cutadapt from r # n is the number of rounds to search and remove a primer, # set to 5 in case primers appear more than once # o refers to output name # and p indicates that outputs consist of paired-end reads for (i in seq_along(fnFs)) { system(paste(cutadapt, R1.flags, R2.flags, "-m", 1,"-n", 2, "-o", fnFs.cut[i], "-p", fnRs.cut[i], fnFs.filtN[i], fnRs.filtN[i])) } #Now count primer hits in our sequences, should all be 0 rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]])) #rm(list=ls()) pkgs <- c("dada2", "ShortRead", "Biostrings") lapply(pkgs, library, character.only=T, quiet=T, warn.conflicts=F) set.seed(1) # Customization: input the path of sequences ("path" variable), # DADA2 formatted taxonomy reference FASTA file ("db_path"), # allowed overlap mismatch proportion ("overlap_mismatch_proportion"), # and confidence threshold #path <- "/home/stefan.gavriliuc/projects/ice/only_twenty/seqs/cutadapt" path <- "/home/stefan/storage_temps/seqs/cutadapt" db_path <- "silva_nr_v138_train_set.fa" taxonomy_confidence_threshold <- 80 list.files(path) # Extracting directory + filenames of primerless sequences cutFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE)) cutRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE)) # Extract sample names (remove path & end of sequence name, "L001_R1_001.fastq") get.sample.name <- function(fname) strsplit(basename(fname), "_L001")[[1]][1] sample.names <- unname(sapply(cutFs, get.sample.name)) # Plot forward and reverse quality profiles (get an idea of how our reads look) jpeg('cutFs_quality_profile.jpg') ; plotQualityProfile(cutFs[1:2]) ; dev.off() jpeg('cutRs_quality_profile.jpg') ; plotQualityProfile(cutRs[1:2]) ; dev.off() # Vector of file paths + names to where the filtered sequences will be stored filtFs <- file.path(path, "filtered", basename(cutFs)) filtRs <- file.path(path, "filtered", basename(cutRs)) # Conduct quality filtering out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, truncLen=c(250,200),maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE,multithread=FALSE) saveRDS(out, "out.rds") # Construct error models (mostly defaults) # MAX_CONSIST: the number of rounds in self-consistency loop # to reach error convergence, increased from 10 to 15 errF <- learnErrors(filtFs, multithread = FALSE, MAX_CONSIST=15) errR <- learnErrors(filtRs, multithread = FALSE, MAX_CONSIST=15) saveRDS(errF, "errF.rds") ; saveRDS(errR, "errR.rds") # Plot error models # Estimated error rates (black line) should fit with observed error # rates (points), and error rates should decrease with higher quality score jpeg("error-f.jpg") ; plotErrors(errF, nominalQ = TRUE) ; dev.off() jpeg("error-r.jpg") ; plotErrors(errR, nominalQ = TRUE) ; dev.off() # Denoise dadaFsPooled <- dada(filtFs, err = errF, multithread = TRUE, pool = T) dadaRsPooled <- dada(filtRs, err = errR, multithread = TRUE, pool = T) saveRDS(dadaFsPooled, "dadaFsPooled.rds") ; saveRDS(dadaRsPooled, "dadaRsPooled.rds") # Merge paired-end reads mergers <- mergePairs(dadaFsPooled, filtFs, dadaRsPooled, filtRs, verbose=T) saveRDS(mergers, "mergers.rds") # Create dada2-formatted sequence table seqtab <- makeSequenceTable(mergers) ; dim(seqtab) # Remove chimeras using consensus method seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=FALSE, verbose=TRUE) saveRDS(seqtab.nochim, "seqtab.nochim.rds") # Read in serialized objects out <- readRDS("out.rds") dadaFsPooled <- readRDS("dadaFsPooled.rds") dadaRsPooled <- readRDS("dadaRsPooled.rds") mergers <- readRDS("mergers.rds") seqtab.nochim <- readRDS("seqtab.nochim.rds") assignTax <- readRDS("assignTax.rds") # ASV length distributions table(nchar(getSequences(seqtab.nochim))) # Create a table showing how many sequences were retained after # each step of the pipeline getN <- function(x) sum(getUniques(x)) track <- cbind(out, sapply(dadaFsPooled, getN), sapply(dadaRsPooled, getN), sapply(mergers, getN), rowSums(seqtab.nochim)) colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") rownames(track) <- sample.names track # Creating fasta file, + merging count and taxonomy tables asv_seqs <- colnames(seqtab.nochim) asv_headers <- vector(dim(seqtab.nochim)[2], mode="character") # Loop to create headers for ASV fasta file (>ASV1, >ASV2, ...) for (i in 1:dim(seqtab.nochim)[2]) { asv_headers[i] <- paste(">ASV", i, sep="_") } # Making and writing out a fasta of our final ASV seqs: asv_fasta <- c(rbind(asv_headers, asv_seqs)) write(asv_fasta, "ASVs.fa") # Taxonomy using assignTaxonomy (RDP algorithm) with pre-specified database # and confidence threshold from above assignTax <- assignTaxonomy(seqtab.nochim, db_path, minBoot = taxonomy_confidence_threshold, multithread=TRUE, tryRC=TRUE, outputBootstraps=TRUE) saveRDS(assignTax, "assignTax.rds") # Count table: number of times ASV is observed/sample asv_tab <- t(seqtab.nochim) colnames(asv_tab) <- sample.names row.names(asv_tab) <- sub(">", "", asv_headers) # Taxonomy tables: ASV # and corresponding taxonomy asv_tax <- assignTax$tax asv_tax_boot <- assignTax$boot colnames(asv_tax_boot) <- c("kingdom.conf", "phylum.conf", "class.conf", "order.conf", "family.conf", "genus.conf") # Formatting rownames(asv_tax) <- sub(">", "", asv_headers) rownames(asv_tax_boot) <- sub(">", "", asv_headers) # Merging taxonomy assignments and bootstrap values tax_merged <- cbind(asv_tax, asv_tax_boot) # Rearrange for viewing convenience tax_merged_rearranged <-data.frame(tax_merged[,c("Kingdom","kingdom.conf", "Phylum","phylum.conf", "Class","class.conf", "Order", "order.conf", "Family", "family.conf", "Genus", "genus.conf")]) # Merging ASV counts and new taxonomy table final_table <- data.frame(cbind(asv_tab, tax_merged_rearranged)) write.csv(final_table, "dada2ASVs.csv", quote=F) # Append proportion of sequences assigned to each sample above 50% and # 80% confidence final_table$phylum.conf <- as.numeric(as.character(final_table$phylum.conf)) fifty <- final_table[as.numeric(as.character(final_table$phylum.conf)) > 50, ] eighty <- final_table[as.numeric(as.character(final_table$phylum.conf)) > 80, ] percent_over_fifty <- colSums(fifty[1:length(sample.names)])*100/ colSums(final_table[1:length(sample.names)]) percent_over_eighty <- colSums(eighty[1:length(sample.names)])*100/ colSums(final_table[1:length(sample.names)]) track <- as.data.frame(track) track$sample <- rownames(track) track$percent_over_fifty <- percent_over_fifty[match(track$sample, names(percent_over_fifty))] track$percent_over_eighty <- percent_over_eighty[match(track$sample, names(percent_over_eighty))] track$sample <- NULL write.csv(track, "trackSequences.csv", quote=F)