# Gavriliuc et. al (2020) Long-term storage of feces at -80C versus -20C is 
#   negligible for 16S rRNA amplicon profiling of the equine bacterial 
#   microbiome. PeerJ.
# 
# This script removes primers from paired end sequences using Cutadapt 
#   through R. Works on sequences renamed from Illumina Miseq names to sample
#   names, but should work either way. Requires Cutadapt, see below:
#   (https://cutadapt.readthedocs.io/en/stable/installation.html),
#   
#  After removing primers, the remainder of the script conducts a custom 
#   implementation of the DADA2 pipeline
#   Works on a directory of sequences that have had primer sequences removed.
#   Will need the flowing R packages: DADA2, ShortRead, Biostrings
#

# Initialization
rm(list=ls())
pkgs <- c("dada2", "ShortRead", "Biostrings")
lapply(pkgs, library, character.only=T, quiet=T, warn.conflicts=F)

library(reticulate) ; library(tidyverse)
conda_list()[[1]][2] %>% use_condaenv(required = TRUE)

# Paths of sequences (here I use the sequences renamed by sample) and where 
# cutadapt is installed
#path <- "/home/stefan.gavriliuc/projects/ice/only_twenty/seqs"
path <- "/home/stefan/storage_temps/seqs"
cutadapt <- "/home/stefan/miniconda3/envs/equine_nemabiome_pipeline/bin/cutadapt"

list.files(path)
# Create character vector of locations + names of forward seqs
fnFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
# and same for reverse
fnRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE))

# primers
FWD <- "CCTACGGGNGGCWGCAG"
REV <- "GACTACHVGGGTATCTAATCC"

# Function that returns all orientations (forward, reverse, their complements 
# and reverse complements) of primers
allOrients <- function(primer) {
  require(Biostrings)
  dna <- DNAString(primer)
  orients <- c(Forward = dna, Complement = complement(dna),
               Reverse = reverse(dna), RevComp = reverseComplement(dna))
  return(sapply(orients, toString))
}

# Get all of our orientation
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)

# Specify location + name of reads filtered for ambiguous bases (Ns)
fnFs.filtN <- file.path(path, "filtN", basename(fnFs))
fnRs.filtN <- file.path(path, "filtN", basename(fnRs))

# Filter out all sequences containing Ns
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN=0, compress = FALSE, 
              multithread = FALSE)

#Function to count the number of primer hits
primerHits <- function(primer, fn) {
  nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
  return(sum(nhits > 0))
}

# Count our hits
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]), 
      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]), 
      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))


system(paste(cutadapt, "--version")) #  ensure that cutadapt is working
# This will be the directory containing reads with primers removed
path.cut <- file.path(path, paste0("cutadapt"))
if(!dir.exists(path.cut)) dir.create(path.cut) #  Make the directory
# Specify directory + names of reads with primers removed
fnFs.cut <- file.path(path.cut, basename(fnFs)) 
fnRs.cut <- file.path(path.cut, basename(fnRs))

FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)

# Specify cutadapt parameters
# g stands for front of the read,
# a stands for the adaptor (primer) to remove
# capitalized parameters correspond to the R2 read
R1.flags <- paste("-g", FWD, "-a", REV.RC) 
R2.flags <- paste("-G", REV, "-A", FWD.RC) 

# Run cutadapt from r
# n is the number of rounds to search and remove a primer,
#   set to 5 in case primers appear more than once
# o refers to output name
# and p indicates that outputs consist of paired-end reads
for (i in seq_along(fnFs)) {
  system(paste(cutadapt, R1.flags, R2.flags, "-m", 1,"-n", 2, "-o", 
               fnFs.cut[i], "-p", fnRs.cut[i], fnFs.filtN[i], fnRs.filtN[i]))
}

#Now count primer hits in our sequences, should all be 0
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
      FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), 
      REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), 
      REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))						

#rm(list=ls())

pkgs <- c("dada2", "ShortRead", "Biostrings")
lapply(pkgs, library, character.only=T, quiet=T, warn.conflicts=F)

set.seed(1)

# Customization: input the path of sequences ("path" variable), 
# DADA2 formatted taxonomy reference FASTA file ("db_path"),
# allowed overlap mismatch proportion ("overlap_mismatch_proportion"),
# and confidence threshold
#path <- "/home/stefan.gavriliuc/projects/ice/only_twenty/seqs/cutadapt" 
path <- "/home/stefan/storage_temps/seqs/cutadapt"
db_path <- "silva_nr_v138_train_set.fa"
taxonomy_confidence_threshold <- 80

list.files(path)

# Extracting directory + filenames of primerless sequences
cutFs <- sort(list.files(path, pattern = "_R1_001.fastq", full.names = TRUE))
cutRs <- sort(list.files(path, pattern = "_R2_001.fastq", full.names = TRUE))

# Extract sample names (remove path & end of sequence name, "L001_R1_001.fastq")
get.sample.name <- function(fname) strsplit(basename(fname), "_L001")[[1]][1]
sample.names <- unname(sapply(cutFs, get.sample.name))

# Plot forward and reverse quality profiles (get an idea of how our reads look)
jpeg('cutFs_quality_profile.jpg') ; plotQualityProfile(cutFs[1:2]) ; dev.off()
jpeg('cutRs_quality_profile.jpg') ; plotQualityProfile(cutRs[1:2]) ; dev.off() 

# Vector of file paths + names to where the filtered sequences will be stored
filtFs <- file.path(path, "filtered", basename(cutFs))
filtRs <- file.path(path, "filtered", basename(cutRs))
# Conduct quality filtering
out <- filterAndTrim(cutFs, filtFs, cutRs, filtRs, truncLen=c(250,200),maxN=0,
	maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE,multithread=FALSE)
saveRDS(out, "out.rds")

# Construct error models (mostly defaults)
# MAX_CONSIST: the number of rounds in self-consistency loop
#   to reach error convergence, increased from 10 to 15
errF <- learnErrors(filtFs, multithread = FALSE, MAX_CONSIST=15)
errR <- learnErrors(filtRs, multithread = FALSE, MAX_CONSIST=15)
saveRDS(errF, "errF.rds") ; saveRDS(errR, "errR.rds")

# Plot error models
# Estimated error rates (black line) should fit with observed error
#   rates (points), and error rates should decrease with higher quality score
jpeg("error-f.jpg") ; plotErrors(errF, nominalQ = TRUE) ; dev.off()
jpeg("error-r.jpg") ; plotErrors(errR, nominalQ = TRUE) ; dev.off()

# Denoise
dadaFsPooled <- dada(filtFs, err = errF, multithread = TRUE, pool = T)
dadaRsPooled <- dada(filtRs, err = errR, multithread = TRUE, pool = T)
saveRDS(dadaFsPooled, "dadaFsPooled.rds") ; saveRDS(dadaRsPooled, "dadaRsPooled.rds")

# Merge paired-end reads
mergers <- mergePairs(dadaFsPooled, filtFs, dadaRsPooled, filtRs, verbose=T)
saveRDS(mergers, "mergers.rds")

# Create dada2-formatted sequence table
seqtab <- makeSequenceTable(mergers) ; dim(seqtab)

# Remove chimeras using consensus method
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", 
                                    multithread=FALSE, verbose=TRUE)
saveRDS(seqtab.nochim, "seqtab.nochim.rds")


# Read in serialized objects
out <- readRDS("out.rds")
dadaFsPooled <- readRDS("dadaFsPooled.rds")
dadaRsPooled <- readRDS("dadaRsPooled.rds")
mergers <- readRDS("mergers.rds")
seqtab.nochim <- readRDS("seqtab.nochim.rds")
assignTax <- readRDS("assignTax.rds")

# ASV length distributions
table(nchar(getSequences(seqtab.nochim)))

# Create a table showing how many sequences were retained after 
# each step of the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFsPooled, getN), sapply(dadaRsPooled, getN), 
	       sapply(mergers, getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", 
		     "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
track

# Creating fasta file, + merging count and taxonomy tables
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")

# Loop to create headers for ASV fasta file (>ASV1, >ASV2, ...)
for (i in 1:dim(seqtab.nochim)[2]) {
  asv_headers[i] <- paste(">ASV", i, sep="_")
}

#	Making and writing out a fasta of our final ASV seqs:
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, "ASVs.fa")

# Taxonomy using assignTaxonomy (RDP algorithm) with pre-specified database
# and confidence threshold from above
assignTax <- assignTaxonomy(seqtab.nochim, db_path, 
			    minBoot = taxonomy_confidence_threshold, 
                            multithread=TRUE, tryRC=TRUE, outputBootstraps=TRUE)
saveRDS(assignTax, "assignTax.rds")

# Count table: number of times ASV is observed/sample
asv_tab <- t(seqtab.nochim)
colnames(asv_tab) <- sample.names
row.names(asv_tab) <- sub(">", "", asv_headers)

# Taxonomy tables: ASV # and corresponding taxonomy
asv_tax <- assignTax$tax
asv_tax_boot <- assignTax$boot
colnames(asv_tax_boot) <- c("kingdom.conf", "phylum.conf", "class.conf", 
			    "order.conf", "family.conf", "genus.conf")

# Formatting
rownames(asv_tax) <- sub(">", "", asv_headers)
rownames(asv_tax_boot) <- sub(">", "", asv_headers)

# Merging taxonomy assignments and bootstrap values
tax_merged <- cbind(asv_tax, asv_tax_boot)

# Rearrange for viewing convenience
tax_merged_rearranged <-data.frame(tax_merged[,c("Kingdom","kingdom.conf",
                                                 "Phylum","phylum.conf",
                                                 "Class","class.conf",
                                                 "Order", "order.conf",
                                                 "Family", "family.conf",
                                                 "Genus", "genus.conf")])

# Merging ASV counts and new taxonomy table
final_table <- data.frame(cbind(asv_tab, tax_merged_rearranged))

write.csv(final_table, "dada2ASVs.csv", quote=F)

# Append proportion of sequences assigned to each sample above 50% and 
# 80% confidence
final_table$phylum.conf <- as.numeric(as.character(final_table$phylum.conf))

fifty <- final_table[as.numeric(as.character(final_table$phylum.conf)) > 50, ]
eighty <- final_table[as.numeric(as.character(final_table$phylum.conf)) > 80, ]

percent_over_fifty <- colSums(fifty[1:length(sample.names)])*100/
  colSums(final_table[1:length(sample.names)])
percent_over_eighty <- colSums(eighty[1:length(sample.names)])*100/
  colSums(final_table[1:length(sample.names)])

track <- as.data.frame(track)
track$sample <- rownames(track)

track$percent_over_fifty <- percent_over_fifty[match(track$sample, 
                                                     names(percent_over_fifty))]
track$percent_over_eighty <- percent_over_eighty[match(track$sample, 
                                                       names(percent_over_eighty))]

track$sample <- NULL
write.csv(track, "trackSequences.csv", quote=F)