# install the libraries necessary to process sequence data and query the NCBI database library(taxize) library(seqinr) # set this variable to be the name of your .loci file samp <- 'sampleName' # set your working directory setwd('~/path/to/your/working/directory') # read the .loci file into R as a character vector where each element of the list is one line of the .loci file. The .loci file contains the sequences for each individual that amplified at a given locus, separated by a consistent line break string. The number of sequences at each locus is not consistent. tmp <- readLines(paste0('./',samp, '.loci')) # The '//' string is used by the author of pyRAD as part of the string that denotes breaks between loci in the .loci file. We identify the vector elements in 'tmp' that have this string, so that we can find the breaks between loci tmp.breakLines <- grep('//', tmp, fixed = TRUE) # we move one line up from each break to select one sequence from each locus for identification take <- c(tmp.breakLines - 1) # use the above vector to pull one sequence per locus. This will be a string with the sample name, followed by the DNA equence of the locus seqs <- tmp[take] # remove the names from the sequence character string seqList <- lapply(seqs, function(sq){x <- strsplit(sq, " "); return(x[[1]][length(x[[1]])])}) # write out a fasta file that can be passed to the blastn program write.fasta(seqList, paste0(samp,"_", 1:length(seqList)), paste0("./",samp,".fa")) ########################## # to blast the sequences # ########################## # blastn -task dc-megablast -db nt -query [your header].fa -outfmt "6 qseqid sseqid staxids sskingdoms pident mismatch" -out [your header]_out.txt -num_threads 2 -max_target_seqs 20 ####################### # after the blast run # ####################### # set your threshold percentage similarity between your sequence and its NCBI match threshold <- 90 # read in the blast results blast.results <- read.table(paste0("./",samp,"_out.txt"), sep=" ") # get all of the NCBI ids from the blast results ids <- strsplit(as.character(blast.results$V3),";") # some sequences get more than one hit, with numbers separated by a semicolon. We take the first number in such cases ids <- as.numeric(unlist(lapply(ids, function(c){return(c[1])}))) # get a vector with each ID repeated once ids <- unique(ids) # use the taxize package to get the verbal classification for each id # make a directory to hold the verbal classifications dir.create(paste0("./",samp)) # loop over each id number and write out the classification into the file for later use. This is the most time-consuming step, so we save time by writing the verbal classification into files and querying that smaller database rather than using the 'classification' function for each separate ID in the Blast results. Please note that the 'classification' function occasionally times out, so the loop may need to be re-started if you see the following error message: 'Error in curl::curl_fetch_memory(url, handle = handle).' If that happens, type 'i' into the R terminal and edit the '1' in line 67 to match the value of i, then run the for loop line again. for(i in 1:length(ids)){ br.classified <- classification(ids[i], db="ncbi") write.table(br.classified[1], paste0('./',samp,'/',ids[i],'.txt')) } # remove all results that have a lower percent similarity than your threshold blast.thresh <- blast.results[blast.results$V5 > threshold,] # each locus can have more than one hit, so this makes a list of each locus name locus.IDs <- unique(blast.thresh$V1) # getTaxa is a function to return the most likely taxonomic ID of each sequence getTaxa <- function(id){ # get the results for a single locus ind.tmp <- blast.thresh[blast.thresh$V1==id,] # put the highest percent similarity hit at the top ind.tmp <- ind.tmp[rev(order(ind.tmp$V5)),] # remove sequences that don't have a kingdom affiliation ind.tmp <- ind.tmp[!ind.tmp$V4=="N/A",] # see if there are multiple kingdoms listed for the top hit(s) ind.max <- ind.tmp[ind.tmp$V5==max(ind.tmp$V5),] # set up a vector to hold the classification results taxon <- rep(NA, 3) # put the locus name in the first element of the vector taxon[1] <- as.character(id) # find the number of kingdoms that match the top hit nKing <- length(unique(ind.max$V4)) # if there is more than one kingdom, stop and report 'multiple' in the second element of the result vector if(nKing > 1){ taxon[2] <- "multiple" # if there is one kingdom, fill the second element of the result vector with the kingdom name and fill the third element with the phylum }else if(nKing==1){ taxon[2] <- as.character(unique(ind.max$V4)) # get the taxon ID number(s) to find the phylum phyla <- as.character(unique(ind.max$V3)) phyla <- strsplit(as.character(phyla),";") phyla <- as.numeric(unlist(lapply(phyla, function(c){return(c[1])}))) # query the verbal classification database to find if there are multiple phyla, no phylum names, or one phylum name. Fill the third element of the vector with this information phyNames <- c() for(i in 1:length(phyla)){ phy.tmp <- read.table(paste0('./',samp,'/', phyla[i],'.txt')) if(sum(phy.tmp[,2]=='phylum') > 0){ phyNames[i] <- as.matrix(phy.tmp[phy.tmp[,2]=='phylum',])[1,1] }else{phyNames[i] <- 'no phylum'} } if(length(unique(phyNames)) > 1){taxon[3] <- "multiple"}else{taxon[3] <- unique(phyNames)} } # return the filled vector of locus names, kingdom, and phylum return(taxon) } # loop over each locus and store the vector in a list taxList <- list() for(i in 1:length(locus.IDs)){ taxList[[i]] <- getTaxa(locus.IDs[i]) } # make the list into a matrix tax <- do.call(rbind, taxList) # write out the matrix into your directory write.table(tax, paste0("./",samp,"_tax.txt"))