#The following script is meant to take multiple output files from the QIIME script assign_taxonomy.py and #consolidate them into a single file. Note that a slightly modified version of assign_taxonomy.py must have #been used. The modified version includes "Percent ID" and "Coverage" (overalp between query and reference) #for each BLAST hit. Those metrics are required for filtering in the script below. setwd("/PATH/TO/WORKING/DIRECTORY") #enter path to R working directory where assign_taxonomy.py taxonomy files (.txt) are located #ENTER FILTERING PARAMETERS Min.Percent.ID <- 95.0 #minimum percent ID for a BLAST match to be retained Min.Coverage <- 100 #minimum query-reference overlap for a BLAST match to be retained Min.Reads.Per.Hit <- 1 #minimum reads (per sample) matching to a reference sequence needed to retain that match #The following code will combine all assign_taxonomy.py output files into a single file ("BLAST Results - Raw Data.csv") #and then consolidate identical reference hits into a single row for easier intrepretation ("BLAST Results - Consolidated Data.csv") results <- list() no.match.results <- list() raw.data <- list() for(f in list.files(pattern = "*.txt")){ sample <- gsub("_tax_assignments.txt","",f) df <- read.csv(file=f, sep="\t", col.names=c("OTU","Identification","E-value","Percent_ID","Coverage","Reference_Process_ID"), header=TRUE) if(nrow(df) == 0){ df <- data.frame("Sample" = sample,"Reads" = 0,"Phylum"= "failed","Class"= "failed","Order"= "failed","Family"= "failed","Genus"= "failed","Species"= "failed") results <- rbind(results,df) } else{ df <- data.frame(df,do.call(rbind, strsplit(as.character(df$OTU),";",fixed=TRUE))) df <- df[,-1] colnames(df) <- c("Identification","E-value","Percent_ID","Coverage","Reference_Process_ID","OTU","Reads") df$Sample <- rep(sample,times=length(df[,1])) df$Reads <- as.numeric(as.character(df$Reads)) no.match <- df[df$Identification == "No blast hit",] temp <- df temp[,2] <- as.numeric(as.character(temp[,2])) temp$Coverage <- as.numeric(as.character(temp$Coverage)) temp$Percent_ID <- as.numeric(as.character(temp$Percent_ID)) raw.data <- rbind(raw.data,temp) df <- df[df$Identification != "No blast hit",] df$Percent_ID <- as.numeric(as.character(df$Percent_ID)) df$Coverage <- as.numeric(as.character(df$Coverage)) df$Reads <- as.numeric(as.character(df$Reads)) df <- df[df$Percent_ID >= Min.Percent.ID,] df <- df[df$Coverage >= Min.Coverage,] df <- df[df$Reads >=Min.Reads.Per.Hit,] if(nrow(df) == 0){ df <- data.frame("Sample" = sample,"Reads" = 0,"Phylum"= "no match","Class"= "no match","Order"= "no match","Family"= "no match","Genus"= "no match","Species"= "no match") results <- rbind(results,df) }else{ df2 <- aggregate(Reads ~ Reference_Process_ID, data=df, FUN=sum) df2$Sample <- rep(sample,times=length(df2[,1])) df2$Identification <- df$Identification[match(df2$Reference_Process_ID, df$Reference_Process_ID)] if(length(df2$Reference_Process_ID == 1) & df2$Reference_Process_ID[1] == "None"){ df2$Identification <- "no match;no match;no match;no match;no match;no match" df2 <- data.frame(df2,do.call(rbind, strsplit(as.character(df2$Identification),";",fixed=TRUE))) if(length(df2) < 10){ df2$temp <- "" } df2 <- df2[,c(3,2,5,6,7,8,9,10)] colnames(df2) <- c("Sample","Reads","Phylum","Class","Order","Family","Genus","Species") results <- rbind(results,df2) no.match.results <- rbind(no.match.results,no.match) } else{ df2 <- data.frame(df2,do.call(rbind, strsplit(as.character(df2$Identification),";",fixed=TRUE))) if(length(df2) < 10){ df2$temp <- "" } df2 <- df2[,c(3,2,5,6,7,8,9,10)] colnames(df2) <- c("Sample","Reads","Phylum","Class","Order","Family","Genus","Species") results <- rbind(results,df2) no.match.results <- rbind(no.match.results,no.match) } } } } #Replace "_" in species names with space results$Species <- gsub("_"," ",results$Species,fixed=TRUE) #Rearrange columns of raw data raw.data <- raw.data[,c(8,6,7,1,2,3,4)] filters <- data.frame("Min ID" = Min.Percent.ID, "Min Coverage" = Min.Coverage, "Min Reads per Hit" = Min.Reads.Per.Hit) #output consolidated and raw data write.csv(results, file = "BLAST Results - Consolidated Data.csv", row.names = F) write.csv(raw.data, file = "BLAST Results - Raw Data.csv", row.names = F)