#The following script is meant to take multiple output files from the QIIME script assign_taxonomy.py and
#consolidate them into a single file. Note that a slightly modified version of assign_taxonomy.py must have
#been used. The modified version includes "Percent ID" and "Coverage" (overalp between query and reference)
#for each BLAST hit. Those metrics are required for filtering in the script below.

setwd("/PATH/TO/WORKING/DIRECTORY") #enter path to R working directory where assign_taxonomy.py taxonomy files (.txt) are located

#ENTER FILTERING PARAMETERS
Min.Percent.ID <- 95.0 #minimum percent ID for a BLAST match to be retained
Min.Coverage <- 100 #minimum query-reference overlap for a BLAST match to be retained
Min.Reads.Per.Hit <- 1 #minimum reads (per sample) matching to a reference sequence needed to retain that match


#The following code will combine all assign_taxonomy.py output files into a single file ("BLAST Results - Raw Data.csv")
#and then consolidate identical reference hits into a single row for easier intrepretation ("BLAST Results - Consolidated Data.csv")

results <- list()
no.match.results <- list()
raw.data <- list()

for(f in list.files(pattern = "*.txt")){
  sample <- gsub("_tax_assignments.txt","",f)
  df <- read.csv(file=f, sep="\t", col.names=c("OTU","Identification","E-value","Percent_ID","Coverage","Reference_Process_ID"), header=TRUE)
  if(nrow(df) == 0){
    df <- data.frame("Sample" = sample,"Reads" = 0,"Phylum"= "failed","Class"= "failed","Order"= "failed","Family"= "failed","Genus"= "failed","Species"= "failed")
    results <- rbind(results,df)
  } else{
    df <- data.frame(df,do.call(rbind, strsplit(as.character(df$OTU),";",fixed=TRUE)))
    df <- df[,-1]
    colnames(df) <- c("Identification","E-value","Percent_ID","Coverage","Reference_Process_ID","OTU","Reads")
    df$Sample <- rep(sample,times=length(df[,1]))
    df$Reads <- as.numeric(as.character(df$Reads))
    no.match <- df[df$Identification == "No blast hit",]
    temp <- df
    temp[,2] <- as.numeric(as.character(temp[,2]))
    temp$Coverage <- as.numeric(as.character(temp$Coverage))
    temp$Percent_ID <- as.numeric(as.character(temp$Percent_ID))
    raw.data <- rbind(raw.data,temp)
    df <- df[df$Identification != "No blast hit",]
    df$Percent_ID <- as.numeric(as.character(df$Percent_ID))
    df$Coverage <- as.numeric(as.character(df$Coverage))
    df$Reads <- as.numeric(as.character(df$Reads))
    df <- df[df$Percent_ID >= Min.Percent.ID,]
    df <- df[df$Coverage >= Min.Coverage,]
    df <- df[df$Reads >=Min.Reads.Per.Hit,]
    if(nrow(df) == 0){
      df <- data.frame("Sample" = sample,"Reads" = 0,"Phylum"= "no match","Class"= "no match","Order"= "no match","Family"= "no match","Genus"= "no match","Species"= "no match")
      results <- rbind(results,df)
    }else{
      df2 <- aggregate(Reads ~ Reference_Process_ID, data=df, FUN=sum)
      df2$Sample <- rep(sample,times=length(df2[,1]))
      df2$Identification <- df$Identification[match(df2$Reference_Process_ID, df$Reference_Process_ID)]
      if(length(df2$Reference_Process_ID == 1) & df2$Reference_Process_ID[1] == "None"){
        df2$Identification <- "no match;no match;no match;no match;no match;no match"
        df2 <- data.frame(df2,do.call(rbind, strsplit(as.character(df2$Identification),";",fixed=TRUE)))
        if(length(df2) < 10){
          df2$temp <- ""
        }
        df2 <- df2[,c(3,2,5,6,7,8,9,10)]
        colnames(df2) <- c("Sample","Reads","Phylum","Class","Order","Family","Genus","Species")
        results <- rbind(results,df2)
        no.match.results <- rbind(no.match.results,no.match)
        } else{
          df2 <- data.frame(df2,do.call(rbind, strsplit(as.character(df2$Identification),";",fixed=TRUE)))
          if(length(df2) < 10){
            df2$temp <- ""
          }
          df2 <- df2[,c(3,2,5,6,7,8,9,10)]
          colnames(df2) <- c("Sample","Reads","Phylum","Class","Order","Family","Genus","Species")
          results <- rbind(results,df2)
          no.match.results <- rbind(no.match.results,no.match)
        }
    }
  }
}

#Replace "_" in species names with space
results$Species <- gsub("_"," ",results$Species,fixed=TRUE)

#Rearrange columns of raw data
raw.data <- raw.data[,c(8,6,7,1,2,3,4)]

filters <- data.frame("Min ID" = Min.Percent.ID, "Min Coverage" = Min.Coverage, "Min Reads per Hit" = Min.Reads.Per.Hit)
#output consolidated and raw data
write.csv(results, file = "BLAST Results - Consolidated Data.csv", row.names = F)
write.csv(raw.data, file = "BLAST Results - Raw Data.csv", row.names = F)