#splittaxfunction to split the taxonomy files on multiple characters using regex
#version 2015
#Kim De Paepe

#### INPUT OF FUNCTION
#taxonomy files generated by the mothur pipeline typically have a column containing the OTUs and one 
#column with the taxonomy, seperated by ; and with the probability between (), unless you specified PROBS=F

#### WHAT THIS FUNCTION DOES
# this function splits the columns on the ;,( and ) characters, but introduces an empty colum after the )
# for this reason empty columns are removed (colsum statement)

#### ARGUMENTS OF FUNCTION
# taxonomy files with probabilities are expected as input
# x needs to be a dataframe/table generated from reading in the mothur taxonomy file
# both the function classify.seqs and classify.otu can be used in mothur and it should be specified in the function which one you use

splittax <- function(x,classify){
  
  if(classify == "seqs"){  
    # use a regex to split the string on multiple characters
    y <- stri_split_regex(x$V2,"\\;|\\(|\\)", simplify = TRUE)
    # add the first column with the id's of the sample to the dateframe with the taxonomy
    x <- cbind(x[,1],as.data.frame(y))
    
    # remove the empty columns
    x <- x[,colSums(x != "") != 0] 
    # silva only goes till genus level so we need to add empty columns in that case, since it will try to give names in next step
    # and name vector will be to long if not adapted
    if (ncol(x)==13){ 
      x <- mutate(x,x14="NA",x15="NA")
    }
    # add column names
    colnames(x) <-c("id","Kingdom","prob_k","Phylum","prob_p","Class","prob_c","Order","prob_o","Family","prob_f","Genus","prob_g","Species","prob_s")
  }
  
  if(classify == "otu"){
    # remove first row and second column
    x <- x[-1,-2]
    # use a regex to split the string on multiple characters (note that because we deleted V2, second column, we now need to split on V3 instead)
    y <- stri_split_regex(x$V3,"\\;|\\(|\\)", simplify = TRUE)
    # add the first column with the id's of the sample to the dateframe with the taxonomy
    x <- cbind(x[,1],as.data.frame(y))
    # remove the empty columns
    x <- x[,colSums(x != "") != 0]
    # silva only goes till genus level so we need to add empty columns in that case, since it will try to give names in next step
    # and name vector will be to long if not adapted
    if (ncol(x)==13){ 
      x <- mutate(x,x14="NA",x15="NA")
    }
    # add column names
    colnames(x) <-c("id","Kingdom","prob_k","Phylum","prob_p","Class","prob_c","Order","prob_o","Family","prob_f","Genus","prob_g","Species","prob_s")
    
  }
  
  
  return(x)
}