#splittaxfunction to split the taxonomy files on multiple characters using regex #version 2015 #Kim De Paepe #### INPUT OF FUNCTION #taxonomy files generated by the mothur pipeline typically have a column containing the OTUs and one #column with the taxonomy, seperated by ; and with the probability between (), unless you specified PROBS=F #### WHAT THIS FUNCTION DOES # this function splits the columns on the ;,( and ) characters, but introduces an empty colum after the ) # for this reason empty columns are removed (colsum statement) #### ARGUMENTS OF FUNCTION # taxonomy files with probabilities are expected as input # x needs to be a dataframe/table generated from reading in the mothur taxonomy file # both the function classify.seqs and classify.otu can be used in mothur and it should be specified in the function which one you use splittax <- function(x,classify){ if(classify == "seqs"){ # use a regex to split the string on multiple characters y <- stri_split_regex(x$V2,"\\;|\\(|\\)", simplify = TRUE) # add the first column with the id's of the sample to the dateframe with the taxonomy x <- cbind(x[,1],as.data.frame(y)) # remove the empty columns x <- x[,colSums(x != "") != 0] # silva only goes till genus level so we need to add empty columns in that case, since it will try to give names in next step # and name vector will be to long if not adapted if (ncol(x)==13){ x <- mutate(x,x14="NA",x15="NA") } # add column names colnames(x) <-c("id","Kingdom","prob_k","Phylum","prob_p","Class","prob_c","Order","prob_o","Family","prob_f","Genus","prob_g","Species","prob_s") } if(classify == "otu"){ # remove first row and second column x <- x[-1,-2] # use a regex to split the string on multiple characters (note that because we deleted V2, second column, we now need to split on V3 instead) y <- stri_split_regex(x$V3,"\\;|\\(|\\)", simplify = TRUE) # add the first column with the id's of the sample to the dateframe with the taxonomy x <- cbind(x[,1],as.data.frame(y)) # remove the empty columns x <- x[,colSums(x != "") != 0] # silva only goes till genus level so we need to add empty columns in that case, since it will try to give names in next step # and name vector will be to long if not adapted if (ncol(x)==13){ x <- mutate(x,x14="NA",x15="NA") } # add column names colnames(x) <-c("id","Kingdom","prob_k","Phylum","prob_p","Class","prob_c","Order","prob_o","Family","prob_f","Genus","prob_g","Species","prob_s") } return(x) }