## KEGG analysis script
# Download the raw data from:
# https://biomedcascz-my.sharepoint.com/:f:/g/personal/daniel_morais_biomed_cas_cz/EglnLwZvyGFCunCJav125m4BwqKlBACLtvQEmxEK2JBcVg?e=8m5UHt
# The current database was downloaded on 27.09.2019 and has the data fields (columns) as described below:
# 1       Entry
# 2       Entry name
# 3       Status
# 4       Protein names
# 5       Gene names
# 6       Organism
# 7       Length
# 8       Cross-reference (KO)
# 9       Taxonomic lineage (PHYLUM)
# 10      Taxonomic lineage (SPECIES) # This field carries current and old* taxonomic classifications.
# 11      Taxonomic lineage (GENUS)
# 12      Taxonomic lineage (KINGDOM)
# 13      Taxonomic lineage (SUPERKINGDOM)
# 14      Cross-reference (OrthoDB)
# 15      Cross-reference (eggNOG)
# *Details about the classification used in UNIPROT can be found at the link: https://www.uniprot.org/help/taxonomy

#Extract only Bacteria and Archaea from the database:
awk -F '\t' '$13 ~ /Bacteria|Archaea/ {print $0}' uniprot-database_\(type_ko\).tab > uniprot_kegg_filt_ba.txt

####
# Checking
####

# Checking how many KEGG accessions each entry has.
# Every KEGG accession ends with a ";" sign, so we counted how many ";" signs each entry had, to know how many KEGG IDs each entry had, using the following command.

cut -f 8 uniprot_kegg_filt_ba.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort -n | uniq -c

#  6503464 1
#    23375 2
#     4232 3

# Generate one table of UNIPROT entries containing the taxonomy information from the column "Organism" (field 6) and one table containing the taxonomic information from the "Taxonomic lineage" (fields 9,10 and 11).

paste <(cut -f 1,8 uniprot_kegg_filt_ba.txt) <(cut -f 6 uniprot_kegg_filt_ba.txt | cut -f 1,2 -d " ") | sort > uniprot_kegg_ba_1_8_6.txt

paste <(cut -f 1,8 uniprot_kegg_filt_ba.txt) <(cut -f 9,10 uniprot_kegg_filt_ba.txt ) <(cut -f 11 uniprot_kegg_filt_ba.txt | cut -f 1,2 -d " ") | sort > uniprot_kegg_ba_1_8_9_10_11.txt

# This search will get the entries from 
grep -w -f uniq_genus_list.txt uniprot_kegg_ba_1_8_6.txt > rep_genus_organism.txt 

grep -w -f uniq_genus_list.txt uniprot_kegg_ba_1_8_9_10_11.txt > rep_genus_tax.txt

grep -w -f uniq_species_list.txt uniprot_kegg_ba_1_8_6.txt > rep_species_organism.txt

grep -w -f uniq_species_list.txt uniprot_kegg_ba_1_8_9_10_11.txt > rep_species_tax.txt

# How many KEGG IDs each entry have?

cut -f 2 rep_species_tax.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c
#  243773 1
#    1011 2
#     283 3

# This output is interpreted as, 13424 entries have only 1 KEGG ID. 276875 entries have only 2 KEGG IDs and so on.

cut -f 2 rep_species_organism.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c
#  243165 1
#    1003 2
#     283 3
	 
cut -f 2 rep_genus_tax.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c
#  813948 1
#    4146 2
#    1447 3

cut -f 2 rep_genus_organism.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c
#  771330 1
#    3976 2
#    1408 3

# Now we will organize the tables so each entry will have only one KEGG ID. To do that, the follwing loop was used.

# This loop will go through all the files that had a hit between the genus/species list and the UNIPROT KEGG database. It will then count how many KEGG IDs were found in each entry and will isolate one KEGG ID per entry, putting them all in in a single column, keeping track of the taxonomy information.

for FILE in rep* ; do MULT_KEGGS=`cut -f 2 ${FILE} | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq| tail -n 1` ; echo $FILE $MULT_KEGGS ; FILE_out=${FILE##rep_}; FILE_out=${FILE_out%%.txt} ; mkdir ${FILE_out} ; for NB in $(seq 1 ${MULT_KEGGS}) ; do paste <(cut -f 1 ${FILE}) <(cut -f 2 ${FILE} | cut -f ${NB} -d ";") <(cut -f 3- ${FILE}) > ${FILE_out}/${FILE_out}_${NB}.txt ; done ; cd ${FILE_out}; cat *.txt | cut -f 2- | sed 's/[][]//g' | sort | uniq -c | awk '$2 ~ /K[0-9]/{print $0}' > concatenated_CLEAN_from_missing.txt ; cd ../ ; done


# The previous for loop in a multi line format.

#for FILE in rep*
# do 
# MULT_KEGGS=`cut -f 2 ${FILE} | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq | tail -n 1` # this step stores the maximum number of KEGG IDs per entry
# echo $FILE $MULT_KEGGS # Here we print on the screen which file we are currently checking and the maximum n of KEGG IDs 
# FILE_out=${FILE##rep_}
# FILE_out=${FILE_out%%.txt} # These two steps generate a variable with the name of the output file.
# mkdir ${FILE_out} # Create a directory for each file
# for NB in $(seq 1 ${MULT_KEGGS}) # Start of a loop to extract only one KEGG ID per entry
#  do paste <(cut -f 1 ${FILE}) <(cut -f 2 ${FILE} | cut -f ${NB} -d ";") <(cut -f 3- ${FILE}) > ${FILE_out}/${FILE_out}_${NB}.txt # save all the entries using only one of the KEGG IDs.
#  done
#  cd ${FILE_out} # Entering the folder to concatenate all the entries and remove lines without KEGG ID. 
#  cat *.txt | cut -f 2- | sed 's/[][]//g' |sort | uniq -c | awk '$2 ~ /K[0-9]/{print $0}' > concatenated_CLEAN_from_missing.txt
#  cd ../
#done

# Important note. In the loop approach described above, the second column (KEGG ID) will be empty in some entries for the files "_2.txt" and "_3.txt" (species) or (genus), because they will have no KEGG ID in this position. But this will be solved during concatenations step as the awk '$2 ~ /K[0-9]/{print $0}' is being used to remove those lines without a KEGG ID.

# To check how many KEGG IDs each line have, this loop can be used:
# E.g. if the maximum number of KEGG IDs per entry, in the current file is 3, use this:
for i in {1..3} ; do paste <(echo "${i}") <(egrep -c "K[0-9]{5}" genus_organism_${i}.txt) ; done
#1       786462
#2       5441
#3       1423

# The previous loop steps grouped the taxa by removing intra-species or intra-genus redundancy.
# The next steps will count how many KEGG IDs were present per taxa.
# This loop requires that no folder other than the four ones generated by the KEGG IDs separation loop are present in the current folder.

for FOLDERS in */ ; do ANNOT_SOURCE=${FOLDERS##*_} ; ANNOT_SOURCE=${ANNOT_SOURCE%%/}; TAX=${FOLDERS%%_*}; cd $FOLDERS ; mkdir number_of_EIDs ; cd number_of_EIDs/ ; if [ ${TAX} = "species" ] ; then awk '{print substr($0,index($0,$3))}' ../concatenated_CLEAN_from_missing.txt | sort | uniq -c | sort -nr > counts_per_species_${ANNOT_SOURCE}.txt ; elif [ ${TAX} = "genus" ] ; then awk '{$1=""; print $0}' concatenated_CLEAN_from_missing.txt | sed 's/^ //' | cut -f 1-3 -d " " | sort | uniq | cut -f 2,3 -d " "| sort | uniq -c | sort -nr > counts_per_genus_${ANNOT_SOURCE}.txt ; else break ; fi; cd ../../ ; done

# Previous loop in multi line format.
# for FOLDERS in */ 
#   do 
# ANNOT_SOURCE=${FOLDERS##*_}
#   ANNOT_SOURCE=${ANNOT_SOURCE%%/}
#   TAX=${FOLDERS%%_*}
#   cd $FOLDERS 
#   mkdir number_of_EIDs 
#   cd number_of_EIDs/ 
#     if [ ${TAX} = "species" ] 
#   then awk '{print substr($0,index($0,$3))}' ../concatenated_CLEAN_from_missing.txt | sort | uniq -c | sort -nr > counts_per_species_${ANNOT_SOURCE}.txt 
#   elif [ ${TAX} = "genus" ] 
#	then awk '{$1=""; print $0}' ../concatenated_CLEAN_from_missing.txt | sed 's/^ //' | cut -f 1-3 -d " " | sort | uniq | cut -f 2,3 -d " "| sort | uniq -c | sort -nr > counts_per_genus_${ANNOT_SOURCE}.txt
#   else break 
#   fi 
#   cd ../../ 
#     done

# Here we will count in how many taxa (genus and species) each KEGG ID is present.

for FOLDERS in */ ; do ANNOT_SOURCE=${FOLDERS##*_} ; ANNOT_SOURCE=${ANNOT_SOURCE%%/}; TAX=${FOLDERS%%_*}; cd $FOLDERS ; mkdir number_of_TAXA_per_EID ; cd number_of_TAXA_per_EID/ ; if [ ${TAX} = "species" ] && [ ${ANNOT_SOURCE} = "organism" ] ; then awk '{print $2,$3,$4}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_species_${ANNOT_SOURCE}.txt ; elif [ ${TAX} = "species" ] && [ ${ANNOT_SOURCE} = "tax" ] ; then awk '{print $2,$3,$4,$5}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_species_${ANNOT_SOURCE}.txt ; elif [ ${TAX} = "genus" ] && [ ${ANNOT_SOURCE} = "organism" ] ; then awk '{print $2,$3}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_genus_${ANNOT_SOURCE}.txt ; elif [ ${TAX} = "genus" ] && [ ${ANNOT_SOURCE} = "tax" ] ; then awk '{print $2,$3,$4}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_genus_${ANNOT_SOURCE}.txt ; else break ; fi; cd ../../ ; done

# Previous loop in multi line format.
# for FOLDERS in */ 
#  do 
#  ANNOT_SOURCE=${FOLDERS##*_}
#  ANNOT_SOURCE=${ANNOT_SOURCE%%/}
#  TAX=${FOLDERS%%_*}
#  cd $FOLDERS 
#  mkdir number_of_TAXA_per_EID 
#  cd number_of_TAXA_per_EID/
#  if [ ${TAX} = "species" ] && [ ${ANNOT_SOURCE} = "organism" ] 
#   then awk '{print $2,$3,$4}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_species_${ANNOT_SOURCE}.txt 
#  elif [ ${TAX} = "species" ] && [ ${ANNOT_SOURCE} = "tax" ]
#   then awk '{print $2,$3,$4,$5}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_species_${ANNOT_SOURCE}.txt
#  elif [ ${TAX} = "genus" ] && [ ${ANNOT_SOURCE} = "organism" ] 
#   then awk '{print $2,$3}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_genus_${ANNOT_SOURCE}.txt 
#  elif [ ${TAX} = "genus" ] && [ ${ANNOT_SOURCE} = "tax" ] 
#   then awk '{print $2,$3,$4}' ../concatenated_CLEAN_from_missing.txt | sort -n | uniq | cut -f 1 -d " " | sort | uniq -c > ID_counts_per_genus_${ANNOT_SOURCE}.txt 
#  else break 
#  fi
#  cd ../../ 
# done

# That is it. All the counts are done.