## EggNOG script
# Download the raw data from:
# https://biomedcascz-my.sharepoint.com/:f:/g/personal/daniel_morais_biomed_cas_cz/EglnLwZvyGFCunCJav125m4BwqKlBACLtvQEmxEK2JBcVg?e=8m5UHt
# The current database was downloaded on 27.09.2019 and has the data fields (columns) as described below:
# 1       Entry
# 2       Entry name
# 3       Status
# 4       Protein names
# 5       Gene names
# 6       Organism
# 7       Length
# 8       Cross-reference (KO)
# 9       Taxonomic lineage (PHYLUM)
# 10      Taxonomic lineage (SPECIES) # This field carries current and old* taxonomic classifications.
# 11      Taxonomic lineage (GENUS)
# 12      Taxonomic lineage (KINGDOM)
# 13      Taxonomic lineage (SUPERKINGDOM)
# 14      Cross-reference (OrthoDB)
# 15      Cross-reference (eggNOG)
# *Details about the classification used in UNIPROT can be found at the link: https://www.uniprot.org/help/taxonomy

#Extract only Bacteria and Archaea from the database:
awk -F '\t' '$13 ~ /Bacteria|Archaea/ {print $0}' uniprot-database_\(type_eggnog\).tab > uniprot_eggnog_filt_ba.txt

####
# Checking
####

# Checking how many EggNOG accessions each entry has.
# Every EggNOG accession ends with a ";" sign, so we counted how many ";" signs each entry had, to know how many EggNOG IDs each entry had, using the following command.

cut -f 15 uniprot_eggnog_filt_ba.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort -n | uniq -c

#  183575 1
# 4467542 2
#   89612 3
#    8163 4
#     464 5
#     203 6
#      36 7
#      18 8
#       6 9
#       2 10
#       1 11

# Generate one table of UNIPROT entries containing the taxonomy information from the column "Organism" (field 6) and one table containing the taxonomic information from the "Taxonomic lineage" (fields 9,10 and 11).
paste <(cut -f 1,15 uniprot_eggnog_filt_ba.txt) <(cut -f 6 uniprot_eggnog_filt_ba.txt | cut -f 1,2 -d " ") | sort > uniprot_eggnog_ba_1_15_6.txt

paste <(cut -f 1,15 uniprot_eggnog_filt_ba.txt) <(cut -f 9,10 uniprot_eggnog_filt_ba.txt ) <(cut -f 11 uniprot_eggnog_filt_ba.txt | cut -f 1,2 -d " ") | sort > uniprot_eggnog_ba_1_15_9_10_11.txt

# This search will get the entries from 
grep -f uniq_genus_list.txt uniprot_eggnog_ba_1_15_6.txt > rep_genus_organism.txt 

grep -f uniq_genus_list.txt uniprot_eggnog_ba_1_15_9_10_11.txt > rep_genus_tax.txt

grep -f uniq_species_list.txt uniprot_eggnog_ba_1_15_6.txt > rep_species_organism.txt

grep -f uniq_species_list.txt uniprot_eggnog_ba_1_15_9_10_11.txt > rep_species_tax.txt

# How many EggNOG IDs each entry have?
cut -f 2 rep_species_tax.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c

#  13424 1
# 276875 2
#   6393 3
#    583 4
#      4 5
#     19 6

# This output is interpreted as, 13424 entries have only 1 EggNOG ID. 276875 entries have only 2 EggNOG IDs and so on.

cut -f 2 rep_species_organism.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c
  
#  13357 1
# 277575 2
#   6399 3
#    586 4
#      4 5
#     19 6
	 
cut -f 2 rep_genus_tax.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c

#  40795 1
# 762824 2
#  16626 3
#   1625 4
#     31 5
#     43 6
#      1 8
#      1 9

cut -f 2 rep_genus_organism.txt | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq -c
  
#  40078 1
# 736249 2
#  16098 3
#   1586 4
#     31 5
#     40 6
#      1 8
#      1 9

# Now we will organize the tables so each entry will have only one EggNOG ID. To do that, the follwing loop was used.

# This loop will go through all the files that had a hit between the genus/species list and the UNIPROT EggNOG database. It will then count how many EggNOG IDs were found in each entry and will isolate one EggNOG ID per entry, putting them all in in a single column, keeping track of the taxonomy information.

for FILE in rep* ; do MULT_EGGNOGS=`cut -f 2 ${FILE} | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq| tail -n 1` ; echo $FILE $MULT_EGGNOGS ; FILE_out=${FILE##rep_}; FILE_out=${FILE_out%%.txt} ; mkdir ${FILE_out} ; for NB in $(seq 1 ${MULT_EGGNOGS}) ; do paste <(cut -f 1 ${FILE}) <(cut -f 2 ${FILE} | cut -f ${NB} -d ";") <(cut -f 3- ${FILE}) > ${FILE_out}/${FILE_out}_${NB}.txt ; done ; cd ${FILE_out}; cat *.txt | cut -f 2- | sed 's/[][]//g' | sort | uniq -c | egrep "COG|KOG|ENOG" > concatenated_CLEAN_from_missing.txt ; cd ../ ; done


# The previous for loop in a multi line format.

#for FILE in rep*
# do MULT_EGGNOGS=`cut -f 2 ${FILE} | grep -n -o ";" | cut -f 1 -d ":" | uniq -c | awk '{print $1}' | sort | uniq | tail -n 1` # this step stores the maximum number of EggNOG IDs per entry
# echo $FILE $MULT_EGGNOGS # Here we print on the screen which file we are currently checking and the maximum n of EggNOG IDs 
# FILE_out=${FILE##rep_}
# FILE_out=${FILE_out%%.txt} # These two steps generate a variable with the name of the output file.
# mkdir ${FILE_out} # Create a directory for each file
# for NB in $(seq 1 ${MULT_EGGNOGS}) # Start of a loop to extract only one EggNOG ID per entry
#  do paste <(cut -f 1 ${FILE}) <(cut -f 2 ${FILE} | cut -f ${NB} -d ";") <(cut -f 3- ${FILE}) > ${FILE_out}/${FILE_out}_${NB}.txt # save all the entries using only one EggNOG ID
#  done
#  cd ${FILE_out} # Entering the folder to concatenate all the entries and remove lines without EggNOG ID. 
#  cat *.txt | cut -f 2- | sed 's/[][]//g' |sort | uniq -c | egrep "COG|KOG|ENOG" > concatenated_CLEAN_from_missing.txt
#  cd ../
#done

# Important note. In the loop approach described above, the second column (EggNOG ID) will be empty in some entries for the files 2-6 (species) or 2-9 (genus), because they will have no EggNOG ID. But this will be solved during concatenations step as the egrep "COG|KOG|ENOG" is being used to remove those lines without an EggNOG ID.

# To check how many EggNOG IDs each line have, this loop can be used:
# E.g. if the maximum number of EggNOG IDs per entry, in the current file is 9, use this:
# for i in {1..9} ; do paste <(echo "${i}") <(grep -c "OG" genus_organism_${i}.txt) ; done
#1       788856
#2       748884
#3       17631
#4       1653
#5       73
#6       42
#7       2
#8       2
#9       1

# The previous loop steps grouped the taxa by removing intra-species or intra-genus redundancy.
# The next steps will count how many EggNOG IDs were present per taxa.
# This loop requires that only the four folders generated from the output of the EggNOG IDs separation loop, are present in the current folder.

#
for FOLDERS in */ ; do ANNOT_SOURCE=${FOLDERS##*_} ; ANNOT_SOURCE=${ANNOT_SOURCE%%/}; TAX=${FOLDERS%%_*}; cd $FOLDERS ; mkdir number_of_EIDs ; cd number_of_EIDs/ ; if [ ${TAX} = "species" ] ; then awk '{print substr($0,index($0,$3))}' ../concatenated_CLEAN_from_missing.txt | sort | uniq -c | sort -nr > counts_per_species_${ANNOT_SOURCE}.txt ; elif [ ${TAX} = "genus" ] ; then awk '{print $2"\t"$3"\t"$4}' ../concatenated_CLEAN_from_missing.txt | sort | uniq | awk '{print $3"\t"$4}' | sort | uniq -c | sort -nr > counts_per_genus_${ANNOT_SOURCE}.txt ; else break ; fi; cd ../../ ; done

# Here we will count in how many taxa each EggNOG ID is present.

for FOLDERS in */ ; do ANNOT_SOURCE=${FOLDERS##*_} ; ANNOT_SOURCE=${ANNOT_SOURCE%%/}; cd $FOLDERS ; mkdir number_of_TAXA_per_EID ; cd number_of_TAXA_per_EID/ ; if [ ${TAX} = "species" ] ; then awk '{print $2}' ../concatenated_CLEAN_from_missing.txt | sort | uniq -c | sort -nr > ID_counts_per_species_${ANNOT_SOURCE}.txt ; elif [ ${TAX} = "genus" ] ; then awk '{print $2}' ../concatenated_CLEAN_from_missing.txt | sort | uniq -c | sort -nr > ID_counts_per_genus_${ANNOT_SOURCE}.txt ; else break ; fi; cd ../../ ; done

#That is it. All the counts are done.