#!/usr/local/bin/python3

'''
    USAGE: calculate-loss-clam-diff.py clam-diff-file <output-file>
    Creates a json file with the stats of the clam-diff files.
    The json file has the following structure:

'''

import re
import json

regex_name = r"\b(\w+)\.(?:main|\w+)\.json\b"
regex_name = re.compile(regex_name)

regex_invariant_couple = r'Invariant #1: \{(.*?)\}\n\s*Invariant #2: \{(.*?)\}'
regex_invariant_couple = re.compile(regex_invariant_couple, re.DOTALL)


def extract_stats(comparisions):
    stats = {}
    for comparision in comparisions:
        try:
            match_name = re.search(regex_name, comparision)
            if match_name:
                file_name = match_name.group(1)
                print("File name:", file_name)
                if file_name == "exam":
                    file_name = "qsort-exam"
                if file_name not in stats:
                    stats[file_name] = {}
                    stats[file_name]["total_loss_percentage"] = []
                    stats[file_name]["loss_percentage_1_gt_than_2"] = []
                    stats[file_name]["loss_percentage_1_lt_than_2"] = []
                matches = re.findall(regex_invariant_couple, comparision)
                for match in matches:
                    invariant1 = match[0].strip()
                    invariant2 = match[1].strip()

                    invariant1_set = set(invariant1.split(";"))
                    invariant2_set = set(invariant2.split(";"))

                    invariant1_set = set(
                        map(lambda x: x.strip(), invariant1_set))
                    invariant2_set = set(
                        map(lambda x: x.strip(), invariant2_set))

                    assert invariant1_set != invariant2_set

                    max_len = max(len(invariant1_set), len(invariant2_set))
                    min_len = min(len(invariant1_set), len(invariant2_set))
                    if len(invariant1_set) > len(invariant2_set):
                        diff_set = invariant1_set - invariant2_set
                    else:
                        diff_set = invariant2_set - invariant1_set
                    # assert len(diff_set) == max_len - min_len
                    print(len(diff_set), min_len, max_len, len(invariant2_set), len(invariant1_set))
                    loss_percentage = len(diff_set) / min_len
                    print("Loss percentage:", loss_percentage)
                    stats[file_name]["total_loss_percentage"].append(loss_percentage)
                    if len(invariant1_set) > len(invariant2_set):
                        stats[file_name]["loss_percentage_1_gt_than_2"].append(loss_percentage)
                    else:
                        stats[file_name]["loss_percentage_1_lt_than_2"].append(loss_percentage)

                print("Number of matches:", len(matches))
            else:
                raise Exception(
                    "ERROR: Could not find file name in first line")

        except Exception as e:
            print("EXCEPITION RAISED: " + str(e))
            continue
    return stats


def analyze_stats(file_name, stats):
    file = open(file_name, "r")
    lines = file.read()

    # Split on line "Comparing precision of ..."
    comparisions = lines.split("Comparing precision of ")
    comparisions.remove(comparisions[0])

    file_name = file_name.replace(".txt", "")
    stats[file_name] = extract_stats(comparisions)


if __name__ == "__main__":
    import sys
    if len(sys.argv) < 3:
        print("USAGE: dump_table.py [clam-diff-files] <output-file>")
        sys.exit(1)

    stats = {}
    for file_name in sys.argv[1:-1]:
        analyze_stats(file_name, stats)
    json.dump(stats, open(sys.argv[-1], "w"), indent=4, sort_keys=True)
