import datetime
import os
from datetime import datetime, timedelta
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re, string
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import csv
import sys
from tika import parser # pip install tika
from os import listdir
from os.path import isfile, join
import scipy
import numpy as np
from matplotlib import pyplot as plt
import statsmodels.api as sm
import scipy.stats as st
import pylab
import math


wnl = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

positive_report_paths = ['positive_result_reports/{}'.format(f) for f in listdir('positive_result_reports')]
negative_report_paths = ['negative_result_reports/{}'.format(f) for f in listdir('negative_result_reports')]

positive_report_paths = positive_report_paths
negative_report_paths = negative_report_paths

def remove_ds_store_path(path_array):
    for path in path_array:
        if '.DS_Store' in path:
            path_array.remove(path)
    return(path_array)

positive_report_paths = remove_ds_store_path(positive_report_paths)
negative_report_paths = remove_ds_store_path(negative_report_paths)

def compress_space(document):

    content = []

    skip_lexicon = ['\n','\t',' ']

    segment_start = 0
    segment_end = 0

    last = ''

    record_word_start = True
    
    for i,v in enumerate(document):
        if v == '\n' or v == '\t' or v == ' ':
            if document[i-1] in skip_lexicon:
                continue
            else:
                segment_end = i
                term = document[segment_start:segment_end]
                term = term.replace('\n','')
                term = term.replace('\t','')
                term = term.replace(' ','')
                content.append(term)
                record_word_start = True
        else:
            if record_word_start == True:
                segment_start = i
                record_word_start = False
            else:
                continue

    res = ''

    for e in content:
        res += e
        res += ' ' 
    
    return res

def load_raw_text(filepath):

    raw = parser.from_file(filepath)

    content = raw['content']

    #print(content)

    #def build_direct_string(document):

    res = compress_space(content)

    return res

positive_report_text = [load_raw_text(path) for path in positive_report_paths]
negative_report_text = [load_raw_text(path) for path in negative_report_paths]

stopwords = stopwords.words('english')

def lower(entry):
    res = entry.lower()
    return res


def trim_space(document):

    document = document.replace('\n','')
    
    try:
        while document[-1] == ' ' or document[-1] == '\n':
            document = document[:-1]
    except IndexError:
        pass
    try:
        while document[0] == ' ' or document[0] == '\n':
            document = document[1:]
    except IndexError:
        pass

    new = ''
    
    for i,v in enumerate(document):
        try:
            if v == ' ' and document[i+1] == ' ':
                continue
            else:
                new += v
        except IndexError:
            pass

    return new


def normalize(document):
    res = trim_space(res)
    res = lower(res)
    return res

positive_normalized_corpus = [normalize(entry) for entry in positive_report_text]
negative_normalized_corpus = [normalize(entry) for entry in negative_report_text]

def lemmatize(document_array):
    res = [wnl.lemmatize(term) for term in document_array]
    return res

def filtered_term_array_gen(collection): # Creates a single array of all terms across the corpus minus stopwords
    term_array_converted_documents = []
    for document in collection: 
        temp = []
        tokenizer = RegexpTokenizer(r'\w+')
        doc_split = tokenizer.tokenize(document)
        lemmatized = lemmatize(doc_split)
        filtered = [word for word in doc_split if word not in stopwords]
        for e in filtered:
            temp.append(e)
        term_array_converted_documents.append(temp)
    return term_array_converted_documents

positive_filtered_term_arrays = filtered_term_array_gen(positive_normalized_corpus)
negative_filtered_term_arrays = filtered_term_array_gen(negative_normalized_corpus)

term_p_values = {}
term_dist_values_positive = {}
term_dist_values_negative = {}

def algorithm1(positive_filtered_term_arrays,negative_filtered_term_arrays):

    universal_terms = []
    positive_term_array = []
    negative_term_array = []

    for subarray in positive_filtered_term_arrays:
        for term in subarray: 
            universal_terms.append(term)

    #print(universal_terms)

    #for subarray in negative_filtered_term_arrays:
        #for term in subarray:
            #universal_terms.append(term)

    universal_terms = list(set(universal_terms))

    total_term_count_positive = sum([len(e) for e in positive_filtered_term_arrays])
    total_term_count_negative = sum([len(e) for e in negative_filtered_term_arrays])

    term_scores_for_correlation_positive = {}  

    for term in universal_terms:

        term_count_positive = sum([subarray.count(term) for subarray in positive_filtered_term_arrays]) 
        term_count_negative = sum([subarray.count(term) for subarray in negative_filtered_term_arrays])

        #percent_in_positive_normalized = (normalized_positive_occurence / (normalized_positive_occurence + normalized_negative_occurence)) * 100

        positive_occurence_proportion_normalized = (term_count_positive / (term_count_positive + term_count_negative))

        distribution_in_positive = ([term in entry for entry in positive_filtered_term_arrays].count(True) / len(positive_filtered_term_arrays))  
        distribution_in_negative = ([term in entry for entry in negative_filtered_term_arrays].count(True) / len(negative_filtered_term_arrays))  

        print('#########')
        print('\n')

        print(term)

        print('\n')

        print('Proportion value: {}'.format(positive_occurence_proportion_normalized))
        term_p_values[term] = positive_occurence_proportion_normalized
        print('Distribution value Cp: {}'.format(distribution_in_positive))
        term_dist_values_positive[term] = distribution_in_positive
        term_dist_values_negative[term] = distribution_in_negative

        final_score = (positive_occurence_proportion_normalized + distribution_in_positive) / 2

        print(final_score)

        print('\n\n')

        print('##########')

        #final_score = (normalized_negative_complement + normalized_positive_occurence + distribution_in_positive) / 3

        term_scores_for_correlation_positive[term] = final_score

    term_scores_for_correlation_positive = dict(sorted(term_scores_for_correlation_positive.items(), key = lambda kv : kv[1], reverse = True))
    
    return term_scores_for_correlation_positive


def tf_idf(target_term_array,target_corpus):

    score_mapping = {}

    terms_unique = list(set(target_term_array))
    for term in terms_unique:
        count = target_term_array.count(term)
        corpus_length = len(target_corpus)
        membership_length = len([doc for doc in target_corpus if term in doc])
        idf = math.log(corpus_length/membership_length)
        score = count * idf
        score_mapping[term] = score
    
    score_mapping = dict(sorted(score_mapping.items(), key = lambda kv: kv[1], reverse = True))

    return score_mapping


positive_documents_score_mappings_array = []

for entry in positive_filtered_term_arrays:
    tf_idf_result = tf_idf(entry,positive_filtered_term_arrays)
    positive_documents_score_mappings_array.append(tf_idf_result)

negative_documents_score_mappings_array = []

for entry in negative_filtered_term_arrays:
    tf_idf_result = tf_idf(entry,negative_filtered_term_arrays)
    negative_documents_score_mappings_array.append(tf_idf_result)

def get_mean_term_score(term,record_array):
    scores = []
    for entry in record_array:
        try:
            temp_score = entry[term]
            scores.append(temp_score)
        except Exception as e:
            continue
    try:
        return sum(scores)/len(scores)
    except Exception as e:
        print('Term not in one of the corpora')


def get_mean_term_ranking(term,record_array):
    scores = []
    for entry in record_array:
        try:
            temp_ranking = list(entry).index(term)
            scores.append(temp_ranking)
        except Exception as e:
            continue
    try:
        return sum(scores)/len(scores)
    except Exception as e:
        print('Term not in one of the corpora')

keyword_scores = algorithm1(positive_filtered_term_arrays,negative_filtered_term_arrays)

raw_scores_array = [v for k,v in keyword_scores.items()]

print(scipy.stats.shapiro(raw_scores_array))

hist,bins = np.histogram(raw_scores_array,bins=15)

fig = plt.figure(figsize =(10, 7))

np_raw_data = np.asarray(raw_scores_array,dtype=np.float32)

plt.hist(np_raw_data,bins,alpha=0.5)

plt.title('Term Count by Score Range')

plt.xlabel('a(t) Score Range')

plt.ylabel('Term Count')

plt.show()

sm.qqplot(np_raw_data, line = '45',fit=True) 

pylab.title('Term Score Distribution Compared to Normal Distribution')

pylab.xlabel('Theoretical Normal Quantiles')

pylab.ylabel('Term Score Quantiles')

pylab.show()

descriptive = pd.DataFrame(raw_scores_array)
d = descriptive.describe()
print(d)

print(np.percentile(np_raw_data,85))

with open('results2.csv', 'w') as f:
    writer = csv.writer(f)
    header = ['Term','FinalScore','PScore','DScorePos','DScoreNeg']
    writer.writerow(header)
    for k,v in keyword_scores.items():
        writer.writerow([k,v,term_p_values[k],term_dist_values_positive[k],term_dist_values_negative[k]])

#for i in range(6):
    #term = input('Enter the term for tf-idf results: ')

    #positive_mean_tf_idf_score = get_mean_term_score(term,positive_documents_score_mappings_array)
    #negative_mean_tf_idf_score = get_mean_term_score(term,negative_documents_score_mappings_array)

    #positive_mean_ranking = get_mean_term_ranking(term,positive_documents_score_mappings_array)
    #negative_mean_ranking = get_mean_term_ranking(term,negative_documents_score_mappings_array)

    #print('RESULTS: \n\n')

    #print('MEAN POSITIVE TF-IDF SCORE FOR {}: {}'.format(term, positive_mean_tf_idf_score))
    #print('MEAN POSITIVE TF-IDF RANKING {}'.format(positive_mean_ranking))
    #print('\n\n')

    #print('MEAN NEGATIVE TF-IDF SCORE FOR {}: {}'.format(term, negative_mean_tf_idf_score))
    #print('MEAN NEGATIVE TF-IDF RANKING {}'.format(negative_mean_ranking))
    

    #print('\n\n')

sys.exit()