from bertopic import BERTopic

import datetime
import os
from datetime import datetime, timedelta
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re, string
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import csv
import sys
from tika import parser # pip install tika
from os import listdir
from os.path import isfile, join
import scipy
import numpy as np

import statsmodels.api as sm
import scipy.stats as st
import math
import string




wnl = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

positive_report_paths = ['positive_result_reports/{}'.format(f) for f in listdir('positive_result_reports')]
negative_report_paths = ['negative_result_reports/{}'.format(f) for f in listdir('negative_result_reports')]

positive_report_paths = positive_report_paths
negative_report_paths = negative_report_paths

def remove_ds_store_path(path_array):
    for path in path_array:
        if '.DS_Store' in path:
            path_array.remove(path)
    return(path_array)

positive_report_paths = remove_ds_store_path(positive_report_paths)
negative_report_paths = remove_ds_store_path(negative_report_paths)

def compress_space(document):

    content = []

    skip_lexicon = ['\n','\t',' ']

    segment_start = 0
    segment_end = 0

    last = ''

    record_word_start = True
    
    for i,v in enumerate(document):
        if v == '\n' or v == '\t' or v == ' ':
            if document[i-1] in skip_lexicon:
                continue
            else:
                segment_end = i
                term = document[segment_start:segment_end]
                term = term.replace('\n','')
                term = term.replace('\t','')
                term = term.replace(' ','')
                content.append(term)
                record_word_start = True
        else:
            if record_word_start == True:
                segment_start = i
                record_word_start = False
            else:
                continue

    res = ''

    for e in content:
        res += e
        res += ' ' 
    
    return res

def load_raw_text(filepath):

    raw = parser.from_file(filepath)

    content = raw['content']

    #print(content)

    #def build_direct_string(document):

    res = compress_space(content)

    return res

positive_report_text = [load_raw_text(path) for path in positive_report_paths]
negative_report_text = [load_raw_text(path) for path in negative_report_paths]

stopwords = stopwords.words('english')

def lower(entry):
    res = entry.lower()
    return res


def trim_space(document):

    document = document.replace('\n','')
    
    try:
        while document[-1] == ' ' or document[-1] == '\n':
            document = document[:-1]
    except IndexError:
        pass
    try:
        while document[0] == ' ' or document[0] == '\n':
            document = document[1:]
    except IndexError:
        pass

    new = ''
    
    for i,v in enumerate(document):
        try:
            if v == ' ' and document[i+1] == ' ':
                continue
            else:
                new += v
        except IndexError:
            pass

    return new


def normalize(document):
    res = trim_space(document)
    res = lower(res)
    return res

positive_normalized_corpus = [normalize(entry) for entry in positive_report_text] 
negative_normalized_corpus = [normalize(entry) for entry in negative_report_text] 

positive_normalized_corpus = [normalize(entry) for entry in positive_report_text]
negative_normalized_corpus = [normalize(entry) for entry in negative_report_text]

def lemmatize(document_array):
    res = [wnl.lemmatize(term) for term in document_array]
    return res

def filtered_term_array_gen(collection): # Creates a single array of all terms across the corpus minus stopwords
    term_array_converted_documents = []
    for document in collection: 
        temp = []
        tokenizer = RegexpTokenizer(r'\w+')
        doc_split = tokenizer.tokenize(document)
        lemmatized = lemmatize(doc_split)
        filtered = [word for word in doc_split if word not in stopwords]
        for e in filtered:
            temp.append(e)
        term_array_converted_documents.append(temp)
    return term_array_converted_documents

positive_filtered_term_arrays = filtered_term_array_gen(positive_normalized_corpus)
negative_filtered_term_arrays = filtered_term_array_gen(negative_normalized_corpus)

positive_docs_rejoined = []

for entry in positive_filtered_term_arrays:
    temp = ''

    for term in entry:
        temp += term
        temp += ' '
    
    positive_docs_rejoined.append(temp)

positive_docs_rejoined = positive_docs_rejoined 

topic_model = BERTopic(embedding_model="paraphrase-MiniLM-L3-v2", verbose=True)

topics, probs = topic_model.fit_transform(positive_docs_rejoined)

res = topic_model.get_topic_info()

res.to_csv('positive_results_bert.csv')






