from top2vec import Top2Vec

import sys
import datetime
import os
from datetime import datetime, timedelta
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import re, string
from nltk.stem.wordnet import WordNetLemmatizer 
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from tika import parser # pip install tika
import csv
import sys
from os import listdir
from os.path import isfile, join
import numpy as np
from matplotlib import pyplot as plt

import math
import string
import time

from wordcloud import WordCloud



wnl = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

positive_report_paths = ['positive_result_reports/{}'.format(f) for f in listdir('positive_result_reports')]
negative_report_paths = ['negative_result_reports/{}'.format(f) for f in listdir('negative_result_reports')]


positive_report_paths = positive_report_paths

def remove_ds_store_path(path_array):
    for path in path_array:
        if '.DS_Store' in path:
            path_array.remove(path)
    return(path_array)

positive_report_paths = remove_ds_store_path(positive_report_paths)
negative_report_paths = remove_ds_store_path(negative_report_paths)


def compress_space(document):

    content = []

    skip_lexicon = ['\n','\t',' ']

    segment_start = 0
    segment_end = 0

    last = ''

    record_word_start = True
    
    for i,v in enumerate(document):
        if v == '\n' or v == '\t' or v == ' ':
            if document[i-1] in skip_lexicon:
                continue
            else:
                segment_end = i
                term = document[segment_start:segment_end]
                term = term.replace('\n','')
                term = term.replace('\t','')
                term = term.replace(' ','')
                content.append(term)
                record_word_start = True
        else:
            if record_word_start == True:
                segment_start = i
                record_word_start = False
            else:
                continue

    res = ''

    for e in content:
        res += e
        res += ' ' 
    
    return res

def load_raw_text(filepath):

    raw = parser.from_file(filepath)

    content = raw['content']

    #print(content)

    #def build_direct_string(document):

    res = compress_space(content)

    return res

positive_report_text = [load_raw_text(path) for path in positive_report_paths] 
negative_report_text = [load_raw_text(path) for path in negative_report_paths] 

docs = positive_report_text + negative_report_text

print(len(negative_report_text))

model = Top2Vec(negative_report_text)

print(model.get_num_topics())

n = model.get_num_topics()

topic_words, word_scores, topic_nums = model.get_topics(n)

for topic in topic_nums:
    print(topic)

for t in topic_words:
    print(t)

for i in range(n):
    t_w = topic_words[i]
    w_s = word_scores[i]
    dict1 = word_dict = dict(zip(t_w, w_s))
    wc = WordCloud(background_color="white", max_font_size=80)
    wc.generate_from_frequencies(word_dict)
    plt.imshow(wc)
    plt.axis("off")
    plt.show()