In [None]:
import string
import re
import tensorflow
from os import listdir
from numpy import array
from collections import Counter
from nltk.corpus import stopwords
import unicodedata as ud
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import Flatten,BatchNormalization
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

In [None]:
def load_doc(filename):

 file = open(filename, 'r',encoding="utf8")
 text = file.read()
 file.close()
 return text

def clean_doc(doc,vocab):
 
 
 tokens = ''.join(c for c in doc if not ud.category(c).startswith('P'))
 tokens = ''.join(c for c in tokens if not ud.category(c).startswith('Lm')) 
 tokens = tokens.split()
 tokens = [w for w in tokens if w in vocab]
 tokens = ' '.join(tokens)
 
 return tokens



def process_docs(directory, vocab):
 documents = list()

 for filename in listdir(directory):
 path = directory + '/' + filename
 doc = load_doc(path)
 tokens = clean_doc(doc, vocab)
 documents.append(tokens)
 return documents
 
 

def load_clean_dataset(vocab):

 urgent = process_docs('DataSet/Urgency/Urgent', vocab)
 nonUrgent = process_docs('DataSet/Urgency/NonUrgent', vocab)
 docs = urgent + nonUrgent
 labels = array([1 for _ in range(len(urgent))] + [0 for _ in range(len(nonUrgent))])


 return docs, labels




def load_clean_dataset_test(vocab):

 urgent = process_docs('DataSet/Urgency/Urgent_Test', vocab)
 nonUrgent = process_docs('DataSet/Urgency/NonUrgent_Test', vocab)
 docs = urgent + nonUrgent
 labels = array([1 for _ in range(len(urgent))] + [0 for _ in range(len(nonUrgent))])

 return docs, labels

def create_tokenizer(lines):
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(lines)
 return tokenizer




 
def encode_docs(tokenizer, max_length, docs):

 encoded = tokenizer.texts_to_sequences(docs)

 padded = pad_sequences(encoded, maxlen=max_length, padding='post')
 return padded


In [None]:
def load_clean_dataset_testUrgent(vocab):

 
 urgent = process_docs('DataSet/Urgency/Urgent_Test', vocab)
 docs = urgent
 labels = array([1 for _ in range(len(urgent))])

 return docs, labels


def load_clean_dataset_testNonUrgent(vocab):

 
 nonUrgent = process_docs('DataSet/Urgency/NonUrgent_Test', vocab)
 docs = nonUrgent
 labels = array([0 for _ in range(len(nonUrgent))])

 return docs, labels




def define_model(vocab_size, max_length):
 model = Sequential()
 model.add(Embedding(vocab_size, 512, input_length=max_length))
 model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
 model.add(BatchNormalization())
 model.add(MaxPooling1D(pool_size=2))
 model.add(Flatten())
 model.add(Dropout(0.1))
 model.add(Dense(10, activation='relu'))
 model.add(Dense(1, activation='sigmoid'))
 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 model.summary()
 plot_model(model, to_file='FinalModel450LengthBNDropout15Emedd512.png', show_shapes=True)
 return model

In [None]:
vocab_filename = 'vocabBalanced_Final.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset_test(vocab)
tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index) + 1
#max_length = max([len(s.split()) for s in train_docs])
max_length = 450
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)
model.fit(Xtrain, ytrain, epochs=20,verbose=2)
model.save('FinalModel450LengthBNDropout15Emedd512.h5')


Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


test_docs, ytest = load_clean_dataset_testNonUrgent(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testUrgent(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
