In [1]:
import string
import re
import tensorflow
from os import listdir
from numpy import array
from collections import Counter
from nltk.corpus import stopwords
import unicodedata as ud
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import Flatten,BatchNormalization
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

In [2]:
def load_doc(filename):

 file = open(filename, 'r',encoding="utf8")
 text = file.read()
 file.close()
 return text

def clean_doc(doc,vocab):
 
 
 tokens = ''.join(c for c in doc if not ud.category(c).startswith('P'))
 tokens = ''.join(c for c in tokens if not ud.category(c).startswith('Lm')) 
 tokens = tokens.split()
 tokens = [w for w in tokens if w in vocab]
 tokens = ' '.join(tokens)
 
 return tokens



def process_docs(directory, vocab):
 documents = list()

 for filename in listdir(directory):
 path = directory + '/' + filename
 doc = load_doc(path)
 tokens = clean_doc(doc, vocab)
 documents.append(tokens)
 return documents
 
 

def load_clean_dataset(vocab):

 finance = process_docs('TopicMerged/Finance', vocab)
 hr = process_docs('TopicMerged/HR', vocab)
 sales = process_docs('TopicMerged/Sales', vocab)
 supp = process_docs('TopicMerged/Support', vocab)
 management = process_docs('TopicMerged/Management', vocab)
 
 docs = finance + hr + sales + supp + management
 
 labels = array([[1,0,0,0,0] for _ in range(len(finance))] + [[0,1,0,0,0] for _ in range(len(hr))] +
 [[0,0,1,0,0] for _ in range(len(sales))] + [[0,0,0,1,0] for _ in range(len(supp))] +
 [[0,0,0,0,1] for _ in range(len(management))])
 
 return docs, labels




def load_clean_dataset_test(vocab):

 finance = process_docs('TopicMerged/Finance_Test', vocab)
 hr = process_docs('TopicMerged/HR_Test', vocab)
 sales = process_docs('TopicMerged/Sales_Test', vocab)
 supp = process_docs('TopicMerged/Support_Test', vocab)
 management = process_docs('TopicMerged/Management_Test', vocab)
 
 docs = finance + hr + sales + supp + management
 
 labels = array([[1,0,0,0,0] for _ in range(len(finance))] + [[0,1,0,0,0] for _ in range(len(hr))] +
 [[0,0,1,0,0] for _ in range(len(sales))] + [[0,0,0,1,0] for _ in range(len(supp))] +
 [[0,0,0,0,1] for _ in range(len(management))])
 
 return docs, labels

def create_tokenizer(lines):
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(lines)
 return tokenizer




 
def encode_docs(tokenizer, max_length, docs):

 encoded = tokenizer.texts_to_sequences(docs)

 padded = pad_sequences(encoded, maxlen=max_length, padding='post')
 return padded


In [3]:
def define_model(vocab_size, max_length):
 model = Sequential()
 model.add(Embedding(vocab_size, 300, input_length=max_length))
 model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
 model.add(MaxPooling1D(pool_size=2))
 model.add(Flatten())
 model.add(Dropout(0.1))
 model.add(Dense(256, activation='relu'))
 model.add(Dense(5, activation='sigmoid'))
 model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 model.summary()
 plot_model(model, to_file='MergedTopicdeNoTrain.png', show_shapes=True)
 return model

In [4]:
vocab_filename = 'MergedTopicVocab.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset_test(vocab)
tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index) + 1
#max_length = max([len(s.split()) for s in train_docs])
max_length = 450
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)
model.fit(Xtrain, ytrain, epochs=32, verbose=2)
model.save('MergedTopicdeNoTrain.h5')


Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))




Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param # 
embedding (Embedding) (None, 450, 300) 3817500 
_________________________________________________________________
conv1d (Conv1D) (None, 443, 64) 153664 
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 221, 64) 0 
_________________________________________________________________
flatten (Flatten) (None, 14144) 0 
_________________________________________________________________
dropout (Dropout) (None, 14144) 0 
_________________________________________________________________
dense (Dense) (None, 256) 3621120 
_________________________________________________________________
dense_1 (Dense) (None, 5) 1285 
Total params: 7,593,569
Trainable params: 7,593,569
Non-trainable params: 0
_________________________________________________________________
Epoch 1/32
144/144 - 23s - loss: 0.4291 - accuracy: 0.4586
Epoch 2/3

In [5]:
def load_clean_dataset_testFinance(vocab):

 finance = process_docs('TopicMerged/Finance_Test', vocab)
 docs = finance
 labels = array([[1,0,0,0,0] for _ in range(len(finance))]) 
 return docs, labels

def load_clean_dataset_testHR(vocab):

 hr = process_docs('TopicMerged/HR_Test', vocab)
 docs = hr

 labels = array([[0,1,0,0,0] for _ in range(len(hr))])
 return docs, labels

def load_clean_dataset_testSales(vocab):

 sales = process_docs('TopicMerged/Sales_Test', vocab)
 docs = sales
 
 labels = array([[0,0,1,0,0] for _ in range(len(sales))])
 
 return docs, labels

def load_clean_dataset_testSupport(vocab):

 supp = process_docs('TopicMerged/Support_Test', vocab)
 docs = supp
 
 labels = array([[0,0,0,1,0] for _ in range(len(supp))])
 
 return docs, labels

def load_clean_dataset_testManagement(vocab):

 management = process_docs('TopicMerged/Management_Test', vocab)
 docs = management
 
 labels = array([[0,0,0,0,1] for _ in range(len(management))])
 
 return docs, labels

model=load_model('MergedTopicdeNoTrain.h5')

test_docs, ytest = load_clean_dataset_testFinance(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Finance Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testHR(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('HR Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testSales(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Sales Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testSupport(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Support Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testManagement(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Management Test Accuracy: %f' % (acc*100))


Finance Test Accuracy: 80.000001
HR Test Accuracy: 60.869563
Sales Test Accuracy: 76.086956
Support Test Accuracy: 36.904761
Management Test Accuracy: 40.404040
