{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import string\n", "import re\n", "import tensorflow\n", "from os import listdir\n", "from numpy import array\n", "from collections import Counter\n", "from nltk.corpus import stopwords\n", "import unicodedata as ud\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.utils.vis_utils import plot_model\n", "from keras.models import Sequential\n", "from keras.layers import Dense,Dropout\n", "from keras.layers import Flatten,BatchNormalization\n", "from keras.layers import Embedding\n", "from keras.layers.convolutional import Conv1D\n", "from keras.layers.convolutional import MaxPooling1D\n", "from keras.models import load_model" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def load_doc(filename):\n", "\n", " file = open(filename, 'r',encoding=\"utf8\")\n", " text = file.read()\n", " file.close()\n", " return text\n", "\n", "def clean_doc(doc,vocab):\n", " \n", " \n", " tokens = ''.join(c for c in doc if not ud.category(c).startswith('P'))\n", " tokens = ''.join(c for c in tokens if not ud.category(c).startswith('Lm')) \n", " tokens = tokens.split()\n", " tokens = [w for w in tokens if w in vocab]\n", " tokens = ' '.join(tokens)\n", " \n", " return tokens\n", "\n", "\n", "\n", "def process_docs(directory, vocab):\n", " documents = list()\n", "\n", " for filename in listdir(directory):\n", " path = directory + '/' + filename\n", " doc = load_doc(path)\n", " tokens = clean_doc(doc, vocab)\n", " documents.append(tokens)\n", " return documents\n", " \n", " \n", "\n", "def load_clean_dataset(vocab):\n", "\n", " finance = process_docs('TopicMerged/Finance', vocab)\n", " hr = process_docs('TopicMerged/HR', vocab)\n", " sales = process_docs('TopicMerged/Sales', vocab)\n", " supp = process_docs('TopicMerged/Support', vocab)\n", " management = process_docs('TopicMerged/Management', vocab)\n", " \n", " docs = finance + hr + sales + supp + management\n", " \n", " labels = array([[1,0,0,0,0] for _ in range(len(finance))] + [[0,1,0,0,0] for _ in range(len(hr))] +\n", " [[0,0,1,0,0] for _ in range(len(sales))] + [[0,0,0,1,0] for _ in range(len(supp))] +\n", " [[0,0,0,0,1] for _ in range(len(management))])\n", " \n", " return docs, labels\n", "\n", "\n", "\n", "\n", "def load_clean_dataset_test(vocab):\n", "\n", " finance = process_docs('TopicMerged/Finance_Test', vocab)\n", " hr = process_docs('TopicMerged/HR_Test', vocab)\n", " sales = process_docs('TopicMerged/Sales_Test', vocab)\n", " supp = process_docs('TopicMerged/Support_Test', vocab)\n", " management = process_docs('TopicMerged/Management_Test', vocab)\n", " \n", " docs = finance + hr + sales + supp + management\n", " \n", " labels = array([[1,0,0,0,0] for _ in range(len(finance))] + [[0,1,0,0,0] for _ in range(len(hr))] +\n", " [[0,0,1,0,0] for _ in range(len(sales))] + [[0,0,0,1,0] for _ in range(len(supp))] +\n", " [[0,0,0,0,1] for _ in range(len(management))])\n", " \n", " return docs, labels\n", "\n", "def create_tokenizer(lines):\n", " tokenizer = Tokenizer()\n", " tokenizer.fit_on_texts(lines)\n", " return tokenizer\n", "\n", "\n", "\n", "\n", " \n", "def encode_docs(tokenizer, max_length, docs):\n", "\n", " encoded = tokenizer.texts_to_sequences(docs)\n", "\n", " padded = pad_sequences(encoded, maxlen=max_length, padding='post')\n", " return padded\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def define_model(vocab_size, max_length):\n", " model = Sequential()\n", " model.add(Embedding(vocab_size, 300, input_length=max_length))\n", " model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))\n", " model.add(MaxPooling1D(pool_size=2))\n", " model.add(Flatten())\n", " model.add(Dropout(0.1))\n", " model.add(Dense(256, activation='relu'))\n", " model.add(Dense(5, activation='sigmoid'))\n", " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", " model.summary()\n", " plot_model(model, to_file='MergedTopicdeNoTrain.png', show_shapes=True)\n", " return model" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"sequential\"\n", "_________________________________________________________________\n", "Layer (type) Output Shape Param # \n", "=================================================================\n", "embedding (Embedding) (None, 450, 300) 3817500 \n", "_________________________________________________________________\n", "conv1d (Conv1D) (None, 443, 64) 153664 \n", "_________________________________________________________________\n", "max_pooling1d (MaxPooling1D) (None, 221, 64) 0 \n", "_________________________________________________________________\n", "flatten (Flatten) (None, 14144) 0 \n", "_________________________________________________________________\n", "dropout (Dropout) (None, 14144) 0 \n", "_________________________________________________________________\n", "dense (Dense) (None, 256) 3621120 \n", "_________________________________________________________________\n", "dense_1 (Dense) (None, 5) 1285 \n", "=================================================================\n", "Total params: 7,593,569\n", "Trainable params: 7,593,569\n", "Non-trainable params: 0\n", "_________________________________________________________________\n", "Epoch 1/32\n", "144/144 - 23s - loss: 0.4291 - accuracy: 0.4586\n", "Epoch 2/32\n", "144/144 - 23s - loss: 0.2682 - accuracy: 0.7164\n", "Epoch 3/32\n", "144/144 - 23s - loss: 0.1939 - accuracy: 0.7837\n", "Epoch 4/32\n", "144/144 - 23s - loss: 0.1623 - accuracy: 0.7963\n", "Epoch 5/32\n", "144/144 - 24s - loss: 0.1429 - accuracy: 0.7968\n", "Epoch 6/32\n", "144/144 - 27s - loss: 0.1292 - accuracy: 0.8027\n", "Epoch 7/32\n", "144/144 - 25s - loss: 0.1189 - accuracy: 0.8081\n", "Epoch 8/32\n", "144/144 - 24s - loss: 0.1115 - accuracy: 0.8129\n", "Epoch 9/32\n", "144/144 - 24s - loss: 0.1050 - accuracy: 0.8194\n", "Epoch 10/32\n", "144/144 - 24s - loss: 0.1008 - accuracy: 0.8109\n", "Epoch 11/32\n", "144/144 - 24s - loss: 0.0965 - accuracy: 0.8196\n", "Epoch 12/32\n", "144/144 - 24s - loss: 0.0946 - accuracy: 0.8237\n", "Epoch 13/32\n", "144/144 - 28s - loss: 0.0933 - accuracy: 0.8235\n", "Epoch 14/32\n", "144/144 - 28s - loss: 0.0909 - accuracy: 0.8283\n", "Epoch 15/32\n", "144/144 - 23s - loss: 0.0902 - accuracy: 0.8357\n", "Epoch 16/32\n", "144/144 - 23s - loss: 0.0891 - accuracy: 0.8287\n", "Epoch 17/32\n", "144/144 - 23s - loss: 0.0882 - accuracy: 0.8348\n", "Epoch 18/32\n", "144/144 - 23s - loss: 0.0877 - accuracy: 0.8329\n", "Epoch 19/32\n", "144/144 - 23s - loss: 0.0866 - accuracy: 0.8424\n", "Epoch 20/32\n", "144/144 - 23s - loss: 0.0873 - accuracy: 0.8368\n", "Epoch 21/32\n", "144/144 - 23s - loss: 0.0859 - accuracy: 0.8383\n", "Epoch 22/32\n", "144/144 - 23s - loss: 0.0874 - accuracy: 0.8374\n", "Epoch 23/32\n", "144/144 - 24s - loss: 0.0867 - accuracy: 0.8437\n", "Epoch 24/32\n", "144/144 - 25s - loss: 0.0857 - accuracy: 0.8420\n", "Epoch 25/32\n", "144/144 - 23s - loss: 0.0866 - accuracy: 0.8483\n", "Epoch 26/32\n", "144/144 - 23s - loss: 0.0863 - accuracy: 0.8468\n", "Epoch 27/32\n", "144/144 - 23s - loss: 0.0868 - accuracy: 0.8448\n", "Epoch 28/32\n", "144/144 - 23s - loss: 0.0884 - accuracy: 0.8455\n", "Epoch 29/32\n", "144/144 - 24s - loss: 0.0875 - accuracy: 0.8459\n", "Epoch 30/32\n", "144/144 - 23s - loss: 0.0868 - accuracy: 0.8594\n", "Epoch 31/32\n", "144/144 - 23s - loss: 0.0910 - accuracy: 0.8439\n", "Epoch 32/32\n", "144/144 - 23s - loss: 0.0923 - accuracy: 0.8502\n", "Test Accuracy: 62.183237\n" ] } ], "source": [ "vocab_filename = 'MergedTopicVocab.txt'\n", "vocab = load_doc(vocab_filename)\n", "vocab = set(vocab.split())\n", "\n", "train_docs, ytrain = load_clean_dataset(vocab)\n", "test_docs, ytest = load_clean_dataset_test(vocab)\n", "tokenizer = create_tokenizer(train_docs)\n", "\n", "vocab_size = len(tokenizer.word_index) + 1\n", "#max_length = max([len(s.split()) for s in train_docs])\n", "max_length = 450\n", "Xtrain = encode_docs(tokenizer, max_length, train_docs)\n", "model = define_model(vocab_size, max_length)\n", "model.fit(Xtrain, ytrain, epochs=32, verbose=2)\n", "model.save('MergedTopicdeNoTrain.h5')\n", "\n", "\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Test Accuracy: %f' % (acc*100))\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Finance Test Accuracy: 80.000001\n", "HR Test Accuracy: 60.869563\n", "Sales Test Accuracy: 76.086956\n", "Support Test Accuracy: 36.904761\n", "Management Test Accuracy: 40.404040\n" ] } ], "source": [ "def load_clean_dataset_testFinance(vocab):\n", "\n", " finance = process_docs('TopicMerged/Finance_Test', vocab)\n", " docs = finance\n", " labels = array([[1,0,0,0,0] for _ in range(len(finance))]) \n", " return docs, labels\n", "\n", "def load_clean_dataset_testHR(vocab):\n", "\n", " hr = process_docs('TopicMerged/HR_Test', vocab)\n", " docs = hr\n", "\n", " labels = array([[0,1,0,0,0] for _ in range(len(hr))])\n", " return docs, labels\n", "\n", "def load_clean_dataset_testSales(vocab):\n", "\n", " sales = process_docs('TopicMerged/Sales_Test', vocab)\n", " docs = sales\n", " \n", " labels = array([[0,0,1,0,0] for _ in range(len(sales))])\n", " \n", " return docs, labels\n", "\n", "def load_clean_dataset_testSupport(vocab):\n", "\n", " supp = process_docs('TopicMerged/Support_Test', vocab)\n", " docs = supp\n", " \n", " labels = array([[0,0,0,1,0] for _ in range(len(supp))])\n", " \n", " return docs, labels\n", "\n", "def load_clean_dataset_testManagement(vocab):\n", "\n", " management = process_docs('TopicMerged/Management_Test', vocab)\n", " docs = management\n", " \n", " labels = array([[0,0,0,0,1] for _ in range(len(management))])\n", " \n", " return docs, labels\n", "\n", "model=load_model('MergedTopicdeNoTrain.h5')\n", "\n", "test_docs, ytest = load_clean_dataset_testFinance(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Finance Test Accuracy: %f' % (acc*100))\n", "\n", "test_docs, ytest = load_clean_dataset_testHR(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('HR Test Accuracy: %f' % (acc*100))\n", "\n", "test_docs, ytest = load_clean_dataset_testSales(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Sales Test Accuracy: %f' % (acc*100))\n", "\n", "test_docs, ytest = load_clean_dataset_testSupport(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Support Test Accuracy: %f' % (acc*100))\n", "\n", "test_docs, ytest = load_clean_dataset_testManagement(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Management Test Accuracy: %f' % (acc*100))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "NoorEnv", "language": "python", "name": "noorenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }