{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import string\n", "import re\n", "import tensorflow\n", "from os import listdir\n", "from numpy import array\n", "from collections import Counter\n", "from nltk.corpus import stopwords\n", "import unicodedata as ud\n", "from keras.preprocessing.text import Tokenizer\n", "from keras.preprocessing.sequence import pad_sequences\n", "from keras.utils.vis_utils import plot_model\n", "from keras.models import Sequential\n", "from keras.layers import Dense,Dropout\n", "from keras.layers import Flatten,BatchNormalization\n", "from keras.layers import Embedding\n", "from keras.layers.convolutional import Conv1D\n", "from keras.layers.convolutional import MaxPooling1D\n", "from keras.models import load_model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_doc(filename):\n", "\n", " file = open(filename, 'r',encoding=\"utf8\")\n", " text = file.read()\n", " file.close()\n", " return text\n", "\n", "def clean_doc(doc,vocab):\n", " \n", " \n", " tokens = ''.join(c for c in doc if not ud.category(c).startswith('P'))\n", " tokens = ''.join(c for c in tokens if not ud.category(c).startswith('Lm')) \n", " tokens = tokens.split()\n", " tokens = [w for w in tokens if w in vocab]\n", " tokens = ' '.join(tokens)\n", " \n", " return tokens\n", "\n", "\n", "\n", "def process_docs(directory, vocab):\n", " documents = list()\n", "\n", " for filename in listdir(directory):\n", " path = directory + '/' + filename\n", " doc = load_doc(path)\n", " tokens = clean_doc(doc, vocab)\n", " documents.append(tokens)\n", " return documents\n", " \n", " \n", "\n", "def load_clean_dataset(vocab):\n", "\n", " urgent = process_docs('DataSet/Urgency/Urgent', vocab)\n", " nonUrgent = process_docs('DataSet/Urgency/NonUrgent', vocab)\n", " docs = urgent + nonUrgent\n", " labels = array([1 for _ in range(len(urgent))] + [0 for _ in range(len(nonUrgent))])\n", "\n", "\n", " return docs, labels\n", "\n", "\n", "\n", "\n", "def load_clean_dataset_test(vocab):\n", "\n", " urgent = process_docs('DataSet/Urgency/Urgent_Test', vocab)\n", " nonUrgent = process_docs('DataSet/Urgency/NonUrgent_Test', vocab)\n", " docs = urgent + nonUrgent\n", " labels = array([1 for _ in range(len(urgent))] + [0 for _ in range(len(nonUrgent))])\n", "\n", " return docs, labels\n", "\n", "def create_tokenizer(lines):\n", " tokenizer = Tokenizer()\n", " tokenizer.fit_on_texts(lines)\n", " return tokenizer\n", "\n", "\n", "\n", "\n", " \n", "def encode_docs(tokenizer, max_length, docs):\n", "\n", " encoded = tokenizer.texts_to_sequences(docs)\n", "\n", " padded = pad_sequences(encoded, maxlen=max_length, padding='post')\n", " return padded\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def load_clean_dataset_testUrgent(vocab):\n", "\n", " \n", " urgent = process_docs('DataSet/Urgency/Urgent_Test', vocab)\n", " docs = urgent\n", " labels = array([1 for _ in range(len(urgent))])\n", "\n", " return docs, labels\n", "\n", "\n", "def load_clean_dataset_testNonUrgent(vocab):\n", "\n", " \n", " nonUrgent = process_docs('DataSet/Urgency/NonUrgent_Test', vocab)\n", " docs = nonUrgent\n", " labels = array([0 for _ in range(len(nonUrgent))])\n", "\n", " return docs, labels\n", "\n", "\n", "\n", "\n", "def define_model(vocab_size, max_length):\n", " model = Sequential()\n", " model.add(Embedding(vocab_size, 512, input_length=max_length))\n", " model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))\n", " model.add(BatchNormalization())\n", " model.add(MaxPooling1D(pool_size=2))\n", " model.add(Flatten())\n", " model.add(Dropout(0.1))\n", " model.add(Dense(10, activation='relu'))\n", " model.add(Dense(1, activation='sigmoid'))\n", " model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n", " model.summary()\n", " plot_model(model, to_file='FinalModel450LengthBNDropout15Emedd512.png', show_shapes=True)\n", " return model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "vocab_filename = 'vocabBalanced_Final.txt'\n", "vocab = load_doc(vocab_filename)\n", "vocab = set(vocab.split())\n", "\n", "train_docs, ytrain = load_clean_dataset(vocab)\n", "test_docs, ytest = load_clean_dataset_test(vocab)\n", "tokenizer = create_tokenizer(train_docs)\n", "\n", "vocab_size = len(tokenizer.word_index) + 1\n", "#max_length = max([len(s.split()) for s in train_docs])\n", "max_length = 450\n", "Xtrain = encode_docs(tokenizer, max_length, train_docs)\n", "model = define_model(vocab_size, max_length)\n", "model.fit(Xtrain, ytrain, epochs=20,verbose=2)\n", "model.save('FinalModel450LengthBNDropout15Emedd512.h5')\n", "\n", "\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Test Accuracy: %f' % (acc*100))\n", "\n", "\n", "test_docs, ytest = load_clean_dataset_testNonUrgent(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Test Accuracy: %f' % (acc*100))\n", "\n", "test_docs, ytest = load_clean_dataset_testUrgent(vocab)\n", "Xtest = encode_docs(tokenizer, max_length, test_docs)\n", "_, acc = model.evaluate(Xtest, ytest, verbose=0)\n", "print('Test Accuracy: %f' % (acc*100))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "NoorEnv", "language": "python", "name": "noorenv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.9" } }, "nbformat": 4, "nbformat_minor": 4 }