In [1]:
import string
import re
import tensorflow
from os import listdir
from numpy import array
from collections import Counter
from nltk.corpus import stopwords
import unicodedata as ud
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense,Dropout
from keras.layers import Flatten,BatchNormalization,LSTM
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D,Conv2D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

In [2]:
def load_doc(filename):

 file = open(filename, 'r',encoding="utf8")
 text = file.read()
 file.close()
 return text

def clean_doc(doc,vocab):
 
 
 tokens = ''.join(c for c in doc if not ud.category(c).startswith('P'))
 tokens = ''.join(c for c in tokens if not ud.category(c).startswith('Lm')) 
 tokens = tokens.split()
 tokens = [w for w in tokens if w in vocab]
 tokens = ' '.join(tokens)
 
 return tokens



def process_docs(directory, vocab):
 documents = list()

 for filename in listdir(directory):
 path = directory + '/' + filename
 doc = load_doc(path)
 tokens = clean_doc(doc, vocab)
 documents.append(tokens)
 return documents
 
 

def load_clean_dataset(vocab):

 neutral = process_docs('Dataset/Sentiment/Neutral', vocab)
 positive = process_docs('Dataset/Sentiment/Positive', vocab)
 negative = process_docs('Dataset/Sentiment/Negative', vocab)
 docs = negative + neutral + positive
 labels = array([[1,0,0] for _ in range(len(negative))] + [[0,1,0] for _ in range(len(neutral))] + [[0,0,1] for _ in range(len(positive))])
 return docs, labels




def load_clean_dataset_test(vocab):

 neutral = process_docs('Dataset/Sentiment/Neutral_Test', vocab)
 positive = process_docs('Dataset/Sentiment/Positive_Test', vocab)
 negative = process_docs('Dataset/Sentiment/Negative_Test', vocab)
 docs = negative + neutral + positive
 labels = array([[1,0,0] for _ in range(len(negative))] + [[0,1,0] for _ in range(len(neutral))] + [[0,0,1] for _ in range(len(positive))])
 return docs, labels

def create_tokenizer(lines):
 tokenizer = Tokenizer()
 tokenizer.fit_on_texts(lines)
 return tokenizer




 
def encode_docs(tokenizer, max_length, docs):

 encoded = tokenizer.texts_to_sequences(docs)

 padded = pad_sequences(encoded, maxlen=max_length, padding='post')
 return padded


In [9]:
def load_clean_dataset_testNeutral(vocab):

 
 neutral = process_docs('Dataset/Sentiment/Neutral_Test', vocab)
 docs = neutral
 labels = array([[0,1,0] for _ in range(len(neutral))])

 return docs, labels


def load_clean_dataset_testNegative(vocab):

 
 negative = process_docs('Dataset/Sentiment/Negative_Test', vocab)
 docs = negative
 labels = array([[1,0,0] for _ in range(len(negative))])

 return docs, labels

def load_clean_dataset_testPositive(vocab):

 
 positive = process_docs('Dataset/Sentiment/Positive_Test', vocab)
 docs = positive
 labels = array([[0,0,1] for _ in range(len(positive))])

 return docs, labels



def define_model(vocab_size, max_length):
 model = Sequential()
 model.add(Embedding(vocab_size, 256, input_length=max_length))
 model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
 model.add(Conv1D(filters=64, kernel_size=8, activation='relu'))
 model.add(MaxPooling1D(pool_size=2))
 model.add(Flatten())
 model.add(Dropout(0.1))
 model.add(Dense(10, activation='relu'))
 model.add(Dense(3, activation='softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 model.summary()
 plot_model(model, to_file='BalancedSentimentModVocab2cnn.png', show_shapes=True)
 return model

In [10]:
vocab_filename = 'SentimentBalancedVocabModified.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset_test(vocab)
tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index) + 1
#max_length = max([len(s.split()) for s in train_docs])
max_length = 450
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)
model.fit(Xtrain, ytrain, epochs=25, verbose=2)
model.save('BalancedSentimentModVocab2cnn.h5')


Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


test_docs, ytest = load_clean_dataset_testNeutral(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testPositive(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
test_docs, ytest = load_clean_dataset_testNegative(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Model: "sequential_5"
_________________________________________________________________
Layer (type) Output Shape Param # 
embedding_5 (Embedding) (None, 450, 256) 2594560 
_________________________________________________________________
conv1d_6 (Conv1D) (None, 443, 64) 131136 
_________________________________________________________________
conv1d_7 (Conv1D) (None, 436, 64) 32832 
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 218, 64) 0 
_________________________________________________________________
flatten_5 (Flatten) (None, 13952) 0 
_________________________________________________________________
dropout_5 (Dropout) (None, 13952) 0 
_________________________________________________________________
dense_10 (Dense) (None, 10) 139530 
_________________________________________________________________
dense_11 (Dense) (None, 3) 33 
Total params: 2,898,091
Trainable params: 2,898,091
Non-trainable params: 0
_________________

InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [32,3] and labels shape [96]
	 [[node sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at :14) ]] [Op:__inference_train_function_32013]

Function call stack:
train_function


In [8]:
def define_model(vocab_size, max_length):
 model = Sequential()
 model.add(Embedding(vocab_size, 256, input_length=max_length))
 model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
 model.add(MaxPooling1D(pool_size=2))
 model.add(Flatten())
 model.add(Dropout(0.1))
 model.add(Dense(10, activation='relu'))
 model.add(Dense(3, activation='softmax'))
 model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
 model.summary()
 plot_model(model, to_file='BalancedSentimentModVocab32f.png', show_shapes=True)
 return model


vocab_filename = 'SentimentBalancedVocabModified.txt'
vocab = load_doc(vocab_filename)
vocab = set(vocab.split())

train_docs, ytrain = load_clean_dataset(vocab)
test_docs, ytest = load_clean_dataset_test(vocab)
tokenizer = create_tokenizer(train_docs)

vocab_size = len(tokenizer.word_index) + 1
#max_length = max([len(s.split()) for s in train_docs])
max_length = 450
Xtrain = encode_docs(tokenizer, max_length, train_docs)
model = define_model(vocab_size, max_length)
model.fit(Xtrain, ytrain, epochs=25, verbose=2)
model.save('BalancedSentimentModVocab32f.h5')


Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))


test_docs, ytest = load_clean_dataset_testNeutral(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))

test_docs, ytest = load_clean_dataset_testPositive(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))
test_docs, ytest = load_clean_dataset_testNegative(vocab)
Xtest = encode_docs(tokenizer, max_length, test_docs)
_, acc = model.evaluate(Xtest, ytest, verbose=0)
print('Test Accuracy: %f' % (acc*100))



Model: "sequential_4"
_________________________________________________________________
Layer (type) Output Shape Param # 
embedding_4 (Embedding) (None, 450, 256) 2594560 
_________________________________________________________________
conv1d_5 (Conv1D) (None, 443, 32) 65568 
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 221, 32) 0 
_________________________________________________________________
flatten_4 (Flatten) (None, 7072) 0 
_________________________________________________________________
dropout_4 (Dropout) (None, 7072) 0 
_________________________________________________________________
dense_8 (Dense) (None, 10) 70730 
_________________________________________________________________
dense_9 (Dense) (None, 3) 33 
Total params: 2,730,891
Trainable params: 2,730,891
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
118/118 - 14s - loss: 1.0220 - accuracy: 0.4865
Epoch