# -*- coding: utf-8 -*-
"""
Created on Mon Aug  8 22:13:38 2022

@author: Lenovo
"""

import os
####*IMPORANT*: Have to do this line *before* importing tensorflow
os.environ['PYTHONHASHSEED']=str(1)
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers 
import random
import pandas as pd
import numpy as np

def reset_random_seeds():
   os.environ['PYTHONHASHSEED']=str(1)
   tf.random.set_seed(1)
   np.random.seed(1)
   random.seed(1)
   
   #make some random data
reset_random_seeds()

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score ,classification_report ,confusion_matrix , auc,roc_curve,roc_auc_score
import sklearn.metrics as metrics
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import nltk
import pandas
from textblob import TextBlob
import re
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords
import numpy as np
import nltk
import pandas as pd
import io
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score ,classification_report ,confusion_matrix , auc,roc_curve,roc_auc_score
import sklearn.metrics as metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from scipy import interp
from itertools import cycle
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
from scipy import interp
from matplotlib import pyplot

import keras
from google.colab import files
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras import layers
#from keras.optimizers import RMSprop,Adam
from tensorflow.keras.optimizers import RMSprop,Adam
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
import tensorflow as tf
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

import pandas
df = pandas.read_csv('autismclean.csv')
plt.figure(figsize=(8,6))
sns.set_style("whitegrid")
ax = sns.countplot(x="Sentiment_Type", data=df)
#Data preprocessing
from io import StringIO
col = ['Sentiment_Type', 'clean_tweet']
df = df[col]
df = df[pd.notnull(df['clean_tweet'])]
df.columns = ['Sentiment_Type', 'clean_tweet']
df['sentiment_id'] = df['Sentiment_Type'].factorize()[0]
Class_id_df = df[['Sentiment_Type', 'sentiment_id']].drop_duplicates().sort_values('sentiment_id')
Class_to_id = dict(Class_id_df.values)
id_to_Class = dict(Class_id_df[['sentiment_id', 'Sentiment_Type']].values)
df.head(1719)

from sklearn.feature_extraction.text import TfidfVectorizer
df['clean_tweet'] = df['clean_tweet'].values.astype('unicode')
tfidf = TfidfVectorizer(max_df=0.1, sublinear_tf=True, min_df=3, norm='l2', encoding='latin-1', ngram_range=(1,3), stop_words= None, use_idf= False,lowercase=False)
features = tfidf.fit_transform(df.clean_tweet)#.toarray()
labels = df['sentiment_id']
features.shape

X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=0.75, test_size=0.25, random_state=3)

### **Machine Learning**
#SGD
from sklearn.linear_model import SGDClassifier
classifier5 =  SGDClassifier(loss="hinge", penalty="l2", max_iter=5)      
fit5=classifier5.fit(X_train, y_train)
pred5=fit5.predict(X_test)
#-------------------------------------------------------------
print("Accuracy: ", metrics.accuracy_score(y_test,pred5))
print('\n classification report: \n')
print(classification_report(y_test,pred5,digits=4))
#-------------------------------------------------------------
conf_matrix5 = confusion_matrix(y_test,pred5)
conf_matrix5 = conf_matrix5.astype('float') / conf_matrix5.sum(axis=1)[:, np.newaxis]
conf_matrix5 = pd.DataFrame(conf_matrix5, index = ['negative','neutral','positive'],columns = ['negative','neutral','positive'])
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix5, annot=True, annot_kws={"size": 15})
#KNN
classifier1 = KNeighborsClassifier(n_neighbors = 3)
fit1=classifier1.fit(X_train, y_train)
pred1=fit1.predict(X_test)
#-------------------------------------------------------------
print("Accuracy: ", metrics.accuracy_score(y_test,pred1))
cnf_matrix = confusion_matrix(y_test,pred1)
print('\n classification report: \n')
print(classification_report(y_test,pred1,digits=4))
#-------------------------------------------------------------
conf_matrix1 = confusion_matrix(y_test,pred1)
conf_matrix1 = conf_matrix1.astype('float') / conf_matrix1.sum(axis=1)[:, np.newaxis]
conf_matrix1 = pd.DataFrame(conf_matrix1, index = ['negative','neutral','positive'],columns = ['negative','neutral','positive'])
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix1, annot=True, annot_kws={"size": 15})

#SVM
classifier2 =SVC()
fit2=classifier2.fit(X_train, y_train)
pred2 = fit2.predict(X_test)

#------------------------------------------------------------- 
print("Accuracy: ", metrics.accuracy_score(y_test, pred2))
conf_matrix2 = confusion_matrix(y_test, pred2)
print('\n classification report: \n')
print(classification_report(y_test, pred2,digits=4))
conf_matrix2 = confusion_matrix(y_test, pred2)
conf_matrix2 = conf_matrix2.astype('d') / conf_matrix2.sum(axis=1)[:, np.newaxis]
conf_matrix2 = pd.DataFrame(conf_matrix2, index = ['negative','neutral','positive'],columns = ['negative','neutral','positive'])
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix2, annot=True, annot_kws={"size": 15})

#MultinomialNB
fit3 = MultinomialNB(alpha=0.1).fit(X_train, y_train) # to extract the class
pred3=fit3.predict(X_test)
#-------------------------------------------------------------
print("Accuracy: ", metrics.accuracy_score(y_test,pred3))
print('\n classification report: \n')
print(classification_report(y_test,pred3,digits=4))
#-------------------------------------------------------------
conf_matrix3 = confusion_matrix(y_test,pred3)
conf_matrix3 = conf_matrix3.astype('float') / conf_matrix3.sum(axis=1)[:, np.newaxis]
conf_matrix3 = pd.DataFrame(conf_matrix3, index = ['negative','neutral','positive'],columns = ['negative','neutral','positive'])
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix3, annot=True, annot_kws={"size": 15})

#DT
from sklearn.tree import DecisionTreeClassifier
classifier4 = DecisionTreeClassifier(random_state=0)
fit4=classifier4.fit(X_train, y_train)
pred4=fit4.predict(X_test)
#-------------------------------------------------------------
print("Accuracy: ", metrics.accuracy_score(y_test,pred4))
print('\n classification report: \n')
print(classification_report(y_test,pred4,digits=4))
#-------------------------------------------------------------
conf_matrix4 = confusion_matrix(y_test,pred4)
conf_matrix4 = conf_matrix4.astype('float') / conf_matrix4.sum(axis=1)[:, np.newaxis]
conf_matrix4 = pd.DataFrame(conf_matrix4, index = ['negative','neutral','positive'],columns = ['negative','neutral','positive'])
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix4, annot=True, annot_kws={"size": 15})

#LR
from sklearn.linear_model import LogisticRegression
classifier6 = LogisticRegression(random_state=0)
fit6=classifier6.fit(X_train, y_train)
pred6=fit6.predict(X_test)
#-------------------------------------------------------------
print("Accuracy: ", metrics.accuracy_score(y_test,pred6))
print('\n classification report: \n')
print(classification_report(y_test,pred6,digits=4))
#-------------------------------------------------------------
conf_matrix6 = confusion_matrix(y_test,pred6)
conf_matrix6 = conf_matrix6.astype('float') / conf_matrix6.sum(axis=1)[:, np.newaxis]
conf_matrix6 = pd.DataFrame(conf_matrix6, index = ['negative','neutral','positive'],columns = ['negative','neutral','positive'])
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix6, annot=True, annot_kws={"size": 15})

### **Deep Learning**"""

data_to_list = df['clean_tweet'].values.tolist()
data = np.array(data_to_list)
y = np.array(df['sentiment_id'])
labels=tf.keras.utils.to_categorical(y, 3,dtype ="int32")
#---------------------------------------------------------------------

tokenizer = Tokenizer()
tokenizer.fit_on_texts(data)

sequences = tokenizer.texts_to_sequences(data)
tweets = pad_sequences(sequences)

X_train, X_test, y_train, y_test = train_test_split(tweets, labels, train_size=0.75, test_size=0.25, random_state=3)

num_words = len(tokenizer.word_index) + 1
print((num_words))
#25906
"""### RNN"""
reset_random_seeds() 
model1 = Sequential()

model1.add(layers.Embedding(num_words, 1000))
model1.add(layers.SimpleRNN(128,dropout=0.9))
model1.add(Dropout(0.09))
model1.add(Dropout(0.05))
model1.add(layers.Dense(3,activation='softmax'))

#-----------------------------------------------------------------------------------------
model1.compile(optimizer='RMSprop',loss='categorical_crossentropy', metrics=['acc'])#RMSprop
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model1.hdf5", monitor='val_acc', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model1.fit(X_train, y_train, batch_size= 90,epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])

#Let's load the best model obtained during training
print("Best accuracy")

#Let's load the best model obtained during training
best_model1 = keras.models.load_model("best_model1.hdf5")

test_loss, test_acc = best_model1.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

predictions1 = best_model1.predict(X_test)
auc1 = roc_auc_score(y_test, predictions1)
print(' auc : ',auc1)
#Model accuracy:  0.9519432783126831
 #auc :  0.9882903016413923
 matrix1 = confusion_matrix(y_test.argmax(axis=1), np.around(predictions1, decimals=0).argmax(axis=1))
conf_matrix1 = pd.DataFrame(matrix1, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Neative','Positive'])

#Normalizing
conf_matrix1 = conf_matrix1.astype('float') / conf_matrix1.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix1, annot=True, annot_kws={"size": 15})

pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()
"""### GRU"""
reset_random_seeds() 
model2 = Sequential()

model2.add(layers.Embedding(num_words, 1000))
model2.add(layers.GRU(128,dropout=0.9))
model2.add(Dropout(0.09))
model2.add(Dropout(0.05))
model2.add(layers.Dense(3,activation='softmax'))

#-----------------------------------------------------------------------------------------
model2.compile(optimizer='RMSprop',loss='categorical_crossentropy', metrics=['acc'])#RMSprop
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model2.hdf5", monitor='val_acc', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model2.fit(X_train, y_train, batch_size= 90,epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])
print("Best accuracy")

#Let's load the best model obtained during training
best_model2 = keras.models.load_model("best_model2.hdf5")

test_loss, test_acc = best_model2.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

predictions2 = best_model2.predict(X_test)
auc2 = roc_auc_score(y_test, predictions2)
print(' auc : ',auc2)
#Model accuracy:  0.9554663896560669
 #auc :  0.991093158015676

pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()

matrix2 = confusion_matrix(y_test.argmax(axis=1), np.around(predictions2, decimals=0).argmax(axis=1))
conf_matrix2 = pd.DataFrame(matrix2, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Neative','Positive'])

#Normalizing
conf_matrix2 = conf_matrix2.astype('float') / conf_matrix2.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix2, annot=True, annot_kws={"size": 15})

"""### LSTM"""

reset_random_seeds() 
model3 = Sequential()

model3.add(layers.Embedding(num_words, 1000))
model3.add(layers.LSTM(128,dropout=0.9))
model3.add(Dropout(0.09))
model3.add(Dropout(0.05))
model3.add(layers.Dense(3,activation='softmax'))

#-----------------------------------------------------------------------------------------
model3.compile(optimizer='RMSprop',loss='categorical_crossentropy', metrics=['acc'])#RMSprop
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model3.hdf5", monitor='val_acc', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model3.fit(X_train, y_train, batch_size= 90,epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])

print("Best accuracy")

#Let's load the best model obtained during training
best_model3 = keras.models.load_model("best_model3.hdf5")

test_loss, test_acc = best_model3.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

predictions3 = best_model3.predict(X_test)
auc3 = roc_auc_score(y_test, predictions3)
print(' auc : ',auc3)

matrix3 = confusion_matrix(y_test.argmax(axis=1), np.around(predictions3, decimals=0).argmax(axis=1))
conf_matrix3 = pd.DataFrame(matrix3, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Neative','Positive'])

#Normalizing
conf_matrix3 = conf_matrix3.astype('float') / conf_matrix3.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix3, annot=True, annot_kws={"size": 15})

pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()"""### LSTM"""

reset_random_seeds() 
model3 = Sequential()

model3.add(layers.Embedding(num_words, 1000))
model3.add(layers.LSTM(128,dropout=0.9))
model3.add(Dropout(0.09))
model3.add(Dropout(0.05))
model3.add(layers.Dense(3,activation='softmax'))

#-----------------------------------------------------------------------------------------
model3.compile(optimizer='RMSprop',loss='categorical_crossentropy', metrics=['acc'])#RMSprop
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model3.hdf5", monitor='val_acc', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model3.fit(X_train, y_train, batch_size= 90,epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])

print("Best accuracy")

#Let's load the best model obtained during training
best_model3 = keras.models.load_model("best_model3.hdf5")

test_loss, test_acc = best_model3.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

predictions3 = best_model3.predict(X_test)
auc3 = roc_auc_score(y_test, predictions3)
print(' auc : ',auc3)

matrix3 = confusion_matrix(y_test.argmax(axis=1), np.around(predictions3, decimals=0).argmax(axis=1))
conf_matrix3 = pd.DataFrame(matrix3, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Neative','Positive'])

#Normalizing
conf_matrix3 = conf_matrix3.astype('float') / conf_matrix3.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix3, annot=True, annot_kws={"size": 15})

pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()

"""### RNN-GRU"""

reset_random_seeds()
model4 = Sequential()
model4.add(layers.Embedding(num_words, 1000))
model4.add(layers.SimpleRNN(128,return_sequences=True,dropout=0.9))
model4.add(layers.GRU(128,dropout=0.09))
model4.add(Dropout(0.05))

model4.add(layers.Dense(3,activation='softmax'))

#-------------------------------------------------------------
model4.compile(optimizer='RMSprop',loss='categorical_crossentropy', metrics=['acc'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model4.hdf5", monitor='val_acc', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model4.fit(X_train, y_train, batch_size=90 ,epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])
print("Best accuracy")

#Let's load the best model obtained during training
best_model4 = keras.models.load_model("best_model4.hdf5")

test_loss, test_acc = best_model4.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

predictions4 = best_model4.predict(X_test)
auc4 = roc_auc_score(y_test, predictions4)
print(' auc : ',auc4)

#confusion matrix
matrix4 = confusion_matrix(y_test.argmax(axis=1), np.around(predictions4, decimals=0).argmax(axis=1))
conf_matrix4 = pd.DataFrame(matrix4, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Neative','Positive'])

#Normalizing
conf_matrix4 = conf_matrix4.astype('float') / conf_matrix4.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix4, annot=True, annot_kws={"size": 15})

pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()

"""### RNN-LSTM"""

reset_random_seeds()
model5 = Sequential()
model5.add(layers.Embedding(num_words, 1000))
model5.add(layers.SimpleRNN(128,return_sequences=True,dropout=0.9))
model5.add(layers.LSTM(128,dropout=0.09))
model5.add(Dropout(0.05))

model5.add(layers.Dense(3,activation='softmax'))

#-------------------------------------------------------------
model5.compile(optimizer='RMSprop',loss='categorical_crossentropy', metrics=['acc'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("best_model5.hdf5", monitor='val_acc', verbose=1,save_best_only=True, mode='auto', period=1,save_weights_only=False)
history = model5.fit(X_train, y_train, batch_size=90 ,epochs=20,validation_data=(X_test, y_test),callbacks=[checkpoint1])

print("Best accuracy")

#Let's load the best model obtained during training
best_model5 = keras.models.load_model("best_model5.hdf5")

test_loss, test_acc = best_model5.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

predictions5 = best_model5.predict(X_test)
auc5 = roc_auc_score(y_test, predictions5)
print(' auc : ',auc5)

matrix5 = confusion_matrix(y_test.argmax(axis=1), np.around(predictions5, decimals=0).argmax(axis=1))
conf_matrix5 = pd.DataFrame(matrix5, index = ['Neutral','Negative','Positive'],columns = ['Neutral','Neative','Positive'])

#Normalizing
conf_matrix5 = conf_matrix5.astype('float')/ conf_matrix5.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (8,6))
sns.heatmap(conf_matrix5, annot=True, annot_kws={"size": 15})


pyplot.title('Learning Curves')
pyplot.xlabel('Epoch')
pyplot.ylabel('Cross Entropy')
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='val')
pyplot.legend()
pyplot.show()




"""# **End code**"""