{"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.10","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":1085454,"sourceType":"datasetVersion","datasetId":605165},{"sourceId":4159888,"sourceType":"datasetVersion","datasetId":2455636},{"sourceId":5415383,"sourceType":"datasetVersion","datasetId":3135989},{"sourceId":5987603,"sourceType":"datasetVersion","datasetId":3431519},{"sourceId":6001808,"sourceType":"datasetVersion","datasetId":3437635},{"sourceId":6503103,"sourceType":"datasetVersion","datasetId":3759264}],"dockerImageVersionId":30513,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-10-04T11:14:27.375578Z","iopub.execute_input":"2023-10-04T11:14:27.375982Z","iopub.status.idle":"2023-10-04T11:14:27.430090Z","shell.execute_reply.started":"2023-10-04T11:14:27.375943Z","shell.execute_reply":"2023-10-04T11:14:27.428768Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import warnings\nwarnings.filterwarnings('ignore')\n%config Completer.use_jedi = False # if autocompletion doesnot work in kaggle notebook | hit tab","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:14:30.831257Z","iopub.execute_input":"2023-10-04T11:14:30.831613Z","iopub.status.idle":"2023-10-04T11:14:30.839281Z","shell.execute_reply.started":"2023-10-04T11:14:30.831587Z","shell.execute_reply":"2023-10-04T11:14:30.838101Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# importing the dataset \ndf_train = pd.read_csv('../input/emotions-dataset-for-nlp/train.txt', header =None, sep =';', names = ['Input','Sentiment'], encoding='utf-8')\ndf_test = pd.read_csv('../input/emotions-dataset-for-nlp/test.txt', header = None, sep =';', names = ['Input','Sentiment'],encoding='utf-8')\ndf_val=pd.read_csv('../input/emotions-dataset-for-nlp/val.txt',header=None,sep=';',names=['Input','Sentiment'],encoding='utf-8')","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:14:47.183393Z","iopub.execute_input":"2023-10-04T11:14:47.183789Z","iopub.status.idle":"2023-10-04T11:14:47.300229Z","shell.execute_reply.started":"2023-10-04T11:14:47.183760Z","shell.execute_reply":"2023-10-04T11:14:47.299274Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_train.Sentiment.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:49.563531Z","iopub.execute_input":"2023-10-04T11:15:49.563918Z","iopub.status.idle":"2023-10-04T11:15:49.585813Z","shell.execute_reply.started":"2023-10-04T11:15:49.563873Z","shell.execute_reply":"2023-10-04T11:15:49.585028Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_val.Sentiment.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:50.673469Z","iopub.execute_input":"2023-10-04T11:15:50.673834Z","iopub.status.idle":"2023-10-04T11:15:50.682505Z","shell.execute_reply.started":"2023-10-04T11:15:50.673803Z","shell.execute_reply":"2023-10-04T11:15:50.681373Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_test.Sentiment.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:51.554645Z","iopub.execute_input":"2023-10-04T11:15:51.555090Z","iopub.status.idle":"2023-10-04T11:15:51.563954Z","shell.execute_reply.started":"2023-10-04T11:15:51.555057Z","shell.execute_reply":"2023-10-04T11:15:51.562786Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_train.Sentiment.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:52.418102Z","iopub.execute_input":"2023-10-04T11:15:52.418504Z","iopub.status.idle":"2023-10-04T11:15:52.428172Z","shell.execute_reply.started":"2023-10-04T11:15:52.418473Z","shell.execute_reply":"2023-10-04T11:15:52.427125Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_train.head()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:53.598764Z","iopub.execute_input":"2023-10-04T11:15:53.599258Z","iopub.status.idle":"2023-10-04T11:15:53.625099Z","shell.execute_reply.started":"2023-10-04T11:15:53.599219Z","shell.execute_reply":"2023-10-04T11:15:53.624094Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(df_train.shape, df_test.shape, df_val.shape)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:55.033887Z","iopub.execute_input":"2023-10-04T11:15:55.035135Z","iopub.status.idle":"2023-10-04T11:15:55.041888Z","shell.execute_reply.started":"2023-10-04T11:15:55.035084Z","shell.execute_reply":"2023-10-04T11:15:55.040682Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import matplotlib.pyplot as plt\nimport seaborn as sns\nsns.set()\n%config InlineBackend.figure_format = 'retina'","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:56.719165Z","iopub.execute_input":"2023-10-04T11:15:56.719555Z","iopub.status.idle":"2023-10-04T11:15:57.551056Z","shell.execute_reply.started":"2023-10-04T11:15:56.719525Z","shell.execute_reply":"2023-10-04T11:15:57.550115Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_train['Length'] = df_train.Input.apply(lambda x:len(x))","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:15:58.559234Z","iopub.execute_input":"2023-10-04T11:15:58.560105Z","iopub.status.idle":"2023-10-04T11:15:58.573123Z","shell.execute_reply.started":"2023-10-04T11:15:58.560068Z","shell.execute_reply":"2023-10-04T11:15:58.571971Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"plt.plot(df_train.Length)\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:16:00.002695Z","iopub.execute_input":"2023-10-04T11:16:00.003100Z","iopub.status.idle":"2023-10-04T11:16:01.164235Z","shell.execute_reply.started":"2023-10-04T11:16:00.003069Z","shell.execute_reply":"2023-10-04T11:16:01.163236Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_train.Length.max() # max length of our text body ","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:16:22.550799Z","iopub.execute_input":"2023-10-04T11:16:22.551215Z","iopub.status.idle":"2023-10-04T11:16:22.559483Z","shell.execute_reply.started":"2023-10-04T11:16:22.551183Z","shell.execute_reply":"2023-10-04T11:16:22.558293Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# i'm using a text preprocessing library for this \n!pip install text_hammer \nimport text_hammer as th","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:16:23.781043Z","iopub.execute_input":"2023-10-04T11:16:23.781423Z","iopub.status.idle":"2023-10-04T11:16:53.480566Z","shell.execute_reply.started":"2023-10-04T11:16:23.781397Z","shell.execute_reply":"2023-10-04T11:16:53.479146Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"%%time\n\nfrom tqdm._tqdm_notebook import tqdm_notebook\ntqdm_notebook.pandas()\n\ndef text_preprocessing(df,col_name):\n column = col_name\n df[column] = df[column].progress_apply(lambda x:str(x).lower())\n df[column] = df[column].progress_apply(lambda x: th.cont_exp(x)) #you're -> you are; i'm -> i am\n df[column] = df[column].progress_apply(lambda x: th.remove_emails(x))\n df[column] = df[column].progress_apply(lambda x: th.remove_html_tags(x))\n# df[column] = df[column].progress_apply(lambda x: ps.remove_stopwords(x)) \n# here we can remove stop-words but in this case removing not, and ,can change the meaning of context \n\n df[column] = df[column].progress_apply(lambda x: th.remove_special_chars(x))\n df[column] = df[column].progress_apply(lambda x: th.remove_accented_chars(x))\n df[column] = df[column].progress_apply(lambda x: th.make_base(x)) #ran -> run,\n return(df)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:17:03.414345Z","iopub.execute_input":"2023-10-04T11:17:03.415151Z","iopub.status.idle":"2023-10-04T11:17:03.427118Z","shell.execute_reply.started":"2023-10-04T11:17:03.415116Z","shell.execute_reply":"2023-10-04T11:17:03.425652Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_cleaned_train = text_preprocessing(df_train, 'Input')","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:17:05.087336Z","iopub.execute_input":"2023-10-04T11:17:05.088416Z","iopub.status.idle":"2023-10-04T11:19:54.109542Z","shell.execute_reply.started":"2023-10-04T11:17:05.088375Z","shell.execute_reply":"2023-10-04T11:19:54.108252Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_cleaned_train.head()","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:19:56.982667Z","iopub.execute_input":"2023-10-04T11:19:56.983064Z","iopub.status.idle":"2023-10-04T11:19:56.994155Z","shell.execute_reply.started":"2023-10-04T11:19:56.983035Z","shell.execute_reply":"2023-10-04T11:19:56.992786Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_cleaned_train['Sentiment']=df_cleaned_train.Sentiment.replace({'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5})\ndf_test['Sentiment']=df_test.Sentiment.replace({'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5})\ndf_val['Sentiment']=df_val.Sentiment.replace({'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5})","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:19:58.082639Z","iopub.execute_input":"2023-10-04T11:19:58.083405Z","iopub.status.idle":"2023-10-04T11:19:58.106101Z","shell.execute_reply.started":"2023-10-04T11:19:58.083344Z","shell.execute_reply":"2023-10-04T11:19:58.105083Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from keras.preprocessing.text import Tokenizer","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:19:59.185511Z","iopub.execute_input":"2023-10-04T11:19:59.186212Z","iopub.status.idle":"2023-10-04T11:19:59.190956Z","shell.execute_reply.started":"2023-10-04T11:19:59.186177Z","shell.execute_reply":"2023-10-04T11:19:59.189386Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"num_words = 10000 # this means 10000 unique words can be taken \ntokenizer=Tokenizer(num_words,lower=True)\ndf_total = pd.concat([df_cleaned_train['Input'], df_test.Input], axis = 0)\ntokenizer.fit_on_texts(df_total)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:20:00.401656Z","iopub.execute_input":"2023-10-04T11:20:00.402134Z","iopub.status.idle":"2023-10-04T11:20:00.798687Z","shell.execute_reply.started":"2023-10-04T11:20:00.402105Z","shell.execute_reply":"2023-10-04T11:20:00.797409Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"len(tokenizer.word_index) # this is whole unique words in our corpus\n# but we are taking only 10000 words in our model","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:20:01.988330Z","iopub.execute_input":"2023-10-04T11:20:01.988737Z","iopub.status.idle":"2023-10-04T11:20:01.996597Z","shell.execute_reply.started":"2023-10-04T11:20:01.988706Z","shell.execute_reply":"2023-10-04T11:20:01.994396Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#from keras.preprocessing.sequence import pad_sequences\nfrom keras.utils import pad_sequences\nX_train=tokenizer.texts_to_sequences(df_cleaned_train['Input']) # this converts texts into some numeric sequences \nX_train_pad=pad_sequences(X_train,maxlen=300,padding='post') # this makes the length of all numeric sequences equal \nX_test = tokenizer.texts_to_sequences(df_test.Input)\nX_test_pad = pad_sequences(X_test, maxlen = 300, padding = 'post')\nX_val = tokenizer.texts_to_sequences(df_val.Input)\nX_val_pad = pad_sequences(X_val, maxlen = 300, padding = 'post')","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:20:03.041730Z","iopub.execute_input":"2023-10-04T11:20:03.042142Z","iopub.status.idle":"2023-10-04T11:20:03.788514Z","shell.execute_reply.started":"2023-10-04T11:20:03.042115Z","shell.execute_reply":"2023-10-04T11:20:03.787050Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from keras.utils import to_categorical\ny_train = to_categorical(df_cleaned_train.Sentiment.values)\ny_test = to_categorical(df_test.Sentiment.values)\ny_val = to_categorical(df_val.Sentiment.values)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:20:05.537196Z","iopub.execute_input":"2023-10-04T11:20:05.537830Z","iopub.status.idle":"2023-10-04T11:20:05.545390Z","shell.execute_reply.started":"2023-10-04T11:20:05.537775Z","shell.execute_reply":"2023-10-04T11:20:05.543829Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(X_train_pad.shape, X_val_pad.shape)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:20:06.799817Z","iopub.execute_input":"2023-10-04T11:20:06.800240Z","iopub.status.idle":"2023-10-04T11:20:06.805895Z","shell.execute_reply.started":"2023-10-04T11:20:06.800210Z","shell.execute_reply":"2023-10-04T11:20:06.804998Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import gensim.downloader as api\nglove_gensim = api.load('glove-wiki-gigaword-100') #100 dimension\n# more dimension means more deep meaning of words but it may take longer time to download ","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:20:07.781084Z","iopub.execute_input":"2023-10-04T11:20:07.781512Z","iopub.status.idle":"2023-10-04T11:20:58.182734Z","shell.execute_reply.started":"2023-10-04T11:20:07.781480Z","shell.execute_reply":"2023-10-04T11:20:58.181331Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"glove_gensim['cat'].shape[0]","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:03.677380Z","iopub.execute_input":"2023-10-04T11:21:03.677783Z","iopub.status.idle":"2023-10-04T11:21:03.685730Z","shell.execute_reply.started":"2023-10-04T11:21:03.677757Z","shell.execute_reply":"2023-10-04T11:21:03.684468Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"vector_size = 100\ngensim_weight_matrix = np.zeros((num_words, vector_size))\ngensim_weight_matrix.shape\n\nfor word, index in tokenizer.word_index.items():\n if index < num_words:\n if word in glove_gensim:\n gensim_weight_matrix[index] = glove_gensim[word]\n else:\n gensim_weight_matrix[index] = np.zeros(vector_size)\n","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:04.687723Z","iopub.execute_input":"2023-10-04T11:21:04.688103Z","iopub.status.idle":"2023-10-04T11:21:04.728653Z","shell.execute_reply.started":"2023-10-04T11:21:04.688076Z","shell.execute_reply":"2023-10-04T11:21:04.727250Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"gensim_weight_matrix.shape","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:06.133313Z","iopub.execute_input":"2023-10-04T11:21:06.133687Z","iopub.status.idle":"2023-10-04T11:21:06.142286Z","shell.execute_reply.started":"2023-10-04T11:21:06.133660Z","shell.execute_reply":"2023-10-04T11:21:06.141007Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"'''\nimport os\nos.environ[\"CUDA_VISIBLE_DEVICES\"] = \"\"\n\nfrom tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n\nEMBEDDING_DIM = 100\n\nmodel = Sequential()\nmodel.add(Embedding(input_dim=num_words, output_dim=EMBEDDING_DIM, input_length=X_train_pad.shape[1],\n weights=[gensim_weight_matrix], trainable=False))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(200, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=False)))\nmodel.add(Dense(6, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n\nes = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)\nmc = ModelCheckpoint('./model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)\n\nhistory_embedding = model.fit(X_train_pad, y_train, epochs=25, batch_size=120, validation_data=(X_val_pad, y_val),\n verbose=1, callbacks=[es, mc])\n'''","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:07.342678Z","iopub.execute_input":"2023-10-04T11:21:07.343381Z","iopub.status.idle":"2023-10-04T11:21:07.352309Z","shell.execute_reply.started":"2023-10-04T11:21:07.343314Z","shell.execute_reply":"2023-10-04T11:21:07.351082Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"'''\nplt.plot(history_embedding.history['accuracy'],c='b',label='train accuracy')\nplt.plot(history_embedding.history['val_accuracy'],c='r',label='validation accuracy')\nplt.legend(loc='lower right')\nplt.show()\n'''","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:08.326298Z","iopub.execute_input":"2023-10-04T11:21:08.327127Z","iopub.status.idle":"2023-10-04T11:21:08.336315Z","shell.execute_reply.started":"2023-10-04T11:21:08.327079Z","shell.execute_reply":"2023-10-04T11:21:08.334779Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#model.evaluate(X_test_pad, y_test) ","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:09.221184Z","iopub.execute_input":"2023-10-04T11:21:09.221845Z","iopub.status.idle":"2023-10-04T11:21:09.225874Z","shell.execute_reply.started":"2023-10-04T11:21:09.221808Z","shell.execute_reply":"2023-10-04T11:21:09.225024Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#y_pred = np.argmax(model.predict(X_test_pad), axis = 1)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:10.081246Z","iopub.execute_input":"2023-10-04T11:21:10.082162Z","iopub.status.idle":"2023-10-04T11:21:10.088206Z","shell.execute_reply.started":"2023-10-04T11:21:10.082088Z","shell.execute_reply":"2023-10-04T11:21:10.086595Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#y_true = np.argmax(y_test, axis = 1)","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:10.609492Z","iopub.execute_input":"2023-10-04T11:21:10.610193Z","iopub.status.idle":"2023-10-04T11:21:10.615156Z","shell.execute_reply.started":"2023-10-04T11:21:10.610136Z","shell.execute_reply":"2023-10-04T11:21:10.613767Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#from sklearn import metrics\n#print(metrics.classification_report(y_pred, y_true))","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:11.137572Z","iopub.execute_input":"2023-10-04T11:21:11.138023Z","iopub.status.idle":"2023-10-04T11:21:11.142695Z","shell.execute_reply.started":"2023-10-04T11:21:11.137992Z","shell.execute_reply":"2023-10-04T11:21:11.141619Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"'''\ndef get_key(value):\n dictionary={'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5}\n for key,val in dictionary.items():\n if (val==value):\n return key\n'''","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:11.669099Z","iopub.execute_input":"2023-10-04T11:21:11.669972Z","iopub.status.idle":"2023-10-04T11:21:11.676311Z","shell.execute_reply.started":"2023-10-04T11:21:11.669887Z","shell.execute_reply":"2023-10-04T11:21:11.675292Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"'''\ndef predict(sentence):\n sentence_lst=[]\n sentence_lst.append(sentence)\n sentence_seq=tokenizer.texts_to_sequences(sentence_lst)\n sentence_padded=pad_sequences(sentence_seq,maxlen=300,padding='post')\n ans=get_key(model.predict_classes(sentence_padded))\n print(\"The emotion predicted is\",ans)\n \n'''","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:12.391427Z","iopub.execute_input":"2023-10-04T11:21:12.392245Z","iopub.status.idle":"2023-10-04T11:21:12.400306Z","shell.execute_reply.started":"2023-10-04T11:21:12.392207Z","shell.execute_reply":"2023-10-04T11:21:12.398808Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"extra kaam","metadata":{}},{"cell_type":"code","source":"from tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout\nfrom tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n\nEMBEDDING_DIM = 100\n\nmodel = Sequential()\nmodel.add(Embedding(input_dim=num_words, output_dim=EMBEDDING_DIM, input_length=X_train_pad.shape[1],\n weights=[gensim_weight_matrix], trainable=False))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(200, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=False)))\nmodel.add(Dense(6, activation='softmax'))\n\nmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n\nes = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)\nmc = ModelCheckpoint('./best_model_weights.h5', monitor='val_accuracy', mode='max', verbose=1,\n save_best_only=True)\n\nhistory_embedding = model.fit(X_train_pad, y_train, epochs=35, batch_size=120, validation_data=(X_val_pad, y_val),\n verbose=1, callbacks=[es, mc])\n\n","metadata":{"execution":{"iopub.status.busy":"2023-10-04T11:21:14.438209Z","iopub.execute_input":"2023-10-04T11:21:14.438619Z","iopub.status.idle":"2023-10-04T14:28:12.724849Z","shell.execute_reply.started":"2023-10-04T11:21:14.438591Z","shell.execute_reply":"2023-10-04T14:28:12.724035Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"plt.plot(history_embedding.history['accuracy'],c='b',label='train accuracy')\nplt.plot(history_embedding.history['val_accuracy'],c='r',label='validation accuracy')\nplt.legend(loc='lower right')\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:44:43.795769Z","iopub.execute_input":"2023-06-23T09:44:43.796572Z","iopub.status.idle":"2023-06-23T09:44:44.231975Z","shell.execute_reply.started":"2023-06-23T09:44:43.796536Z","shell.execute_reply":"2023-06-23T09:44:44.230814Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.evaluate(X_test_pad, y_test) ","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:44:49.171194Z","iopub.execute_input":"2023-06-23T09:44:49.171627Z","iopub.status.idle":"2023-06-23T09:45:30.196535Z","shell.execute_reply.started":"2023-06-23T09:44:49.171592Z","shell.execute_reply":"2023-06-23T09:45:30.195438Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_pred = np.argmax(model.predict(X_test_pad), axis = 1)","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:46:48.348142Z","iopub.execute_input":"2023-06-23T09:46:48.348547Z","iopub.status.idle":"2023-06-23T09:47:21.684649Z","shell.execute_reply.started":"2023-06-23T09:46:48.348518Z","shell.execute_reply":"2023-06-23T09:47:21.683465Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"y_true = np.argmax(y_test, axis = 1)","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:47:38.596102Z","iopub.execute_input":"2023-06-23T09:47:38.596535Z","iopub.status.idle":"2023-06-23T09:47:38.602810Z","shell.execute_reply.started":"2023-06-23T09:47:38.596501Z","shell.execute_reply":"2023-06-23T09:47:38.601275Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn import metrics\nprint(metrics.classification_report(y_pred, y_true))","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:47:42.509860Z","iopub.execute_input":"2023-06-23T09:47:42.510283Z","iopub.status.idle":"2023-06-23T09:47:42.530049Z","shell.execute_reply.started":"2023-06-23T09:47:42.510250Z","shell.execute_reply":"2023-06-23T09:47:42.529247Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def get_key(value):\n dictionary={'joy':0,'anger':1,'love':2,'sadness':3,'fear':4,'surprise':5}\n for key,val in dictionary.items():\n if (val==value):\n return key","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:47:45.297401Z","iopub.execute_input":"2023-06-23T09:47:45.298305Z","iopub.status.idle":"2023-06-23T09:47:45.305272Z","shell.execute_reply.started":"2023-06-23T09:47:45.298263Z","shell.execute_reply":"2023-06-23T09:47:45.304016Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nimport numpy as np\n\n# Assuming you have your test data X_test_pad\n\nEMBEDDING_DIM = 100\nnum_words = 10000 # Example value, adjust according to your data\n\n# Define the model architecture\nmodel = Sequential()\nmodel.add(Embedding(input_dim=num_words, output_dim=EMBEDDING_DIM, input_length=X_test_pad.shape[1]))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(200, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=False)))\nmodel.add(Dense(6, activation='softmax'))\n\n# Load the saved model weights\nmodel.load_weights('./best_model_weights.h5')\n\n# Compile the model (if needed)\nmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n\n# Predict sentiment on the test dataset\npredictions = model.predict(X_test_pad)\n\n# Get the predicted sentiment label for each prediction\nsentiment_labels = np.argmax(predictions, axis=1)\n\nlabel_to_sentiment = {\n 0: \"joy\",\n 1: \"anger\",\n 2: \"love\",\n 3: \"sadness\",\n 4: \"fear\",\n 5: \"surprise\"\n}\n# Convert sentiment labels to actual sentiments (assuming you have a label-to-sentiment mapping)\nsentiments = [label_to_sentiment[label] for label in sentiment_labels]\n\n# Print the predicted sentiments\n#print(sentiments)\n","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:47:49.174088Z","iopub.execute_input":"2023-06-23T09:47:49.174535Z","iopub.status.idle":"2023-06-23T09:48:24.160515Z","shell.execute_reply.started":"2023-06-23T09:47:49.174502Z","shell.execute_reply":"2023-06-23T09:48:24.159302Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import pandas as pd\n\n# Assuming you have a DataFrame called 'test_df' with the test data and labels\n\n# Create a new column 'Predicted Label' in the DataFrame\ndf_test['Predicted Label'] = sentiment_labels\n\n# Compare the predicted labels with the true labels\ndf_test['Prediction Correct'] = df_test['Predicted Label'] == df_test['Sentiment']\n\n# Print the DataFrame to see the results\nprint(df_test)\n","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:48:34.677623Z","iopub.execute_input":"2023-06-23T09:48:34.677996Z","iopub.status.idle":"2023-06-23T09:48:34.692913Z","shell.execute_reply.started":"2023-06-23T09:48:34.677966Z","shell.execute_reply":"2023-06-23T09:48:34.691613Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score\n\n# Calculate the accuracy\naccuracy = accuracy_score(df_test['Sentiment'], df_test['Predicted Label'])\n\n# Print the accuracy\nprint(\"Accuracy: {:.2f}%\".format(accuracy * 100))\n","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:48:38.348303Z","iopub.execute_input":"2023-06-23T09:48:38.348696Z","iopub.status.idle":"2023-06-23T09:48:38.357372Z","shell.execute_reply.started":"2023-06-23T09:48:38.348666Z","shell.execute_reply":"2023-06-23T09:48:38.356110Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"len(sentiments)","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:48:40.383289Z","iopub.execute_input":"2023-06-23T09:48:40.383660Z","iopub.status.idle":"2023-06-23T09:48:40.391509Z","shell.execute_reply.started":"2023-06-23T09:48:40.383632Z","shell.execute_reply":"2023-06-23T09:48:40.390377Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"#now trying to get sentiments/emotions on our unlabelled pheme dataset\n\ndf_pheme = pd.read_csv('/kaggle/input/pheme-dataset-for-rumour-detection/dataset.csv')","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:48:53.418287Z","iopub.execute_input":"2023-06-23T09:48:53.418681Z","iopub.status.idle":"2023-06-23T09:48:53.681165Z","shell.execute_reply.started":"2023-06-23T09:48:53.418653Z","shell.execute_reply":"2023-06-23T09:48:53.680296Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_pheme","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:48:55.377023Z","iopub.execute_input":"2023-06-23T09:48:55.377425Z","iopub.status.idle":"2023-06-23T09:48:55.398303Z","shell.execute_reply.started":"2023-06-23T09:48:55.377393Z","shell.execute_reply":"2023-06-23T09:48:55.397128Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"\n# Create New DataFrame of Specific column by DataFrame.assign() method.\ndf4 = pd.DataFrame().assign(Input=df_pheme['text'], label=df_pheme['is_rumor'])\nprint(df4)\n","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:51:36.238138Z","iopub.execute_input":"2023-06-23T09:51:36.238578Z","iopub.status.idle":"2023-06-23T09:51:36.253575Z","shell.execute_reply.started":"2023-06-23T09:51:36.238544Z","shell.execute_reply":"2023-06-23T09:51:36.252281Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4['Length'] = df4.Input.apply(lambda x:len(x))","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:51:53.877895Z","iopub.execute_input":"2023-06-23T09:51:53.878312Z","iopub.status.idle":"2023-06-23T09:51:53.885681Z","shell.execute_reply.started":"2023-06-23T09:51:53.878278Z","shell.execute_reply":"2023-06-23T09:51:53.884721Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"plt.plot(df4.Length)\nplt.show()","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:51:58.672750Z","iopub.execute_input":"2023-06-23T09:51:58.673158Z","iopub.status.idle":"2023-06-23T09:51:59.163009Z","shell.execute_reply.started":"2023-06-23T09:51:58.673127Z","shell.execute_reply":"2023-06-23T09:51:59.161860Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4.Length.max() # max length of our text body ","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:52:07.277126Z","iopub.execute_input":"2023-06-23T09:52:07.277576Z","iopub.status.idle":"2023-06-23T09:52:07.285657Z","shell.execute_reply.started":"2023-06-23T09:52:07.277539Z","shell.execute_reply":"2023-06-23T09:52:07.284399Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_cleaned_train_4 = text_preprocessing(df4, 'Input')","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:52:53.686513Z","iopub.execute_input":"2023-06-23T09:52:53.686908Z","iopub.status.idle":"2023-06-23T09:53:02.993451Z","shell.execute_reply.started":"2023-06-23T09:52:53.686875Z","shell.execute_reply":"2023-06-23T09:53:02.992012Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:53:22.716920Z","iopub.execute_input":"2023-06-23T09:53:22.718061Z","iopub.status.idle":"2023-06-23T09:53:22.731065Z","shell.execute_reply.started":"2023-06-23T09:53:22.718016Z","shell.execute_reply":"2023-06-23T09:53:22.729977Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_total_4 = pd.concat([df_cleaned_train_4['Input'], df4.Input], axis = 0)\ntokenizer.fit_on_texts(df_total_4)","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:53:37.418684Z","iopub.execute_input":"2023-06-23T09:53:37.419827Z","iopub.status.idle":"2023-06-23T09:53:37.502521Z","shell.execute_reply.started":"2023-06-23T09:53:37.419782Z","shell.execute_reply":"2023-06-23T09:53:37.501317Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df_total_4","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:53:45.486225Z","iopub.execute_input":"2023-06-23T09:53:45.486619Z","iopub.status.idle":"2023-06-23T09:53:45.495576Z","shell.execute_reply.started":"2023-06-23T09:53:45.486584Z","shell.execute_reply":"2023-06-23T09:53:45.494415Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"X_train_4=tokenizer.texts_to_sequences(df_cleaned_train_4['Input']) # this converts texts into some numeric sequences \nX_train_pad_4=pad_sequences(X_train_4,maxlen=300,padding='post')","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:54:32.355411Z","iopub.execute_input":"2023-06-23T09:54:32.355823Z","iopub.status.idle":"2023-06-23T09:54:32.385246Z","shell.execute_reply.started":"2023-06-23T09:54:32.355792Z","shell.execute_reply":"2023-06-23T09:54:32.384229Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"print(X_train_pad_4.shape)","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:54:38.165077Z","iopub.execute_input":"2023-06-23T09:54:38.166158Z","iopub.status.idle":"2023-06-23T09:54:38.171740Z","shell.execute_reply.started":"2023-06-23T09:54:38.166104Z","shell.execute_reply":"2023-06-23T09:54:38.170269Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from tensorflow.keras.models import Sequential\nfrom tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional, Dropout\nfrom tensorflow.keras.preprocessing.sequence import pad_sequences\nimport numpy as np\n\n# Assuming you have your test data X_test_pad\n\nEMBEDDING_DIM = 100\nnum_words = 10000 # Example value, adjust according to your data\n\n# Define the model architecture\nmodel = Sequential()\nmodel.add(Embedding(input_dim=num_words, output_dim=EMBEDDING_DIM, input_length=X_test_pad.shape[1]))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(200, return_sequences=True)))\nmodel.add(Dropout(0.2))\nmodel.add(Bidirectional(LSTM(100, return_sequences=False)))\nmodel.add(Dense(6, activation='softmax'))\n\n# Load the saved model weights\nmodel.load_weights('./best_model_weights.h5')\n\n# Compile the model (if needed)\nmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n\n# Predict sentiment on the test dataset\npredictions = model.predict(X_train_pad_4)\n\n# Get the predicted sentiment label for each prediction\nsentiment_labels = np.argmax(predictions, axis=1)\n\nlabel_to_sentiment = {\n 0: \"joy\",\n 1: \"anger\",\n 2: \"love\",\n 3: \"sadness\",\n 4: \"fear\",\n 5: \"surprise\"\n}\n# Convert sentiment labels to actual sentiments (assuming you have a label-to-sentiment mapping)\nsentiments = [label_to_sentiment[label] for label in sentiment_labels]\n\n# Print the predicted sentiments\n#print(sentiments)\n","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:55:15.727761Z","iopub.execute_input":"2023-06-23T09:55:15.728612Z","iopub.status.idle":"2023-06-23T09:55:40.699931Z","shell.execute_reply.started":"2023-06-23T09:55:15.728575Z","shell.execute_reply":"2023-06-23T09:55:40.698944Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4['Predicted_Label'] = sentiments","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:56:16.246572Z","iopub.execute_input":"2023-06-23T09:56:16.246968Z","iopub.status.idle":"2023-06-23T09:56:16.252689Z","shell.execute_reply.started":"2023-06-23T09:56:16.246939Z","shell.execute_reply":"2023-06-23T09:56:16.251540Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:56:23.437503Z","iopub.execute_input":"2023-06-23T09:56:23.438400Z","iopub.status.idle":"2023-06-23T09:56:23.453434Z","shell.execute_reply.started":"2023-06-23T09:56:23.438363Z","shell.execute_reply":"2023-06-23T09:56:23.452674Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4.Predicted_Label.value_counts()","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:56:33.707546Z","iopub.execute_input":"2023-06-23T09:56:33.707966Z","iopub.status.idle":"2023-06-23T09:56:33.717179Z","shell.execute_reply.started":"2023-06-23T09:56:33.707933Z","shell.execute_reply":"2023-06-23T09:56:33.716104Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df4.to_csv('emotion.csv')","metadata":{"execution":{"iopub.status.busy":"2023-06-23T09:56:41.544165Z","iopub.execute_input":"2023-06-23T09:56:41.544596Z","iopub.status.idle":"2023-06-23T09:56:41.564784Z","shell.execute_reply.started":"2023-06-23T09:56:41.544565Z","shell.execute_reply":"2023-06-23T09:56:41.563671Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"","metadata":{},"execution_count":null,"outputs":[]}]}