{"cells":[{"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","collapsed":true},"cell_type":"markdown","source":"# Import the necessary libraries"},{"metadata":{"trusted":true,"_uuid":"d6fb32fd69316596e236eab5fb8cf77c848508c3"},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.preprocessing import LabelEncoder\nfrom keras.models import Model\nfrom keras.models import Sequential\nfrom keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, GRU, Bidirectional\nfrom keras.optimizers import RMSprop\nfrom keras.preprocessing.text import Tokenizer\nfrom keras.preprocessing import sequence\nfrom keras.utils import to_categorical\nfrom keras.callbacks import EarlyStopping\n%matplotlib inline\nfrom keras.models import Sequential\nfrom keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D, Conv1D ,Activation\nfrom keras.layers import Embedding, Conv1D, GlobalMaxPooling1D\nfrom keras.layers import LeakyReLU\nimport time\nfrom keras import metrics\nimport warnings\nwarnings.filterwarnings(\"ignore\")\nfrom keras import optimizers\nsgd=optimizers.SGD()\nrmsprop=optimizers.RMSprop()\nadagrad=optimizers.Adagrad()\nadadelta=optimizers.Adadelta()\nadam=optimizers.Adam()\nadamax=optimizers.Adamax()\nnadam=optimizers.Nadam()\n","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"**Importing Evalution Measures Libraries**"},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import accuracy_score\nfrom sklearn.metrics import precision_score\nfrom sklearn.metrics import recall_score\nfrom sklearn.metrics import f1_score\nfrom sklearn.metrics import cohen_kappa_score\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import confusion_matrix","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"f674695f1742479cefdeec0e81ab469f7b6ec90f"},"cell_type":"markdown","source":"### Load the data into Pandas dataframe"},{"metadata":{"trusted":true},"cell_type":"code","source":"dataset = pd.read_csv('../input/yelp-labelled-dataset/Labelled Yelp Dataset.csv', nrows=20000)\ndataset = dataset.rename(columns={\"Product_id\": \"Product_ID\", \"User_id\": \"Customer_ID\", \"Review\": \"Reviews\"}, errors=\"raise\")\ndataset['Label']=dataset['Label'].replace(1,0)\ndataset['Label']=dataset['Label'].replace(-1,1)\n\n# #Selecting Subset of Original Dataset\n# NegitiveDataset= temp_df.loc[(temp_df['Label'] ==1)]#2054\n# positiveDataset = temp_df.loc[(temp_df['Label'] ==0)]\n# NegitiveDataset = NegitiveDataset.head(2054)\n# positiveDataset =positiveDataset.head(17946)\n# dataset = pd.concat([NegitiveDataset,positiveDataset])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dataset['Label'].value_counts()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# dataset = dataset.sample(frac=1).reset_index(drop=True)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df = dataset.copy()","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"3c7060084470000f39a2dcc15b656586dcd6e9fd"},"cell_type":"markdown","source":"Understand the distribution better."},{"metadata":{"trusted":true,"_uuid":"a12002f521dd8eaeb0f69a932cbf23815ffd09d7"},"cell_type":"code","source":"sns.countplot(df.Label)\nplt.xlabel('Label')\nplt.title('Number of Spam and Non-Spam Reviews')","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"353a8191f86c3a22843a729b5d4a5acefbf94be8"},"cell_type":"markdown","source":"* Create input and output vectors.\n* Process the labels."},{"metadata":{"trusted":true},"cell_type":"code","source":"df['Reviews'].replace('', np.nan, inplace=True)\ndf.dropna(subset=['Reviews'], inplace=True)\ndataset['Reviews'].replace('', np.nan, inplace=True)\ndataset.dropna(subset=['Reviews'], inplace=True)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"a1a345c1683e2fcc7173ecae867a5da87f2dde24"},"cell_type":"code","source":"X = df.Reviews\nY = df.Label","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"150e244a39b814d8a41bbe0e419bc5f28e457dd6"},"cell_type":"markdown","source":"Split into training and test data."},{"metadata":{"trusted":true},"cell_type":"code","source":"user_rating_avg = {}\nfor user in dataset['Customer_ID']:\n user_dataset =dataset.loc[dataset['Customer_ID'] == user]\n count = 0\n rating_sum =0\n for rating in user_dataset['Rating']:\n count=count+1\n rating_sum = rating_sum+rating;\n \n avg = rating_sum/count\n user_rating_avg[user]=avg\n \nprod_rating_avg = {}\nfor prod in dataset['Product_ID']:\n prod_dataset =dataset.loc[dataset['Product_ID'] == prod]\n count = 0\n rating_sum =0\n for rating in prod_dataset['Rating']:\n count=count+1\n rating_sum = rating_sum+rating;\n \n avg = rating_sum/count\n prod_rating_avg[prod]=avg","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.feature_extraction.text import TfidfVectorizer\nvect = TfidfVectorizer(min_df=1, stop_words=\"english\") \n\ntfidf = vect.fit_transform(dataset['Reviews']) ","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from nltk.tokenize import sent_tokenize\n\n#------- Counting words in reviews ---------\nword_counts = []\nfor review in dataset['Reviews']:\n count=0\n for word in str(review).split():\n count +=1\n word_counts.append(count)\ndataset['words_counts'] = word_counts\n\n# ---------------- Total Sentences Containg ! -----------------------\npp1 = []\n\nfor i in dataset['Reviews']:\n sent_exc_count=0\n for sent in sent_tokenize(i):\n if '!' in sent:\n sent_exc_count=sent_exc_count+1\n pp1.append(sent_exc_count/len(sent_tokenize(i)))\ndataset['pp1'] = pp1\n\n# --------------- Content Similarity ------------------------\ncontent_similarity = []\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nvect = TfidfVectorizer(min_df=1, stop_words=\"english\") \ntfidf = vect.fit_transform(dataset['Reviews']) \npairwise_similarity = tfidf * tfidf.T\nfor i in pairwise_similarity.toarray():\n sum_score= 0;\n count = 0\n for n in i:\n count=count+1\n sum_score=sum_score+n\n content_similarity.append(sum_score/count)\ndataset['content_similarity'] = content_similarity\n\n\n\n\n\n\n# ---------------- extremity of rating -----------------------\next_score = []\nfor i in dataset['Rating']:\n if(i>3):\n ext_score.append(1)\n else:\n \n ext_score.append(0)\n \ndataset['ext_score'] = ext_score\n\n# --------------- Is Review Singleton ------------------------\nIRS =[]\nfor i in dataset['Customer_ID']:\n if(len(dataset.loc[dataset['Customer_ID'] == i])>1):\n IRS.append(0)\n else:\n IRS.append(1)\n \ndataset['IRS_score'] = IRS\n\n# # --------------- Average Rating Deviation ------------------------\nARD_Score =[]\nfor user,prod in zip(dataset['Customer_ID'],dataset['Product_ID']):\n Ard_score = user_rating_avg[user] - prod_rating_avg[prod]\n ARD_Score.append(Ard_score)\ndataset['ARD_Score'] = ARD_Score\n\n\n# # --------------- Absolute Rating Deviation ------------------------\nRD_Score =[]\nfor rating,prod in zip(dataset['Rating'],dataset['Product_ID']):\n Rd_score = rating - prod_rating_avg[prod]\n RD_Score.append(Rd_score)\ndataset['RD_Score'] = RD_Score\n\n\n\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dataset.head(2)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"split_Data = dataset.copy()\nsplit_Data.drop(['Label'],axis=1,inplace=True)\nsplit_Data['Reviews'].replace('', np.nan, inplace=True)\nsplit_Data.dropna(subset=['Reviews'], inplace=True)\n\n# X_train,X_test,Y_train,Y_test = train_test_split(split_Data,Y,test_size=0.15)","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"c5378d55c271e01480c1ac07f94ff99a80f900d6"},"cell_type":"markdown","source":"### Process the data\n* Tokenize the data and convert the text to sequences.\n* Add padding to ensure that all the sequences have the same shape.\n* There are many ways of taking the *max_len* and here an arbitrary length of 150 is chosen."},{"metadata":{"trusted":true},"cell_type":"code","source":"type(split_Data['Reviews'])","execution_count":null,"outputs":[]},{"metadata":{"trusted":true,"_uuid":"bdca14f2b8cd7bd7cb5ee66fd40ea522217c03c6"},"cell_type":"code","source":"max_len = 231\nmax_words = 1000\ntok = Tokenizer(num_words=max_words)\ntok.fit_on_texts(split_Data['Reviews'])\nsequences = tok.texts_to_sequences(split_Data['Reviews'])\nsequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Linguestic Featues"},{"metadata":{"trusted":true},"cell_type":"code","source":"sequences_matrix","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"split_Data.head(2)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"max_words = 1000\nmax_len = 238\nTest_features_with_linguestics = sequences_matrix.tolist()\n\ncount =0\nfor words_counts,pp1,content_similarity,ext_score,IRS_score,ARD_Score,RD_Score in zip(\n split_Data['words_counts'],\n split_Data['pp1'],\n split_Data['content_similarity'],\n split_Data['ext_score'],\n split_Data['IRS_score'],\n split_Data['ARD_Score'],\n split_Data['RD_Score'],\n \n ):\n Test_features_with_linguestics[count].append(abs(words_counts))\n Test_features_with_linguestics[count].append(abs(pp1))\n Test_features_with_linguestics[count].append(abs(content_similarity))\n Test_features_with_linguestics[count].append(abs(ext_score))\n Test_features_with_linguestics[count].append(abs(IRS_score))\n Test_features_with_linguestics[count].append(abs(ARD_Score))\n Test_features_with_linguestics[count].append(abs(RD_Score))\n \n \n count=count+1\nTest_features_with_linguestics = np.asarray(Test_features_with_linguestics)\nTest_features_with_linguestics[0]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dataset.head(2)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"batch=200","execution_count":null,"outputs":[]},{"metadata":{"_uuid":"ad8706caa7a447fb49b44919fd109129e4082a93"},"cell_type":"markdown","source":"# LSTM"},{"metadata":{"trusted":true},"cell_type":"code","source":"def RNN():\n inputs = Input(name='inputs',shape=[max_len])\n model = Sequential()\n model.add(Embedding(2000,50,input_length=max_len))\n model.add(LSTM(50))\n model.add(Activation('relu'))\n model.add(Dropout(0.3))\n model.add(Dense(1,name='out_layer'))\n model.add(Activation('relu'))\n return model\n\nmodel = RNN()\nmodel.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])\nmodel.fit(Test_features_with_linguestics,Y,batch_size=batch,epochs=100,\n validation_split=0.2)\nprint('__________________________________________________________\\n\\n')\n\n\nY_predicted_classes = model.predict_classes(Test_features_with_linguestics, verbose=0)\nY_predicted_classes = Y_predicted_classes[:, 0]\n# accuracy: (tp + tn) / (p + n)\naccuracy = accuracy_score(Y, Y_predicted_classes)\nprint('Accuracy: %f' % accuracy)\n# precision tp / (tp + fp)\nprecision = precision_score(Y, Y_predicted_classes)\nprint('Precision: %f' % precision)\n# recall: tp / (tp + fn)\nrecall = recall_score(Y, Y_predicted_classes)\nprint('Recall: %f' % recall)\n# f1: 2 tp / (2 tp + fp + fn)\nf1 = f1_score(Y, Y_predicted_classes)\nprint('F1 score: %f' % f1)\n\nprint(\"Confusion Matrix\\n\", confusion_matrix(Y, Y_predicted_classes))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# GRU"},{"metadata":{"trusted":true},"cell_type":"code","source":"def RNN1():\n inputs = Input(name='inputs',shape=[max_len])\n model = Sequential()\n model.add(Embedding(2000,50,input_length=max_len))\n model.add(GRU(100, return_sequences=False))\n model.add(Dropout(0.2))\n model.add(Dense(1))\n model.add(Activation('sigmoid'))\n return model\n\n\nmodel1 = RNN1()\n# model1.summary()\nmodel1.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])\nmodel1.fit(Test_features_with_linguestics,Y,batch_size=batch,epochs=100,\n validation_split=0.2)\nprint('__________________________________________________________\\n\\n')\n\nY_predicted_classes = model1.predict_classes(Test_features_with_linguestics, verbose=0)\nY_predicted_classes = Y_predicted_classes[:, 0]\n# accuracy: (tp + tn) / (p + n)\naccuracy = accuracy_score(Y, Y_predicted_classes)\nprint('Accuracy: %f' % accuracy)\n# precision tp / (tp + fp)\nprecision = precision_score(Y, Y_predicted_classes)\nprint('Precision: %f' % precision)\n# recall: tp / (tp + fn)\nrecall = recall_score(Y, Y_predicted_classes)\nprint('Recall: %f' % recall)\n# f1: 2 tp / (2 tp + fp + fn)\nf1 = f1_score(Y, Y_predicted_classes)\nprint('F1 score: %f' % f1)\n\nprint(\"Confusion Matrix\\n\", confusion_matrix(Y, Y_predicted_classes))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# CNN"},{"metadata":{"trusted":true},"cell_type":"code","source":"def RNN2(): # added filter\n inputs = Input(name='inputs',shape=[max_len])\n model = Sequential()\n model.add(Embedding(1000, 20,input_length=max_len))\n model.add(Conv1D(200,4,padding='valid',activation='relu',strides=1))\n model.add(GlobalMaxPooling1D())\n model.add(Dropout(0.3))\n model.add(Dense(1))\n model.add(Activation('sigmoid'))\n model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['acc'])\n return model\n\nmodel2 = RNN2()\n# model2.summary()\nmodel2.fit(Test_features_with_linguestics,Y,batch_size=batch,epochs=100,\n validation_split=0.2)\nprint('__________________________________________________________\\n\\n')\n\n\nY_predicted_classes = model2.predict_classes(Test_features_with_linguestics, verbose=0)\nY_predicted_classes = Y_predicted_classes[:, 0]\n# accuracy: (tp + tn) / (p + n)\naccuracy = accuracy_score(Y, Y_predicted_classes)\nprint('Accuracy: %f' % accuracy)\n# precision tp / (tp + fp)\nprecision = precision_score(Y, Y_predicted_classes)\nprint('Precision: %f' % precision)\n# recall: tp / (tp + fn)\nrecall = recall_score(Y, Y_predicted_classes)\nprint('Recall: %f' % recall)\n# f1: 2 tp / (2 tp + fp + fn)\nf1 = f1_score(Y, Y_predicted_classes)\nprint('F1 score: %f' % f1)\n\nprint(\"Confusion Matrix\\n\", confusion_matrix(Y, Y_predicted_classes))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Experimentation for BERT"},{"metadata":{"trusted":true},"cell_type":"code","source":"import torch\nimport torch.nn as nn\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import classification_report\nimport transformers\nfrom transformers import AutoModel, BertTokenizerFast\nimport warnings\nwarnings.filterwarnings(\"ignore\")\n# specify GPU\ndevice = torch.device(\"cuda\")","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"dataset = pd.read_csv(\"../input/yelp-dataset-final/Yelp_Dataset.csv\", nrows=20000)\ndataset['Label']=dataset['Label'].replace(1,0)\ndataset['Label']=dataset['Label'].replace(-1,1)\ndataset.head()\ndf= dataset[['Label', 'Review']]\ndf = df.rename({'Label': 'label', 'Review': 'text'}, axis=1) # new method","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# split train dataset into train, validation and test sets\ntrain_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], \n random_state=2018, \n test_size=0.3, \n stratify=df['label'])\n\n\nval_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, \n random_state=2018, \n test_size=0.5, \n stratify=temp_labels)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# import BERT-base pretrained model\nbert = AutoModel.from_pretrained('bert-base-uncased')\n\n# Load the BERT tokenizer\ntokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# sample data\ntext = [\"this is a bert model tutorial\", \"we will fine-tune a bert model\"]\n\n# encode text\nsent_id = tokenizer.batch_encode_plus(text, padding=True)\n\n# output\nprint(sent_id)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# get length of all the messages in the train set\nseq_len = [len(i.split()) for i in train_text]\n\npd.Series(seq_len).hist(bins = 30)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# tokenize and encode sequences in the training set\ntokens_train = tokenizer.batch_encode_plus(\n train_text.tolist(),\n max_length = 25,\n pad_to_max_length=True,\n truncation=True\n)\n\n# tokenize and encode sequences in the validation set\ntokens_val = tokenizer.batch_encode_plus(\n val_text.tolist(),\n max_length = 25,\n pad_to_max_length=True,\n truncation=True\n)\n\n# tokenize and encode sequences in the test set\ntokens_test = tokenizer.batch_encode_plus(\n test_text.tolist(),\n max_length = 25,\n pad_to_max_length=True,\n truncation=True\n)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"## convert lists to tensors\n\ntrain_seq = torch.tensor(tokens_train['input_ids'])\ntrain_mask = torch.tensor(tokens_train['attention_mask'])\ntrain_y = torch.tensor(train_labels.tolist())\n\nval_seq = torch.tensor(tokens_val['input_ids'])\nval_mask = torch.tensor(tokens_val['attention_mask'])\nval_y = torch.tensor(val_labels.tolist())\n\ntest_seq = torch.tensor(tokens_test['input_ids'])\ntest_mask = torch.tensor(tokens_test['attention_mask'])\ntest_y = torch.tensor(test_labels.tolist())","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler\n\n#define a batch size\nbatch_size = 32\n\n# wrap tensors\ntrain_data = TensorDataset(train_seq, train_mask, train_y)\n\n# sampler for sampling the data during training\ntrain_sampler = RandomSampler(train_data)\n\n# dataLoader for train set\ntrain_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)\n\n# wrap tensors\nval_data = TensorDataset(val_seq, val_mask, val_y)\n\n# sampler for sampling the data during training\nval_sampler = SequentialSampler(val_data)\n\n# dataLoader for validation set\nval_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)\n","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# freeze all the parameters\nfor param in bert.parameters():\n param.requires_grad = False","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"class BERT_Arch(nn.Module):\n\n def __init__(self, bert):\n \n super(BERT_Arch, self).__init__()\n\n self.bert = bert \n \n # dropout layer\n self.dropout = nn.Dropout(0.3)\n \n # relu activation function\n self.tanh = nn.Tanh()\n\n # dense layer 1\n self.fc1 = nn.Linear(768,512)\n \n # dense layer 2 (Output layer)\n self.fc2 = nn.Linear(512,2)\n\n #softmax activation function\n self.softmax = nn.LogSoftmax(dim=1)\n\n #define the forward pass\n def forward(self, sent_id, mask):\n\n #pass the inputs to the model \n _, cls_hs = self.bert(sent_id, attention_mask=mask)\n \n x = self.fc1(cls_hs)\n\n x = self.tanh(x)\n\n x = self.dropout(x)\n\n # output layer\n x = self.fc2(x)\n \n # apply softmax activation\n x = self.softmax(x)\n\n return x\n\n# pass the pre-trained BERT to our define architecture\nmodel = BERT_Arch(bert)\n\n# push the model to GPU\nmodel = model.to(device)\n\n\n# optimizer from hugging face transformers\nfrom transformers import AdamW\n\n# define the optimizer\noptimizer = AdamW(model.parameters(),\n lr = 1e-5) # learning rate\n\nfrom sklearn.utils.class_weight import compute_class_weight\n\n#compute the class weights\nclass_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)\n\nprint(\"Class Weights:\",class_weights)\n\n# converting list of class weights to a tensor\nweights= torch.tensor(class_weights,dtype=torch.float)\n\n# push to GPU\nweights = weights.to(device)\n\n# define the loss function\ncross_entropy = nn.NLLLoss(weight=weights) \n\n# number of training epochs\nepochs = 10","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# function to train the model\ndef train():\n \n model.train()\n\n total_loss, total_accuracy = 0, 0\n \n # empty list to save model predictions\n total_preds=[]\n \n # iterate over batches\n for step,batch in enumerate(train_dataloader):\n \n # progress update after every 50 batches.\n if step % 50 == 0 and not step == 0:\n print(' Batch {:>5,} of {:>5,}.'.format(step, len(train_dataloader)))\n\n # push the batch to gpu\n batch = [r.to(device) for r in batch]\n \n sent_id, mask, labels = batch\n\n # clear previously calculated gradients \n model.zero_grad() \n\n # get model predictions for the current batch\n preds = model(sent_id, mask)\n\n # compute the loss between actual and predicted values\n loss = cross_entropy(preds, labels)\n\n # add on to the total loss\n total_loss = total_loss + loss.item()\n\n # backward pass to calculate the gradients\n loss.backward()\n\n # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem\n torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)\n\n # update parameters\n optimizer.step()\n\n # model predictions are stored on GPU. So, push it to CPU\n preds=preds.detach().cpu().numpy()\n\n # append the model predictions\n total_preds.append(preds)\n\n # compute the training loss of the epoch\n avg_loss = total_loss / len(train_dataloader)\n \n # predictions are in the form of (no. of batches, size of batch, no. of classes).\n # reshape the predictions in form of (number of samples, no. of classes)\n total_preds = np.concatenate(total_preds, axis=0)\n\n #returns the loss and predictions\n return avg_loss, total_preds","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# function for evaluating the model\ndef evaluate():\n \n print(\"\\nEvaluating...\")\n \n # deactivate dropout layers\n model.eval()\n\n total_loss, total_accuracy = 0, 0\n \n # empty list to save the model predictions\n total_preds = []\n\n # iterate over batches\n for step,batch in enumerate(val_dataloader):\n \n # Progress update every 50 batches.\n if step % 50 == 0 and not step == 0:\n \n # Calculate elapsed time in minutes.\n elapsed = format_time(time.time() - t0)\n \n # Report progress.\n print(' Batch {:>5,} of {:>5,}.'.format(step, len(val_dataloader)))\n\n # push the batch to gpu\n batch = [t.to(device) for t in batch]\n\n sent_id, mask, labels = batch\n\n # deactivate autograd\n with torch.no_grad():\n \n # model predictions\n preds = model(sent_id, mask)\n\n # compute the validation loss between actual and predicted values\n loss = cross_entropy(preds,labels)\n\n total_loss = total_loss + loss.item()\n\n preds = preds.detach().cpu().numpy()\n\n total_preds.append(preds)\n\n # compute the validation loss of the epoch\n avg_loss = total_loss / len(val_dataloader) \n\n # reshape the predictions in form of (number of samples, no. of classes)\n total_preds = np.concatenate(total_preds, axis=0)\n\n return avg_loss, total_preds","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# set initial loss to infinite\nbest_valid_loss = float('inf')\n\n# empty lists to store training and validation loss of each epoch\ntrain_losses=[]\nvalid_losses=[]\n\n#for each epoch\nfor epoch in range(epochs):\n \n print('\\n Epoch {:} / {:}'.format(epoch + 1, epochs))\n \n #train model\n train_loss, _ = train()\n \n #evaluate model\n# valid_loss, _ = evaluate()\n \n #save the best model\n# if valid_loss < best_valid_loss:\n# best_valid_loss = valid_loss\n# torch.save(model.state_dict(), 'saved_weights.pt')\n \n # append training and validation loss\n train_losses.append(train_loss)\n #valid_losses.append(valid_loss)\n \n print(f'\\nTraining Loss: {train_loss:.3f}')\n # print(f'Validation Loss: {valid_loss:.3f}')","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# #load weights of best model\n# path = 'saved_weights.pt'\n# model.load_state_dict(torch.load(path))\n\n# get predictions for test data\nwith torch.no_grad():\n preds = model(test_seq.to(device), test_mask.to(device))\n preds = preds.detach().cpu().numpy()\n\npreds = np.argmax(preds, axis = 1)\n# print(classification_report(test_y, preds))","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"from sklearn.metrics import accuracy_score\nfrom sklearn.metrics import precision_score\nfrom sklearn.metrics import recall_score\nfrom sklearn.metrics import f1_score\nfrom sklearn.metrics import cohen_kappa_score\nfrom sklearn.metrics import roc_auc_score\nfrom sklearn.metrics import confusion_matrix\n\n\naccuracy = accuracy_score(test_y, preds)\nprint('Accuracy: %f' % accuracy)\n# precision tp / (tp + fp)\nprecision = precision_score(test_y, preds)\nprint('Precision: %f' % precision)\n# recall: tp / (tp + fn)\nrecall = recall_score(test_y, preds)\nprint('Recall: %f' % recall)\n# f1: 2 tp / (2 tp + fp + fn)\nf1 = f1_score(test_y, preds)\nprint('F1 score: %f' % f1)\n\nprint(\"Confusion Matrix\\n\", confusion_matrix(test_y, preds))","execution_count":null,"outputs":[]}],"metadata":{"kernelspec":{"name":"python3","display_name":"Python 3","language":"python"},"language_info":{"name":"python","version":"3.6.6","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":4}