# Import the necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, GRU, Bidirectional
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
%matplotlib inline
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Dropout, Flatten, MaxPooling2D, Conv1D ,Activation
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D
from keras.layers import LeakyReLU
import time
from keras import metrics
import warnings
warnings.filterwarnings("ignore")
from keras import optimizers
sgd=optimizers.SGD()
rmsprop=optimizers.RMSprop()
adagrad=optimizers.Adagrad()
adadelta=optimizers.Adadelta()
adam=optimizers.Adam()
adamax=optimizers.Adamax()
nadam=optimizers.Nadam()


**Importing Evalution Measures Libraries**

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

### Load the data into Pandas dataframe

In [None]:
dataset = pd.read_csv('../input/daraz-labeled-standardized-dataset/Daraz_Labeled_Standardized_Dataset.csv')
dataset.drop(['Unnamed: 0', 'Review',],axis=1,inplace=True)
dataset =  dataset.rename(columns={"New_Label": "Label", "Standardized_Review": "Reviews"}, errors="raise")

In [None]:
df = pd.read_csv('../input/daraz-labeled-standardized-dataset/Daraz_Labeled_Standardized_Dataset.csv')
df.drop(['Unnamed: 0', 'Review',],axis=1,inplace=True)
df =  df.rename(columns={"New_Label": "Label", "Standardized_Review": "Reviews"}, errors="raise")

Understand the distribution better.

In [None]:
sns.countplot(df.Label)
plt.xlabel('Label')
plt.title('Number of Spam and Non-Spam Reviews')

* Create input and output vectors.
* Process the labels.

In [None]:
df['Reviews'].replace('', np.nan, inplace=True)
df.dropna(subset=['Reviews'], inplace=True)
dataset['Reviews'].replace('', np.nan, inplace=True)
dataset.dropna(subset=['Reviews'], inplace=True)

In [None]:
dataset['Label'].value_counts()

In [None]:
X = df.Reviews
Y = df.Label

Split into training and test data.

In [None]:
user_rating_avg = {}
for user in dataset['Customer_ID']:
    user_dataset =dataset.loc[dataset['Customer_ID'] == user]
    count = 0
    rating_sum =0
    for rating in user_dataset['Rating']:
        count=count+1
        rating_sum = rating_sum+rating;
    
    avg = rating_sum/count
    user_rating_avg[user]=avg
    
prod_rating_avg = {}
for prod in dataset['Product_ID']:
    prod_dataset =dataset.loc[dataset['Product_ID'] == prod]
    count = 0
    rating_sum =0
    for rating in prod_dataset['Rating']:
        count=count+1
        rating_sum = rating_sum+rating;
    
    avg = rating_sum/count
    prod_rating_avg[prod]=avg

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=1, stop_words="english") 

tfidf = vect.fit_transform(dataset['Reviews']) 

In [None]:
from nltk.tokenize import sent_tokenize

#------- Counting words in reviews ---------
word_counts = []
for review in dataset['Reviews']:
    count=0
    for word in str(review).split():
        count +=1
    word_counts.append(count)
dataset['words_counts'] = word_counts

# ---------------- Total Sentences Containg !  -----------------------
pp1 = []

for i in dataset['Reviews']:
    sent_exc_count=0
    for sent in sent_tokenize(i):
        if '!' in sent:
            sent_exc_count=sent_exc_count+1
    pp1.append(sent_exc_count/len(sent_tokenize(i)))
dataset['pp1'] = pp1

# --------------- Content Similarity ------------------------
content_similarity = []
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=1, stop_words="english") 
tfidf = vect.fit_transform(dataset['Reviews']) 
pairwise_similarity = tfidf * tfidf.T
for i in pairwise_similarity.toarray():
    sum_score= 0;
    count = 0
    for n in i:
        count=count+1
        sum_score=sum_score+n
    content_similarity.append(sum_score/count)
dataset['content_similarity'] = content_similarity






# ---------------- extremity of rating  -----------------------
ext_score = []
for i in dataset['Rating']:
    if(i>3):
        ext_score.append(1)
    else:
        
        ext_score.append(0)
        
dataset['ext_score'] = ext_score

# --------------- Is Review Singleton ------------------------
IRS =[]
for i in dataset['Customer_ID']:
    if(len(dataset.loc[dataset['Customer_ID'] == i])>1):
        IRS.append(0)
    else:
        IRS.append(1)
        
dataset['IRS_score'] = IRS

# # --------------- Average Rating Deviation ------------------------
ARD_Score =[]
for user,prod in zip(dataset['Customer_ID'],dataset['Product_ID']):
    Ard_score = user_rating_avg[user] - prod_rating_avg[prod]
    ARD_Score.append(Ard_score)
dataset['ARD_Score'] = ARD_Score


# # --------------- Absolute Rating Deviation ------------------------
RD_Score =[]
for rating,prod in zip(dataset['Rating'],dataset['Product_ID']):
    Rd_score = rating - prod_rating_avg[prod]
    RD_Score.append(Rd_score)
dataset['RD_Score'] = RD_Score





In [None]:
dataset.head(2)

In [None]:
split_Data = dataset.copy()
split_Data.drop(['Label'],axis=1,inplace=True)
split_Data['Reviews'].replace('', np.nan, inplace=True)
split_Data.dropna(subset=['Reviews'], inplace=True)

# X_train,X_test,Y_train,Y_test = train_test_split(split_Data,Y,test_size=0.15)

### Process the data
* Tokenize the data and convert the text to sequences.
* Add padding to ensure that all the sequences have the same shape.
* There are many ways of taking the *max_len* and here an arbitrary length of 150 is chosen.

In [None]:
type(split_Data['Reviews'])

In [None]:
max_len = 231
max_words = 1000
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(split_Data['Reviews'])
sequences = tok.texts_to_sequences(split_Data['Reviews'])
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

Linguestic Featues

In [None]:
sequences_matrix

In [None]:
split_Data.head(2)

In [None]:
max_words = 1000
max_len = 238
Test_features_with_linguestics = sequences_matrix.tolist()

count =0
for words_counts,pp1,content_similarity,ext_score,IRS_score,ARD_Score,RD_Score in zip(
    split_Data['words_counts'],
    split_Data['pp1'],
    split_Data['content_similarity'],
    split_Data['ext_score'],
    split_Data['IRS_score'],
    split_Data['ARD_Score'],
    split_Data['RD_Score'],
        
    ):
    Test_features_with_linguestics[count].append(abs(words_counts))
    Test_features_with_linguestics[count].append(abs(pp1))
    Test_features_with_linguestics[count].append(abs(content_similarity))
    Test_features_with_linguestics[count].append(abs(ext_score))
    Test_features_with_linguestics[count].append(abs(IRS_score))
    Test_features_with_linguestics[count].append(abs(ARD_Score))
    Test_features_with_linguestics[count].append(abs(RD_Score))
    
    
    count=count+1
Test_features_with_linguestics = np.asarray(Test_features_with_linguestics)
Test_features_with_linguestics[0]

In [None]:
dataset.head(2)

In [None]:
batch=40

# LSTM

In [None]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    model = Sequential()
    model.add(Embedding(2000,50,input_length=max_len))
    model.add(LSTM(50))
    model.add(Activation('relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1,name='out_layer'))
    model.add(Activation('relu'))
    return model

model = RNN()
model.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])
model.fit(Test_features_with_linguestics,Y,batch_size=batch,epochs=100,
          validation_split=0.2)
print('__________________________________________________________\n\n')


Y_predicted_classes = model.predict_classes(Test_features_with_linguestics, verbose=0)
Y_predicted_classes = Y_predicted_classes[:, 0]
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y, Y_predicted_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y, Y_predicted_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y, Y_predicted_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y, Y_predicted_classes)
print('F1 score: %f' % f1)

print("Confusion Matrix\n", confusion_matrix(Y, Y_predicted_classes))

# GRU

In [None]:
def RNN1():
    inputs = Input(name='inputs',shape=[max_len])
    model = Sequential()
    model.add(Embedding(2000,50,input_length=max_len))
    model.add(GRU(100, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    return model


model1 = RNN1()
# model1.summary()
model1.compile(loss='binary_crossentropy',optimizer=adam,metrics=['accuracy'])
model1.fit(Test_features_with_linguestics,Y,batch_size=batch,epochs=100,
          validation_split=0.2)
print('__________________________________________________________\n\n')

Y_predicted_classes = model1.predict_classes(Test_features_with_linguestics, verbose=0)
Y_predicted_classes = Y_predicted_classes[:, 0]
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y, Y_predicted_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y, Y_predicted_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y, Y_predicted_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y, Y_predicted_classes)
print('F1 score: %f' % f1)

print("Confusion Matrix\n", confusion_matrix(Y, Y_predicted_classes))

# CNN

In [None]:
def RNN2():    # added filter
    inputs = Input(name='inputs',shape=[max_len])
    model = Sequential()
    model.add(Embedding(1000, 20,input_length=max_len))
    model.add(Conv1D(200,4,padding='valid',activation='relu',strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dropout(0.3))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy',optimizer='nadam',metrics=['acc'])
    return model

model2 = RNN2()
# model2.summary()
model2.fit(Test_features_with_linguestics,Y,batch_size=batch,epochs=100,
          validation_split=0.2)
print('__________________________________________________________\n\n')


Y_predicted_classes = model2.predict_classes(Test_features_with_linguestics, verbose=0)
Y_predicted_classes = Y_predicted_classes[:, 0]
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y, Y_predicted_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y, Y_predicted_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y, Y_predicted_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y, Y_predicted_classes)
print('F1 score: %f' % f1)

print("Confusion Matrix\n", confusion_matrix(Y, Y_predicted_classes))

# Experimentation for BERT

In [None]:
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
import warnings
warnings.filterwarnings("ignore")
# specify GPU
device = torch.device("cuda")

In [None]:
dataset = pd.read_csv("../input/daraz-dataset/Dataset.csv")
#dataset.head()
df= dataset[['Label', 'Reviews']]
df = df.rename({'Label': 'label', 'Reviews': 'text'}, axis=1)  # new method

In [None]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['text'], df['label'], 
                                                                    random_state=2018, 
                                                                    test_size=0.3, 
                                                                    stratify=df['label'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels, 
                                                                random_state=2018, 
                                                                test_size=0.5, 
                                                                stratify=temp_labels)

In [None]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True
)

In [None]:
## convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during training
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)


In [None]:
# freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [None]:
class BERT_Arch(nn.Module):

    def __init__(self, bert):
      
      super(BERT_Arch, self).__init__()

      self.bert = bert 
      
      # dropout layer
      self.dropout = nn.Dropout(0.5)
      
      # relu activation function
      self.tanh =  nn.Tanh()

      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)

      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      
      x = self.fc1(cls_hs)

      x = self.tanh(x)

      x = self.dropout(x)

      # output layer
      x = self.fc2(x)
      
      # apply softmax activation
      x = self.softmax(x)

      return x

# pass the pre-trained BERT to our define architecture
model = BERT_Arch(bert)

# push the model to GPU
model = model.to(device)


# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),
                  lr = 1e-5)          # learning rate

from sklearn.utils.class_weight import compute_class_weight

#compute the class weights
class_weights = compute_class_weight('balanced', np.unique(train_labels), train_labels)

print("Class Weights:",class_weights)

# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)

# push to GPU
weights = weights.to(device)

# define the loss function
cross_entropy  = nn.NLLLoss(weight=weights) 

# number of training epochs
epochs = 10


In [None]:
# function to train the model
def train():
  
  model.train()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save model predictions
  total_preds=[]
  
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

    # push the batch to gpu
    batch = [r.to(device) for r in batch]
 
    sent_id, mask, labels = batch

    # clear previously calculated gradients 
    model.zero_grad()        

    # get model predictions for the current batch
    preds = model(sent_id, mask)

    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)

    # add on to the total loss
    total_loss = total_loss + loss.item()

    # backward pass to calculate the gradients
    loss.backward()

    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

    # update parameters
    optimizer.step()

    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()

    # append the model predictions
    total_preds.append(preds)

  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  #returns the loss and predictions
  return avg_loss, total_preds

In [None]:
# function for evaluating the model
def evaluate():
  
  print("\nEvaluating...")
  
  # deactivate dropout layers
  model.eval()

  total_loss, total_accuracy = 0, 0
  
  # empty list to save the model predictions
  total_preds = []

  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)
            
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

    # push the batch to gpu
    batch = [t.to(device) for t in batch]

    sent_id, mask, labels = batch

    # deactivate autograd
    with torch.no_grad():
      
      # model predictions
      preds = model(sent_id, mask)

      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)

      total_loss = total_loss + loss.item()

      preds = preds.detach().cpu().numpy()

      total_preds.append(preds)

  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader) 

  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)

  return avg_loss, total_preds

In [None]:
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    #train model
    train_loss, _ = train()
    
    #evaluate model
    valid_loss, _ = evaluate()
    
    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')
    
    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')

In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

# get predictions for test data
with torch.no_grad():
  preds = model(test_seq.to(device), test_mask.to(device))
  preds = preds.detach().cpu().numpy()

preds = np.argmax(preds, axis = 1)
# print(classification_report(test_y, preds))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


accuracy = accuracy_score(test_y, preds)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_y, preds)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_y, preds)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_y, preds)
print('F1 score: %f' % f1)

print("Confusion Matrix\n", confusion_matrix(test_y, preds))