In [1]:
#1. importing libraries
import pandas as pd
import re

In [2]:
#2. Loading Image Text from corresponding training and test files
path11="C://Users//FurqanSaddozai//scrapping_tweets_16_july_2022//Uni-multi-urdu-HS//"
file_path_1 = path11+"MMHS11K_train.xlsx"
file_path_2 = path11+"MMHS11K_test.xlsx"

# Load specific columns into a DataFrame
columns_to_load = ["Image_Text", "Label"]
df_train = pd.read_excel(file_path_1, usecols=columns_to_load)
df_test = pd.read_excel(file_path_2, usecols=columns_to_load)

# Display the DataFrame
print('Training data')
print(df_train[:3])
print('................................')
print('Test data')
print(df_test[:3])

Training data
  Image_Text    Label
0        NIL  No_Hate
1        NIL  No_Hate
2        NIL  No_Hate
................................
Test data
                                          Image_Text Label
0                                                NIL  Hate
1  8ا 8 8\n:۰\n\n2 9 10:51\n358 ۵۱ ۷۸۷۰۱ لھگ\nٹیر...  Hate
2                                                NIL  Hate


In [3]:
#3. printing number of samples in training and test files
print(len(df_train))
print(len(df_test))

8800
2200


In [4]:
# 4. Text Cleansing
#Removing all digits, non-Urdu characters or words from Image text
df_train['Image_Text']=df_train['Image_Text'].apply((lambda x: re.sub(r'[^\u0600-\u06FF\s]+',' ',x)))
df_test['Image_Text']=df_test['Image_Text'].apply((lambda x: re.sub(r'[^\u0600-\u06FF\s]+',' ',x)))
# Removing all Urdu digits  from Image text
df_train['Image_Text']=df_train['Image_Text'].apply((lambda x: re.sub('[۰-۹]',' ',x)))
df_test['Image_Text']=df_test['Image_Text'].apply((lambda x: re.sub('[۰-۹]',' ',x)))
#Removing all punctuations marks from Image text
df_train['Image_Text']=df_train['Image_Text'].apply((lambda x: re.sub('[^\w\s]',' ',x)))
df_test['Image_Text']=df_test['Image_Text'].apply((lambda x: re.sub('[^\w\s]',' ',x)))
#Replace '_' with white space (if any)
df_train['Image_Text']=df_train['Image_Text'].apply((lambda x: re.sub('_',' ',x)))
df_test['Image_Text']=df_test['Image_Text'].apply((lambda x: re.sub('_',' ',x)))
#Tokenization using NLTK
import nltk
def identify_tokens(row):
    text= row['Image_Text'] 
    #print
    tokens = nltk.word_tokenize(text)
    #print
    #print(tokens)
    #aa=input("need more?")
    return tokens
df_train['Image_Text'] = df_train.apply(identify_tokens, axis=1)# apply method has axis column showing:
df_test['Image_Text'] = df_test.apply(identify_tokens, axis=1)# apply method has axis column showing:
#Removing stopwords and characters having lenggth<=1 from Image text
#importing manually compiled Urdu stopwords list having 414 entries
urdu_stop_word_file='C:\\Users\\FurqanSaddozai\\scrapping_tweets_16_july_2022\\Uni-multi-urdu-HS\\DFF_2023\\Urdu_stopwords.txt'
import codecs
f=codecs.open(urdu_stop_word_file,'r','utf-8-sig')
text=f.read()
#print(text)
stops=text.split()
#print("value of aa is equal to========")
#print(stops)
f.close()
#Removing stopwords and single characters from Image Text
def remove_stops(row):
    my_list = row['Image_Text']
    meaningful_words1 = [w for w in my_list if not w in stops]
    meaningful_words2 = [w for w in meaningful_words1 if len(w)>1]
    return (meaningful_words2)
df_train['Image_Text'] = df_train.apply(remove_stops, axis=1)# apply method has axis column showing:
df_test['Image_Text'] = df_test.apply(remove_stops, axis=1)# apply method has axis column showing:
#Rejoining tokenized words into strings/text
def rejoin_words(row):
    my_list = row['Image_Text']
    joined_words = ( " ".join(my_list))
    return joined_words
df_train['Image_Text'] = df_train.apply(rejoin_words, axis=1)# apply method has axis column showing:
df_test['Image_Text'] = df_test.apply(rejoin_words, axis=1)# apply method has axis column showing:
# Replace empty values with 'NIL'
df_train['Image_Text'] = df_train['Image_Text'].replace('', 'NIL')
df_test['Image_Text']=df_test['Image_Text'].replace('', 'NIL')
#print(df)

In [5]:
#5. Dividing training and test dataframes into corresponding variables: X_train, X_test, y_train, y_test
X_train=df_train['Image_Text']
X_test=df_test['Image_Text']
y_train=df_train['Label']
y_test=df_test['Label']

In [6]:
#6. Converting labels into numerical values
from tensorflow.keras.utils import to_categorical
# Convert labels to numerical values
labels1 = y_train.map({'Hate': 1, 'No_Hate': 0})
labels1 = to_categorical(labels1)  # Convert to one-hot encoded format
labels2 = y_test.map({'Hate': 1, 'No_Hate': 0})
labels2 = to_categorical(labels2)  # Convert to one-hot encoded format
y_train=labels1
y_test=labels2

In [7]:
#7. Tokenize Image text, converting it into sequence of integers, and padding the sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#X_train.shape
max_features = 5300
max_len=45
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train.values)
X1 = tokenizer.texts_to_sequences(X_train.values)
X1 = pad_sequences(X1,maxlen=max_len)

#tokenizer.fit_on_texts(X_test.values)
X2 = tokenizer.texts_to_sequences(X_test.values)
X2 = pad_sequences(X2,maxlen=max_len)

In [8]:
#8. Presenting the shape of input features and target variables/labels
print(X1.shape)
print(X2.shape)
print(y_train.shape)
print(y_test.shape)

(8800, 45)
(2200, 45)
(8800, 2)
(2200, 2)


In [9]:
#9. Finding the total length of vocabulary
vocab_size = len(tokenizer.word_index) + 1

In [10]:
#10. Printing the total length of vocabulary
print(vocab_size)

12228


In [11]:
#11. Load pre-trained word embeddings (specifically skipgram embeddings) for Urdu language.
#we downloaded skipgram embeddings from link
##https://github.com/samarh/urduvec

from gensim.models import word2vec
model = word2vec.KeyedVectors.load_word2vec_format('urduvec_140M_100K_300d.bin', binary=True)

In [12]:
#12. Create an embedding matrix for words in the tokenizer's vocabulary using pre-trained word embeddings
#only selected first 80 embeddings from 300 embeddings
import numpy as np

embedding_dim = 300  # Adjust based on your pretrained embedding dimension
selected_embedding_dim = 80  # Number of dimensions to select

num_words = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((num_words, selected_embedding_dim))  # Use selected_embedding_dim

for word, index in tokenizer.word_index.items():
    if word in model:
        full_embedding = model[word]  # Assuming model[word] gives you a 300-dimensional embedding
        selected_embedding = full_embedding[:selected_embedding_dim]  # Extract first 50 dimensions
        embedding_matrix[index] = selected_embedding

In [13]:
#13. Define and compile a Bidirectional LSTM neural network model for binary classification using Keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Flatten, Dropout, Dense, Bidirectional
from keras.optimizers import Adam
import tensorflow as tf

# Set random seeds for reproducibility
seed_value = 10
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
model = Sequential()
embedding_dim=80 #output embedding dimensions
model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False))
# Add an BiLSTM layer with 150 units
model.add(Bidirectional(LSTM(150, return_sequences = False, activation='relu')))
#Dense layer
model.add(Dense(512, activation='relu'))
#Dropout layer 
model.add(Dropout(0.5))
#Classification Layer
model.add(Dense(2, activation='sigmoid'))
#model.summary()
learning_rate = 0.001
optimizer = Adam(learning_rate=learning_rate)

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [14]:
#14. Model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 45, 80)            978240    
                                                                 
 bidirectional (Bidirection  (None, 300)               277200    
 al)                                                             
                                                                 
 dense (Dense)               (None, 512)               154112    
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense_1 (Dense)             (None, 2)                 1026      
                                                                 
Total params: 1410578 (5.38 MB)
Trainable params: 432338 (1.65 MB)
Non-trainable params: 978240 (3.73 MB)
________________

In [15]:
#15. Train a neural network model while monitoring the validation loss and stopping the training process 
#early if the validation loss does not improve for a specified number of epochs.
from tensorflow.keras.callbacks import EarlyStopping

# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
#Training model
model.fit(X1, y_train, epochs=20, batch_size=22, validation_data=(X2, y_test), callbacks=[early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


<keras.src.callbacks.History at 0x1af8000fc10>

In [16]:
#16. Finding & printing Test accuracy & loss
test_loss, test_accuracy = model.evaluate(X2, y_test, batch_size=22)
print(test_accuracy)
print(test_loss)

0.5145454406738281
0.6922673583030701


In [17]:
#17. Importing libraries to calculate accracy, precision, recall and f-measure
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [18]:
yhat_classes = (model.predict(X2) > 0.5).astype("int32")



In [19]:
yhat_classes = yhat_classes[:, 0]
y_test=y_test[:, 0]

In [20]:
accuracy = accuracy_score(y_test, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(y_test, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(y_test, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(y_test, yhat_classes)
print('F1 score: %f' % f1)

Accuracy: 0.516818
Precision: 0.508933
Recall: 0.958182
F1 score: 0.664775
