{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "cilIdrLI_45O" }, "source": [ "**Import Libraries and Data** " ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "JQombcEn_45S" }, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "id": "DXvzRjJA_45Y" }, "outputs": [], "source": [ "# import BERT tokenization\n", "\n", "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "w78aCQUjAPrO", "outputId": "059c06ee-6569-44b9-9a8b-3db63408b400" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting tokenization\n", " Downloading tokenization-1.0.7-py3-none-any.whl (10 kB)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.8/dist-packages (from tokenization) (2022.6.2)\n", "Installing collected packages: tokenization\n", "Successfully installed tokenization-1.0.7\n" ] } ], "source": [ "pip install tokenization" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "if3e9neA_45a" }, "outputs": [], "source": [ "import tokenization\n", "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "from tensorflow.keras.utils import to_categorical\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "LKDX7i3jAbNz", "outputId": "c68a6631-e570-4c3a-be08-b36c9d354535" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "from google.colab import drive\n", "import os\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "id": "lC91JoTC_45c" }, "outputs": [], "source": [ "data_t = pd.read_csv('/content/drive/My Drive/smartphone_preprocessed_sentiments.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "zKckPxr2m1sv" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 243 }, "id": "xFhCh4RZ_45e", "outputId": "af15ca6a-5a8b-4ebe-f6e5-a20ad14c9dcb" }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Unnamed: 0.1DateUserLocationTweetsc_TweetscleanTextSubjectivityPolaritySentiment
111.02022-10-30 23:00:47+00:00GhostlyOwlBritish ColumbiaI will go anywhere provided it be forward. ~ D...I will go anywhere provided it be forward. ~ D...anywher provid forward david livingston anasta...0.00.0Neutral
222.02022-10-30 22:48:15+00:00Sensible_GeorgeLondon, England@them_apples Compel parking companies to retai...Compel parking companies to retain pay machine...compel park compani retain pay machin five mil...0.00.0Neutral
\n", "
\n", " \n", " \n", " \n", "\n", " \n", "
\n", "
\n", " " ], "text/plain": [ " Unnamed: 0 Unnamed: 0.1 Date User \\\n", "1 1 1.0 2022-10-30 23:00:47+00:00 GhostlyOwl \n", "2 2 2.0 2022-10-30 22:48:15+00:00 Sensible_George \n", "\n", " Location Tweets \\\n", "1 British Columbia I will go anywhere provided it be forward. ~ D... \n", "2 London, England @them_apples Compel parking companies to retai... \n", "\n", " c_Tweets \\\n", "1 I will go anywhere provided it be forward. ~ D... \n", "2 Compel parking companies to retain pay machine... \n", "\n", " cleanText Subjectivity Polarity \\\n", "1 anywher provid forward david livingston anasta... 0.0 0.0 \n", "2 compel park compani retain pay machin five mil... 0.0 0.0 \n", "\n", " Sentiment \n", "1 Neutral \n", "2 Neutral " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_t[1:3]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "NgJni98DoQLa" }, "outputs": [], "source": [ "data_t=data_t.dropna( axis=0, subset=[\"cleanText\"])" ] }, { "cell_type": "markdown", "metadata": { "id": "-6Q7quLz_45i" }, "source": [ "**Label encoding of labels**" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "_Y6zx-Iv_45j", "outputId": "3b39c47c-000f-404b-f431-ef946750c587" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[[0. 1. 0.]\n", " [0. 1. 0.]\n", " [0. 1. 0.]\n", " [0. 0. 1.]\n", " [0. 1. 0.]]\n" ] } ], "source": [ "#training\n", "label = preprocessing.LabelEncoder()\n", "x = label.fit_transform(data_t['Sentiment'])\n", "x = to_categorical(x)\n", "print(x[:5])" ] }, { "cell_type": "markdown", "metadata": { "id": "0J8ZarJn_45l" }, "source": [ "**Build a BERT layer**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "id": "yz7hzOOr_45n" }, "outputs": [], "source": [ "m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'\n", "bert_layer = hub.KerasLayer(m_url, trainable=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "at8-uI27CmXd", "outputId": "e42b6061-1998-4871-cfc9-a6b2aac3f8f5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting bert-tensorflow\n", " Downloading bert_tensorflow-1.0.4-py2.py3-none-any.whl (64 kB)\n", "\u001b[K |████████████████████████████████| 64 kB 3.1 MB/s \n", "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from bert-tensorflow) (1.15.0)\n", "Installing collected packages: bert-tensorflow\n", "Successfully installed bert-tensorflow-1.0.4\n" ] } ], "source": [ "pip install bert-tensorflow\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "3_uy7j14G9M6" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 12, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "VrIJyrMtG9RI", "outputId": "eae62106-f46e-4042-eef5-74815410d1bc" }, "outputs": [ { "data": { "text/plain": [ "['preserve_unused_tokens=False']" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import sys\n", "from absl import flags\n", "sys.argv=['preserve_unused_tokens=False']\n", "flags.FLAGS(sys.argv)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "_-dyVm8oDj_q" }, "outputs": [], "source": [ "from bert import tokenization" ] }, { "cell_type": "markdown", "metadata": { "id": "aLQ0BE1w_45o" }, "source": [ "**Encoding the text**" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "id": "AR350PJ0_45q" }, "outputs": [], "source": [ "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n", "do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n", "tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)\n", "\n", "def bert_encode(texts, tokenizer, max_len=512):\n", " all_tokens = []\n", " all_masks = []\n", " all_segments = []\n", " \n", " for text in texts:\n", " text = tokenizer.tokenize(text)\n", " \n", " text = text[:max_len-2]\n", " input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n", " pad_len = max_len-len(input_sequence)\n", " \n", " tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len\n", " pad_masks = [1] * len(input_sequence) + [0] * pad_len\n", " segment_ids = [0] * max_len\n", " \n", " all_tokens.append(tokens)\n", " all_masks.append(pad_masks)\n", " all_segments.append(segment_ids)\n", " \n", " return np.array(all_tokens), np.array(all_masks), np.array(all_segments)" ] }, { "cell_type": "markdown", "metadata": { "id": "WIgpLKl__45r" }, "source": [ "**Build The Model**" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "VoTnehxN_45t" }, "outputs": [], "source": [ "def build_model(bert_layer, max_len=512):\n", " input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n", " input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n", " segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n", " \n", " pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n", " \n", " clf_output = sequence_output[:, 0, :]\n", " \n", " lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " lay = tf.keras.layers.Dense(32, activation='relu')(lay)\n", " lay = tf.keras.layers.Dense(16, activation='relu')(lay)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " out = tf.keras.layers.Dense(3, activation='softmax')(lay)\n", " \n", " model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n", " model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])\n", " \n", " return model" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "f9DwncVuFqta" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "yW-rgiMmn1yR" }, "outputs": [], "source": [ "data_t=data_t.dropna( axis=0, subset=[\"Sentiment\"])\n" ] }, { "cell_type": "markdown", "metadata": { "id": "DIlVUTpX_45u" }, "source": [ "Here We check only the first 250 characters of each text, and also we set train-test input and train labels" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "id": "i5c_zxlE_45v" }, "outputs": [], "source": [ "max_len = 200\n", "data_input = bert_encode(data_t.cleanText.values, tokenizer, max_len=max_len)\n", "data_labels = x" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1eJv1Up8_45w", "outputId": "aee4987c-bdc7-420f-e124-ce2668f94c16" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Negative' 'Neutral' 'Positive']\n" ] } ], "source": [ "labels = label.classes_\n", "print(labels)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eSJs7UwK_45w", "outputId": "3da9640c-5719-4e58-fa2e-470588af25ef" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"model\"\n", "__________________________________________________________________________________________________\n", " Layer (type) Output Shape Param # Connected to \n", "==================================================================================================\n", " input_word_ids (InputLayer) [(None, 200)] 0 [] \n", " \n", " input_mask (InputLayer) [(None, 200)] 0 [] \n", " \n", " segment_ids (InputLayer) [(None, 200)] 0 [] \n", " \n", " keras_layer (KerasLayer) [(None, 768), 109482241 ['input_word_ids[0][0]', \n", " (None, 200, 768)] 'input_mask[0][0]', \n", " 'segment_ids[0][0]'] \n", " \n", " tf.__operators__.getitem (Slic (None, 768) 0 ['keras_layer[0][1]'] \n", " ingOpLambda) \n", " \n", " dense (Dense) (None, 64) 49216 ['tf.__operators__.getitem[0][0]'\n", " ] \n", " \n", " dropout (Dropout) (None, 64) 0 ['dense[0][0]'] \n", " \n", " dense_1 (Dense) (None, 32) 2080 ['dropout[0][0]'] \n", " \n", " dense_2 (Dense) (None, 16) 528 ['dense_1[0][0]'] \n", " \n", " dropout_1 (Dropout) (None, 16) 0 ['dense_2[0][0]'] \n", " \n", " dense_3 (Dense) (None, 3) 51 ['dropout_1[0][0]'] \n", " \n", "==================================================================================================\n", "Total params: 109,534,116\n", "Trainable params: 109,534,115\n", "Non-trainable params: 1\n", "__________________________________________________________________________________________________\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/adam.py:110: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.\n", " super(Adam, self).__init__(name, **kwargs)\n" ] } ], "source": [ "model = build_model(bert_layer, max_len=max_len)\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": { "id": "eDL4Sy6D_45y" }, "source": [ "**Run the model**" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "yOYHSG4K_45z", "outputId": "a64f3a0b-222c-4a14-d5a4-03ac8391e9a6" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "1614/1614 [==============================] - 1163s 712ms/step - loss: 0.3223 - accuracy: 0.8824 - val_loss: 0.0917 - val_accuracy: 0.9744\n", "Epoch 2/10\n", "1614/1614 [==============================] - 1150s 712ms/step - loss: 0.1087 - accuracy: 0.9701 - val_loss: 0.0536 - val_accuracy: 0.9870\n", "Epoch 3/10\n", "1614/1614 [==============================] - 1151s 713ms/step - loss: 0.0729 - accuracy: 0.9806 - val_loss: 0.0749 - val_accuracy: 0.9808\n", "Epoch 4/10\n", "1614/1614 [==============================] - 1153s 714ms/step - loss: 0.0596 - accuracy: 0.9849 - val_loss: 0.0729 - val_accuracy: 0.9732\n", "Epoch 5/10\n", "1614/1614 [==============================] - 1152s 714ms/step - loss: 0.0478 - accuracy: 0.9876 - val_loss: 0.0503 - val_accuracy: 0.9887\n", "Epoch 6/10\n", "1614/1614 [==============================] - 1151s 713ms/step - loss: 0.0393 - accuracy: 0.9900 - val_loss: 0.0685 - val_accuracy: 0.9856\n", "Epoch 7/10\n", "1614/1614 [==============================] - 1151s 713ms/step - loss: 0.0339 - accuracy: 0.9915 - val_loss: 0.0898 - val_accuracy: 0.9823\n", "Epoch 8/10\n", "1614/1614 [==============================] - 1150s 713ms/step - loss: 0.0295 - accuracy: 0.9919 - val_loss: 0.0596 - val_accuracy: 0.9873\n", "Epoch 9/10\n", "1614/1614 [==============================] - 1150s 713ms/step - loss: 0.0286 - accuracy: 0.9932 - val_loss: 0.0876 - val_accuracy: 0.9814\n", "Epoch 10/10\n", "1614/1614 [==============================] - 1150s 713ms/step - loss: 0.0188 - accuracy: 0.9945 - val_loss: 0.0771 - val_accuracy: 0.9842\n" ] } ], "source": [ "#checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)\n", "#earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)\n", "#test_labels=y\n", "train_sh = model.fit(\n", " data_input, data_labels,\n", " validation_split=0.20,\n", " epochs=10,\n", " #callbacks=[checkpoint, earlystopping],\n", " batch_size=16\n", ")" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "IMFFZsp8o8mD" }, "outputs": [], "source": [ "data_test = pd.read_csv('/content/drive/My Drive/smartphone_preprocessed_sentiments_test.csv')\n", "data_test=data_test.dropna( axis=0, subset=[\"cleanText\"])\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "y = label.fit_transform(data_test['Sentiment'])\n", "y = to_categorical(y)\n", "#print(y[:5])\n", "max_len = 200\n", "test_input = bert_encode(data_test.cleanText.values, tokenizer, max_len=max_len)\n", "test_labels = y" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "dVPPMbANo9OF", "outputId": "0dc7d4ce-71c6-4725-81af-c3c884d21c5b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "202/202 [==============================] - 91s 454ms/step\n", " precision recall f1-score support\n", "\n", " 0 0.99 0.98 0.98 884\n", " 1 1.00 0.99 1.00 3209\n", " 2 0.99 1.00 0.99 2366\n", "\n", " accuracy 0.99 6459\n", " macro avg 0.99 0.99 0.99 6459\n", "weighted avg 0.99 0.99 0.99 6459\n", "\n", "[[ 869 0 15]\n", " [ 8 3189 12]\n", " [ 5 2 2359]]\n", "Accuracy: 0.993497\n", "Precision: 0.993526\n", "Recall: 0.993497\n", "F1 score: 0.993501\n" ] } ], "source": [ "from matplotlib import pyplot as plt #(matplotblib)\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import f1_score\n", "import seaborn as sns #(visualsize)\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix\n", "test_predictions =model.predict(test_input)\n", "test_labels=np.argmax(test_labels, axis=1)\n", "print(classification_report(test_labels,np.argmax(test_predictions,axis=1)))\n", "print(confusion_matrix(test_labels,np.argmax(test_predictions,axis=1)))\n", "accuracy = accuracy_score(test_labels,np.argmax(test_predictions,axis=1))\n", "print('Accuracy: %f' % accuracy)\n", "precision = precision_score(test_labels,np.argmax(test_predictions,axis=1),average='weighted')\n", "print('Precision: %f' % precision)\n", "recall = recall_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('Recall: %f' % recall)\n", "f1 = f1_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('F1 score: %f' % f1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "mH4tr1yQo9W-" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "EasZOhjwo9Zg" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "9StZ_GnWn2lo" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 1, "metadata": { "id": "sDgpnIRd16Tb" }, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd\n", "import os\n", "\n", "# import BERT tokenization\n", "\n", "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py\n", "\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6cMy1yQX2uOd", "outputId": "9338c84b-8c2a-45f4-83fc-1f3a7e19c2e5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting tokenization\n", " Downloading tokenization-1.0.7-py3-none-any.whl (10 kB)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.8/dist-packages (from tokenization) (2022.6.2)\n", "Installing collected packages: tokenization\n", "Successfully installed tokenization-1.0.7\n" ] } ], "source": [ "pip install tokenization\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "FH6f2pMP16am", "outputId": "ac8a71b3-f717-4dad-ce5e-9b14ec26cf04" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mounted at /content/drive\n" ] } ], "source": [ "import tokenization\n", "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "from tensorflow.keras.utils import to_categorical\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split\n", "\n", "from google.colab import drive\n", "import os\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "ltRKPKiV16fl" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 4, "metadata": { "id": "EEPZeaS0n2sO" }, "outputs": [], "source": [ "data = pd.read_csv('/content/drive/My Drive/smartphone_withoutpreprocessed_sentiments.csv')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "id": "JyQMxy4wn2u1" }, "outputs": [], "source": [ "data=data.dropna( axis=0, subset=[\"Tweets\"])\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "x = label.fit_transform(data['Sentiment'])\n", "x = to_categorical(x)\n", "\n", "#testing\n", "#label = preprocessing.LabelEncoder()\n", "#y = label.fit_transform(test_data['Sentiment'])\n", "#y = to_categorical(y)\n", "\n", "\n", "m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'\n", "bert_layer = hub.KerasLayer(m_url, trainable=True)\n", "\n", "\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vKEd9x3f3c2H", "outputId": "1a8cb53a-8546-4c4f-e536-91e3f841a9bc" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Collecting bert-tensorflow\n", " Downloading bert_tensorflow-1.0.4-py2.py3-none-any.whl (64 kB)\n", "\u001b[K |████████████████████████████████| 64 kB 2.0 MB/s \n", "\u001b[?25hRequirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from bert-tensorflow) (1.15.0)\n", "Installing collected packages: bert-tensorflow\n", "Successfully installed bert-tensorflow-1.0.4\n" ] } ], "source": [ "pip install bert-tensorflow" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "id": "tIEF8E_en2xP" }, "outputs": [], "source": [ "import sys\n", "from absl import flags\n", "sys.argv=['preserve_unused_tokens=False']\n", "flags.FLAGS(sys.argv)\n", "\n", "from bert import tokenization" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "id": "iK2j_L-6n2zx" }, "outputs": [], "source": [ "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n", "do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n", "tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)\n", "\n", "def bert_encode(texts, tokenizer, max_len=512):\n", " all_tokens = []\n", " all_masks = []\n", " all_segments = []\n", " \n", " for text in texts:\n", " text = tokenizer.tokenize(text)\n", " \n", " text = text[:max_len-2]\n", " input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n", " pad_len = max_len-len(input_sequence)\n", " \n", " tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len\n", " pad_masks = [1] * len(input_sequence) + [0] * pad_len\n", " segment_ids = [0] * max_len\n", " \n", " all_tokens.append(tokens)\n", " all_masks.append(pad_masks)\n", " all_segments.append(segment_ids)\n", " \n", " return np.array(all_tokens), np.array(all_masks), np.array(all_segments)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "id": "tDfeXu45n22H" }, "outputs": [], "source": [ "def build_model(bert_layer, max_len=512):\n", " input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n", " input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n", " segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n", " \n", " pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n", " \n", " clf_output = sequence_output[:, 0, :]\n", " \n", " lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " lay = tf.keras.layers.Dense(32, activation='relu')(lay)\n", " lay = tf.keras.layers.Dense(16, activation='relu')(lay)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " out = tf.keras.layers.Dense(3, activation='softmax')(lay)\n", " \n", " model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n", " model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])\n", " \n", " return model" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4SQy8-M8n24t", "outputId": "d29f1a8e-78e7-4006-b765-86f99147809f" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Negative' 'Neutral' 'Positive']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/adam.py:110: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.\n", " super(Adam, self).__init__(name, **kwargs)\n" ] } ], "source": [ "max_len = 200\n", "data_input = bert_encode(data.Tweets.values, tokenizer, max_len=max_len)\n", "data_labels = x\n", "labels = label.classes_\n", "print(labels)\n", "model = build_model(bert_layer, max_len=max_len)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "mEHxaoxXn263", "outputId": "3e967cba-94bf-4aeb-ecbf-d63b57c312b7" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "1621/1621 [==============================] - 1209s 737ms/step - loss: 0.4848 - accuracy: 0.8187 - val_loss: 0.2592 - val_accuracy: 0.9190\n", "Epoch 2/10\n", "1621/1621 [==============================] - 1213s 748ms/step - loss: 0.2104 - accuracy: 0.9370 - val_loss: 0.3539 - val_accuracy: 0.9206\n", "Epoch 3/10\n", "1621/1621 [==============================] - 1210s 747ms/step - loss: 0.1402 - accuracy: 0.9618 - val_loss: 0.2676 - val_accuracy: 0.9388\n", "Epoch 4/10\n", "1621/1621 [==============================] - 1213s 748ms/step - loss: 0.1036 - accuracy: 0.9718 - val_loss: 0.2405 - val_accuracy: 0.9531\n", "Epoch 5/10\n", "1621/1621 [==============================] - 1216s 750ms/step - loss: 0.0742 - accuracy: 0.9808 - val_loss: 0.2299 - val_accuracy: 0.9520\n", "Epoch 6/10\n", "1621/1621 [==============================] - 1216s 750ms/step - loss: 0.0593 - accuracy: 0.9849 - val_loss: 0.3030 - val_accuracy: 0.9540\n", "Epoch 7/10\n", "1621/1621 [==============================] - 1216s 750ms/step - loss: 0.0486 - accuracy: 0.9863 - val_loss: 0.3550 - val_accuracy: 0.9510\n", "Epoch 8/10\n", "1621/1621 [==============================] - 1216s 750ms/step - loss: 0.0379 - accuracy: 0.9890 - val_loss: 0.3728 - val_accuracy: 0.9502\n", "Epoch 9/10\n", "1621/1621 [==============================] - 1217s 751ms/step - loss: 0.0364 - accuracy: 0.9904 - val_loss: 0.3250 - val_accuracy: 0.9488\n", "1621/1621 [==============================] - 1217s 751ms/step - loss: 0.0364 - accuracy: 0.9904 - val_loss: 0.3250 - val_accuracy: 0.9488\n", "Epoch 10/10\n", "Epoch 10/10\n", "1621/1621 [==============================] - 1216s 750ms/step - loss: 0.0275 - accuracy: 0.9922 - val_loss: 0.3902 - val_accuracy: 0.9488\n", "1621/1621 [==============================] - 1216s 750ms/step - loss: 0.0275 - accuracy: 0.9922 - val_loss: 0.3902 - val_accuracy: 0.9488\n" ] } ], "source": [ "#checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)\n", "#earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)\n", "train_sh = model.fit(\n", " data_input, data_labels,\n", " validation_split=0.20,\n", " epochs=10,\n", " #callbacks=[checkpoint, earlystopping],\n", " batch_size=16\n", ")" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "id": "15V1u4u7n29G" }, "outputs": [], "source": [ "data_test = pd.read_csv('/content/drive/My Drive/smartphone_withoutpreprocessed_sentiments_test.csv')\n", "data_test=data_test.dropna( axis=0, subset=[\"Tweets\"])\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "y = label.fit_transform(data_test['Sentiment'])\n", "y = to_categorical(y)\n", "#print(y[:5])\n", "max_len = 200\n", "test_input = bert_encode(data_test.Tweets.values, tokenizer, max_len=max_len)\n", "test_labels = y" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "toweEzDCn3Bp", "outputId": "bd3a2541-9fca-4d37-baf5-7797191f6280" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "203/203 [==============================] - 92s 455ms/step\n", " precision recall f1-score support\n", "\n", " 0 0.96 0.98 0.97 1095\n", " 1 1.00 0.98 0.99 2411\n", " 2 0.99 0.99 0.99 2978\n", "\n", " accuracy 0.99 6484\n", " macro avg 0.98 0.98 0.98 6484\n", "weighted avg 0.99 0.99 0.99 6484\n", "\n", "[[1068 4 23]\n", " [ 25 2365 21]\n", " [ 14 5 2959]]\n", "Accuracy: 0.985811\n", "Precision: 0.985911\n", "Recall: 0.985811\n", "F1 score: 0.985826\n" ] } ], "source": [ "from matplotlib import pyplot as plt #(matplotblib)\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import f1_score\n", "import seaborn as sns #(visualsize)\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix\n", "test_predictions =model.predict(test_input)\n", "test_labels=np.argmax(test_labels, axis=1)\n", "print(classification_report(test_labels,np.argmax(test_predictions,axis=1)))\n", "print(confusion_matrix(test_labels,np.argmax(test_predictions,axis=1)))\n", "accuracy = accuracy_score(test_labels,np.argmax(test_predictions,axis=1))\n", "print('Accuracy: %f' % accuracy)\n", "precision = precision_score(test_labels,np.argmax(test_predictions,axis=1),average='weighted')\n", "print('Precision: %f' % precision)\n", "recall = recall_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('Recall: %f' % recall)\n", "f1 = f1_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('F1 score: %f' % f1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "8FYB11mEn3EP" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 15, "metadata": { "id": "zfZHmiV5z_8N" }, "outputs": [], "source": [ "import numpy as np \n", "import pandas as pd\n", "import os" ] }, { "cell_type": "code", "execution_count": 16, "metadata": { "id": "y0o0Nbm_z__P" }, "outputs": [], "source": [ "# import BERT tokenization\n", "\n", "!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "iHMINQxB0QDP", "outputId": "8f40f252-794a-4522-ce4c-44a63da7e832" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: tokenization in /usr/local/lib/python3.8/dist-packages (1.0.7)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.8/dist-packages (from tokenization) (2022.6.2)\n" ] } ], "source": [ "pip install tokenization" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "rG6hmVnb0ACU", "outputId": "e47511a9-931e-49da-a38a-62a2c20f6679" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ] } ], "source": [ "import tokenization\n", "import tensorflow as tf\n", "import tensorflow_hub as hub\n", "from tensorflow.keras.utils import to_categorical\n", "from sklearn import preprocessing\n", "from sklearn.model_selection import train_test_split\n", "\n", "from google.colab import drive\n", "import os\n", "drive.mount('/content/drive')" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "gb7B7SKzy4o9" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "x-QM02jTn3GL" }, "outputs": [], "source": [ "data = pd.read_csv('/content/drive/My Drive/crypto_10k_tweets_preprocessed_sentiments.csv')" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "xN17BFN2alVH" }, "outputs": [], "source": [ "data=data.dropna( axis=0, subset=[\"cleanText\"])\n", "\n", "\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "x = label.fit_transform(data['Sentiment'])\n", "x = to_categorical(x)\n", "\n", "\n", "m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'\n", "bert_layer = hub.KerasLayer(m_url, trainable=True)\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wKNH9QpcalXm", "outputId": "ae344e0b-9b2d-4661-d7a5-49bad94e0a1c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", "Requirement already satisfied: bert-tensorflow in /usr/local/lib/python3.8/dist-packages (1.0.4)\n", "Requirement already satisfied: six in /usr/local/lib/python3.8/dist-packages (from bert-tensorflow) (1.15.0)\n" ] } ], "source": [ "pip install bert-tensorflow" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "id": "GGV3Sx9OalaF" }, "outputs": [], "source": [ "import sys\n", "from absl import flags\n", "sys.argv=['preserve_unused_tokens=False']\n", "flags.FLAGS(sys.argv)\n", "\n", "from bert import tokenization" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "id": "57JO-17calc_" }, "outputs": [], "source": [ "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n", "do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n", "tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)\n", "\n", "def bert_encode(texts, tokenizer, max_len=512):\n", " all_tokens = []\n", " all_masks = []\n", " all_segments = []\n", " \n", " for text in texts:\n", " text = tokenizer.tokenize(text)\n", " \n", " text = text[:max_len-2]\n", " input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n", " pad_len = max_len-len(input_sequence)\n", " \n", " tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len\n", " pad_masks = [1] * len(input_sequence) + [0] * pad_len\n", " segment_ids = [0] * max_len\n", " \n", " all_tokens.append(tokens)\n", " all_masks.append(pad_masks)\n", " all_segments.append(segment_ids)\n", " \n", " return np.array(all_tokens), np.array(all_masks), np.array(all_segments)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "IgRSlL7Pa2pI" }, "outputs": [], "source": [ "def build_model(bert_layer, max_len=512):\n", " input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n", " input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n", " segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n", " \n", " pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n", " \n", " clf_output = sequence_output[:, 0, :]\n", " \n", " lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " lay = tf.keras.layers.Dense(32, activation='relu')(lay)\n", " lay = tf.keras.layers.Dense(16, activation='relu')(lay)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " out = tf.keras.layers.Dense(3, activation='softmax')(lay)\n", " \n", " model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n", " model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])\n", " \n", " return model" ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "MFMt1g6Za2su", "outputId": "8ce02379-d656-4f92-c205-5b5d836f6014" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Negative' 'Neutral' 'Positive']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/adam.py:110: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.\n", " super(Adam, self).__init__(name, **kwargs)\n" ] } ], "source": [ "max_len = 200\n", "data_input = bert_encode(data.cleanText.values, tokenizer, max_len=max_len)\n", "data_labels = x\n", "\n", "\n", "labels = label.classes_\n", "print(labels)\n", "\n", "model = build_model(bert_layer, max_len=max_len)\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-ncLBQbga2vr", "outputId": "682df943-c08c-4ade-bf16-ba5dd1396f9c" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "459/459 [==============================] - 340s 716ms/step - loss: 0.5579 - accuracy: 0.7723 - val_loss: 0.3138 - val_accuracy: 0.8938\n", "Epoch 2/10\n", "459/459 [==============================] - 340s 742ms/step - loss: 0.2360 - accuracy: 0.9204 - val_loss: 0.1810 - val_accuracy: 0.9597\n", "Epoch 3/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.1236 - accuracy: 0.9657 - val_loss: 0.1556 - val_accuracy: 0.9608\n", "Epoch 4/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.0798 - accuracy: 0.9786 - val_loss: 0.2027 - val_accuracy: 0.9581\n", "Epoch 5/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.0728 - accuracy: 0.9824 - val_loss: 0.1801 - val_accuracy: 0.9662\n", "Epoch 6/10\n", "459/459 [==============================] - 340s 742ms/step - loss: 0.0408 - accuracy: 0.9910 - val_loss: 0.2025 - val_accuracy: 0.9630\n", "Epoch 7/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.0491 - accuracy: 0.9884 - val_loss: 0.2965 - val_accuracy: 0.9504\n", "Epoch 8/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.0502 - accuracy: 0.9903 - val_loss: 0.1991 - val_accuracy: 0.9684\n", "Epoch 9/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.0298 - accuracy: 0.9931 - val_loss: 0.2223 - val_accuracy: 0.9706\n", "Epoch 10/10\n", "459/459 [==============================] - 341s 743ms/step - loss: 0.0248 - accuracy: 0.9955 - val_loss: 0.2224 - val_accuracy: 0.9684\n" ] } ], "source": [ "#checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)\n", "#earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)\n", "train_sh2 = model.fit(\n", " data_input, data_labels,\n", " validation_split=0.20,\n", " epochs=10,\n", " #callbacks=[checkpoint, earlystopping],\n", " batch_size=16\n", ")" ] }, { "cell_type": "code", "execution_count": 28, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nFJquVpha2zI", "outputId": "897bd6c5-ab2a-4449-d498-98438188ad6b" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "58/58 [==============================] - 25s 441ms/step\n", " precision recall f1-score support\n", "\n", " 0 0.95 0.98 0.96 170\n", " 1 1.00 1.00 1.00 1232\n", " 2 1.00 0.98 0.99 438\n", "\n", " accuracy 0.99 1840\n", " macro avg 0.98 0.99 0.98 1840\n", "weighted avg 0.99 0.99 0.99 1840\n", "\n", "[[ 166 3 1]\n", " [ 3 1229 0]\n", " [ 6 1 431]]\n", "Accuracy: 0.992391\n", "Precision: 0.992525\n", "Recall: 0.992391\n", "F1 score: 0.992428\n" ] } ], "source": [ "data_test = pd.read_csv('/content/drive/My Drive/crypto_10k_tweets_preprocessed_sentiments_test.csv')\n", "data_test=data_test.dropna( axis=0, subset=[\"cleanText\"])\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "y = label.fit_transform(data_test['Sentiment'])\n", "y = to_categorical(y)\n", "#print(y[:5])\n", "max_len = 200\n", "test_input = bert_encode(data_test.cleanText.values, tokenizer, max_len=max_len)\n", "test_labels = y\n", "\n", "from matplotlib import pyplot as plt #(matplotblib)\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import f1_score\n", "import seaborn as sns #(visualsize)\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix\n", "test_predictions =model.predict(test_input)\n", "test_labels=np.argmax(test_labels, axis=1)\n", "print(classification_report(test_labels,np.argmax(test_predictions,axis=1)))\n", "print(confusion_matrix(test_labels,np.argmax(test_predictions,axis=1)))\n", "accuracy = accuracy_score(test_labels,np.argmax(test_predictions,axis=1))\n", "print('Accuracy: %f' % accuracy)\n", "precision = precision_score(test_labels,np.argmax(test_predictions,axis=1),average='weighted')\n", "print('Precision: %f' % precision)\n", "recall = recall_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('Recall: %f' % recall)\n", "f1 = f1_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('F1 score: %f' % f1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "7AyD6eVcBGYY" }, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 30, "metadata": { "id": "w_iZnhl3BGiT" }, "outputs": [], "source": [ "data = pd.read_csv('/content/drive/My Drive/crypto_10k_tweets_withoutpreprocessed_sentiments.csv')\n", "data=data.dropna( axis=0, subset=[\"Content\"])\n", "\n", "\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "x = label.fit_transform(data['Sentiment'])\n", "x = to_categorical(x)\n", "\n", "\n", "m_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'\n", "bert_layer = hub.KerasLayer(m_url, trainable=True)\n" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "id": "SJUm2uKmBGrv" }, "outputs": [], "source": [ "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n", "do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n", "tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)\n", "\n", "def bert_encode(texts, tokenizer, max_len=512):\n", " all_tokens = []\n", " all_masks = []\n", " all_segments = []\n", " \n", " for text in texts:\n", " text = tokenizer.tokenize(text)\n", " \n", " text = text[:max_len-2]\n", " input_sequence = [\"[CLS]\"] + text + [\"[SEP]\"]\n", " pad_len = max_len-len(input_sequence)\n", " \n", " tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len\n", " pad_masks = [1] * len(input_sequence) + [0] * pad_len\n", " segment_ids = [0] * max_len\n", " \n", " all_tokens.append(tokens)\n", " all_masks.append(pad_masks)\n", " all_segments.append(segment_ids)\n", " \n", " return np.array(all_tokens), np.array(all_masks), np.array(all_segments)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "id": "R6o4WvhvBGzW" }, "outputs": [], "source": [ "def build_model(bert_layer, max_len=512):\n", " input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_word_ids\")\n", " input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"input_mask\")\n", " segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name=\"segment_ids\")\n", " \n", " pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n", " \n", " clf_output = sequence_output[:, 0, :]\n", " \n", " lay = tf.keras.layers.Dense(64, activation='relu')(clf_output)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " lay = tf.keras.layers.Dense(32, activation='relu')(lay)\n", " lay = tf.keras.layers.Dense(16, activation='relu')(lay)\n", " lay = tf.keras.layers.Dropout(0.2)(lay)\n", " out = tf.keras.layers.Dense(3, activation='softmax')(lay)\n", " \n", " model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)\n", " model.compile(tf.keras.optimizers.Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])\n", " \n", " return model" ] }, { "cell_type": "code", "execution_count": 33, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "vnMWi1pOBG8G", "outputId": "035cc106-b13f-4e52-9fe8-b13b5f398497" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['Negative' 'Neutral' 'Positive']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/adam.py:110: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.\n", " super(Adam, self).__init__(name, **kwargs)\n" ] } ], "source": [ "max_len = 200\n", "data_input = bert_encode(data.Content.values, tokenizer, max_len=max_len)\n", "data_labels = x\n", "\n", "\n", "labels = label.classes_\n", "print(labels)\n", "\n", "model = build_model(bert_layer, max_len=max_len)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "an9KDmHxBHD1", "outputId": "e3fb5ef0-03fa-487f-91ec-7ae2f6039f80" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/10\n", "500/500 [==============================] - 379s 734ms/step - loss: 0.6074 - accuracy: 0.7531 - val_loss: 0.3676 - val_accuracy: 0.8715\n", "Epoch 2/10\n", "500/500 [==============================] - 376s 753ms/step - loss: 0.3014 - accuracy: 0.8991 - val_loss: 0.3438 - val_accuracy: 0.9110\n", "Epoch 3/10\n", "500/500 [==============================] - 377s 754ms/step - loss: 0.1746 - accuracy: 0.9460 - val_loss: 0.2418 - val_accuracy: 0.9400\n", "Epoch 4/10\n", "500/500 [==============================] - 376s 753ms/step - loss: 0.1101 - accuracy: 0.9641 - val_loss: 0.2686 - val_accuracy: 0.9320\n", "Epoch 5/10\n", "500/500 [==============================] - 377s 754ms/step - loss: 0.0745 - accuracy: 0.9787 - val_loss: 0.3562 - val_accuracy: 0.9380\n", "Epoch 6/10\n", "500/500 [==============================] - 376s 753ms/step - loss: 0.0553 - accuracy: 0.9844 - val_loss: 0.3645 - val_accuracy: 0.9330\n", "Epoch 7/10\n", "500/500 [==============================] - 377s 754ms/step - loss: 0.0439 - accuracy: 0.9879 - val_loss: 0.3589 - val_accuracy: 0.9395\n", "Epoch 8/10\n", "500/500 [==============================] - 377s 754ms/step - loss: 0.0345 - accuracy: 0.9902 - val_loss: 0.4334 - val_accuracy: 0.9250\n", "Epoch 9/10\n", "500/500 [==============================] - 377s 753ms/step - loss: 0.0366 - accuracy: 0.9918 - val_loss: 0.3511 - val_accuracy: 0.9335\n", "Epoch 10/10\n", "500/500 [==============================] - 376s 753ms/step - loss: 0.0290 - accuracy: 0.9921 - val_loss: 0.4590 - val_accuracy: 0.9350\n" ] } ], "source": [ "#checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)\n", "#earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)\n", "train_sh3 = model.fit(\n", " data_input, data_labels,\n", " validation_split=0.20,\n", " epochs=10,\n", " #callbacks=[checkpoint, earlystopping],\n", " batch_size=16\n", ")" ] }, { "cell_type": "code", "execution_count": 36, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Hx29ZDSmBdre", "outputId": "452efcf9-661f-4a4c-feac-a434d425b197" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "63/63 [==============================] - 26s 416ms/step\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.92 0.95 169\n", " 1 0.99 0.98 0.99 1226\n", " 2 0.97 0.99 0.98 605\n", "\n", " accuracy 0.98 2000\n", " macro avg 0.97 0.96 0.97 2000\n", "weighted avg 0.98 0.98 0.98 2000\n", "\n", "[[ 156 8 5]\n", " [ 4 1207 15]\n", " [ 1 7 597]]\n", "Accuracy: 0.980000\n", "Precision: 0.980046\n", "Recall: 0.980000\n", "F1 score: 0.979946\n" ] } ], "source": [ "data_test = pd.read_csv('/content/drive/My Drive/crypto_10k_tweets_withoutpreprocessed_sentiments_test.csv')\n", "data_test=data_test.dropna( axis=0, subset=[\"Content\"])\n", "#training\n", "label = preprocessing.LabelEncoder()\n", "y = label.fit_transform(data_test['Sentiment'])\n", "y = to_categorical(y)\n", "#print(y[:5])\n", "max_len = 200\n", "test_input = bert_encode(data_test.Content.values, tokenizer, max_len=max_len)\n", "test_labels = y\n", "\n", "from matplotlib import pyplot as plt #(matplotblib)\n", "from sklearn.metrics import precision_score\n", "from sklearn.metrics import recall_score\n", "from sklearn.metrics import f1_score\n", "import seaborn as sns #(visualsize)\n", "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix\n", "test_predictions =model.predict(test_input)\n", "test_labels=np.argmax(test_labels, axis=1)\n", "print(classification_report(test_labels,np.argmax(test_predictions,axis=1)))\n", "print(confusion_matrix(test_labels,np.argmax(test_predictions,axis=1)))\n", "accuracy = accuracy_score(test_labels,np.argmax(test_predictions,axis=1))\n", "print('Accuracy: %f' % accuracy)\n", "precision = precision_score(test_labels,np.argmax(test_predictions,axis=1),average='weighted')\n", "print('Precision: %f' % precision)\n", "recall = recall_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('Recall: %f' % recall)\n", "f1 = f1_score(test_labels,np.argmax(test_predictions,axis=1), average='weighted')\n", "print('F1 score: %f' % f1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "5Vda2kZ2BdxV" }, "outputs": [], "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "machine_shape": "hm", "provenance": [] }, "gpuClass": "standard", "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 1 }