{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "name": "Employee.ipynb", "provenance": [], "collapsed_sections": [] }, "kernelspec": { "name": "python3", "display_name": "Python 3" } }, "cells": [ { "cell_type": "code", "metadata": { "id": "pCNyopPPqhYl" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "sohdybpH-qIe" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qMycr31tBgRE" }, "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import seaborn as sns" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "KnLIYhusGRWO" }, "source": [ "**Mounting Google Drive**\n" ] }, { "cell_type": "code", "metadata": { "id": "nKqyCo9_xCEC", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "a1c050e1-306f-426a-b30d-44aaa3953979" }, "source": [ "from google.colab import drive\n", "drive.mount('/content/drive')\n" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "C-K07Bx8jHwK" }, "source": [ "dataF = pd.read_csv('/content/drive/MyDrive/sentiments - sentiments.csv')\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 341 }, "id": "A6pLuAnHjQNG", "outputId": "ed13af40-9af3-4fdd-fc32-8f64c7340a80" }, "source": [ "dataF[:2]" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0Unnamed: 0.1companylocationdatesjob-titlesummaryprosconsadvice-to-mgmtoverall-ratingswork-balance-starsculture-values-starscarrer-opportunities-starscomp-benefit-starssenior-mangemnet-starshelpful-countlinktextcleanTextPolarity Scoresentimentsentiments
0035googleDearborn, MINov 20, 2018Current Employee - Google SpecialistGoogleGoogle is one of the beast Company in the worldYou have to work hardnone5344541https://www.glassdoor.com/Reviews/Google-Revie...Google Google is one of the beast Company in t...google google one beast company world work har...-0.291667-1-1
1168googlenoneNov 19, 2018Former Employee - Anonymous EmployeeEngineersalary was fine but in line with other companieswork was boring. I disliked it.none3332320https://www.glassdoor.com/Reviews/Google-Revie...Engineer salary was fine but in line with othe...engineer salary fine line companies work borin...-0.261111-1-1
\n", "
" ], "text/plain": [ " Unnamed: 0 Unnamed: 0.1 company ... Polarity Score sentiment sentiments\n", "0 0 35 google ... -0.291667 -1 -1\n", "1 1 68 google ... -0.261111 -1 -1\n", "\n", "[2 rows x 23 columns]" ] }, "metadata": { "tags": [] }, "execution_count": 8 } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "F8X35HdieN8i", "outputId": "cf40a51f-c4a6-4914-a899-3bf086fc6d42" }, "source": [ "from collections import Counter\n", "Counter(dataF[\"sentiment\"])" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Counter({-1: 5064, 1: 5064})" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "vytJqu8PtdaD" }, "source": [ "#Ensemble Model" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OqNZIv-zmALk" }, "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(dataF[\"cleanText\"],dataF[\"sentiment\"].astype(int),test_size=0.25, random_state=20,shuffle=True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kz1iMpL5mAOz" }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "# learn training data vocabulary, then use it to create a document-term matrix\n", "vect = CountVectorizer()\n", "# 3. fit\n", "# 4. transform training data\n", "X_train_dtf = vect.fit_transform(X_train)\n", "X_test_dtf = vect.transform(X_test)\n", "\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nooxgOYTwyKV", "outputId": "2dbd578c-9842-4075-c4d8-ad32897e3336" }, "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer()\n", "X_train_tf = vectorizer.fit_transform(X_train)\n", "X_test_tf = vectorizer.transform(X_test)\n", "X_test_tf.toarray()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "array([[0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " ...,\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.],\n", " [0., 0., 0., ..., 0., 0., 0.]])" ] }, "metadata": { "tags": [] }, "execution_count": 40 } ] }, { "cell_type": "code", "metadata": { "id": "tw864xpB2qtE" }, "source": [ "pip install zeugma\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nGvQaKa82omK", "outputId": "b4b7f76e-f0c4-4793-ba91-b32727cceb48" }, "source": [ "from zeugma.embeddings import EmbeddingTransformer\n", "glove = EmbeddingTransformer('glove')\n", "X_train_g = glove.fit_transform(X_train)\n", "X_test_g = glove.transform(X_test)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[==================================================] 100.0% 104.8/104.8MB downloaded\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "6jMm1KmnmPDy" }, "source": [ "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ej44cwSbjMea", "outputId": "b417064a-9787-4489-acd9-c90d36f3467e" }, "source": [ "from sklearn.linear_model import LogisticRegression\n", "a=LogisticRegression(random_state=1000, solver='liblinear',multi_class='ovr',C=1.0)\n", "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n", "from sklearn.linear_model import SGDClassifier\n", "clf2= SGDClassifier(max_iter=1000, tol=1e-3)\n", "clf1= SVC(kernel='linear', C=1.0, random_state=500)\n", "#rfc = RandomForestClassifier(n_estimators=300, random_state=2,max_depth=300) \n", "eclf1 = VotingClassifier(estimators=[('a', a),('svc', clf1), ('lr', clf2)], voting='hard')\n", "eclf1.fit(X_train_g,y_train)\n", "etc_pred=eclf1.predict(X_test_g)\n", "print(accuracy_score(y_test,etc_pred))\n", "print(classification_report(y_test,etc_pred))\n", "print(confusion_matrix(y_test,etc_pred))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "0.8274091627172195\n", " precision recall f1-score support\n", "\n", " -1 0.83 0.83 0.83 1289\n", " 1 0.82 0.82 0.82 1243\n", "\n", " accuracy 0.83 2532\n", " macro avg 0.83 0.83 0.83 2532\n", "weighted avg 0.83 0.83 0.83 2532\n", "\n", "[[1072 217]\n", " [ 220 1023]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "F0FIHWCdjMhb" }, "source": [ "from sklearn.linear_model import LogisticRegression\n", "a=LogisticRegression(random_state=1000, solver='liblinear',multi_class='ovr',C=1.0)\n", "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n", "from sklearn.linear_model import SGDClassifier\n", "clf2= SGDClassifier(max_iter=1000, tol=1e-3)\n", "clf1= SVC(kernel='linear', C=1.0, random_state=500)\n", "#rfc = RandomForestClassifier(n_estimators=300, random_state=2,max_depth=300) \n", "eclf1 = VotingClassifier(estimators=[('a', a),('svc', clf1), ('lr', clf2)], voting='hard')\n", "eclf1.fit(X_train_dtf,y_train)\n", "etc_pred=eclf1.predict(X_test_dtf)\n", "print(accuracy_score(y_test,etc_pred))\n", "print(classification_report(y_test,etc_pred))\n", "print(confusion_matrix(y_test,etc_pred))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QpBc1-pWtkvR" }, "source": [ "from sklearn.linear_model import LogisticRegression\n", "a=LogisticRegression(random_state=1000, solver='liblinear',multi_class='ovr',C=1.0)\n", "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n", "from sklearn.linear_model import SGDClassifier\n", "clf2= SGDClassifier(max_iter=1000, tol=1e-3)\n", "clf1= SVC(kernel='linear', C=1.0, random_state=500)\n", "#rfc = RandomForestClassifier(n_estimators=300, random_state=2,max_depth=300) \n", "eclf1 = VotingClassifier(estimators=[('a', a),('svc', clf1), ('lr', clf2)], voting='hard')\n", "eclf1.fit(X_train_g,y_train)\n", "etc_pred=eclf1.predict(X_test_g)\n", "print(accuracy_score(y_test,etc_pred))\n", "print(classification_report(y_test,etc_pred))\n", "print(confusion_matrix(y_test,etc_pred))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SUsACLJhzjcT", "outputId": "7f3c0921-8f9c-447e-e821-8bb3a7202086" }, "source": [ "from sklearn import linear_model\n", "clf = SVC()\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_tf, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_tf)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "0.9620853080568721\n", " precision recall f1-score support\n", "\n", " -1 0.95 0.98 0.96 1289\n", " 1 0.98 0.95 0.96 1243\n", "\n", " accuracy 0.96 2532\n", " macro avg 0.96 0.96 0.96 2532\n", "weighted avg 0.96 0.96 0.96 2532\n", "\n", "[[1259 30]\n", " [ 66 1177]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "xAIFuLb-3jRY" }, "source": [ "from sklearn import linear_model\n", "clf = SVC()\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_dtf, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_dtf)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "jLkJ8s9K3k3n" }, "source": [ "from sklearn import linear_model\n", "clf = SVC()\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_tf, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_tf)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7eO19gow3qjO" }, "source": [ "from sklearn import linear_model\n", "clf = SVC()\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_g, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_g)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "2O8M-WFm3qpE" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pB7DB67Azjff" }, "source": [ "from sklearn import linear_model\n", "clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_dtf, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_dtf)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "UOZxxFFy3wDB" }, "source": [ "from sklearn import linear_model\n", "clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_tf, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_tf)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "ZMDg9y3q3wGM" }, "source": [ "from sklearn import linear_model\n", "clf = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)\n", "# 3. train the model using X_train_dtm\n", "x=clf.fit(X_train_g, y_train)\n", "# 4. make class predictions for X_test_dtm\n", "y_pred_class = clf.predict(X_test_g)\n", "# calculate accuracy\n", "print(accuracy_score(y_test, y_pred_class))\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "NuUt4Dtq3wKI" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "tP5hgZfheOGP" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "qGWxgg_UjR7B" }, "source": [ "df.fillna(\" \",inplace = True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-YIYdmcajgnx" }, "source": [ "df['text'] = df['summary'] + ' ' + df['pros'] + ' ' + df['cons'] + ' ' + df['advice-to-mgmt']" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6jgR1_eqjx7A", "outputId": "3eb39e76-f1e8-4a50-a270-0aebd42ba18e" }, "source": [ "print(df['text'][:2])" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "0 Google Google is one of the beast Company in t...\n", "1 Engineer salary was fine but in line with othe...\n", "Name: text, dtype: object\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gz5Ec4u1j0Yx", "outputId": "95d6b244-859f-40c0-9ee6-de0db565a8f8" }, "source": [ "import nltk\n", "from nltk.tokenize import RegexpTokenizer\n", "from nltk.stem import WordNetLemmatizer,PorterStemmer\n", "from nltk.corpus import stopwords\n", "import re\n", "nltk.download('stopwords')\n", "nltk.download('wordnet')\n", "lemmatizer = WordNetLemmatizer()\n", "stemmer = PorterStemmer() \n", "\n", "def preprocess(sentence):\n", " sentence=str(sentence)\n", " sentence = sentence.lower()\n", " sentence=sentence.replace('{html}',\"\") \n", " cleanr = re.compile('<.*?>')\n", " cleantext = re.sub(cleanr, '', sentence)\n", " rem_url=re.sub(r'http\\S+', '',cleantext)\n", " rem_num = re.sub('[0-9]+', '', rem_url)\n", " tokenizer = RegexpTokenizer(r'\\w+')\n", " tokens = tokenizer.tokenize(rem_num) \n", " filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]\n", " stem_words=[stemmer.stem(w) for w in filtered_words]\n", " lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]\n", " return \" \".join(filtered_words)\n", "\n", "\n", "df['cleanText']=df['text'].map(lambda s:preprocess(s))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", "[nltk_data] Package stopwords is already up-to-date!\n", "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "ujO7VbmTanBD" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "QsEOsq8uaolX" }, "source": [ "df4=df4[0:5064]\n", "dataF=df5.append(df4,ignore_index = True) \n", "dataF=dataF.append(df3,ignore_index = True)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "PdlEwianarpr" }, "source": [ "dataF.to_csv(\"/content/drive/My Drive/sentiment.csv\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "1YTgfDKec9Rb", "outputId": "fa607c42-d55d-4c25-9703-6fc9c5ea71a7" }, "source": [ "df['sentiments'].value_counts()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "-1 5064\n", " 1 5064\n", "Name: sentiments, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 9 } ] }, { "cell_type": "code", "metadata": { "id": "h971RH-pNy_o" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "IMjRmXPs2vkp" }, "source": [ "GloVe\n" ] }, { "cell_type": "code", "metadata": { "id": "wnpo0t1x2uPO" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(dataF['cleanText'], dataF['sentiments'], test_size=0.25)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2rH3tY3y3Rm6", "outputId": "a918a75a-c503-4556-eed6-bc55d700c749" }, "source": [ "pip install zeugma" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Collecting zeugma\n", " Downloading https://files.pythonhosted.org/packages/59/38/8f57f83719027e36a61238abe1cafa55d257eaaf8e9185b2adbb5a928308/zeugma-0.48.tar.gz\n", "Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.7/dist-packages (from zeugma) (1.19.5)\n", "Requirement already satisfied: Cython>=0.27.3 in /usr/local/lib/python3.7/dist-packages (from zeugma) (0.29.22)\n", "Requirement already satisfied: pandas>=0.20.3 in /usr/local/lib/python3.7/dist-packages (from zeugma) (1.1.5)\n", "Requirement already satisfied: gensim>=3.5.0 in /usr/local/lib/python3.7/dist-packages (from zeugma) (3.6.0)\n", "Requirement already satisfied: scikit_learn>=0.19.1 in /usr/local/lib/python3.7/dist-packages (from zeugma) (0.22.2.post1)\n", "Requirement already satisfied: tensorflow>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from zeugma) (2.4.1)\n", "Requirement already satisfied: keras>=2.1.3 in /usr/local/lib/python3.7/dist-packages (from zeugma) (2.4.3)\n", "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.3->zeugma) (2.8.1)\n", "Requirement already satisfied: pytz>=2017.2 in /usr/local/lib/python3.7/dist-packages (from pandas>=0.20.3->zeugma) (2018.9)\n", "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.7/dist-packages (from gensim>=3.5.0->zeugma) (5.0.0)\n", "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.7/dist-packages (from gensim>=3.5.0->zeugma) (1.4.1)\n", "Requirement already satisfied: six>=1.5.0 in /usr/local/lib/python3.7/dist-packages (from gensim>=3.5.0->zeugma) (1.15.0)\n", "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.7/dist-packages (from scikit_learn>=0.19.1->zeugma) (1.0.1)\n", "Requirement already satisfied: opt-einsum~=3.3.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (3.3.0)\n", "Requirement already satisfied: wrapt~=1.12.1 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (1.12.1)\n", "Requirement already satisfied: h5py~=2.10.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (2.10.0)\n", "Requirement already satisfied: keras-preprocessing~=1.1.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (1.1.2)\n", "Requirement already satisfied: typing-extensions~=3.7.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (3.7.4.3)\n", "Requirement already satisfied: grpcio~=1.32.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (1.32.0)\n", "Requirement already satisfied: astunparse~=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (1.6.3)\n", "Requirement already satisfied: termcolor~=1.1.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (1.1.0)\n", "Requirement already satisfied: wheel~=0.35 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (0.36.2)\n", "Requirement already satisfied: tensorflow-estimator<2.5.0,>=2.4.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (2.4.0)\n", "Requirement already satisfied: google-pasta~=0.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (0.2.0)\n", "Requirement already satisfied: protobuf>=3.9.2 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (3.12.4)\n", "Requirement already satisfied: gast==0.3.3 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (0.3.3)\n", "Requirement already satisfied: absl-py~=0.10 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (0.12.0)\n", "Requirement already satisfied: tensorboard~=2.4 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (2.4.1)\n", "Requirement already satisfied: flatbuffers~=1.12.0 in /usr/local/lib/python3.7/dist-packages (from tensorflow>=1.5.0->zeugma) (1.12)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.7/dist-packages (from keras>=2.1.3->zeugma) (3.13)\n", "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from protobuf>=3.9.2->tensorflow>=1.5.0->zeugma) (56.0.0)\n", "Requirement already satisfied: markdown>=2.6.8 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (3.3.4)\n", "Requirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (0.4.4)\n", "Requirement already satisfied: werkzeug>=0.11.15 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (1.0.1)\n", "Requirement already satisfied: google-auth<2,>=1.6.3 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (1.28.1)\n", "Requirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (1.8.0)\n", "Requirement already satisfied: requests<3,>=2.21.0 in /usr/local/lib/python3.7/dist-packages (from tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (2.23.0)\n", "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.7/dist-packages (from markdown>=2.6.8->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (3.10.1)\n", "Requirement already satisfied: requests-oauthlib>=0.7.0 in /usr/local/lib/python3.7/dist-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (1.3.0)\n", "Requirement already satisfied: pyasn1-modules>=0.2.1 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (0.2.8)\n", "Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (4.2.1)\n", "Requirement already satisfied: rsa<5,>=3.1.4; python_version >= \"3.6\" in /usr/local/lib/python3.7/dist-packages (from google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (4.7.2)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (2020.12.5)\n", "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (2.10)\n", "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (3.0.4)\n", "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3,>=2.21.0->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (1.24.3)\n", "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata; python_version < \"3.8\"->markdown>=2.6.8->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (3.4.1)\n", "Requirement already satisfied: oauthlib>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (3.1.0)\n", "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /usr/local/lib/python3.7/dist-packages (from pyasn1-modules>=0.2.1->google-auth<2,>=1.6.3->tensorboard~=2.4->tensorflow>=1.5.0->zeugma) (0.4.8)\n", "Building wheels for collected packages: zeugma\n", " Building wheel for zeugma (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for zeugma: filename=zeugma-0.48-cp37-none-any.whl size=8778 sha256=d6eeb779f506a10419abe3c6a39d1e3b49e73a0247e231c7c2fffb3de16f97a9\n", " Stored in directory: /root/.cache/pip/wheels/6a/b5/bc/5183ac478b0071d04d3ed0c0dd4a43db94c5c8ffb317b5eb53\n", "Successfully built zeugma\n", "Installing collected packages: zeugma\n", "Successfully installed zeugma-0.48\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "vw9KqlRR2uTM" }, "source": [ "from zeugma.embeddings import EmbeddingTransformer\n", "glove = EmbeddingTransformer('glove')\n", "X_train_tf = glove.fit_transform(X_train)\n", "X_test_tf = glove.transform(X_test)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "OV_sPMTz5GfZ" }, "source": [ "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3SPA-4EG5Gfj", "outputId": "c5d766fb-fad9-4058-ec27-196ee6c0f1ff" }, "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "print(\"RF\")\n", "rfc = RandomForestClassifier(n_estimators=100, random_state=50,max_depth=250) \n", "rfc.fit(X_train_tf, y_train)\n", "# calculate accuracy of class predictions\n", "y_pred_class = rfc.predict(X_test_tf)\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "RF\n", " precision recall f1-score support\n", "\n", " -1 0.81 0.84 0.82 1261\n", " 1 0.83 0.80 0.82 1271\n", "\n", " accuracy 0.82 2532\n", " macro avg 0.82 0.82 0.82 2532\n", "weighted avg 0.82 0.82 0.82 2532\n", "\n", "[[1055 206]\n", " [ 255 1016]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qZWc-4wC5K2t", "outputId": "d043334f-d5c1-409a-ecdc-e6575e5555bb" }, "source": [ "from sklearn.linear_model import LogisticRegression\n", "print(\"LR\")\n", "logreg = LogisticRegression(random_state=100,multi_class='ovr',C=3)\n", "x=logreg.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "LR\n", " precision recall f1-score support\n", "\n", " -1 0.81 0.83 0.82 1261\n", " 1 0.83 0.81 0.82 1271\n", "\n", " accuracy 0.82 2532\n", " macro avg 0.82 0.82 0.82 2532\n", "weighted avg 0.82 0.82 0.82 2532\n", "\n", "[[1051 210]\n", " [ 245 1026]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3ygL1Pz65K2w", "outputId": "7d49f4b4-8ee9-4715-82f5-eade5c9df81f" }, "source": [ "from sklearn.ensemble import ExtraTreesClassifier\n", "etc = ExtraTreesClassifier(n_estimators=200, random_state=50, max_depth=150)\n", "x=etc.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " -1 0.81 0.84 0.82 1261\n", " 1 0.83 0.81 0.82 1271\n", "\n", " accuracy 0.82 2532\n", " macro avg 0.82 0.82 0.82 2532\n", "weighted avg 0.82 0.82 0.82 2532\n", "\n", "[[1054 207]\n", " [ 243 1028]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4y1_0ma35Sty", "outputId": "a1990dbb-99e1-47ec-f89b-7031b9ca8798" }, "source": [ "from sklearn.neural_network import MLPClassifier\n", "mlp = MLPClassifier(random_state=20, max_iter=300)\n", "x=mlp.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " -1 0.81 0.87 0.84 1261\n", " 1 0.86 0.80 0.83 1271\n", "\n", " accuracy 0.83 2532\n", " macro avg 0.84 0.83 0.83 2532\n", "weighted avg 0.84 0.83 0.83 2532\n", "\n", "[[1093 168]\n", " [ 252 1019]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "njIIlrZ15Stz", "outputId": "4e220f20-a0a3-4119-b5af-93eaaa8a7be9" }, "source": [ "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", "from sklearn.svm import SVC\n", "print(\"ADA\")\n", "xgb = AdaBoostClassifier(n_estimators=300, random_state=50)\n", "xgb_pred = xgb.fit(X_train_tf, y_train).predict(X_test_tf) \n", "print(classification_report(y_test,xgb_pred))\n", "print(confusion_matrix(y_test,xgb_pred))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ADA\n", " precision recall f1-score support\n", "\n", " -1 0.82 0.81 0.81 1261\n", " 1 0.81 0.82 0.81 1271\n", "\n", " accuracy 0.81 2532\n", " macro avg 0.81 0.81 0.81 2532\n", "weighted avg 0.81 0.81 0.81 2532\n", "\n", "[[1016 245]\n", " [ 230 1041]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "6bse-Qm62uWT" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "8n2JJ2JN2uZF" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "LGjpXSmq2ucf" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Fc-5cW-e2ufp" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "oDyQAYJ92ujJ" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "KyI8FtVj2ul9" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "JPlJYLHl2upn" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "_6Q3BTSP2usE" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "LuSaMEgs2uwK" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "nvR46EMt2uyh" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7jtzI86b2u1W" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "kDw7e2Ii2fDz" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "DtcHnWSz2UuF" }, "source": [ "\n", "Bag of Words" ] }, { "cell_type": "code", "metadata": { "id": "0sPu8YKQdNPU" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(dataF['cleanText'], dataF['sentiments'], test_size=0.25)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "sMK16CKSk3S3" }, "source": [ "from sklearn.feature_extraction.text import CountVectorizer\n", "cv=CountVectorizer()\n", "X_train_tf = cv.fit_transform(X_train)\n", "X_test_tf=cv.transform(X_test)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pNQSLIL1l4jP" }, "source": [ "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "a74GWWfqlFul", "outputId": "e23b7380-dad3-4f82-de26-dc5b9e787263" }, "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "print(\"RF\")\n", "rfc = RandomForestClassifier(n_estimators=100, random_state=50,max_depth=250) \n", "rfc.fit(X_train_tf, y_train)\n", "# calculate accuracy of class predictions\n", "y_pred_class = rfc.predict(X_test_tf)\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "RF\n", " precision recall f1-score support\n", "\n", " -1 0.96 0.92 0.94 1260\n", " 1 0.92 0.96 0.94 1272\n", "\n", " accuracy 0.94 2532\n", " macro avg 0.94 0.94 0.94 2532\n", "weighted avg 0.94 0.94 0.94 2532\n", "\n", "[[1159 101]\n", " [ 51 1221]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nm1wxG_OlT4Q", "outputId": "051ef95c-04bc-4050-e72d-078f9429a948" }, "source": [ "from sklearn.linear_model import LogisticRegression\n", "print(\"LR\")\n", "logreg = LogisticRegression(random_state=100,multi_class='ovr',C=3)\n", "x=logreg.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "LR\n", " precision recall f1-score support\n", "\n", " -1 0.96 0.97 0.96 1260\n", " 1 0.97 0.96 0.96 1272\n", "\n", " accuracy 0.96 2532\n", " macro avg 0.96 0.96 0.96 2532\n", "weighted avg 0.96 0.96 0.96 2532\n", "\n", "[[1217 43]\n", " [ 56 1216]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "4S-Of9SUs9Il", "outputId": "4f9b8007-9a67-44d1-cfc3-b5913903c7fe" }, "source": [ "from sklearn.ensemble import ExtraTreesClassifier\n", "etc = ExtraTreesClassifier(n_estimators=200, random_state=50, max_depth=150)\n", "x=etc.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " -1 0.96 0.92 0.94 1260\n", " 1 0.92 0.96 0.94 1272\n", "\n", " accuracy 0.94 2532\n", " macro avg 0.94 0.94 0.94 2532\n", "weighted avg 0.94 0.94 0.94 2532\n", "\n", "[[1160 100]\n", " [ 47 1225]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "lMjUDL3ZtHze", "outputId": "c82dd6c3-6288-44f0-8c6c-3eafa5517ec3" }, "source": [ "from sklearn.neural_network import MLPClassifier\n", "mlp = MLPClassifier(random_state=20, max_iter=300)\n", "x=mlp.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " -1 0.96 0.96 0.96 1260\n", " 1 0.96 0.96 0.96 1272\n", "\n", " accuracy 0.96 2532\n", " macro avg 0.96 0.96 0.96 2532\n", "weighted avg 0.96 0.96 0.96 2532\n", "\n", "[[1207 53]\n", " [ 56 1216]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "OrqwKe4MuZvP", "outputId": "1e384d0f-e300-4317-c46a-8234d0baa5da" }, "source": [ "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", "from sklearn.svm import SVC\n", "print(\"ADA\")\n", "xgb = AdaBoostClassifier(n_estimators=300, random_state=50)\n", "xgb_pred = xgb.fit(X_train_tf, y_train).predict(X_test_tf) \n", "print(classification_report(y_test,xgb_pred))\n", "print(confusion_matrix(y_test,xgb_pred))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ADA\n", " precision recall f1-score support\n", "\n", " -1 0.95 0.96 0.96 1260\n", " 1 0.96 0.95 0.96 1272\n", "\n", " accuracy 0.96 2532\n", " macro avg 0.96 0.96 0.96 2532\n", "weighted avg 0.96 0.96 0.96 2532\n", "\n", "[[1213 47]\n", " [ 66 1206]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "CNWvze0d4BMj" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wGozrOrCUiWt" }, "source": [ "2-Classes\n" ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9FIOnrYm1sVj", "outputId": "c93b04c6-5470-4ce6-d8f2-030942d328af" }, "source": [ "import nltk\n", "from textblob import TextBlob\n", "df['Polarity Score']=\"\"\n", "df['sentiments']=\"\"\n", "#df2 = pd.DataFrame(columns=['text', 'sentiment', 'score'])\n", "df['cleanText']=df['cleanText'].astype(str)\n", "for i in range(len(df)):\n", " sentiment = TextBlob(df['cleanText'][i])\n", " a=sentiment.sentiment.polarity\n", " #df2.loc[i] = [data['cleanText'][i]]+[str(0)]+ [a]\n", " df[\"Polarity Score\"][i]=a\n", "\n", "for i in range(len(df)):\n", " if(df['Polarity Score'][i]>0):\n", " df['sentiments'][i]=1\n", " elif(df['Polarity Score'][i]==0):\n", " df['sentiments'][i]=0\n", " else:\n", " df['sentiments'][i]=-1" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " # This is added back by InteractiveShellApp.init_path()\n", "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " from ipykernel import kernelapp as app\n", "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:19: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "O5Vjd_wJfF50", "outputId": "551dbd74-5fb3-439b-b140-3cfbf6fbf290" }, "source": [ "df5=df.loc[df['sentiments'] == 0]\n", "print(len(df5))\n", "df4=df.loc[df['sentiments'] == 1]\n", "print(len(df4))\n", "df3=df.loc[df['sentiments'] == -1]\n", "print(len(df3))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "1448\n", "61017\n", "5064\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "AnKAEj2d1jbY" }, "source": [ "df4=df4[0:5064]\n", "dataF=df3.append(df4,ignore_index = True) " ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ew6kqhKLzf7Y", "outputId": "48c43f4b-33ec-4c0d-80b9-3faa1aea1719" }, "source": [ "dataF['sentiments'].value_counts()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "-1 5064\n", " 1 5064\n", "Name: sentiments, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 4 } ] }, { "cell_type": "code", "metadata": { "id": "xYK1YTcMQVmy" }, "source": [ "dataF.to_csv(\"/content/drive/My Drive/sentiments.csv\")" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "R8gha7dO4UYO" }, "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "X_train, X_test, y_train, y_test = train_test_split(dataF['cleanText'], dataF['sentiments'], test_size=0.25)\n" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Eka4JL884nZD" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "JJOmRCPq4wM2" }, "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "vectorizer = TfidfVectorizer()\n", "X_train_tf = vectorizer.fit_transform(X_train)\n", "X_test_tf=vectorizer.transform(X_test)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "KEsEUOj24wND" }, "source": [ "from sklearn.metrics import accuracy_score\n", "from sklearn.metrics import classification_report\n", "from sklearn.metrics import confusion_matrix" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9EpGoj13_kgV" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "x0NBGKHR4zYm", "outputId": "3c2c6fb2-83cd-460a-ef06-5cf7fb8aed50" }, "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "print(\"RF\")\n", "rfc = RandomForestClassifier(n_estimators=100, random_state=50,max_depth=250) \n", "rfc.fit(X_train_tf, y_train)\n", "# calculate accuracy of class predictions\n", "y_pred_class = rfc.predict(X_test_tf)\n", "print(classification_report(y_test, y_pred_class))\n", "print(confusion_matrix(y_test, y_pred_class))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "RF\n", " precision recall f1-score support\n", "\n", " -1 0.95 0.92 0.93 1257\n", " 1 0.92 0.96 0.94 1275\n", "\n", " accuracy 0.94 2532\n", " macro avg 0.94 0.94 0.94 2532\n", "weighted avg 0.94 0.94 0.94 2532\n", "\n", "[[1152 105]\n", " [ 57 1218]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WVJWDpVB4zYo", "outputId": "eaab1951-bc91-442b-e000-975e7b2ebc09" }, "source": [ "from sklearn.linear_model import LogisticRegression\n", "print(\"LR\")\n", "logreg = LogisticRegression(random_state=100,multi_class='ovr',C=3)\n", "x=logreg.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "from sklearn.metrics import precision_recall_fscore_support\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "LR\n", " precision recall f1-score support\n", "\n", " -1 0.96 0.97 0.96 1257\n", " 1 0.97 0.96 0.96 1275\n", "\n", " accuracy 0.96 2532\n", " macro avg 0.96 0.96 0.96 2532\n", "weighted avg 0.96 0.96 0.96 2532\n", "\n", "[[1217 40]\n", " [ 53 1222]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6xhecnL4474p", "outputId": "abf7776b-4ef6-43a7-fbbb-883a06143367" }, "source": [ "from sklearn.ensemble import ExtraTreesClassifier\n", "etc = ExtraTreesClassifier(n_estimators=200, random_state=50, max_depth=150)\n", "x=etc.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " -1 0.95 0.92 0.93 1257\n", " 1 0.92 0.95 0.94 1275\n", "\n", " accuracy 0.94 2532\n", " macro avg 0.94 0.94 0.94 2532\n", "weighted avg 0.94 0.94 0.94 2532\n", "\n", "[[1152 105]\n", " [ 58 1217]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "eZxOZ8W0474q", "outputId": "0870f616-5fa3-4aac-a6b3-5f999fa31d83" }, "source": [ "from sklearn.neural_network import MLPClassifier\n", "mlp = MLPClassifier(random_state=20, max_iter=300)\n", "x=mlp.fit(X_train_tf, y_train).predict(X_test_tf)\n", "# calculate accuracy\n", "print(classification_report(y_test, x))\n", "print(confusion_matrix(y_test, x))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ " precision recall f1-score support\n", "\n", " -1 0.95 0.95 0.95 1257\n", " 1 0.95 0.95 0.95 1275\n", "\n", " accuracy 0.95 2532\n", " macro avg 0.95 0.95 0.95 2532\n", "weighted avg 0.95 0.95 0.95 2532\n", "\n", "[[1191 66]\n", " [ 64 1211]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "0SV1-8Sj5Coa", "outputId": "3ae3897d-4039-4864-e382-e0f3215ca02e" }, "source": [ "from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier\n", "from sklearn.svm import SVC\n", "print(\"ADA\")\n", "xgb = AdaBoostClassifier(n_estimators=300, random_state=50)\n", "xgb_pred = xgb.fit(X_train_tf, y_train).predict(X_test_tf) \n", "print(classification_report(y_test,xgb_pred))\n", "print(confusion_matrix(y_test,xgb_pred))" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "ADA\n", " precision recall f1-score support\n", "\n", " -1 0.95 0.95 0.95 1257\n", " 1 0.95 0.95 0.95 1275\n", "\n", " accuracy 0.95 2532\n", " macro avg 0.95 0.95 0.95 2532\n", "weighted avg 0.95 0.95 0.95 2532\n", "\n", "[[1194 63]\n", " [ 67 1208]]\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "rD2ubU2O5ILk" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "7PZiSNZ66BrI" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "9fcC1rBw72rl" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Q3OV8dDc72wZ" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Qy1Rgjjc72zi" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Oq1iYRcM721w" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "eJyF7Pkm725w" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "FonjsBLn727E" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "cKyPN5i_7294" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "pyFbsSPu73Ba" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "markdown", "metadata": { "id": "wnyzXURVUlis" }, "source": [ "Deep learning Models\n" ] }, { "cell_type": "code", "metadata": { "id": "uSGAQjRJUpkx" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "-Bfj8ErP71-_" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "rUUFyjWv72Fe" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "p8bn7CzX72HW" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "APCR0wHY72Kc" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "1Du-ssSh72Nx" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "iGD_0z5m72PZ" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "EjCUpQ4H2Vnm", "outputId": "1ecb9f3a-ab07-4fc3-9bad-909beeaf750d" }, "source": [ "import tensorflow.keras\n", "tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(num_words=5000, lower=True,split=' ',filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n')\n", "tokenizer.fit_on_texts(df[\"cleanText\"].values)\n", "#print(tokenizer.word_index) # To see the dicstionary\n", "X = tokenizer.texts_to_sequences(df[\"cleanText\"].values)\n", "X = tensorflow.keras.preprocessing.sequence.pad_sequences(X)\n", "\n", "\n", "from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Embedding\n", "from keras.models import Sequential\n", "model_conv = Sequential()\n", "model_conv.add(Embedding(5000, 100, input_length=X.shape[1]))\n", "model_conv.add(Conv1D(128, 5, activation='relu'))\n", "model_conv.add(MaxPooling1D(pool_size=4))\n", "model_conv.add(LSTM(100))\n", "model_conv.add(Dense(2, activation='softmax'))\n", "model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "\n", "#Y = data.Reviewer_Score\n", "Y = pd.get_dummies(df.sentiments).values\n", "\n", "X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.25, random_state =2, shuffle=True)\n", "batch_size=128\n", "#Here we train the Network.\n", "pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =10, verbose =2,validation_data=(X_valid,Y_valid))\n", "pred" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/10\n", "60/60 - 6s - loss: 0.4124 - accuracy: 0.7854 - val_loss: 0.1724 - val_accuracy: 0.9384\n", "Epoch 2/10\n", "60/60 - 4s - loss: 0.1054 - accuracy: 0.9614 - val_loss: 0.1479 - val_accuracy: 0.9451\n", "Epoch 3/10\n", "60/60 - 4s - loss: 0.0500 - accuracy: 0.9838 - val_loss: 0.1742 - val_accuracy: 0.9423\n", "Epoch 4/10\n", "60/60 - 4s - loss: 0.0281 - accuracy: 0.9925 - val_loss: 0.2083 - val_accuracy: 0.9388\n", "Epoch 5/10\n", "60/60 - 4s - loss: 0.0157 - accuracy: 0.9962 - val_loss: 0.2572 - val_accuracy: 0.9368\n", "Epoch 6/10\n", "60/60 - 4s - loss: 0.0087 - accuracy: 0.9980 - val_loss: 0.3184 - val_accuracy: 0.9313\n", "Epoch 7/10\n", "60/60 - 4s - loss: 0.0050 - accuracy: 0.9989 - val_loss: 0.3725 - val_accuracy: 0.9293\n", "Epoch 8/10\n", "60/60 - 4s - loss: 0.0026 - accuracy: 0.9996 - val_loss: 0.4896 - val_accuracy: 0.9206\n", "Epoch 9/10\n", "60/60 - 4s - loss: 0.0080 - accuracy: 0.9972 - val_loss: 0.5157 - val_accuracy: 0.9155\n", "Epoch 10/10\n", "60/60 - 4s - loss: 0.0241 - accuracy: 0.9921 - val_loss: 0.3922 - val_accuracy: 0.9261\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 25 } ] }, { "cell_type": "code", "metadata": { "id": "Mp0M2mQa2Vqw", "colab": { "base_uri": "https://localhost:8080/", "height": 573 }, "outputId": "f4cfa669-4448-49d6-9df9-80c79353d8cd" }, "source": [ "import matplotlib.pyplot as plt\n", "from google.colab import files\n", "\n", "# Plot training & validation accuracy values\n", "plt.plot(pred.history['accuracy'])\n", "plt.plot(pred.history['val_accuracy'])\n", "plt.title('Model accuracy')\n", "plt.ylabel('Accuracy')\n", "plt.xlabel('Epoch')\n", "plt.legend(['Train', 'Test'], loc='upper left')\n", "plt.savefig('training & validation accuracySST1.pdf')\n", "files.download('training & validation accuracySST1.pdf') \n", "plt.show()\n", "\n", "\n", "# Plot training & validation loss values\n", "plt.plot(pred.history['loss'])\n", "plt.plot(pred.history['val_loss'])\n", "plt.title('Model loss')\n", "plt.ylabel('Loss')\n", "plt.xlabel('Epoch')\n", "plt.legend(['Train', 'Test'], loc='upper left')\n", "plt.savefig('training & validation lossSST1.pdf')\n", "files.download('training & validation lossSST1.pdf') \n", "plt.show()\n" ], "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "display_data", "data": { "application/javascript": [ "download(\"download_477ae9fa-be58-46c0-b64d-4230a06741eb\", \"training & validation accuracySST1.pdf\", 12465)" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } }, { "output_type": "display_data", "data": { "application/javascript": [ "\n", " async function download(id, filename, size) {\n", " if (!google.colab.kernel.accessAllowed) {\n", " return;\n", " }\n", " const div = document.createElement('div');\n", " const label = document.createElement('label');\n", " label.textContent = `Downloading \"${filename}\": `;\n", " div.appendChild(label);\n", " const progress = document.createElement('progress');\n", " progress.max = size;\n", " div.appendChild(progress);\n", " document.body.appendChild(div);\n", "\n", " const buffers = [];\n", " let downloaded = 0;\n", "\n", " const channel = await google.colab.kernel.comms.open(id);\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", "\n", " for await (const message of channel.messages) {\n", " // Send a message to notify the kernel that we're ready.\n", " channel.send({})\n", " if (message.buffers) {\n", " for (const buffer of message.buffers) {\n", " buffers.push(buffer);\n", " downloaded += buffer.byteLength;\n", " progress.value = downloaded;\n", " }\n", " }\n", " }\n", " const blob = new Blob(buffers, {type: 'application/binary'});\n", " const a = document.createElement('a');\n", " a.href = window.URL.createObjectURL(blob);\n", " a.download = filename;\n", " div.appendChild(a);\n", " a.click();\n", " div.remove();\n", " }\n", " " ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "display_data", "data": { "application/javascript": [ "download(\"download_be8d2354-08a5-48d8-9471-2396676df37f\", \"training & validation lossSST1.pdf\", 11746)" ], "text/plain": [ "" ] }, "metadata": { "tags": [] } }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "id": "Q-T-JjwwTAuG" }, "source": [ "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n", "pred=model_conv.predict(X_valid)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Bs-6MEygTAuT", "outputId": "a6433616-a44a-4590-8a2c-99e659687a78" }, "source": [ "v = np.where(pred > 0.5, 1, 0)\n", "\n", "\n", "\n", "print(\"++++++++++++++++++++++=Testing Results++++++++++++++++++++\")\n", "\n", "print(\"Testing Accuracy\")\n", "print(accuracy_score(Y_valid,v))\n", "print(\"Traning Classifcation Report\")\n", "print(classification_report(Y_valid,v))\n", "\n", "print(\"Testing Confusion\")\n", "print(confusion_matrix(Y_valid.ravel(),v.ravel()))\n", "cm1 = confusion_matrix(Y_valid.ravel(),v.ravel())\n", "\n", "Accuracy = cm1[0,0]/(cm1[0,1]+cm1[0,0])\n", "print('Accuracy 0 class : ', Accuracy)\n", "\n", "Accuracy = cm1[1,1]/(cm1[1,0]+cm1[1,1])\n", "print('Accuracy 1 class : ', Accuracy)\n", "\n", "total1=sum(sum(cm1))\n", "\n", "Accuracy = cm1[0,1]/(cm1[0,0]+cm1[0,1])\n", "print('Error Rate for 0: ', Accuracy)\n", "\n", "\n", "Accuracy = cm1[1,0]/(cm1[1,1]+cm1[1,0])\n", "print('Error Rate for 1: ', Accuracy)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "++++++++++++++++++++++=Testing Results++++++++++++++++++++\n", "Testing Accuracy\n", "0.9261453396524486\n", "Traning Classifcation Report\n", " precision recall f1-score support\n", "\n", " 0 0.91 0.95 0.93 1285\n", " 1 0.94 0.90 0.92 1247\n", "\n", " micro avg 0.93 0.93 0.93 2532\n", " macro avg 0.93 0.93 0.93 2532\n", "weighted avg 0.93 0.93 0.93 2532\n", " samples avg 0.93 0.93 0.93 2532\n", "\n", "Testing Confusion\n", "[[2345 187]\n", " [ 187 2345]]\n", "Accuracy 0 class : 0.9261453396524486\n", "Accuracy 1 class : 0.9261453396524486\n", "Error Rate for 0: 0.07385466034755134\n", "Error Rate for 1: 0.07385466034755134\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "GuWV1VuIUxvV" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "5vlWvQ-oUxye" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PxwtK5XM604I", "outputId": "f8403fd7-f5ea-4f1a-878d-50748e200947" }, "source": [ "import tensorflow.keras\n", "tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(num_words=5000, lower=True,split=' ',filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n')\n", "tokenizer.fit_on_texts(df[\"cleanText\"].values)\n", "#print(tokenizer.word_index) # To see the dicstionary\n", "X = tokenizer.texts_to_sequences(df[\"cleanText\"].values)\n", "X = tensorflow.keras.preprocessing.sequence.pad_sequences(X)\n", "\n", "\n", "from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Embedding\n", "from keras.models import Sequential\n", "model_conv = Sequential()\n", "model_conv.add(Embedding(5000, 100, input_length=X.shape[1]))\n", "model_conv.add(Conv1D(128, 5, activation='relu'))\n", "model_conv.add(MaxPooling1D(pool_size=4))\n", "model_conv.add(Flatten())\n", "model_conv.add(Dense(32))\n", "model_conv.add(Dense(2, activation='softmax'))\n", "model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "\n", "#Y = data.Reviewer_Score\n", "Y = pd.get_dummies(df.sentiments).values\n", "\n", "X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.25, random_state =2)\n", "batch_size=128\n", "#Here we train the Network.\n", "\n", "pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =10, verbose =2,validation_data=(X_valid,Y_valid))\n", "pred" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/10\n", "60/60 - 4s - loss: 0.5159 - accuracy: 0.7592 - val_loss: 0.2455 - val_accuracy: 0.9036\n", "Epoch 2/10\n", "60/60 - 3s - loss: 0.1218 - accuracy: 0.9550 - val_loss: 0.1776 - val_accuracy: 0.9293\n", "Epoch 3/10\n", "60/60 - 3s - loss: 0.0542 - accuracy: 0.9828 - val_loss: 0.1563 - val_accuracy: 0.9439\n", "Epoch 4/10\n", "60/60 - 3s - loss: 0.0285 - accuracy: 0.9910 - val_loss: 0.1933 - val_accuracy: 0.9380\n", "Epoch 5/10\n", "60/60 - 3s - loss: 0.0166 - accuracy: 0.9953 - val_loss: 0.2084 - val_accuracy: 0.9376\n", "Epoch 6/10\n", "60/60 - 3s - loss: 0.0089 - accuracy: 0.9978 - val_loss: 0.2481 - val_accuracy: 0.9301\n", "Epoch 7/10\n", "60/60 - 3s - loss: 0.0054 - accuracy: 0.9986 - val_loss: 0.2714 - val_accuracy: 0.9325\n", "Epoch 8/10\n", "60/60 - 3s - loss: 0.0028 - accuracy: 0.9996 - val_loss: 0.2947 - val_accuracy: 0.9301\n", "Epoch 9/10\n", "60/60 - 3s - loss: 0.0020 - accuracy: 0.9997 - val_loss: 0.3206 - val_accuracy: 0.9277\n", "Epoch 10/10\n", "60/60 - 3s - loss: 9.5111e-04 - accuracy: 0.9999 - val_loss: 0.3442 - val_accuracy: 0.9285\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 28 } ] }, { "cell_type": "code", "metadata": { "id": "iEoJkkFb607R", "colab": { "base_uri": "https://localhost:8080/", "height": 573 }, "outputId": "1c6bc49b-0112-496b-fd09-eb283a100f67" }, "source": [ "import matplotlib.pyplot as plt\n", "from google.colab import files\n", "\n", "# Plot training & validation accuracy values\n", "plt.plot(pred.history['accuracy'])\n", "plt.plot(pred.history['val_accuracy'])\n", "plt.title('Model accuracy')\n", "plt.ylabel('Accuracy')\n", "plt.xlabel('Epoch')\n", "plt.legend(['Train', 'Test'], loc='upper left')\n", "plt.show()\n", "\n", "\n", "# Plot training & validation loss values\n", "plt.plot(pred.history['loss'])\n", "plt.plot(pred.history['val_loss'])\n", "plt.title('Model loss')\n", "plt.ylabel('Loss')\n", "plt.xlabel('Epoch')\n", "plt.legend(['Train', 'Test'], loc='upper left')\n", "plt.show()\n" ], "execution_count": null, "outputs": [ { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } }, { "output_type": "display_data", "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "tags": [], "needs_background": "light" } } ] }, { "cell_type": "code", "metadata": { "id": "wi4Fkbs2TCa1" }, "source": [ "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n", "pred=model_conv.predict(X_valid)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "tGbk0uJeTCa3", "outputId": "6d840172-120c-490a-91fa-ca0a39bf364e" }, "source": [ "v = np.where(pred > 0.5, 1, 0)\n", "\n", "\n", "\n", "print(\"++++++++++++++++++++++=Testing Results++++++++++++++++++++\")\n", "\n", "print(\"Testing Accuracy\")\n", "print(accuracy_score(Y_valid,v))\n", "print(\"Traning Classifcation Report\")\n", "print(classification_report(Y_valid,v))\n", "\n", "print(\"Testing Confusion\")\n", "print(confusion_matrix(Y_valid.ravel(),v.ravel()))\n", "cm1 = confusion_matrix(Y_valid.ravel(),v.ravel())\n", "\n", "Accuracy = cm1[0,0]/(cm1[0,1]+cm1[0,0])\n", "print('Accuracy 0 class : ', Accuracy)\n", "\n", "Accuracy = cm1[1,1]/(cm1[1,0]+cm1[1,1])\n", "print('Accuracy 1 class : ', Accuracy)\n", "\n", "total1=sum(sum(cm1))\n", "\n", "Accuracy = cm1[0,1]/(cm1[0,0]+cm1[0,1])\n", "print('Error Rate for 0: ', Accuracy)\n", "\n", "\n", "Accuracy = cm1[1,0]/(cm1[1,1]+cm1[1,0])\n", "print('Error Rate for 1: ', Accuracy)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "++++++++++++++++++++++=Testing Results++++++++++++++++++++\n", "Testing Accuracy\n", "0.9285150078988942\n", "Traning Classifcation Report\n", " precision recall f1-score support\n", "\n", " 0 0.93 0.93 0.93 1285\n", " 1 0.92 0.93 0.93 1247\n", "\n", " micro avg 0.93 0.93 0.93 2532\n", " macro avg 0.93 0.93 0.93 2532\n", "weighted avg 0.93 0.93 0.93 2532\n", " samples avg 0.93 0.93 0.93 2532\n", "\n", "Testing Confusion\n", "[[2351 181]\n", " [ 181 2351]]\n", "Accuracy 0 class : 0.9285150078988942\n", "Accuracy 1 class : 0.9285150078988942\n", "Error Rate for 0: 0.07148499210110584\n", "Error Rate for 1: 0.07148499210110584\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "id": "LWOflLHeU3qU" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "fn3yU3eiU3zS" }, "source": [ "" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "id": "Q6v9wXLs75wg", "colab": { "base_uri": "https://localhost:8080/" }, "outputId": "0adeacb2-00d9-4ab9-b3f9-823fa6c7b854" }, "source": [ "from keras import optimizers\n", "from keras.models import Sequential\n", "from keras.layers import Embedding\n", "from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Embedding\n", "from keras.models import Sequential\n", "tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(num_words=5000, lower=True,split=' ',filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n')\n", "tokenizer.fit_on_texts(dataF[\"cleanText\"].values)\n", "#print(tokenizer.word_index) # To see the dicstionary\n", "X = tokenizer.texts_to_sequences(dataF[\"cleanText\"].values)\n", "X = tensorflow.keras.preprocessing.sequence.pad_sequences(X)\n", "#Deep Learning Network Structure\n", "model_conv = Sequential()\n", "model_conv.add(Embedding(5000,100, input_length=X.shape[1]))\n", "model_conv.add(Dropout(0.5))\n", "model_conv.add(LSTM(100))\n", "model_conv.add(Dense(2, activation='softmax'))\n", "model_conv.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", "#Y = data.Reviewer_Score\n", "Y = pd.get_dummies(dataF.sentiments).values\n", "\n", "X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.20, random_state =2)\n", "batch_size=128\n", "#Here we train the Network.\n", "\n", "pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =10, verbose =2,validation_data=(X_valid,Y_valid))\n", "pred" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "Epoch 1/10\n", "64/64 - 206s - loss: 0.4884 - accuracy: 0.7607 - val_loss: 0.1923 - val_accuracy: 0.9284\n", "Epoch 2/10\n", "64/64 - 210s - loss: 0.2005 - accuracy: 0.9268 - val_loss: 0.1298 - val_accuracy: 0.9521\n", "Epoch 3/10\n", "64/64 - 212s - loss: 0.0770 - accuracy: 0.9725 - val_loss: 0.1038 - val_accuracy: 0.9650\n", "Epoch 4/10\n", "64/64 - 216s - loss: 0.0411 - accuracy: 0.9874 - val_loss: 0.1103 - val_accuracy: 0.9620\n", "Epoch 5/10\n", "64/64 - 215s - loss: 0.0324 - accuracy: 0.9904 - val_loss: 0.1139 - val_accuracy: 0.9605\n", "Epoch 6/10\n", "64/64 - 214s - loss: 0.0218 - accuracy: 0.9931 - val_loss: 0.1362 - val_accuracy: 0.9556\n", "Epoch 7/10\n", "64/64 - 214s - loss: 0.0133 - accuracy: 0.9959 - val_loss: 0.1270 - val_accuracy: 0.9610\n", "Epoch 8/10\n", "64/64 - 217s - loss: 0.0095 - accuracy: 0.9973 - val_loss: 0.1474 - val_accuracy: 0.9561\n", "Epoch 9/10\n", "64/64 - 212s - loss: 0.0090 - accuracy: 0.9979 - val_loss: 0.1311 - val_accuracy: 0.9635\n", "Epoch 10/10\n", "64/64 - 211s - loss: 0.0070 - accuracy: 0.9973 - val_loss: 0.1685 - val_accuracy: 0.9566\n" ], "name": "stdout" }, { "output_type": "execute_result", "data": { "text/plain": [ "" ] }, "metadata": { "tags": [] }, "execution_count": 26 } ] }, { "cell_type": "code", "metadata": { "id": "bRxehtUHR8w5" }, "source": [ "from sklearn.metrics import classification_report,confusion_matrix,accuracy_score\n", "pred=model_conv.predict(X_valid)" ], "execution_count": null, "outputs": [] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "KqGV37k0vnVX", "outputId": "93c7891a-4812-4f56-8551-5dc560af12fe" }, "source": [ "v = np.where(pred > 0.5, 1, 0)\n", "\n", "\n", "\n", "print(\"++++++++++++++++++++++=Testing Results++++++++++++++++++++\")\n", "\n", "print(\"Testing Accuracy\")\n", "print(accuracy_score(Y_valid,v))\n", "print(\"Traning Classifcation Report\")\n", "print(classification_report(Y_valid,v))\n", "\n", "print(\"Testing Confusion\")\n", "print(confusion_matrix(Y_valid.ravel(),v.ravel()))\n", "cm1 = confusion_matrix(Y_valid.ravel(),v.ravel())\n", "\n", "Accuracy = cm1[0,0]/(cm1[0,1]+cm1[0,0])\n", "print('Accuracy 0 class : ', Accuracy)\n", "\n", "Accuracy = cm1[1,1]/(cm1[1,0]+cm1[1,1])\n", "print('Accuracy 1 class : ', Accuracy)\n", "\n", "total1=sum(sum(cm1))\n", "\n", "Accuracy = cm1[0,1]/(cm1[0,0]+cm1[0,1])\n", "print('Error Rate for 0: ', Accuracy)\n", "\n", "\n", "Accuracy = cm1[1,0]/(cm1[1,1]+cm1[1,0])\n", "print('Error Rate for 1: ', Accuracy)" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "++++++++++++++++++++++=Testing Results++++++++++++++++++++\n", "Testing Accuracy\n", "0.9565646594274433\n", "Traning Classifcation Report\n", " precision recall f1-score support\n", "\n", " 0 0.97 0.94 0.96 1027\n", " 1 0.94 0.97 0.96 999\n", "\n", " micro avg 0.96 0.96 0.96 2026\n", " macro avg 0.96 0.96 0.96 2026\n", "weighted avg 0.96 0.96 0.96 2026\n", " samples avg 0.96 0.96 0.96 2026\n", "\n", "Testing Confusion\n", "[[1938 88]\n", " [ 88 1938]]\n", "Accuracy 0 class : 0.9565646594274433\n", "Accuracy 1 class : 0.9565646594274433\n", "Error Rate for 0: 0.04343534057255676\n", "Error Rate for 1: 0.04343534057255676\n" ], "name": "stdout" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Pgr8AFdD18ax", "outputId": "d11057a8-4049-4bbb-b3f6-2f8aff94e2de" }, "source": [ "import nltk\n", "from textblob import TextBlob\n", "dataF['Polarity Score']=\"\"\n", "dataF['sentiments']=\"\"\n", "#df2 = pd.DataFrame(columns=['text', 'sentiment', 'score'])\n", "dataF['cleanText']=dataF['cleanText'].astype(str)\n", "for i in range(len(dataF)):\n", " sentiment = TextBlob(dataF['cleanText'][i])\n", " a=sentiment.sentiment.polarity\n", " #df2.loc[i] = [data['cleanText'][i]]+[str(0)]+ [a]\n", " dataF[\"Polarity Score\"][i]=a\n", "\n", "for i in range(len(dataF)):\n", " if(dataF['Polarity Score'][i]>0):\n", " dataF['sentiments'][i]=1\n", " else:\n", " dataF['sentiments'][i]=-1" ], "execution_count": null, "outputs": [ { "output_type": "stream", "text": [ "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:11: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " # This is added back by InteractiveShellApp.init_path()\n", "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:17: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", "/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " from ipykernel import kernelapp as app\n" ], "name": "stderr" } ] }, { "cell_type": "code", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8iuZILUM18dp", "outputId": "d91317ff-bfd4-4f0c-f521-2e6dad59cb56" }, "source": [ "dataF['sentiments'].value_counts()" ], "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "-1 5064\n", " 1 5064\n", "Name: sentiments, dtype: int64" ] }, "metadata": { "tags": [] }, "execution_count": 30 } ] } ] }