{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "executionInfo": {
     "elapsed": 1333,
     "status": "ok",
     "timestamp": 1616768427523,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "BRtpZetrlYGA"
   },
   "outputs": [],
   "source": [
    "#Libraries for the \n",
    "import os\n",
    "import pandas as pd\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 83702,
     "status": "ok",
     "timestamp": 1616768514451,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "7OPIvVLoMiga",
    "outputId": "336770f9-e733-43f7-f82b-e348957d2ec2"
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "executionInfo": {
     "elapsed": 11256,
     "status": "ok",
     "timestamp": 1616768516632,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "6iggmdfaHfsF"
   },
   "outputs": [],
   "source": [
    "data=pd.read_csv(\"PreprocessReview.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 112
    },
    "executionInfo": {
     "elapsed": 9976,
     "status": "ok",
     "timestamp": 1616768516641,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "eaY4QIHrN25p",
    "outputId": "b45e96f4-43fb-4541-d98f-3fc2c148e039"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>Unnamed: 0.1</th>\n",
       "      <th>review</th>\n",
       "      <th>label</th>\n",
       "      <th>file</th>\n",
       "      <th>cleanText</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>21939</td>\n",
       "      <td>Gwyneth Paltrow is absolutely great in this mo...</td>\n",
       "      <td>0</td>\n",
       "      <td>7246_4.txt</td>\n",
       "      <td>gwyneth paltrow absolutely great movie story u...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>24113</td>\n",
       "      <td>I own this movie. Not by choice, I do. I was r...</td>\n",
       "      <td>0</td>\n",
       "      <td>9202_1.txt</td>\n",
       "      <td>movie choice really bored day box intrigued po...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Unnamed: 0  Unnamed: 0.1  \\\n",
       "0           0         21939   \n",
       "1           1         24113   \n",
       "\n",
       "                                              review  label        file  \\\n",
       "0  Gwyneth Paltrow is absolutely great in this mo...      0  7246_4.txt   \n",
       "1  I own this movie. Not by choice, I do. I was r...      0  9202_1.txt   \n",
       "\n",
       "                                           cleanText  \n",
       "0  gwyneth paltrow absolutely great movie story u...  \n",
       "1  movie choice really bored day box intrigued po...  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[:2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 7116,
     "status": "ok",
     "timestamp": 1616768516643,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "J9H7ZzSy5ciU",
    "outputId": "5bd3b935-32d8-4d8c-f2a1-b9493e629edb"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "25000"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# number of tweets in CSV file\n",
    "len(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:13: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  del sys.path[0]\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:17: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:21: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:19: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n"
     ]
    }
   ],
   "source": [
    "# textblob library to extract the polarity score to find sentiments\n",
    "\n",
    "import nltk\n",
    "from textblob import  TextBlob\n",
    "data['TextBlob Score']=\"\"\n",
    "data['TextBlob Sentiment']=\"\"\n",
    "#df2 = pd.DataFrame(columns=['text', 'sentiment', 'score'])\n",
    "data['cleanText']=data['cleanText'].astype(str)\n",
    "for i in range(len(data)):\n",
    "    sentiment = TextBlob(data['cleanText'][i])\n",
    "    a=sentiment.sentiment.polarity\n",
    "    #df2.loc[i] = [data['cleanText'][i]]+[str(0)]+ [a]\n",
    "    data[\"TextBlob Score\"][i]=a\n",
    "\n",
    "for i in range(len(data)):\n",
    "    if(data['TextBlob Score'][i]>0):\n",
    "        data['TextBlob Sentiment'][i]=\"Positive\"\n",
    "    elif(data['TextBlob Score'][i]==0):\n",
    "        data['TextBlob Sentiment'][i]=\"Neutral\"\n",
    "    else:\n",
    "        data['TextBlob Sentiment'][i]=\"Negative\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Positve negative extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18309\n",
      "6668\n"
     ]
    }
   ],
   "source": [
    "data5=data.loc[data['TextBlob Sentiment'] == \"Positive\"]\n",
    "print(len(data5))\n",
    "data4=data.loc[data['TextBlob Sentiment'] == \"Negative\"]\n",
    "print(len(data4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataF=data5.append(data4,ignore_index = True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 5753,
     "status": "ok",
     "timestamp": 1616768516645,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "Bp3ZalVnQved",
    "outputId": "008fc640-61de-40a4-d451-ee0c54db9cc5"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({'Positive': 18309, 'Negative': 6668})"
      ]
     },
     "execution_count": 69,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "Counter( dataF['TextBlob Sentiment'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "TbBISAtr_KYE"
   },
   "source": [
    "Supervised machine leanring approach start from **here**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {
    "executionInfo": {
     "elapsed": 1097,
     "status": "ok",
     "timestamp": 1616769068814,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "Yp1gzlOHHU0e"
   },
   "outputs": [],
   "source": [
    "#Data Splitting into training and testing sets for machine leanring models\n",
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(data[\"cleanText\"],data['label'],test_size=0.25,shuffle=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import accuracy_score\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "executionInfo": {
     "elapsed": 1472,
     "status": "ok",
     "timestamp": 1616769073354,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "Ck16zeQNHU0e"
   },
   "outputs": [],
   "source": [
    "#TF-IDF features \n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer = TfidfVectorizer(max_features=5000)\n",
    "X = vectorizer.fit_transform(X_train)\n",
    "y = vectorizer.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {
    "id": "YvbLqoT5HU0g"
   },
   "outputs": [],
   "source": [
    "#TF-IDF features \n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vectorizer = CountVectorizer(max_features=5000)\n",
    "X = vectorizer.fit_transform(X_train)\n",
    "y = vectorizer.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Glove features \n",
    "from zeugma.embeddings import EmbeddingTransformer\n",
    "glove = EmbeddingTransformer('glove')\n",
    "X = glove.fit_transform(X_train)\n",
    "y = glove.transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 9296,
     "status": "ok",
     "timestamp": 1616769083545,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "fhz17ZiEHU0g",
    "outputId": "d850c7f8-72d7-4626-bfe1-0fc4f64f643a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.64608\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.65      0.65      0.65      3125\n",
      "           1       0.65      0.65      0.65      3125\n",
      "\n",
      "    accuracy                           0.65      6250\n",
      "   macro avg       0.65      0.65      0.65      6250\n",
      "weighted avg       0.65      0.65      0.65      6250\n",
      "\n",
      "[[2017 1108]\n",
      " [1104 2021]]\n"
     ]
    }
   ],
   "source": [
    "#Decsion tree machine leanirng models\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "dt=DecisionTreeClassifier(max_depth=50)\n",
    "dtPre=dt.fit(X, y_train).predict(y)\n",
    "print(accuracy_score(y_test,dtPre))\n",
    "print(classification_report(y_test,dtPre))\n",
    "print(confusion_matrix(y_test,dtPre))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 128,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 215810,
     "status": "ok",
     "timestamp": 1616769293475,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "LoNl5sH3HU0g",
    "outputId": "7c4e39f2-74d9-4e91-9ddf-d94af2c6cf21"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.79888\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.83      0.75      0.79      3125\n",
      "           1       0.77      0.85      0.81      3125\n",
      "\n",
      "    accuracy                           0.80      6250\n",
      "   macro avg       0.80      0.80      0.80      6250\n",
      "weighted avg       0.80      0.80      0.80      6250\n",
      "\n",
      "[[2340  785]\n",
      " [ 472 2653]]\n"
     ]
    }
   ],
   "source": [
    "#Random forest machine leanring models\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "rfc = RandomForestClassifier(n_estimators=300, random_state=2,max_depth=300) \n",
    "rfc.fit(X, y_train)\n",
    "# calculate accuracy of class predictions\n",
    "y_pred_class = rfc.predict(y)\n",
    "print(accuracy_score(y_test, y_pred_class))\n",
    "print(classification_report(y_test, y_pred_class))\n",
    "print(confusion_matrix(y_test, y_pred_class))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "executionInfo": {
     "elapsed": 218547,
     "status": "ok",
     "timestamp": 1616769299244,
     "user": {
      "displayName": "Furqan Rustam",
      "photoUrl": "https://lh3.googleusercontent.com/a-/AOh14Gg5oat7XG884rtNww-Dgt4sEfDMp6vxfwes8ntbog=s64",
      "userId": "11638408104830397330"
     },
     "user_tz": -300
    },
    "id": "Vz0q3bh_HU0h",
    "outputId": "099cd005-441b-46cd-d1fe-1e6a76be5e53"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.65216\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.65      0.65      0.65      3125\n",
      "           1       0.65      0.65      0.65      3125\n",
      "\n",
      "    accuracy                           0.65      6250\n",
      "   macro avg       0.65      0.65      0.65      6250\n",
      "weighted avg       0.65      0.65      0.65      6250\n",
      "\n",
      "[[2038 1087]\n",
      " [1087 2038]]\n"
     ]
    }
   ],
   "source": [
    "#Random forest machine leanring models\n",
    "from sklearn.ensemble import GradientBoostingClassifier\n",
    "rfc = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2,max_depth=50, random_state=0)\n",
    "rfc.fit(X, y_train)\n",
    "# calculate accuracy of class predictions\n",
    "y_pred_class = rfc.predict(y)\n",
    "print(accuracy_score(y_test, y_pred_class))\n",
    "print(classification_report(y_test, y_pred_class))\n",
    "print(confusion_matrix(y_test, y_pred_class))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "SVC\n",
      "0.87744\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.89      0.86      0.88      3125\n",
      "           1       0.87      0.89      0.88      3125\n",
      "\n",
      "    accuracy                           0.88      6250\n",
      "   macro avg       0.88      0.88      0.88      6250\n",
      "weighted avg       0.88      0.88      0.88      6250\n",
      "\n",
      "[[2690  435]\n",
      " [ 331 2794]]\n"
     ]
    }
   ],
   "source": [
    "from sklearn.svm import SVC\n",
    "print(\"SVC\")\n",
    "svm = SVC(kernel='linear', C=1.0)\n",
    "svm.fit(X, y_train)\n",
    "y_pred_class = svm.predict(y)\n",
    "print(accuracy_score(y_test, y_pred_class))\n",
    "print(classification_report(y_test, y_pred_class))\n",
    "print(confusion_matrix(y_test, y_pred_class))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from gensim.models import Word2Vec\n",
    "from sklearn import preprocessing\n",
    "lab_enc = preprocessing.LabelEncoder()\n",
    "lab_enc.fit(data['label'])\n",
    "newdata=data\n",
    "tokenized_data = newdata['cleanText'].apply(lambda x: x.split())\n",
    "model_w2v = Word2Vec( tokenized_data, vector_size=2000,window=10, min_count=2,sg = 1, hs = 0, negative = 10, workers= 2, seed = 34)\n",
    "model_w2v.train(tokenized_data, total_examples= len(newdata['cleanText']), epochs=20)\n",
    "def word_vector(tokens, size):\n",
    "    vec = np.zeros(size).reshape((1, size))\n",
    "    count = 0.\n",
    "    for word in tokens:\n",
    "        try:\n",
    "            vec += model_w2v.wv[word].reshape((1, size))\n",
    "            count += 1.\n",
    "        except KeyError:\n",
    "            continue\n",
    "    if count != 0:\n",
    "        vec /= count\n",
    "        return vec\n",
    "wordvec_arrays = np.zeros((len(tokenized_data), 2000))\n",
    "for i in range(len(tokenized_data)):\n",
    "    wordvec_arrays[i,:] = word_vector(tokenized_data[i], 2000)\n",
    "data_feature_3 = pd.DataFrame(wordvec_arrays)\n",
    "df_tex = data_feature_3\n",
    "df_sen = lab_enc.transform(data['label'])\n",
    "X, y, y_train, y_test = train_test_split(df_tex, df_sen, test_size=0.25, random_state=2, stratify=df_sen)\n",
    "y_train = y_train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Logistic regression for machine leanring\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "Lr = LogisticRegression(solver=\"sag\",multi_class=\"ovr\",C=1.0)\n",
    "Lr.fit(X, y_train)\n",
    "y_pred_class = Lr.predict(y)\n",
    "print(accuracy_score(y_test, y_pred_class))\n",
    "print(classification_report(y_test, y_pred_class))\n",
    "print(confusion_matrix(y_test, y_pred_class))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sohail_Comp\\AppData\\Roaming\\Python\\Python37\\site-packages\\sklearn\\linear_model\\_sag.py:329: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge\n",
      "  \"the coef_ did not converge\", ConvergenceWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "    Negative       0.83      0.90      0.86      1675\n",
      "    Positive       0.96      0.93      0.95      4570\n",
      "\n",
      "    accuracy                           0.92      6245\n",
      "   macro avg       0.89      0.92      0.90      6245\n",
      "weighted avg       0.93      0.92      0.92      6245\n",
      "\n",
      "[[1512  163]\n",
      " [ 320 4250]]\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestClassifier, VotingClassifier\n",
    "svm = SVC(kernel='linear',C=1.0)\n",
    "Lr = LogisticRegression(solver=\"sag\",multi_class=\"ovr\",C=1.0)\n",
    "er = VotingClassifier([('rf', svm),('etc', Lr)],voting=\"hard\")\n",
    "\n",
    "x=er.fit(X, y_train)\n",
    "\n",
    "y_pred_class = er.predict(y)\n",
    "print(classification_report(y_test,y_pred_class))\n",
    "print(confusion_matrix(y_test,y_pred_class))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({0: 12500, 1: 12500})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "Counter(data[\"label\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np # linear algebra\n",
    "\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import matplotlib.pyplot as plt\n",
    "# Input data files are available in the \"../input/\" directory.\n",
    "# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory\n",
    "\n",
    "import os\n",
    "import glob\n",
    "import cv2\n",
    "import tensorflow as tf\n",
    "from keras import layers\n",
    "from keras.layers import Dropout , Input, Add, Dense, Activation, ZeroPadding2D, BatchNormalization, Flatten, Conv2D, AveragePooling2D, MaxPooling2D, GlobalMaxPooling2D\n",
    "from keras.models import Model, load_model\n",
    "from keras.initializers import glorot_uniform\n",
    "from sklearn.model_selection import train_test_split\n",
    "import keras.backend as K\n",
    "from sklearn.utils import shuffle\n",
    "# importing all necessary libraries to run the code\n",
    "import re,string\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import keras_metrics\n",
    "import tensorflow.keras\n",
    "import matplotlib.pyplot as plt\n",
    "from nltk.corpus import stopwords\n",
    "from keras.models import Sequential\n",
    "from sklearn.model_selection import train_test_split\n",
    "from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Embedding\n",
    "# using the variable sw to hold all stopwords that are in English\n",
    "sw = stopwords.words('english')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/100\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "176/176 - 675s - loss: 0.4575 - accuracy: 0.7900 - mae: 0.2538 - mse: 0.1568 - precision: 0.4874 - recall: 0.1591 - f1_score: 0.2247 - val_loss: 0.2983 - val_accuracy: 0.8815 - val_mae: 0.1354 - val_mse: 0.0982 - val_precision: 0.7023 - val_recall: 0.4034 - val_f1_score: 0.5124\n",
      "Epoch 2/100\n",
      "176/176 - 678s - loss: 0.2310 - accuracy: 0.9066 - mae: 0.1073 - mse: 0.0731 - precision: 0.7565 - recall: 0.5312 - f1_score: 0.6234 - val_loss: 0.2412 - val_accuracy: 0.8939 - val_mae: 0.1150 - val_mse: 0.0788 - val_precision: 0.7845 - val_recall: 0.6106 - val_f1_score: 0.6867\n",
      "Epoch 3/100\n",
      "176/176 - 660s - loss: 0.1647 - accuracy: 0.9379 - mae: 0.0733 - mse: 0.0492 - precision: 0.8085 - recall: 0.6610 - f1_score: 0.7272 - val_loss: 0.2326 - val_accuracy: 0.9047 - val_mae: 0.1029 - val_mse: 0.0749 - val_precision: 0.8243 - val_recall: 0.6966 - val_f1_score: 0.7551\n",
      "Epoch 4/100\n",
      "176/176 - 685s - loss: 0.1315 - accuracy: 0.9503 - mae: 0.0576 - mse: 0.0394 - precision: 0.8376 - recall: 0.7257 - f1_score: 0.7776 - val_loss: 0.2411 - val_accuracy: 0.9131 - val_mae: 0.0946 - val_mse: 0.0744 - val_precision: 0.8463 - val_recall: 0.7470 - val_f1_score: 0.7936\n",
      "Epoch 5/100\n",
      "176/176 - 710s - loss: 0.1069 - accuracy: 0.9593 - mae: 0.0469 - mse: 0.0316 - precision: 0.8553 - recall: 0.7656 - f1_score: 0.8080 - val_loss: 0.2984 - val_accuracy: 0.9059 - val_mae: 0.0971 - val_mse: 0.0803 - val_precision: 0.8619 - val_recall: 0.7802 - val_f1_score: 0.8190\n",
      "Epoch 6/100\n",
      "176/176 - 697s - loss: 0.0981 - accuracy: 0.9636 - mae: 0.0429 - mse: 0.0287 - precision: 0.8688 - recall: 0.7936 - f1_score: 0.8295 - val_loss: 0.2789 - val_accuracy: 0.8867 - val_mae: 0.1285 - val_mse: 0.0916 - val_precision: 0.8724 - val_recall: 0.8036 - val_f1_score: 0.8366\n",
      "Epoch 7/100\n",
      "176/176 - 698s - loss: 0.0963 - accuracy: 0.9646 - mae: 0.0417 - mse: 0.0281 - precision: 0.8763 - recall: 0.8127 - f1_score: 0.8433 - val_loss: 0.2815 - val_accuracy: 0.9023 - val_mae: 0.0996 - val_mse: 0.0836 - val_precision: 0.8797 - val_recall: 0.8205 - val_f1_score: 0.8491\n",
      "Epoch 8/100\n",
      "176/176 - 714s - loss: 0.0742 - accuracy: 0.9733 - mae: 0.0316 - mse: 0.0212 - precision: 0.8836 - recall: 0.8285 - f1_score: 0.8551 - val_loss: 0.3015 - val_accuracy: 0.9135 - val_mae: 0.0900 - val_mse: 0.0752 - val_precision: 0.8874 - val_recall: 0.8353 - val_f1_score: 0.8606\n",
      "Epoch 9/100\n",
      "176/176 - 716s - loss: 0.0636 - accuracy: 0.9762 - mae: 0.0275 - mse: 0.0186 - precision: 0.8913 - recall: 0.8418 - f1_score: 0.8658 - val_loss: 0.3689 - val_accuracy: 0.9051 - val_mae: 0.0974 - val_mse: 0.0861 - val_precision: 0.8943 - val_recall: 0.8469 - val_f1_score: 0.8700\n",
      "Epoch 10/100\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-82-14eb0f0a7b65>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     20\u001b[0m \u001b[1;31m#Here we train the Network.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     21\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m \u001b[0mpred\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_conv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m \u001b[1;33m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mY_valid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     23\u001b[0m \u001b[0mpred\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[0;32m   1156\u001b[0m                 _r=1):\n\u001b[0;32m   1157\u001b[0m               \u001b[0mcallbacks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_train_batch_begin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1158\u001b[1;33m               \u001b[0mtmp_logs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1159\u001b[0m               \u001b[1;32mif\u001b[0m \u001b[0mdata_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshould_sync\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1160\u001b[0m                 \u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masync_wait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    887\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    888\u001b[0m       \u001b[1;32mwith\u001b[0m \u001b[0mOptionalXlaContext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jit_compile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 889\u001b[1;33m         \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    890\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    891\u001b[0m       \u001b[0mnew_tracing_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexperimental_get_tracing_count\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m_call\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    915\u001b[0m       \u001b[1;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    916\u001b[0m       \u001b[1;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 917\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=not-callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    918\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    919\u001b[0m       \u001b[1;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   3022\u001b[0m        filtered_flat_args) = self._maybe_define_function(args, kwargs)\n\u001b[0;32m   3023\u001b[0m     return graph_function._call_flat(\n\u001b[1;32m-> 3024\u001b[1;33m         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access\n\u001b[0m\u001b[0;32m   3025\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3026\u001b[0m   \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[1;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[0;32m   1959\u001b[0m       \u001b[1;31m# No tape is watching; skip to running the function.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1960\u001b[0m       return self._build_call_outputs(self._inference_function.call(\n\u001b[1;32m-> 1961\u001b[1;33m           ctx, args, cancellation_manager=cancellation_manager))\n\u001b[0m\u001b[0;32m   1962\u001b[0m     forward_backward = self._select_forward_and_backward_functions(\n\u001b[0;32m   1963\u001b[0m         \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[0;32m    594\u001b[0m               \u001b[0minputs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    595\u001b[0m               \u001b[0mattrs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mattrs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 596\u001b[1;33m               ctx=ctx)\n\u001b[0m\u001b[0;32m    597\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    598\u001b[0m           outputs = execute.execute_with_cancellation(\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[1;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[0;32m     58\u001b[0m     \u001b[0mctx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mensure_initialized\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     59\u001b[0m     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,\n\u001b[1;32m---> 60\u001b[1;33m                                         inputs, attrs, num_outputs)\n\u001b[0m\u001b[0;32m     61\u001b[0m   \u001b[1;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     62\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from keras import optimizers\n",
    "import tensorflow.keras\n",
    "tokenizer = tensorflow.keras.preprocessing.text.Tokenizer(num_words=5000, lower=True,split=' ',filters='!\"#$%&()*+,-./:;<=>?@[\\\\]^_`{|}~\\t\\n')\n",
    "tokenizer.fit_on_texts(dataF[\"cleanText\"].values)\n",
    "#print(tokenizer.word_index)  # To see the dicstionary\n",
    "X = tokenizer.texts_to_sequences(dataF[\"cleanText\"].values)\n",
    "X = tensorflow.keras.preprocessing.sequence.pad_sequences(X)\n",
    "#Deep Learning Network Structure\n",
    "model_conv = Sequential()\n",
    "model_conv.add(Embedding(5000,100, input_length=X.shape[1]))\n",
    "model_conv.add(Dropout(0.5))\n",
    "model_conv.add(LSTM(100))\n",
    "model_conv.add(Dense(2, activation='softmax'))\n",
    "model_conv.compile(loss='binary_crossentropy', optimizer='adam',   metrics=['accuracy','mae','mse',keras_metrics.precision(), keras_metrics.recall(),keras_metrics.f1_score()])\n",
    "#Y = data.Reviewer_Score\n",
    "Y = pd.get_dummies(dataF['TextBlob Sentiment']).values\n",
    "\n",
    "X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.25, random_state =2)\n",
    "batch_size=128\n",
    "#Here we train the Network.\n",
    "\n",
    "pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =100, verbose =2,validation_data=(X_valid,Y_valid))\n",
    "pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sohail_Comp\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\sequential.py:450: UserWarning: `model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype(\"int32\")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).\n",
      "  warnings.warn('`model.predict_classes()` is deprecated and '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.83      0.79      0.81       647\n",
      "           1       0.93      0.94      0.94      1851\n",
      "\n",
      "    accuracy                           0.90      2498\n",
      "   macro avg       0.88      0.87      0.87      2498\n",
      "weighted avg       0.90      0.90      0.90      2498\n",
      "\n",
      "[[ 513  134]\n",
      " [ 106 1745]]\n"
     ]
    }
   ],
   "source": [
    "rounded_predictions = model_conv.predict_classes(X_valid, batch_size=128, verbose=0)\n",
    "rounded_labels=np.argmax(Y_valid, axis=1)\n",
    "print(classification_report(rounded_labels,rounded_predictions))\n",
    "print(confusion_matrix(rounded_labels,rounded_predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/100\n",
      "176/176 - 333s - loss: 0.4245 - accuracy: 0.8001 - val_loss: 0.2785 - val_accuracy: 0.8799\n",
      "Epoch 2/100\n",
      "176/176 - 339s - loss: 0.1993 - accuracy: 0.9195 - val_loss: 0.2127 - val_accuracy: 0.9119\n",
      "Epoch 3/100\n",
      "176/176 - 350s - loss: 0.1264 - accuracy: 0.9521 - val_loss: 0.2413 - val_accuracy: 0.9087\n",
      "Epoch 4/100\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-84-878ae9d6a742>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      8\u001b[0m \u001b[0mmodel_conv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0madd\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mDense\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mactivation\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'softmax'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      9\u001b[0m \u001b[0mmodel_conv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcompile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mloss\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'binary_crossentropy'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0moptimizer\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'adam'\u001b[0m\u001b[1;33m,\u001b[0m    \u001b[0mmetrics\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'accuracy'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 10\u001b[1;33m \u001b[0mpred\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_conv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m \u001b[1;33m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mY_valid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     11\u001b[0m \u001b[0mpred\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[0;32m   1156\u001b[0m                 _r=1):\n\u001b[0;32m   1157\u001b[0m               \u001b[0mcallbacks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_train_batch_begin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1158\u001b[1;33m               \u001b[0mtmp_logs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1159\u001b[0m               \u001b[1;32mif\u001b[0m \u001b[0mdata_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshould_sync\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1160\u001b[0m                 \u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masync_wait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    887\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    888\u001b[0m       \u001b[1;32mwith\u001b[0m \u001b[0mOptionalXlaContext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jit_compile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 889\u001b[1;33m         \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    890\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    891\u001b[0m       \u001b[0mnew_tracing_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexperimental_get_tracing_count\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m_call\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    915\u001b[0m       \u001b[1;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    916\u001b[0m       \u001b[1;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 917\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=not-callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    918\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    919\u001b[0m       \u001b[1;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   3022\u001b[0m        filtered_flat_args) = self._maybe_define_function(args, kwargs)\n\u001b[0;32m   3023\u001b[0m     return graph_function._call_flat(\n\u001b[1;32m-> 3024\u001b[1;33m         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access\n\u001b[0m\u001b[0;32m   3025\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3026\u001b[0m   \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[1;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[0;32m   1959\u001b[0m       \u001b[1;31m# No tape is watching; skip to running the function.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1960\u001b[0m       return self._build_call_outputs(self._inference_function.call(\n\u001b[1;32m-> 1961\u001b[1;33m           ctx, args, cancellation_manager=cancellation_manager))\n\u001b[0m\u001b[0;32m   1962\u001b[0m     forward_backward = self._select_forward_and_backward_functions(\n\u001b[0;32m   1963\u001b[0m         \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[0;32m    594\u001b[0m               \u001b[0minputs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    595\u001b[0m               \u001b[0mattrs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mattrs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 596\u001b[1;33m               ctx=ctx)\n\u001b[0m\u001b[0;32m    597\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    598\u001b[0m           outputs = execute.execute_with_cancellation(\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[1;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[0;32m     58\u001b[0m     \u001b[0mctx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mensure_initialized\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     59\u001b[0m     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,\n\u001b[1;32m---> 60\u001b[1;33m                                         inputs, attrs, num_outputs)\n\u001b[0m\u001b[0;32m     61\u001b[0m   \u001b[1;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     62\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation,Embedding\n",
    "from keras.models import Sequential\n",
    "model_conv = Sequential()\n",
    "model_conv.add(Embedding(5000, 100, input_length=X.shape[1]))\n",
    "model_conv.add(Conv1D(128, 2, activation='relu'))\n",
    "model_conv.add(MaxPooling1D(pool_size=2))\n",
    "model_conv.add(LSTM(100))\n",
    "model_conv.add(Dense(2, activation='softmax'))\n",
    "model_conv.compile(loss='binary_crossentropy', optimizer='adam',    metrics=['accuracy'])\n",
    "pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =100, verbose =2,validation_data=(X_valid,Y_valid))\n",
    "pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sohail_Comp\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\sequential.py:450: UserWarning: `model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype(\"int32\")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).\n",
      "  warnings.warn('`model.predict_classes()` is deprecated and '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.78      0.88      0.83       647\n",
      "           1       0.96      0.91      0.93      1851\n",
      "\n",
      "    accuracy                           0.90      2498\n",
      "   macro avg       0.87      0.90      0.88      2498\n",
      "weighted avg       0.91      0.90      0.91      2498\n",
      "\n",
      "[[ 568   79]\n",
      " [ 160 1691]]\n"
     ]
    }
   ],
   "source": [
    "rounded_predictions = model_conv.predict_classes(X_valid, batch_size=128, verbose=0)\n",
    "rounded_labels=np.argmax(Y_valid, axis=1)\n",
    "print(classification_report(rounded_labels,rounded_predictions))\n",
    "print(confusion_matrix(rounded_labels,rounded_predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1/100\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sohail_Comp\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\base_layer.py:1307: UserWarning: `layer.updates` will be removed in a future version. This property should not be used in TensorFlow 2.0, as `updates` are applied automatically.\n",
      "  warnings.warn('`layer.updates` will be removed in a future version. '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "WARNING:tensorflow:`add_update` `inputs` kwarg has been deprecated. You no longer need to pass a value to `inputs` as it is being automatically inferred.\n",
      "1405/1405 - 1024s - loss: 0.5827 - accuracy: 0.7326 - mae: 0.3215 - mse: 0.2178 - precision: 0.3331 - recall: 0.0153 - f1_score: 0.0286 - val_loss: 0.5918 - val_accuracy: 0.7230 - val_mae: 0.3491 - val_mse: 0.2134 - val_precision: 0.3534 - val_recall: 0.0065 - val_f1_score: 0.0128\n",
      "Epoch 2/100\n"
     ]
    },
    {
     "ename": "KeyboardInterrupt",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-86-fcad73064f49>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     11\u001b[0m \u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mX_valid\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_valid\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtrain_test_split\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mY\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtest_size\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m0.1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m \u001b[1;31m#Here we train the Network.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 13\u001b[1;33m \u001b[0mpred\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmodel_conv\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mY_train\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbatch_size\u001b[0m \u001b[1;33m=\u001b[0m\u001b[0mbatch_size\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mepochs\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m100\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mverbose\u001b[0m \u001b[1;33m=\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mvalidation_data\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mX_valid\u001b[0m\u001b[1;33m,\u001b[0m\u001b[0mY_valid\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     14\u001b[0m \u001b[0mpred\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\training.py\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)\u001b[0m\n\u001b[0;32m   1156\u001b[0m                 _r=1):\n\u001b[0;32m   1157\u001b[0m               \u001b[0mcallbacks\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mon_train_batch_begin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstep\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1158\u001b[1;33m               \u001b[0mtmp_logs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtrain_function\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0miterator\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1159\u001b[0m               \u001b[1;32mif\u001b[0m \u001b[0mdata_handler\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshould_sync\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1160\u001b[0m                 \u001b[0mcontext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masync_wait\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    887\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    888\u001b[0m       \u001b[1;32mwith\u001b[0m \u001b[0mOptionalXlaContext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jit_compile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 889\u001b[1;33m         \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_call\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    890\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    891\u001b[0m       \u001b[0mnew_tracing_count\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexperimental_get_tracing_count\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\def_function.py\u001b[0m in \u001b[0;36m_call\u001b[1;34m(self, *args, **kwds)\u001b[0m\n\u001b[0;32m    915\u001b[0m       \u001b[1;31m# In this case we have created variables on the first call, so we run the\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    916\u001b[0m       \u001b[1;31m# defunned version which is guaranteed to never create variables.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 917\u001b[1;33m       \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateless_fn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m  \u001b[1;31m# pylint: disable=not-callable\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    918\u001b[0m     \u001b[1;32melif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_stateful_fn\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    919\u001b[0m       \u001b[1;31m# Release the lock early so that multiple threads can perform the call\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m   3022\u001b[0m        filtered_flat_args) = self._maybe_define_function(args, kwargs)\n\u001b[0;32m   3023\u001b[0m     return graph_function._call_flat(\n\u001b[1;32m-> 3024\u001b[1;33m         filtered_flat_args, captured_inputs=graph_function.captured_inputs)  # pylint: disable=protected-access\n\u001b[0m\u001b[0;32m   3025\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   3026\u001b[0m   \u001b[1;33m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36m_call_flat\u001b[1;34m(self, args, captured_inputs, cancellation_manager)\u001b[0m\n\u001b[0;32m   1959\u001b[0m       \u001b[1;31m# No tape is watching; skip to running the function.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1960\u001b[0m       return self._build_call_outputs(self._inference_function.call(\n\u001b[1;32m-> 1961\u001b[1;33m           ctx, args, cancellation_manager=cancellation_manager))\n\u001b[0m\u001b[0;32m   1962\u001b[0m     forward_backward = self._select_forward_and_backward_functions(\n\u001b[0;32m   1963\u001b[0m         \u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\function.py\u001b[0m in \u001b[0;36mcall\u001b[1;34m(self, ctx, args, cancellation_manager)\u001b[0m\n\u001b[0;32m    594\u001b[0m               \u001b[0minputs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0margs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    595\u001b[0m               \u001b[0mattrs\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mattrs\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 596\u001b[1;33m               ctx=ctx)\n\u001b[0m\u001b[0;32m    597\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    598\u001b[0m           outputs = execute.execute_with_cancellation(\n",
      "\u001b[1;32m~\\AppData\\Roaming\\Python\\Python37\\site-packages\\tensorflow\\python\\eager\\execute.py\u001b[0m in \u001b[0;36mquick_execute\u001b[1;34m(op_name, num_outputs, inputs, attrs, ctx, name)\u001b[0m\n\u001b[0;32m     58\u001b[0m     \u001b[0mctx\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mensure_initialized\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     59\u001b[0m     tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,\n\u001b[1;32m---> 60\u001b[1;33m                                         inputs, attrs, num_outputs)\n\u001b[0m\u001b[0;32m     61\u001b[0m   \u001b[1;32mexcept\u001b[0m \u001b[0mcore\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_NotOkStatusException\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     62\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mname\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mKeyboardInterrupt\u001b[0m: "
     ]
    }
   ],
   "source": [
    "from keras import optimizers\n",
    "#Deep Learning Network Structure\n",
    "model_conv = Sequential()\n",
    "model_conv.add(Embedding(5000,100, input_length=X.shape[1]))\n",
    "model_conv.add(Dropout(0.5))\n",
    "model_conv.add(layers.GRU(256, return_sequences=True))\n",
    "model_conv.add(layers.SimpleRNN(128))\n",
    "model_conv.add(Dense(2, activation='softmax'))\n",
    "model_conv.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy','mae','mse',keras_metrics.precision(), keras_metrics.recall(),keras_metrics.f1_score()])\n",
    "batch_size=16\n",
    "X_train, X_valid, Y_train, Y_valid = train_test_split(X,Y, test_size = 0.1)\n",
    "#Here we train the Network.\n",
    "pred=model_conv.fit(X_train, Y_train, batch_size =batch_size, epochs =100, verbose =2,validation_data=(X_valid,Y_valid))\n",
    "pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Sohail_Comp\\AppData\\Roaming\\Python\\Python37\\site-packages\\keras\\engine\\sequential.py:450: UserWarning: `model.predict_classes()` is deprecated and will be removed after 2021-01-01. Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype(\"int32\")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).\n",
      "  warnings.warn('`model.predict_classes()` is deprecated and '\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              precision    recall  f1-score   support\n",
      "\n",
      "           0       0.84      0.88      0.86      3715\n",
      "           1       0.88      0.83      0.85      3785\n",
      "\n",
      "    accuracy                           0.86      7500\n",
      "   macro avg       0.86      0.86      0.86      7500\n",
      "weighted avg       0.86      0.86      0.86      7500\n",
      "\n",
      "[[3272  443]\n",
      " [ 642 3143]]\n"
     ]
    }
   ],
   "source": [
    "rounded_predictions = model_conv.predict_classes(X_valid, batch_size=128, verbose=0)\n",
    "rounded_labels=np.argmax(Y_valid, axis=1)\n",
    "print(classification_report(rounded_labels,rounded_predictions))\n",
    "print(confusion_matrix(rounded_labels,rounded_predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TF-IDF On Sample Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "a=[\"gwyneth paltrow absolute great movie\",\"own movie number movie didnt like choice do\", \"wish show would come back tel\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>absolute</th>\n",
       "      <th>back</th>\n",
       "      <th>choice</th>\n",
       "      <th>come</th>\n",
       "      <th>didnt</th>\n",
       "      <th>do</th>\n",
       "      <th>great</th>\n",
       "      <th>gwyneth</th>\n",
       "      <th>like</th>\n",
       "      <th>movie</th>\n",
       "      <th>number</th>\n",
       "      <th>own</th>\n",
       "      <th>paltrow</th>\n",
       "      <th>show</th>\n",
       "      <th>tel</th>\n",
       "      <th>wish</th>\n",
       "      <th>would</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   absolute  back  choice  come  didnt  do  great  gwyneth  like  movie  \\\n",
       "0         1     0       0     0      0   0      1        1     0      1   \n",
       "1         0     0       1     0      1   1      0        0     1      2   \n",
       "2         0     1       0     1      0   0      0        0     0      0   \n",
       "\n",
       "   number  own  paltrow  show  tel  wish  would  \n",
       "0       0    0        1     0    0     0      0  \n",
       "1       1    1        0     0    0     0      0  \n",
       "2       0    0        0     1    1     1      1  "
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "vectorizer1 = CountVectorizer(max_features=500)\n",
    "X_train_tf1 = vectorizer1.fit_transform(a)\n",
    "df1 = pd.DataFrame(X_train_tf1.toarray(), columns=vectorizer1.get_feature_names())\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>absolute</th>\n",
       "      <th>back</th>\n",
       "      <th>choice</th>\n",
       "      <th>come</th>\n",
       "      <th>didnt</th>\n",
       "      <th>do</th>\n",
       "      <th>great</th>\n",
       "      <th>gwyneth</th>\n",
       "      <th>like</th>\n",
       "      <th>movie</th>\n",
       "      <th>number</th>\n",
       "      <th>own</th>\n",
       "      <th>paltrow</th>\n",
       "      <th>show</th>\n",
       "      <th>tel</th>\n",
       "      <th>wish</th>\n",
       "      <th>would</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.467351</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.467351</td>\n",
       "      <td>0.467351</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.355432</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.467351</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.346821</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.346821</td>\n",
       "      <td>0.346821</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.346821</td>\n",
       "      <td>0.527533</td>\n",
       "      <td>0.346821</td>\n",
       "      <td>0.346821</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.408248</td>\n",
       "      <td>0.408248</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   absolute      back    choice      come     didnt        do     great  \\\n",
       "0  0.467351  0.000000  0.000000  0.000000  0.000000  0.000000  0.467351   \n",
       "1  0.000000  0.000000  0.346821  0.000000  0.346821  0.346821  0.000000   \n",
       "2  0.000000  0.408248  0.000000  0.408248  0.000000  0.000000  0.000000   \n",
       "\n",
       "    gwyneth      like     movie    number       own   paltrow      show  \\\n",
       "0  0.467351  0.000000  0.355432  0.000000  0.000000  0.467351  0.000000   \n",
       "1  0.000000  0.346821  0.527533  0.346821  0.346821  0.000000  0.000000   \n",
       "2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.408248   \n",
       "\n",
       "        tel      wish     would  \n",
       "0  0.000000  0.000000  0.000000  \n",
       "1  0.000000  0.000000  0.000000  \n",
       "2  0.408248  0.408248  0.408248  "
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "vectorizer1 = TfidfVectorizer(max_features=500)\n",
    "X_train_tf1 = vectorizer1.fit_transform(a)\n",
    "df1 = pd.DataFrame(X_train_tf1.toarray(), columns=vectorizer1.get_feature_names())\n",
    "df1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# T Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "t=-0.182, df=9\n",
      "Reject the null hypothesis that the means are not equal.\n"
     ]
    }
   ],
   "source": [
    "# t-test\n",
    "from math import sqrt\n",
    "from numpy.random import seed\n",
    "from numpy.random import randn\n",
    "from numpy import mean\n",
    "from scipy.stats import t\n",
    "\n",
    "# function for calculating the t-test for two dependent samples\n",
    "def dependent_ttest(data1, data2, alpha):\n",
    "\t# calculate means\n",
    "\tmean1, mean2 = mean(data1), mean(data2)\n",
    "\t# number of paired samples\n",
    "\tn = len(data1)\n",
    "\t# sum squared difference between observations\n",
    "\td1 = sum([(data1[i]-data2[i])**2 for i in range(n)])\n",
    "\t# sum difference between observations\n",
    "\td2 = sum([data1[i]-data2[i] for i in range(n)])\n",
    "\t# standard deviation of the difference between means\n",
    "\tsd = sqrt((d1 - (d2**2 / n)) / (n - 1))\n",
    "\t# standard error of the difference between the means\n",
    "\tsed = sd / sqrt(n)\n",
    "\t# calculate the t statistic\n",
    "\tt_stat = (mean1 - mean2) / sed\n",
    "\t# degrees of freedom\n",
    "\tdf = n - 1\n",
    "\t# calculate the critical value\n",
    "\tcv = t.ppf(1.0 - alpha, df)\n",
    "\t# calculate the p-value\n",
    "\tp = (1.0 - t.cdf(abs(t_stat), df)) * 2.0\n",
    "\t# return everything\n",
    "\treturn t_stat, df, cv, p\n",
    "\n",
    "# seed the random number generator\n",
    "seed(1)\n",
    "\n",
    "\n",
    "data1 = [0.92, 0.92, 0.88, 0.90, 0.96, 0.78, 0.87, 0.94, 0.82, 0.88]\n",
    "data2 = [0.89, 0.88, 0.90, 0.89, 0.90, 0.88, 0.89, 0.89, 0.89, 0.89]\n",
    "data3 = [0.92, 0.92, 0.88, 0.90, 0.96, 0.78, 0.87, 0.94, 0.82, 0.88]\n",
    "\n",
    "\n",
    "# calculate the t test\n",
    "alpha =0.5\n",
    "t_stat, df, cv, p = dependent_ttest(data1, data2, alpha)\n",
    "print('t=%.3f, df=%d' % (t_stat, df))\n",
    "# interpret via critical value\n",
    "if abs(t_stat) <= cv:\n",
    "\tprint('Accept null hypothesis that the means are equal.')\n",
    "else:\n",
    "\tprint('Reject the null hypothesis that the means are not equal.')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "IMDB.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}