{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Importing Libraries**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" }, "outputs": [], "source": [ "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "import ast\n", "import operator\n", "from textblob import TextBlob\n", "import pandas as pd\n", "import re" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Loading Dataset**" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "dataset = pd.read_csv(\"../input/daraz-sef/Daraz_Labeled_Standardized_Sentiments_Featured_Dataset(Updated).csv\")\n", "dataset = dataset.rename(columns={\"Unnamed: 0\": \"Review_Id\"}, errors=\"raise\")\n", "# dataset.drop(['Unnamed: 0'],axis=1,inplace=True)\n", "dataset.drop_duplicates(subset =\"Reviews\",keep = False, inplace = True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Excluding Spam Reviews**\n", "i.e. Review with Label 0" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "dataset.drop(dataset[dataset.Label == 1].index, inplace=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Review_IdProduct_IDCustomer_NameCustomer_IDDateRatingLabelReviewsSentimentFeatures
00TE111EL1KXNGWNAFAMZitzmr767025/5/201710.0mil gya hai acha hai aur toota howa b nai haiPositive['toota']
11MI543EL02ACXSNAFAMZAfridi3963/28/201750.0aram sy pocket may aa jta hai aur quality b a...Positive['aram', 'pocket', 'quality']
22MI543EL02ACXSNAFAMZMuhammed37103/24/201720.0sb sy achi chez apki customer service bhot ac...Positive['customer', 'products', 'orignal']
33TE111EL1KXNGWNAFAMZGhazanfar18604/5/201730.0back cover acha hai sai cuts hai aur soft b h...Positive['back', 'cover', 'cuts', 'soft', 'protector',...
44SO638EL103OUWNAFAMZMurtaza iqbal377811/4/201740.0shru may may thumb grip k khalif thi kyu k ex...Negative['thumb', 'grip', 'chezy', 'stock', 'controlle...
.................................
39153915HA779HL1JDA5CNAFAMZAdny2k3685/29/201820.0iss article mein koi auto car tareeqay se saf...Positive['auto', 'car', 'tareeqay', 'safai']
39183918HA779HL1JDA5CNAFAMZAz11575/27/201830.0tasweer isi terhan tha jaisay tasweer aur tar...Positive['tasweer', 'tasweer', 'haftay', 'takheer', 'm...
39193919HA779HL1JDA5CNAFAMZZeeshan61355/22/201850.0waqt par masool hwa aaccha lagta hai mein iss...Neutral['waqt', 'masool', 'miyaar']
39203920HA779HL1JDA5CNAFAMZshan72185/21/201830.0masnoaat isi terhan ki thi jaisay website par...Positive['masnoaat', 'website', 'asal', 'tasweer', 'da...
39223922HA779HL1JDA5CNAFAMZFeroz18005/20/201810.0aik puranay model bheja gaya tha jo ab bazaar...Positive['puranay', 'model', 'bazaar', 'dastyab', 'plu...
\n", "

3308 rows × 10 columns

\n", "
" ], "text/plain": [ " Review_Id Product_ID Customer_Name Customer_ID Date \\\n", "0 0 TE111EL1KXNGWNAFAMZ itzmr7 6702 5/5/2017 \n", "1 1 MI543EL02ACXSNAFAMZ Afridi 396 3/28/2017 \n", "2 2 MI543EL02ACXSNAFAMZ Muhammed 3710 3/24/2017 \n", "3 3 TE111EL1KXNGWNAFAMZ Ghazanfar 1860 4/5/2017 \n", "4 4 SO638EL103OUWNAFAMZ Murtaza iqbal 3778 11/4/2017 \n", "... ... ... ... ... ... \n", "3915 3915 HA779HL1JDA5CNAFAMZ Adny2k 368 5/29/2018 \n", "3918 3918 HA779HL1JDA5CNAFAMZ Az 1157 5/27/2018 \n", "3919 3919 HA779HL1JDA5CNAFAMZ Zeeshan 6135 5/22/2018 \n", "3920 3920 HA779HL1JDA5CNAFAMZ shan 7218 5/21/2018 \n", "3922 3922 HA779HL1JDA5CNAFAMZ Feroz 1800 5/20/2018 \n", "\n", " Rating Label Reviews \\\n", "0 1 0.0 mil gya hai acha hai aur toota howa b nai hai \n", "1 5 0.0 aram sy pocket may aa jta hai aur quality b a... \n", "2 2 0.0 sb sy achi chez apki customer service bhot ac... \n", "3 3 0.0 back cover acha hai sai cuts hai aur soft b h... \n", "4 4 0.0 shru may may thumb grip k khalif thi kyu k ex... \n", "... ... ... ... \n", "3915 2 0.0 iss article mein koi auto car tareeqay se saf... \n", "3918 3 0.0 tasweer isi terhan tha jaisay tasweer aur tar... \n", "3919 5 0.0 waqt par masool hwa aaccha lagta hai mein iss... \n", "3920 3 0.0 masnoaat isi terhan ki thi jaisay website par... \n", "3922 1 0.0 aik puranay model bheja gaya tha jo ab bazaar... \n", "\n", " Sentiment Features \n", "0 Positive ['toota'] \n", "1 Positive ['aram', 'pocket', 'quality'] \n", "2 Positive ['customer', 'products', 'orignal'] \n", "3 Positive ['back', 'cover', 'cuts', 'soft', 'protector',... \n", "4 Negative ['thumb', 'grip', 'chezy', 'stock', 'controlle... \n", "... ... ... \n", "3915 Positive ['auto', 'car', 'tareeqay', 'safai'] \n", "3918 Positive ['tasweer', 'tasweer', 'haftay', 'takheer', 'm... \n", "3919 Neutral ['waqt', 'masool', 'miyaar'] \n", "3920 Positive ['masnoaat', 'website', 'asal', 'tasweer', 'da... \n", "3922 Positive ['puranay', 'model', 'bazaar', 'dastyab', 'plu... \n", "\n", "[3308 rows x 10 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Products and Number of reviews for that**" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1009" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_Product_ID=[]\n", "for Product_ID in dataset['Product_ID']:\n", " if Product_ID not in unique_Product_ID:\n", " unique_Product_ID.append(Product_ID)\n", "len(unique_Product_ID)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "unique_Product_ID_with_Count = {}\n", "for C_ID in unique_Product_ID:\n", " len_Of_C_ID = len(dataset.loc[dataset['Product_ID'] == C_ID])\n", " unique_Product_ID_with_Count[C_ID]= len_Of_C_ID\n", "\n", "unique_Product_ID_with_Count = sorted(unique_Product_ID_with_Count.items(), key=operator.itemgetter(1) , reverse=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "Top_Reviewed_Products = unique_Product_ID_with_Count[:10]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Selecting a product to Diversify**" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Review_IdProduct_IDCustomer_NameCustomer_IDDateRatingLabelReviewsSentimentFeatures
224224MA305FA1KY528NAFAMZAbdullah Shahid2518/31/201720.0achi terhan se iss ke sath mera tajurbah muk...Negative['tajurbah', 'mukammal', 'tor', 'zabardast', '...
225225MA305FA1KY528NAFAMZSuhayb53338/24/201730.0mujhe yeh apne dost ke liye mile hain qmizin...Negative['qmizin', 'qeemat', 'roz', 'pahannay', 'cotto...
226226MA305FA1KY528NAFAMZMalik Arslan29898/22/201730.0mein nai un shirts ka order diya tha aur sirf...Negative['shirts', 'order', 'order', 'haasil', 'safaid...
229229MA305FA1KY528NAFAMZQasim42328/21/201750.0bohat umdah miyaar theek hai waqai iss ko p...Positive['umdah', 'miyaar', 'pasand', 'garmiyon', 'roz...
230230MA305FA1KY528NAFAMZAsad9668/19/201710.0mein nay x ail shirts order ki lekin un ke s...Negative['shirts', 'order', 'size', 'wazeh', 'shirts',...
\n", "
" ], "text/plain": [ " Review_Id Product_ID Customer_Name Customer_ID Date \\\n", "224 224 MA305FA1KY528NAFAMZ Abdullah Shahid 251 8/31/2017 \n", "225 225 MA305FA1KY528NAFAMZ Suhayb 5333 8/24/2017 \n", "226 226 MA305FA1KY528NAFAMZ Malik Arslan 2989 8/22/2017 \n", "229 229 MA305FA1KY528NAFAMZ Qasim 4232 8/21/2017 \n", "230 230 MA305FA1KY528NAFAMZ Asad 966 8/19/2017 \n", "\n", " Rating Label Reviews \\\n", "224 2 0.0 achi terhan se iss ke sath mera tajurbah muk... \n", "225 3 0.0 mujhe yeh apne dost ke liye mile hain qmizin... \n", "226 3 0.0 mein nai un shirts ka order diya tha aur sirf... \n", "229 5 0.0 bohat umdah miyaar theek hai waqai iss ko p... \n", "230 1 0.0 mein nay x ail shirts order ki lekin un ke s... \n", "\n", " Sentiment Features \n", "224 Negative ['tajurbah', 'mukammal', 'tor', 'zabardast', '... \n", "225 Negative ['qmizin', 'qeemat', 'roz', 'pahannay', 'cotto... \n", "226 Negative ['shirts', 'order', 'order', 'haasil', 'safaid... \n", "229 Positive ['umdah', 'miyaar', 'pasand', 'garmiyon', 'roz... \n", "230 Negative ['shirts', 'order', 'size', 'wazeh', 'shirts',... " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ProductToDiversify_id='MA305FA1KY528NAFAMZ'\n", "ProductDataset = dataset.loc[dataset['Product_ID'] == ProductToDiversify_id]\n", "ProductDataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Separating Reviews according to their Sentiments**" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "27\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Review_IdProduct_IDCustomer_NameCustomer_IDDateRatingLabelReviewsSentimentFeatures
224224MA305FA1KY528NAFAMZAbdullah Shahid2518/31/201720.0achi terhan se iss ke sath mera tajurbah muk...Negative['tajurbah', 'mukammal', 'tor', 'zabardast', '...
225225MA305FA1KY528NAFAMZSuhayb53338/24/201730.0mujhe yeh apne dost ke liye mile hain qmizin...Negative['qmizin', 'qeemat', 'roz', 'pahannay', 'cotto...
226226MA305FA1KY528NAFAMZMalik Arslan29898/22/201730.0mein nai un shirts ka order diya tha aur sirf...Negative['shirts', 'order', 'order', 'haasil', 'safaid...
230230MA305FA1KY528NAFAMZAsad9668/19/201710.0mein nay x ail shirts order ki lekin un ke s...Negative['shirts', 'order', 'size', 'wazeh', 'shirts',...
261261MA305FA1KY528NAFAMZShayam Malik514511/8/201740.0un shirts mein sirf ghalti fitting thi woh ha...Negative['shirts', 'ghalti', 'fitting', 'halki', 'dhee...
\n", "
" ], "text/plain": [ " Review_Id Product_ID Customer_Name Customer_ID Date \\\n", "224 224 MA305FA1KY528NAFAMZ Abdullah Shahid 251 8/31/2017 \n", "225 225 MA305FA1KY528NAFAMZ Suhayb 5333 8/24/2017 \n", "226 226 MA305FA1KY528NAFAMZ Malik Arslan 2989 8/22/2017 \n", "230 230 MA305FA1KY528NAFAMZ Asad 966 8/19/2017 \n", "261 261 MA305FA1KY528NAFAMZ Shayam Malik 5145 11/8/2017 \n", "\n", " Rating Label Reviews \\\n", "224 2 0.0 achi terhan se iss ke sath mera tajurbah muk... \n", "225 3 0.0 mujhe yeh apne dost ke liye mile hain qmizin... \n", "226 3 0.0 mein nai un shirts ka order diya tha aur sirf... \n", "230 1 0.0 mein nay x ail shirts order ki lekin un ke s... \n", "261 4 0.0 un shirts mein sirf ghalti fitting thi woh ha... \n", "\n", " Sentiment Features \n", "224 Negative ['tajurbah', 'mukammal', 'tor', 'zabardast', '... \n", "225 Negative ['qmizin', 'qeemat', 'roz', 'pahannay', 'cotto... \n", "226 Negative ['shirts', 'order', 'order', 'haasil', 'safaid... \n", "230 Negative ['shirts', 'order', 'size', 'wazeh', 'shirts',... \n", "261 Negative ['shirts', 'ghalti', 'fitting', 'halki', 'dhee... " ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NegativeDataset = ProductDataset.loc[ProductDataset['Sentiment'] == 'Negative']\n", "print(len(NegativeDataset))\n", "NegativeDataset.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "41\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Review_IdProduct_IDCustomer_NameCustomer_IDDateRatingLabelReviewsSentimentFeatures
229229MA305FA1KY528NAFAMZQasim42328/21/201750.0bohat umdah miyaar theek hai waqai iss ko p...Positive['umdah', 'miyaar', 'pasand', 'garmiyon', 'roz...
231231MA305FA1KY528NAFAMZMahnoor Mubashir29628/17/201740.0tea shirts ka packet bohat acha tha mein ye...Positive['tea', 'shirts', 'packet', 'chahta', 'rang', ...
232232MA305FA1KY528NAFAMZumar khattak73558/16/201740.0theek hai iss qeemat ke baarey mein mein yeh...Positive['qeemat', 'qmizin', 'umdah', 'tane', 'miyaar'...
305305MA305FA1KY528NAFAMZMustufa Dilpazir38142/8/201740.0mein daraz ka aksar gahak hon mujhe waqai mei...Positive['aksar', 'tea', 'shirts', 'pasand', 'miyaar',...
387387MA305FA1KY528NAFAMZNo name406010/7/201750.0shirts ki bohat achi quality sirf yeh dekh ka...Positive['shirts', 'quality', 'heran', 'shirts', 'pack...
\n", "
" ], "text/plain": [ " Review_Id Product_ID Customer_Name Customer_ID Date \\\n", "229 229 MA305FA1KY528NAFAMZ Qasim 4232 8/21/2017 \n", "231 231 MA305FA1KY528NAFAMZ Mahnoor Mubashir 2962 8/17/2017 \n", "232 232 MA305FA1KY528NAFAMZ umar khattak 7355 8/16/2017 \n", "305 305 MA305FA1KY528NAFAMZ Mustufa Dilpazir 3814 2/8/2017 \n", "387 387 MA305FA1KY528NAFAMZ No name 4060 10/7/2017 \n", "\n", " Rating Label Reviews \\\n", "229 5 0.0 bohat umdah miyaar theek hai waqai iss ko p... \n", "231 4 0.0 tea shirts ka packet bohat acha tha mein ye... \n", "232 4 0.0 theek hai iss qeemat ke baarey mein mein yeh... \n", "305 4 0.0 mein daraz ka aksar gahak hon mujhe waqai mei... \n", "387 5 0.0 shirts ki bohat achi quality sirf yeh dekh ka... \n", "\n", " Sentiment Features \n", "229 Positive ['umdah', 'miyaar', 'pasand', 'garmiyon', 'roz... \n", "231 Positive ['tea', 'shirts', 'packet', 'chahta', 'rang', ... \n", "232 Positive ['qeemat', 'qmizin', 'umdah', 'tane', 'miyaar'... \n", "305 Positive ['aksar', 'tea', 'shirts', 'pasand', 'miyaar',... \n", "387 Positive ['shirts', 'quality', 'heran', 'shirts', 'pack... " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PositiveDataset= ProductDataset.loc[ProductDataset['Sentiment'] == 'Positive']\n", "print(len(PositiveDataset))\n", "PositiveDataset.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "12\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Review_IdProduct_IDCustomer_NameCustomer_IDDateRatingLabelReviewsSentimentFeatures
306306MA305FA1KY528NAFAMZMrs33301/8/201750.0ya asan h istemaal may keemat b kaam hai aur ...Neutral['asan', 'keemat', 'saaf']
336336MA305FA1KY528NAFAMZKhan26367/30/201710.0aik shirt khrb hai aur blue wali may whole haiNeutral['khrb', 'blue']
533533MA305FA1KY528NAFAMZAzam11606/17/201720.0mein ne poooray shirts mein poori terhan se ...Neutral['poooray', 'shirts', 'paaya', 'safaid', 'peel...
729729MA305FA1KY528NAFAMZIrfan Bin Hakim23455/26/201720.0mein ne safaid aur gehra surkh rang ke liye ...Neutral['safaid', 'rang', 'order', 'rang', 'safaid', ...
807807MA305FA1KY528NAFAMZhina timothy663211/5/201740.0yeh qmizin iss mausam garma mein meri jane wa...Neutral['qmizin', 'mausam', 'garma', 'rangeen', 'muta...
\n", "
" ], "text/plain": [ " Review_Id Product_ID Customer_Name Customer_ID Date \\\n", "306 306 MA305FA1KY528NAFAMZ Mrs 3330 1/8/2017 \n", "336 336 MA305FA1KY528NAFAMZ Khan 2636 7/30/2017 \n", "533 533 MA305FA1KY528NAFAMZ Azam 1160 6/17/2017 \n", "729 729 MA305FA1KY528NAFAMZ Irfan Bin Hakim 2345 5/26/2017 \n", "807 807 MA305FA1KY528NAFAMZ hina timothy 6632 11/5/2017 \n", "\n", " Rating Label Reviews \\\n", "306 5 0.0 ya asan h istemaal may keemat b kaam hai aur ... \n", "336 1 0.0 aik shirt khrb hai aur blue wali may whole hai \n", "533 2 0.0 mein ne poooray shirts mein poori terhan se ... \n", "729 2 0.0 mein ne safaid aur gehra surkh rang ke liye ... \n", "807 4 0.0 yeh qmizin iss mausam garma mein meri jane wa... \n", "\n", " Sentiment Features \n", "306 Neutral ['asan', 'keemat', 'saaf'] \n", "336 Neutral ['khrb', 'blue'] \n", "533 Neutral ['poooray', 'shirts', 'paaya', 'safaid', 'peel... \n", "729 Neutral ['safaid', 'rang', 'order', 'rang', 'safaid', ... \n", "807 Neutral ['qmizin', 'mausam', 'garma', 'rangeen', 'muta... " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NeutralDataset = ProductDataset.loc[ProductDataset['Sentiment'] == 'Neutral']\n", "print(len(NeutralDataset))\n", "NeutralDataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Defining Functions**" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "def Get_Dataset(k):\n", " if(k=='alpha'):\n", " R = PositiveDataset\n", " elif(k=='beta'):\n", " R = NegativeDataset\n", " else:\n", " R = NeutralDataset\n", " return R" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "def Get_Feature_Set(Features):\n", " Feature_List = []\n", " Feature_count= {}\n", " for featurelist in Features:\n", " Feature_List_array = ast.literal_eval(featurelist)\n", " for feature in Feature_List_array:\n", " if feature in Feature_count:\n", " Feature_count[feature]+=1\n", " else: \n", " Feature_count[feature]=1 \n", " if feature not in Feature_List:\n", " Feature_List.append(feature)\n", " \n", " Feature_count = sorted(Feature_count.items(), key=operator.itemgetter(1) , reverse=True)\n", "# Feature_count = Feature_count[1:]\n", " feature_dict={}\n", " for i in Feature_count:\n", " name = i[0]\n", " value = i[1]\n", " feature_dict[name] = value\n", " ret = {'feature_count':feature_dict, 'Feature_List':Feature_List }\n", " return ret\n", " " ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# f = Get_Feature_Set(PositiveDataset.Features)\n", "\n", "# k = f['feature_count']" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def Features_Of_Review_wrt_F(F,review): \n", " features_List = []\n", " f_asteric = ast.literal_eval(review)\n", " for feature in f_asteric:\n", " if feature in F:\n", " features_List.append(feature)\n", " return features_List" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def Utility_of_Review(F,F_Count,f_asteric):\n", " utility_of_review=0\n", " tempF = []\n", " for f in F:\n", " if f not in f_asteric:\n", " tempF.append(f)\n", " tempF_counts = {}\n", " for f_t in tempF:\n", " tempF_counts[f_t] = F_Count[f_t]\n", " tempF_counts = sorted(tempF_counts.items(), key=operator.itemgetter(1) , reverse=True)\n", " \n", " for f in f_asteric:\n", " w_of_f = F_Count[f]/tempF_counts[0][1]\n", " utility_of_review+=w_of_f\n", " \n", " return utility_of_review " ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# pfd = pd.DataFrame()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def features_of_Max_Reviews(Features,F):\n", " features_of_Max_Reviews = ast.literal_eval(Features)\n", " F_list=[]\n", " for feature in F:\n", " if feature not in features_of_Max_Reviews:\n", " F_list.append(feature)\n", " return F_list\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**SeFOD- Main Algorithm Code**" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "K={'alpha':8,'beta':8,'gamma':4}\n", "columns = dataset.columns\n", "Diversified_Set = pd.DataFrame(columns=columns)\n", "for k in K:\n", " R = Get_Dataset(k)\n", " ret = Get_Feature_Set(R.Features)\n", " F = ret['Feature_List']\n", " F_Count = ret['feature_count']\n", " for i in range(K[k]):\n", " max_score_review = {}\n", " for review,Review_Id, in zip(R.Features,R.Review_Id):\n", " f_asteric = Features_Of_Review_wrt_F(F,review)\n", " utility_review = Utility_of_Review(F,F_Count,f_asteric)\n", " max_score_review[Review_Id] = utility_review\n", " max_score_review = max(max_score_review.items(), key=operator.itemgetter(1))[0]\n", " \n", " MaxReviewDataset = R.loc[R['Review_Id'] == max_score_review]\n", " R = R.loc[R['Review_Id'] != max_score_review]\n", " featurelist = []\n", " for review in MaxReviewDataset.Features:\n", " featurelist = features_of_Max_Reviews(review,F)\n", " F = featurelist\n", " Diversified_Set = pd.concat([Diversified_Set, MaxReviewDataset])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Displaying Diversified Reviews**" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Review_IdCustomer_NameReviewsFeatures
0231Mahnoor Mubashirtea shirts ka packet bohat acha tha mein ye...['tea', 'shirts', 'packet', 'chahta', 'rang', ...
1391NADEEM ABBASjab mein un ki qeemat par ghhor karta hon tu...['qeemat', 'tea', 'shirts', 'qabil', 'neh', 'q...
2919Ahsan Malikyeh tea herat angaiz hain aur mein nay websi...['tea', 'herat', 'angaiz', 'website', 'dekhty'...
3841behroz alimujhe iss package mein mojood tamam shirts bo...['package', 'mojood', 'tamam', 'shirts', 'pasa...
4877Bilal Ali brohitamam shirts mein se yeh pack behtareen hai ...['tamam', 'shirts', 'pack', 'behtareen', 'aram...
51085Abid Zafarperfect size perfect fitting aur perfect colo...['perfect', 'size', 'perfect', 'fitting', 'per...
61040Azlan Khanyeh pack of iss price ke hisaab se acha hai ...['pack', 'hisaab', 'bus', 'masla', 'tshirts', ...
71083Ameen Yousufmunasib keemat mein achi quality ki shirts ey...['munasib', 'keemat', 'quality', 'shirts', 'ba...
8342Adil Aslammein ne un tea shirts ka order diya tha aur ...['tea', 'shirts', 'order', 'vaqata', 'satisfie...
93611Muradtarseel prompt aur presentation achay miyaar...['prompt', 'presentation', 'miyaar', 'darmiyan...
10225Suhaybmujhe yeh apne dost ke liye mile hain qmizin...['qmizin', 'qeemat', 'roz', 'pahannay', 'cotto...
11879Abdullahmein tawaqqa nahi kar raha tha ke un shirts ...['tawaqqa', 'shirts', 'shirts', 'miyaar', 'ara...
121057Abdullahnu doubt quality bht achi ha magar mene yeh w...['doubt', 'quality', 'white', 'order', 'packag...
13261Shayam Malikun shirts mein sirf ghalti fitting thi woh ha...['shirts', 'ghalti', 'fitting', 'halki', 'dhee...
14842Usman Rasheedpakistan mein meri pehli online shopping mei...['online', 'shopping', 'herat', 'umdah', 'sama...
15224Abdullah Shahidachi terhan se iss ke sath mera tajurbah muk...['tajurbah', 'mukammal', 'tor', 'zabardast', '...
16925Ammad Alimeri pehli online shopping mujhe herat zada h...['online', 'shopping', 'herat', 'herat', 'anga...
17729Irfan Bin Hakimmein ne safaid aur gehra surkh rang ke liye ...['safaid', 'rang', 'order', 'rang', 'safaid', ...
18807hina timothyyeh qmizin iss mausam garma mein meri jane wa...['qmizin', 'mausam', 'garma', 'rangeen', 'muta...
19845Aunmein ne jaali saazi ki numayesh karte hue waz...['jaali', 'numayesh', 'wazeh', 'tor', 'tasavee...
\n", "
" ], "text/plain": [ " Review_Id Customer_Name \\\n", "0 231 Mahnoor Mubashir \n", "1 391 NADEEM ABBAS \n", "2 919 Ahsan Malik \n", "3 841 behroz ali \n", "4 877 Bilal Ali brohi \n", "5 1085 Abid Zafar \n", "6 1040 Azlan Khan \n", "7 1083 Ameen Yousuf \n", "8 342 Adil Aslam \n", "9 3611 Murad \n", "10 225 Suhayb \n", "11 879 Abdullah \n", "12 1057 Abdullah \n", "13 261 Shayam Malik \n", "14 842 Usman Rasheed \n", "15 224 Abdullah Shahid \n", "16 925 Ammad Ali \n", "17 729 Irfan Bin Hakim \n", "18 807 hina timothy \n", "19 845 Aun \n", "\n", " Reviews \\\n", "0 tea shirts ka packet bohat acha tha mein ye... \n", "1 jab mein un ki qeemat par ghhor karta hon tu... \n", "2 yeh tea herat angaiz hain aur mein nay websi... \n", "3 mujhe iss package mein mojood tamam shirts bo... \n", "4 tamam shirts mein se yeh pack behtareen hai ... \n", "5 perfect size perfect fitting aur perfect colo... \n", "6 yeh pack of iss price ke hisaab se acha hai ... \n", "7 munasib keemat mein achi quality ki shirts ey... \n", "8 mein ne un tea shirts ka order diya tha aur ... \n", "9 tarseel prompt aur presentation achay miyaar... \n", "10 mujhe yeh apne dost ke liye mile hain qmizin... \n", "11 mein tawaqqa nahi kar raha tha ke un shirts ... \n", "12 nu doubt quality bht achi ha magar mene yeh w... \n", "13 un shirts mein sirf ghalti fitting thi woh ha... \n", "14 pakistan mein meri pehli online shopping mei... \n", "15 achi terhan se iss ke sath mera tajurbah muk... \n", "16 meri pehli online shopping mujhe herat zada h... \n", "17 mein ne safaid aur gehra surkh rang ke liye ... \n", "18 yeh qmizin iss mausam garma mein meri jane wa... \n", "19 mein ne jaali saazi ki numayesh karte hue waz... \n", "\n", " Features \n", "0 ['tea', 'shirts', 'packet', 'chahta', 'rang', ... \n", "1 ['qeemat', 'tea', 'shirts', 'qabil', 'neh', 'q... \n", "2 ['tea', 'herat', 'angaiz', 'website', 'dekhty'... \n", "3 ['package', 'mojood', 'tamam', 'shirts', 'pasa... \n", "4 ['tamam', 'shirts', 'pack', 'behtareen', 'aram... \n", "5 ['perfect', 'size', 'perfect', 'fitting', 'per... \n", "6 ['pack', 'hisaab', 'bus', 'masla', 'tshirts', ... \n", "7 ['munasib', 'keemat', 'quality', 'shirts', 'ba... \n", "8 ['tea', 'shirts', 'order', 'vaqata', 'satisfie... \n", "9 ['prompt', 'presentation', 'miyaar', 'darmiyan... \n", "10 ['qmizin', 'qeemat', 'roz', 'pahannay', 'cotto... \n", "11 ['tawaqqa', 'shirts', 'shirts', 'miyaar', 'ara... \n", "12 ['doubt', 'quality', 'white', 'order', 'packag... \n", "13 ['shirts', 'ghalti', 'fitting', 'halki', 'dhee... \n", "14 ['online', 'shopping', 'herat', 'umdah', 'sama... \n", "15 ['tajurbah', 'mukammal', 'tor', 'zabardast', '... \n", "16 ['online', 'shopping', 'herat', 'herat', 'anga... \n", "17 ['safaid', 'rang', 'order', 'rang', 'safaid', ... \n", "18 ['qmizin', 'mausam', 'garma', 'rangeen', 'muta... \n", "19 ['jaali', 'numayesh', 'wazeh', 'tor', 'tasavee... " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Diversified_Set = Diversified_Set.reset_index(drop=True)\n", "Diversified_Set[['Review_Id', 'Customer_Name', 'Reviews', 'Features']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Creating Download Link for Diversified Dataset**" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "temp_dataset = Diversified_Set[['Review_Id', 'Customer_Name', 'Reviews', 'Features']]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Download created CSV file" ], "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import HTML\n", "import pandas as pd\n", "import numpy as np\n", "import base64\n", "\n", "# function that takes in a dataframe and creates a text link to \n", "# download it (will only work for files < 2MB or so)\n", "def create_download_link(df, title = \"Download created CSV file\", filename = \"Daraz_Diversified_Dataset.csv\"): \n", " csv = df.to_csv()\n", " b64 = base64.b64encode(csv.encode())\n", " payload = b64.decode()\n", " html = '{title}'\n", " html = html.format(payload=payload,title=title,filename=filename)\n", " return HTML(html)\n", "\n", "# create a random sample dataframe\n", "\n", "# create a link to download the dataframe\n", "create_download_link(temp_dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Calculating Scores for Review**" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "ret = Get_Feature_Set(Diversified_Set.Features)\n", "features_count = ret['feature_count']\n", "feature_score={}\n", "for feature in features_count:\n", " feature_score[feature] = 1/features_count[feature]" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "review_score_list=[]\n", "score_sum = 0\n", "for featureSet in Diversified_Set.Features:\n", " score = 0\n", " for feature in ast.literal_eval(featureSet):\n", " score=score+feature_score[feature]\n", " review_score_list.append(score)\n", "Diversified_Set['Score'] = review_score_list\n", "\n", "review_score_list.sort(reverse=True)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "max_review_Score = review_score_list[0]\n", "# max_review_Score" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "7.137454201631012" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Normalised_score=[]\n", "normalised_score_sum = 0\n", "for score in Diversified_Set['Score']:\n", " Nscore = score/max_review_Score\n", " Normalised_score.append(Nscore)\n", " normalised_score_sum=normalised_score_sum+Nscore\n", "normalised_score_sum" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 4 }