{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "**Importing Libraries**" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" }, "outputs": [], "source": [ "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "import ast\n", "import operator\n", "from textblob import TextBlob\n", "import pandas as pd\n", "import re\n", "import nltk\n", "from nltk import pos_tag\n", "from nltk.tokenize import word_tokenize\n", "from nltk.corpus import stopwords\n", "from nltk.stem.porter import PorterStemmer\n", "from nltk.stem import LancasterStemmer, WordNetLemmatizer\n", "import re, string, unicodedata\n", "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Loading Dataset**" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User_idProduct_idRatingDateReviewLabel
09230312/8/2014The food at snack is a selection of popular Gr...-1
1924035/16/2013This little place in Soho is wonderful. I had ...-1
2925047/1/2013ordered lunch for 15 from Snack last Friday. Â...-1
3926047/28/2011This is a beautiful quaint little restaurant o...-1
49270411/1/2010Snack is great place for a  casual sit down l...-1
\n", "
" ], "text/plain": [ " User_id Product_id Rating Date \\\n", "0 923 0 3 12/8/2014 \n", "1 924 0 3 5/16/2013 \n", "2 925 0 4 7/1/2013 \n", "3 926 0 4 7/28/2011 \n", "4 927 0 4 11/1/2010 \n", "\n", " Review Label \n", "0 The food at snack is a selection of popular Gr... -1 \n", "1 This little place in Soho is wonderful. I had ... -1 \n", "2 ordered lunch for 15 from Snack last Friday. Â... -1 \n", "3 This is a beautiful quaint little restaurant o... -1 \n", "4 Snack is great place for a  casual sit down l... -1 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset = pd.read_csv(\"../input/yelp-labelled-dataset/Labelled Yelp Dataset.csv\")\n", "dataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Data Prepration**" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "Review_Id= []\n", "for i in range(len(dataset)):\n", " Review_Id.append(i)\n", "dataset['Review_Id']=Review_Id" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ " 1 322167\n", "-1 36885\n", "Name: Label, dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['Label'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Replacing Labels to keep similarity as other notebooks" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 322167\n", "1 36885\n", "Name: Label, dtype: int64" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset['Label']=dataset['Label'].replace(1,0)\n", "dataset['Label']=dataset['Label'].replace(-1,1)\n", "dataset['Label'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Renaming Columns. You can renname other columns like product_id, customer-id too in order to not modify the code." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "dataset = dataset.rename(columns={\"Review\": \"Reviews\", 'Product_id': 'Product_ID'}, errors=\"raise\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Excluding Spam Reviews**\n", "i.e. Review with Label 1" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 322167\n", "Name: Label, dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "dataset.drop(dataset[dataset.Label == 1].index, inplace=True)\n", "dataset['Label'].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Removing Duplicate Reviews" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "dataset.drop_duplicates(subset =\"Reviews\",keep = False, inplace = True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "321550\n" ] } ], "source": [ "print(len(dataset))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Looking for products with Highest Number of Reviews**" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "923" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unique_Product_ID=[]\n", "for Product_ID in dataset['Product_ID']:\n", " if Product_ID not in unique_Product_ID:\n", " unique_Product_ID.append(Product_ID)\n", "len(unique_Product_ID)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "unique_Product_ID_with_Count = {}\n", "for C_ID in unique_Product_ID:\n", " len_Of_C_ID = len(dataset.loc[dataset['Product_ID'] == C_ID])\n", " unique_Product_ID_with_Count[C_ID]= len_Of_C_ID\n", "\n", "unique_Product_ID_with_Count = sorted(unique_Product_ID_with_Count.items(), key=operator.itemgetter(1) , reverse=True)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(247, 6648),\n", " (555, 5818),\n", " (542, 3992),\n", " (465, 3382),\n", " (91, 2940),\n", " (468, 2917),\n", " (72, 2800),\n", " (211, 2588),\n", " (668, 2576),\n", " (100, 2493)]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Top_Reviewed_Products = unique_Product_ID_with_Count[:10]\n", "Top_Reviewed_Products" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Selecting a product to Diversify**" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User_idProduct_IDRatingDateReviewsLabelReview_Id
3470725611824751/6/2015So good, if very slightly salty. No outside fo...0347072
3470734539224741/6/2015I'm a fan of Ramen (tried Ippudo first in 2010...0347073
34707415768224741/6/2015I LOVE their pulled pork. It's absolutely deli...0347074
34707510073424731/5/2015I ordered the modern ramen with black garlic s...0347075
3470765650624751/5/2015What? You haven't been to Ippudo yet? *slaps f...0347076
\n", "
" ], "text/plain": [ " User_id Product_ID Rating Date \\\n", "347072 56118 247 5 1/6/2015 \n", "347073 45392 247 4 1/6/2015 \n", "347074 157682 247 4 1/6/2015 \n", "347075 100734 247 3 1/5/2015 \n", "347076 56506 247 5 1/5/2015 \n", "\n", " Reviews Label Review_Id \n", "347072 So good, if very slightly salty. No outside fo... 0 347072 \n", "347073 I'm a fan of Ramen (tried Ippudo first in 2010... 0 347073 \n", "347074 I LOVE their pulled pork. It's absolutely deli... 0 347074 \n", "347075 I ordered the modern ramen with black garlic s... 0 347075 \n", "347076 What? You haven't been to Ippudo yet? *slaps f... 0 347076 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ProductToDiversify_id=Top_Reviewed_Products[0][0]\n", "ProductDataset = dataset.loc[dataset['Product_ID'] == ProductToDiversify_id]\n", "ProductDataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Dataset Cleaning**" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def remove_non_ascii(words):\n", " \"\"\"Remove non-ASCII characters from list of tokenized words\"\"\"\n", " new_words = []\n", " for word in words:\n", " new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')\n", " new_words.append(new_word)\n", " return new_words\n", "\n", "def to_lowercase(words):\n", " \"\"\"Convert all characters to lowercase from list of tokenized words\"\"\"\n", " new_words = []\n", " for word in words:\n", " new_word = word.lower()\n", " new_words.append(new_word)\n", " return new_words\n", "\n", "def remove_punctuation(words):\n", " \"\"\"Remove punctuation from list of tokenized words\"\"\"\n", " new_words = []\n", " for word in words:\n", " new_word = re.sub(r'[^\\w\\s]', '', word)\n", " if new_word != '':\n", " new_words.append(new_word)\n", " return new_words\n", "\n", "def replace_numbers(words):\n", " \"\"\"Replace all interger occurrences in list of tokenized words with textual representation\"\"\"\n", " p = inflect.engine()\n", " new_words = []\n", " for word in words:\n", " if word.isdigit():\n", " new_word = p.number_to_words(word)\n", " new_words.append(new_word)\n", " else:\n", " new_words.append(word)\n", " return new_words\n", "\n", "def remove_stopwords(words):\n", " \"\"\"Remove stop words from list of tokenized words\"\"\"\n", " new_words = []\n", " for word in words:\n", " if word not in stopwords.words('english'):\n", " new_words.append(word)\n", " return new_words\n", "\n", "def stem_words(words):\n", " \"\"\"Stem words in list of tokenized words\"\"\"\n", " stemmer = LancasterStemmer()\n", " stems = []\n", " for word in words:\n", " stem = stemmer.stem(word)\n", " stems.append(stem)\n", " return stems\n", "\n", "def lemmatize_verbs(words):\n", " \"\"\"Lemmatize verbs in list of tokenized words\"\"\"\n", " lemmatizer = WordNetLemmatizer()\n", " lemmas = []\n", " for word in words:\n", " lemma = lemmatizer.lemmatize(word, pos='v')\n", " lemmas.append(lemma)\n", " return lemmas\n", "\n", "def normalize(words):\n", " words = remove_non_ascii(words)\n", " words = to_lowercase(words)\n", " words = remove_punctuation(words)\n", " #words = replace_numbers(words)\n", " words = remove_stopwords(words)\n", " return words\n", "\n", "#Steemming and Lemmatization\n", "def stem_and_lemmatize(words):\n", " stems = stem_words(words)\n", " lemmas = lemmatize_verbs(words)\n", " return stems, lemmas\n", "\n", "#stems, lemmas = stem_and_lemmatize(words)\n", "#print('Stemmed:\\n', stems)\n", "#print('\\nLemmatized:\\n', lemmas)\n", "\n", "\n", "# --------------- Cleaning ------------------\n", "def clean_text(text):\n", " wording = nltk.word_tokenize(text)\n", " words = normalize(wording)\n", " string_text = ' '.join(words)\n", " return string_text" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "ProductDataset['Original Reviews'] = ProductDataset['Reviews']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "cleaned_reviews=[]\n", "for review in ProductDataset['Reviews']:\n", " row=clean_text(review)\n", " cleaned_reviews.append(row)\n", "ProductDataset['Reviews']= cleaned_reviews" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add Sentiments using Textblob" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# --------------- Sentiment ------------------\n", "def get_text_sentiment(text):\n", " # create TextBlob object of passed text \n", " analysis = TextBlob(text)\n", " # set sentiment \n", " if analysis.sentiment.polarity > 0: \n", " return 'Positive'\n", " elif analysis.sentiment.polarity == 0: \n", " return 'Neutral'\n", " else: \n", " return 'Negative' \n", " \n", "# --------------- Features ------------------ \n", "def feature_extraction(text): \n", " blob = TextBlob(text)\n", " return blob.noun_phrases" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Add Features using Textblob" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "features = []\n", "sentiments =[]\n", "for review in ProductDataset['Reviews']:\n", " features.append(feature_extraction(review))\n", " sentiments.append(get_text_sentiment(review)) \n", " \n", "ProductDataset['Features'] = features\n", "ProductDataset['Sentiment'] = sentiments" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User_idProduct_IDRatingDateReviewsLabelReview_IdOriginal ReviewsFeaturesSentiment
3470725611824751/6/2015good slightly salty outside food reservations ...0347072So good, if very slightly salty. No outside fo...[food reservations]Positive
3470734539224741/6/2015fan ramen tried ippudo first 2010 ramen option...0347073I'm a fan of Ramen (tried Ippudo first in 2010...[fan ramen, ramen options, fair portion pay, a...Positive
34707415768224741/6/2015love pulled pork absolutely delicious mustget ...0347074I LOVE their pulled pork. It's absolutely deli...[delicious mustget, service food good eel rice...Positive
34707510073424731/5/2015ordered modern ramen black garlic sauce pretty...0347075I ordered the modern ramen with black garlic s...[modern ramen, black garlic sauce, pretty good...Positive
3470765650624751/5/2015nt ippudo yet slaps face go best restaurant wo...0347076What? You haven't been to Ippudo yet? *slaps f...[nt ippudo, restaurant world bye]Positive
\n", "
" ], "text/plain": [ " User_id Product_ID Rating Date \\\n", "347072 56118 247 5 1/6/2015 \n", "347073 45392 247 4 1/6/2015 \n", "347074 157682 247 4 1/6/2015 \n", "347075 100734 247 3 1/5/2015 \n", "347076 56506 247 5 1/5/2015 \n", "\n", " Reviews Label Review_Id \\\n", "347072 good slightly salty outside food reservations ... 0 347072 \n", "347073 fan ramen tried ippudo first 2010 ramen option... 0 347073 \n", "347074 love pulled pork absolutely delicious mustget ... 0 347074 \n", "347075 ordered modern ramen black garlic sauce pretty... 0 347075 \n", "347076 nt ippudo yet slaps face go best restaurant wo... 0 347076 \n", "\n", " Original Reviews \\\n", "347072 So good, if very slightly salty. No outside fo... \n", "347073 I'm a fan of Ramen (tried Ippudo first in 2010... \n", "347074 I LOVE their pulled pork. It's absolutely deli... \n", "347075 I ordered the modern ramen with black garlic s... \n", "347076 What? You haven't been to Ippudo yet? *slaps f... \n", "\n", " Features Sentiment \n", "347072 [food reservations] Positive \n", "347073 [fan ramen, ramen options, fair portion pay, a... Positive \n", "347074 [delicious mustget, service food good eel rice... Positive \n", "347075 [modern ramen, black garlic sauce, pretty good... Positive \n", "347076 [nt ippudo, restaurant world bye] Positive " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ProductDataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Separating Reviews according to their Sentiments**" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "274\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User_idProduct_IDRatingDateReviewsLabelReview_IdOriginal ReviewsFeaturesSentiment
347094157688247112/29/2014well u want quoted 2 hour wait think lot well ...0347094Well if u want to be quoted 2 hour wait and th...[horses bc, awful nt]Negative
34709913270247112/28/2014ippudo rated restaurant nyc area year 2014 sma...0347099IPPUDO is the most rated restaurant in the nyc...[restaurant nyc area year, small review, veget...Negative
347117157697247312/20/2014place overated wait long food pricey running l...0347117This place is overated.  The wait was long an...[food pricey, nt seat friend, party friend lie...Negative
347127157699247312/15/2014place used five star place us recently cut bac...0347127This place used to be a five star place for us...[star place, portion size, rid pork, ramen nt,...Negative
34714875089247312/5/2014came way st marks ippudo ramen average quality...0347148Came all the way to St. marks just for ippudo....[st marks ippudo ramen average quality, ramen ...Negative
\n", "
" ], "text/plain": [ " User_id Product_ID Rating Date \\\n", "347094 157688 247 1 12/29/2014 \n", "347099 13270 247 1 12/28/2014 \n", "347117 157697 247 3 12/20/2014 \n", "347127 157699 247 3 12/15/2014 \n", "347148 75089 247 3 12/5/2014 \n", "\n", " Reviews Label Review_Id \\\n", "347094 well u want quoted 2 hour wait think lot well ... 0 347094 \n", "347099 ippudo rated restaurant nyc area year 2014 sma... 0 347099 \n", "347117 place overated wait long food pricey running l... 0 347117 \n", "347127 place used five star place us recently cut bac... 0 347127 \n", "347148 came way st marks ippudo ramen average quality... 0 347148 \n", "\n", " Original Reviews \\\n", "347094 Well if u want to be quoted 2 hour wait and th... \n", "347099 IPPUDO is the most rated restaurant in the nyc... \n", "347117 This place is overated.  The wait was long an... \n", "347127 This place used to be a five star place for us... \n", "347148 Came all the way to St. marks just for ippudo.... \n", "\n", " Features Sentiment \n", "347094 [horses bc, awful nt] Negative \n", "347099 [restaurant nyc area year, small review, veget... Negative \n", "347117 [food pricey, nt seat friend, party friend lie... Negative \n", "347127 [star place, portion size, rid pork, ramen nt,... Negative \n", "347148 [st marks ippudo ramen average quality, ramen ... Negative " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NegativeDataset = ProductDataset.loc[ProductDataset['Sentiment'] == 'Negative']\n", "print(len(NegativeDataset))\n", "NegativeDataset.head()" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "6282\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User_idProduct_IDRatingDateReviewsLabelReview_IdOriginal ReviewsFeaturesSentiment
3470725611824751/6/2015good slightly salty outside food reservations ...0347072So good, if very slightly salty. No outside fo...[food reservations]Positive
3470734539224741/6/2015fan ramen tried ippudo first 2010 ramen option...0347073I'm a fan of Ramen (tried Ippudo first in 2010...[fan ramen, ramen options, fair portion pay, a...Positive
34707415768224741/6/2015love pulled pork absolutely delicious mustget ...0347074I LOVE their pulled pork. It's absolutely deli...[delicious mustget, service food good eel rice...Positive
34707510073424731/5/2015ordered modern ramen black garlic sauce pretty...0347075I ordered the modern ramen with black garlic s...[modern ramen, black garlic sauce, pretty good...Positive
3470765650624751/5/2015nt ippudo yet slaps face go best restaurant wo...0347076What? You haven't been to Ippudo yet? *slaps f...[nt ippudo, restaurant world bye]Positive
\n", "
" ], "text/plain": [ " User_id Product_ID Rating Date \\\n", "347072 56118 247 5 1/6/2015 \n", "347073 45392 247 4 1/6/2015 \n", "347074 157682 247 4 1/6/2015 \n", "347075 100734 247 3 1/5/2015 \n", "347076 56506 247 5 1/5/2015 \n", "\n", " Reviews Label Review_Id \\\n", "347072 good slightly salty outside food reservations ... 0 347072 \n", "347073 fan ramen tried ippudo first 2010 ramen option... 0 347073 \n", "347074 love pulled pork absolutely delicious mustget ... 0 347074 \n", "347075 ordered modern ramen black garlic sauce pretty... 0 347075 \n", "347076 nt ippudo yet slaps face go best restaurant wo... 0 347076 \n", "\n", " Original Reviews \\\n", "347072 So good, if very slightly salty. No outside fo... \n", "347073 I'm a fan of Ramen (tried Ippudo first in 2010... \n", "347074 I LOVE their pulled pork. It's absolutely deli... \n", "347075 I ordered the modern ramen with black garlic s... \n", "347076 What? You haven't been to Ippudo yet? *slaps f... \n", "\n", " Features Sentiment \n", "347072 [food reservations] Positive \n", "347073 [fan ramen, ramen options, fair portion pay, a... Positive \n", "347074 [delicious mustget, service food good eel rice... Positive \n", "347075 [modern ramen, black garlic sauce, pretty good... Positive \n", "347076 [nt ippudo, restaurant world bye] Positive " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "PositiveDataset= ProductDataset.loc[ProductDataset['Sentiment'] == 'Positive']\n", "print(len(PositiveDataset))\n", "PositiveDataset.head()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "92\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
User_idProduct_IDRatingDateReviewsLabelReview_IdOriginal ReviewsFeaturesSentiment
3473599858524759/25/2014pork buns ftw0347359pork buns ftw.[pork buns ftw]Neutral
34736415777424759/24/2014holy vegetarian ramen0347364Holy vegetarian ramen.[vegetarian ramen]Neutral
34739915779224749/14/2014pork buns die0347399the pork buns are to die for!![pork buns]Neutral
347418794624749/9/2014pork buns tho0347418those pork buns tho.[pork buns tho]Neutral
347426776524749/7/2014yummmmmmmmm0347426Yummmmmmmmm[]Neutral
\n", "
" ], "text/plain": [ " User_id Product_ID Rating Date Reviews Label \\\n", "347359 98585 247 5 9/25/2014 pork buns ftw 0 \n", "347364 157774 247 5 9/24/2014 holy vegetarian ramen 0 \n", "347399 157792 247 4 9/14/2014 pork buns die 0 \n", "347418 7946 247 4 9/9/2014 pork buns tho 0 \n", "347426 7765 247 4 9/7/2014 yummmmmmmmm 0 \n", "\n", " Review_Id Original Reviews Features \\\n", "347359 347359 pork buns ftw. [pork buns ftw] \n", "347364 347364 Holy vegetarian ramen. [vegetarian ramen] \n", "347399 347399 the pork buns are to die for!! [pork buns] \n", "347418 347418 those pork buns tho. [pork buns tho] \n", "347426 347426 Yummmmmmmmm [] \n", "\n", " Sentiment \n", "347359 Neutral \n", "347364 Neutral \n", "347399 Neutral \n", "347418 Neutral \n", "347426 Neutral " ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "NeutralDataset = ProductDataset.loc[ProductDataset['Sentiment'] == 'Neutral']\n", "print(len(NeutralDataset))\n", "NeutralDataset.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Defining Functions**" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def Get_Dataset(k):\n", " if(k=='alpha'):\n", " R = PositiveDataset\n", " elif(k=='beta'):\n", " R = NegativeDataset\n", " else:\n", " R = NeutralDataset\n", " return R" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "def Get_Feature_Set(Features):\n", " Feature_List = []\n", " Feature_count= {}\n", " for featurelist in Features:\n", " Feature_List_array = (featurelist)\n", " for feature in Feature_List_array:\n", " if feature in Feature_count:\n", " Feature_count[feature]+=1\n", " else: \n", " Feature_count[feature]=1 \n", " if feature not in Feature_List:\n", " Feature_List.append(feature)\n", " \n", " Feature_count = sorted(Feature_count.items(), key=operator.itemgetter(1) , reverse=True)\n", " Feature_count = Feature_count\n", " feature_dict={}\n", " for i in Feature_count:\n", " name = i[0]\n", " value = i[1]\n", " feature_dict[name] = value\n", " ret = {'feature_count':feature_dict, 'Feature_List':Feature_List}\n", " return ret\n", " " ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "# f = Get_Feature_Set(PositiveDataset.Features)\n", "# k = f['feature_count']" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def Features_Of_Review_wrt_F(F,review): \n", " features_List = []\n", " f_asteric = (review)\n", " for feature in f_asteric:\n", " if feature in F:\n", " features_List.append(feature)\n", " return features_List" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "def Utility_of_Review(F,F_Count,f_asteric):\n", " utility_of_review=0\n", " tempF = []\n", " for f in F:\n", " if f not in f_asteric:\n", " tempF.append(f)\n", " tempF_counts = {}\n", " for f_t in tempF:\n", " feature = f_t\n", " tempF_counts[feature] = F_Count[feature]\n", " tempF_counts = sorted(tempF_counts.items(), key=operator.itemgetter(1) , reverse=True)\n", " \n", " for f in f_asteric:\n", " if(tempF_counts!=[]):\n", " w_of_f = F_Count[f]/tempF_counts[0][1]\n", " utility_of_review+=w_of_f\n", " \n", " \n", " return utility_of_review " ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "# pfd = pd.DataFrame()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "def features_of_Max_Reviews(Features,F):\n", " features_of_Max_Reviews = (Features)\n", " F_list=[]\n", " for feature in F:\n", " if feature not in features_of_Max_Reviews:\n", " F_list.append(feature)\n", " return F_list\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "\\**SeFOD- Main Algorithm Code**" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "K={'alpha':8,'beta':8,'gamma':4}\n", "columns = dataset.columns\n", "Diversified_Set = pd.DataFrame(columns=columns)\n", "for k in K:\n", " R = Get_Dataset(k)\n", " ret = Get_Feature_Set(R.Features)\n", " F = ret['Feature_List']\n", " F_Count = ret['feature_count']\n", " for i in range(K[k]):\n", " max_score_review = {}\n", " for review,Review_Id, in zip(R.Features,R.Review_Id):\n", " f_asteric = Features_Of_Review_wrt_F(F,review)\n", " utility_review = Utility_of_Review(F,F_Count,f_asteric)\n", " max_score_review[Review_Id] = utility_review\n", " max_score_review = max(max_score_review.items(), key=operator.itemgetter(1))[0]\n", " \n", " MaxReviewDataset = R.loc[R['Review_Id'] == max_score_review]\n", " R = R.loc[R['Review_Id'] != max_score_review]\n", " featurelist = []\n", " for review in MaxReviewDataset.Features:\n", " featurelist = features_of_Max_Reviews(review,F)\n", " F = featurelist\n", " Diversified_Set = pd.concat([Diversified_Set, MaxReviewDataset])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Displaying Diversified Reviews**" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
DateFeaturesLabelOriginal ReviewsProduct_IDRatingReview_IdReviewsSentimentUser_id
08/20/2013[ramen ippudo, amazing things ippudo, great re...0To be honest, I was pretty disappointed by the...2473348937honest pretty disappointed ramen ippudo prior ...Positive28644
14/14/2010[hrs trendy, expensive kind ramen place, side ...0Expect a LONG wait.  Average wait for dinner ...2474352584expect long wait average wait dinner usually 4...Positive29778
211/11/2012[place fav restaurant visit, new york, 630pm f...0I'll add my 2 cents to the other 4000 reviews....2475349848add 2 cents 4000 reviews love place fav restau...Positive158470
312/24/2013[visit ippudo disappoint, thursday afternoon w...0My latest visit to Ippudo did not disappoint. ...2474348461latest visit ippudo disappoint came thursday a...Positive10151
411/8/2013[okay course, ramen places nyc visit, reviews ...0Okay.. Of course I have to try one of the best...2474348626okay course try one best ramen places nyc visi...Positive95663
51/9/2011[reviews speaks, fan ramen broth, smooth thirs...0Hands down, the best ramen I have yet tasted! ...2475351816hands best ramen yet tasted 2000 reviews speak...Positive158880
63/14/2010[new york, recommendation coworker, ramen nood...0I went here while visiting my sister in New Yo...2475352688went visiting sister new york got recommendati...Positive159063
73/6/2011[review place year, accident bat shit, crazy i...0So this place is growing on me.  I swore I wr...2474351591place growing swore wrote review place year ag...Positive11968
89/12/2012[ippudo list, name cringe, previous reviews ex...0Ippudo should be a on a list called \"Yelp's Mo...2472349980ippudo list called yelp overrated reviews ever...Negative18712
94/19/2011[wednesday night, ramen place, worth wait, ca ...0We waited for about an hour to get seats one W...2473351409waited hour get seats one wednesday night last...Negative158807
109/21/2010[ippudo review, ramen places area making decis...0I have been sitting on this Ippudo review, wai...2472352178sitting ippudo review waiting try ramen places...Negative45628
1112/4/2008[email faithful foodie friends notify, concur ...0so. i was craving for ramen yesterday. shit ha...2472353503craving ramen yesterday shit happens know emai...Negative30068
122/12/2014[mmhmm refund, subway system, ippudo ny 310pm,...0Over-hyped? Yes, totally. Overpriced? Mmhmm!!!...2471348240overhyped yes totally overpriced mmhmm refund ...Negative158054
137/26/2011[true ippudo japan hint hintif ippudo japan, p...0I was torn between giving Ippudo 1 star or 2 s...2471351107torn giving ippudo 1 star 2 stars wish could g...Negative115992
144/4/2010[pork bun, pork bun akamaru, pork bun, expecta...0Pork bun! 4**** I got the pork bun and the Aka...2474352621pork bun 4 got pork bun akamaru modern first b...Negative72285
1512/28/2009[trip ippudo monday december, good meal, awful...0Just got back from my first trip to Ippudo (Mo...2471352894got back first trip ippudo monday december 28t...Negative4447
162/15/2014[ramen delicious, pork buns]0Ramen is delicious. Try the pork buns THe wait...2474348232ramen delicious try pork buns wait insaneNeutral127760
1712/6/2013[worth wait, tables empty, sister sister nt, p...0Not worth the wait. I went here few times and ...2471348525worth wait went times waited 2 hours first tim...Neutral79801
188/3/2010[faint webster hall, hot bowl ramen, cold wet ...0What can I say - after seeing Ladytron and the...2475352312say seeing ladytron faint webster hall nothing...Neutral158981
1912/19/2012[line nt, pork buns bowl noodles, 1030am satur...0I don't like waiting in line, and I don't like...2474349740nt like waiting line nt like sitting strangers...Neutral7015
\n", "
" ], "text/plain": [ " Date Features Label \\\n", "0 8/20/2013 [ramen ippudo, amazing things ippudo, great re... 0 \n", "1 4/14/2010 [hrs trendy, expensive kind ramen place, side ... 0 \n", "2 11/11/2012 [place fav restaurant visit, new york, 630pm f... 0 \n", "3 12/24/2013 [visit ippudo disappoint, thursday afternoon w... 0 \n", "4 11/8/2013 [okay course, ramen places nyc visit, reviews ... 0 \n", "5 1/9/2011 [reviews speaks, fan ramen broth, smooth thirs... 0 \n", "6 3/14/2010 [new york, recommendation coworker, ramen nood... 0 \n", "7 3/6/2011 [review place year, accident bat shit, crazy i... 0 \n", "8 9/12/2012 [ippudo list, name cringe, previous reviews ex... 0 \n", "9 4/19/2011 [wednesday night, ramen place, worth wait, ca ... 0 \n", "10 9/21/2010 [ippudo review, ramen places area making decis... 0 \n", "11 12/4/2008 [email faithful foodie friends notify, concur ... 0 \n", "12 2/12/2014 [mmhmm refund, subway system, ippudo ny 310pm,... 0 \n", "13 7/26/2011 [true ippudo japan hint hintif ippudo japan, p... 0 \n", "14 4/4/2010 [pork bun, pork bun akamaru, pork bun, expecta... 0 \n", "15 12/28/2009 [trip ippudo monday december, good meal, awful... 0 \n", "16 2/15/2014 [ramen delicious, pork buns] 0 \n", "17 12/6/2013 [worth wait, tables empty, sister sister nt, p... 0 \n", "18 8/3/2010 [faint webster hall, hot bowl ramen, cold wet ... 0 \n", "19 12/19/2012 [line nt, pork buns bowl noodles, 1030am satur... 0 \n", "\n", " Original Reviews Product_ID Rating \\\n", "0 To be honest, I was pretty disappointed by the... 247 3 \n", "1 Expect a LONG wait.  Average wait for dinner ... 247 4 \n", "2 I'll add my 2 cents to the other 4000 reviews.... 247 5 \n", "3 My latest visit to Ippudo did not disappoint. ... 247 4 \n", "4 Okay.. Of course I have to try one of the best... 247 4 \n", "5 Hands down, the best ramen I have yet tasted! ... 247 5 \n", "6 I went here while visiting my sister in New Yo... 247 5 \n", "7 So this place is growing on me.  I swore I wr... 247 4 \n", "8 Ippudo should be a on a list called \"Yelp's Mo... 247 2 \n", "9 We waited for about an hour to get seats one W... 247 3 \n", "10 I have been sitting on this Ippudo review, wai... 247 2 \n", "11 so. i was craving for ramen yesterday. shit ha... 247 2 \n", "12 Over-hyped? Yes, totally. Overpriced? Mmhmm!!!... 247 1 \n", "13 I was torn between giving Ippudo 1 star or 2 s... 247 1 \n", "14 Pork bun! 4**** I got the pork bun and the Aka... 247 4 \n", "15 Just got back from my first trip to Ippudo (Mo... 247 1 \n", "16 Ramen is delicious. Try the pork buns THe wait... 247 4 \n", "17 Not worth the wait. I went here few times and ... 247 1 \n", "18 What can I say - after seeing Ladytron and the... 247 5 \n", "19 I don't like waiting in line, and I don't like... 247 4 \n", "\n", " Review_Id Reviews Sentiment \\\n", "0 348937 honest pretty disappointed ramen ippudo prior ... Positive \n", "1 352584 expect long wait average wait dinner usually 4... Positive \n", "2 349848 add 2 cents 4000 reviews love place fav restau... Positive \n", "3 348461 latest visit ippudo disappoint came thursday a... Positive \n", "4 348626 okay course try one best ramen places nyc visi... Positive \n", "5 351816 hands best ramen yet tasted 2000 reviews speak... Positive \n", "6 352688 went visiting sister new york got recommendati... Positive \n", "7 351591 place growing swore wrote review place year ag... Positive \n", "8 349980 ippudo list called yelp overrated reviews ever... Negative \n", "9 351409 waited hour get seats one wednesday night last... Negative \n", "10 352178 sitting ippudo review waiting try ramen places... Negative \n", "11 353503 craving ramen yesterday shit happens know emai... Negative \n", "12 348240 overhyped yes totally overpriced mmhmm refund ... Negative \n", "13 351107 torn giving ippudo 1 star 2 stars wish could g... Negative \n", "14 352621 pork bun 4 got pork bun akamaru modern first b... Negative \n", "15 352894 got back first trip ippudo monday december 28t... Negative \n", "16 348232 ramen delicious try pork buns wait insane Neutral \n", "17 348525 worth wait went times waited 2 hours first tim... Neutral \n", "18 352312 say seeing ladytron faint webster hall nothing... Neutral \n", "19 349740 nt like waiting line nt like sitting strangers... Neutral \n", "\n", " User_id \n", "0 28644 \n", "1 29778 \n", "2 158470 \n", "3 10151 \n", "4 95663 \n", "5 158880 \n", "6 159063 \n", "7 11968 \n", "8 18712 \n", "9 158807 \n", "10 45628 \n", "11 30068 \n", "12 158054 \n", "13 115992 \n", "14 72285 \n", "15 4447 \n", "16 127760 \n", "17 79801 \n", "18 158981 \n", "19 7015 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Diversified_Set = Diversified_Set.reset_index(drop=True)\n", "# Diversified_Set[['Review_Id', 'Customer_Name', 'Reviews', 'Features']]\n", "Diversified_Set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Creating Download Link for Diversified Dataset**" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "temp_dataset = Diversified_Set[['Review_Id', 'User_id', 'Original Reviews','Reviews', 'Features']]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "Download created CSV file" ], "text/plain": [ "" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from IPython.display import HTML\n", "import pandas as pd\n", "import numpy as np\n", "import base64\n", "\n", "# function that takes in a dataframe and creates a text link to \n", "# download it (will only work for files < 2MB or so)\n", "def create_download_link(df, title = \"Download created CSV file\", filename = \"Yelp_Diversified_Dataset.csv\"): \n", " csv = df.to_csv()\n", " b64 = base64.b64encode(csv.encode())\n", " payload = b64.decode()\n", " html = '{title}'\n", " html = html.format(payload=payload,title=title,filename=filename)\n", " return HTML(html)\n", "\n", "# create a random sample dataframe\n", "\n", "# create a link to download the dataframe\n", "create_download_link(temp_dataset)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "**Calculating Scores for Review**" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "ret = Get_Feature_Set(Diversified_Set.Features)\n", "features_count = ret['feature_count']\n", "feature_score={}\n", "for feature in features_count:\n", " feature_score[feature] = 1/features_count[feature]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "review_score_list=[]\n", "score_sum = 0\n", "for featureSet in Diversified_Set.Features:\n", " score = 0\n", " for feature in featureSet:\n", " score=score+feature_score[feature]\n", " review_score_list.append(score)\n", "Diversified_Set['Score'] = review_score_list\n", "\n", "review_score_list.sort(reverse=True)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "max_review_Score = review_score_list[0]\n", "# max_review_Score" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "10.883720930232556" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Normalised_score=[]\n", "normalised_score_sum = 0\n", "for score in Diversified_Set['Score']:\n", " Nscore = score/max_review_Score\n", " Normalised_score.append(Nscore)\n", " normalised_score_sum=normalised_score_sum+Nscore\n", "normalised_score_sum" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 4 }