notebook edits

5 years ago · 6332008aa7
parent b57fdec684
commit 6332008aa7
3 changed files with 180 additions and 45 deletions
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb
@ -10,16 +10,23 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
  },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.0 64-bit ('3.7')"
+  },
+  "interpreter": {
+   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
+  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -30,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -54,7 +61,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -65,7 +72,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -75,7 +82,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -85,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -98,7 +105,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -118,23 +125,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# No longer need any of these columns\n",
-    "df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
+    "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Saving results to Hotel_Reviews_Filtered.csv\n",
+      "Filtering took 23.74 seconds\n"
+     ]
+    }
+   ],
   "source": [
    "# Saving new data file with calculated columns\n",
    "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
-    "df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n",
+    "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
    "end = time.time()\n",
    "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb
@ -10,26 +10,35 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
  },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.0 64-bit ('3.7')"
+  },
+  "interpreter": {
+   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
+  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV (you can )\n",
+    "import pandas as pd \n",
+    "\n",
    "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -42,7 +51,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -53,7 +62,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -68,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -78,9 +87,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The shape of the tags with no filtering: (2514684, 2)\n",
+      "                        index   count\n",
+      "0                Leisure trip  338423\n",
+      "1                      Couple  205305\n",
+      "2               Solo traveler   89779\n",
+      "3               Business trip   68176\n",
+      "4                       Group   51593\n",
+      "5  Family with young children   49318\n",
+      "6  Family with older children   21509\n",
+      "7      Travelers with friends    1610\n",
+      "8                  With a pet    1078\n"
+     ]
+    }
+   ],
   "source": [
    "# Get the value counts\n",
    "tag_vc = df_tags.value.value_counts()\n",
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
@ -10,40 +10,65 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
  },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.0 64-bit ('3.7')"
+  },
+  "interpreter": {
+   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
+  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "[nltk_data] Downloading package vader_lexicon to\n[nltk_data]     /Users/jenlooper/nltk_data...\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 9
+    }
+   ],
   "source": [
    "import time\n",
    "import pandas as pd\n",
+    "import nltk as nltk\n",
    "from nltk.corpus import stopwords\n",
-    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n"
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+    "nltk.download('vader_lexicon')\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
    "vader_sentiment = SentimentIntensityAnalyzer()\n",
-    "# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n",
-    "# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
+    "\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@ -59,7 +84,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@ -69,7 +94,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@ -85,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -96,9 +121,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Removing stop words took 5.77 seconds\n"
+     ]
+    }
+   ],
   "source": [
    "end = time.time()\n",
    "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
@ -106,9 +139,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Calculating sentiment columns for both positive and negative reviews\n",
+      "Calculating sentiment took 201.07 seconds\n"
+     ]
+    }
+   ],
   "source": [
    "# Add a negative sentiment and positive sentiment column\n",
    "print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
@ -121,9 +163,44 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "                                          Negative_Review  Negative_Sentiment\n",
+      "186584  So bad experience memories I hotel The first n...             -0.9920\n",
+      "129503  First charged twice room booked booking second...             -0.9896\n",
+      "307286  The staff Had bad experience even booking Janu...             -0.9889\n",
+      "452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884\n",
+      "201293  We usually traveling Paris 2 3 times year busi...             -0.9873\n",
+      "...                                                   ...                 ...\n",
+      "26899   I would say however one night expensive even d...              0.9933\n",
+      "138365  Wifi terribly slow I speed test network upload...              0.9938\n",
+      "79215   I find anything hotel first I walked past hote...              0.9938\n",
+      "278506  The property great location There bakery next ...              0.9945\n",
+      "339189  Guys I like hotel I wish return next year Howe...              0.9948\n",
+      "\n",
+      "[515738 rows x 2 columns]\n",
+      "                                          Positive_Review  Positive_Sentiment\n",
+      "137893  Bathroom Shower We going stay twice hotel 2 ni...             -0.9820\n",
+      "5839    I completely disappointed mad since reception ...             -0.9780\n",
+      "64158   get everything extra internet parking breakfas...             -0.9751\n",
+      "124178  I didnt like anythig Room small Asked upgrade ...             -0.9721\n",
+      "489137  Very rude manager abusive staff reception Dirt...             -0.9703\n",
+      "...                                                   ...                 ...\n",
+      "331570  Everything This recently renovated hotel class...              0.9984\n",
+      "322920  From moment stepped doors Guesthouse Hotel sta...              0.9985\n",
+      "293710  This place surprise expected good actually gre...              0.9985\n",
+      "417442  We celebrated wedding night Langham I commend ...              0.9985\n",
+      "132492  We arrived super cute boutique hotel area expl...              0.9987\n",
+      "\n",
+      "[515738 rows x 2 columns]\n"
+     ]
+    }
+   ],
   "source": [
    "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
    "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
@ -133,7 +210,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
@ -143,13 +220,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Saving results to Hotel_Reviews_NLP.csv\n"
+     ]
+    }
+   ],
   "source": [
    "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
    "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ]
 }