From 6332008aa785f5dccbd8655c07021bca6cfbf68c Mon Sep 17 00:00:00 2001
From: Jen Looper <jen.looper@gmail.com>
Date: Tue, 29 Jun 2021 13:55:05 -0400
Subject: [PATCH] notebook edits

---
 ...ebook-filtering.ipynb => 1-notebook.ipynb} |  44 ++++--
 .../{notebook-tags.ipynb => 2-notebook.ipynb} |  45 ++++--
 .../notebook-sentiment-analysis.ipynb         | 136 +++++++++++++++---
 3 files changed, 180 insertions(+), 45 deletions(-)
 rename 6-NLP/5-Hotel-Reviews-2/solution/{notebook-filtering.ipynb => 1-notebook.ipynb} (81%)
 rename 6-NLP/5-Hotel-Reviews-2/solution/{notebook-tags.ipynb => 2-notebook.ipynb} (70%)

diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb
similarity index 81%
rename from 6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb
rename to 6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb
index 3baa7fc1..43036cde 100644
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb
@@ -10,16 +10,23 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
   },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.0 64-bit ('3.7')"
+  },
+  "interpreter": {
+   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2,
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -30,7 +37,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -54,7 +61,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,7 +72,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -75,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,7 +92,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -98,7 +105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -118,23 +125,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
     "# No longer need any of these columns\n",
-    "df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
+    "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Saving results to Hotel_Reviews_Filtered.csv\n",
+      "Filtering took 23.74 seconds\n"
+     ]
+    }
+   ],
    "source": [
     "# Saving new data file with calculated columns\n",
     "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
-    "df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n",
+    "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
     "end = time.time()\n",
     "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
    ]
diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb
similarity index 70%
rename from 6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb
rename to 6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb
index 494b7947..18bdf4cf 100644
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb
@@ -10,26 +10,35 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
   },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.0 64-bit ('3.7')"
+  },
+  "interpreter": {
+   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2,
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Load the hotel reviews from CSV (you can )\n",
+    "import pandas as pd \n",
+    "\n",
     "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -42,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -53,7 +62,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -68,7 +77,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -78,9 +87,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "The shape of the tags with no filtering: (2514684, 2)\n",
+      "                        index   count\n",
+      "0                Leisure trip  338423\n",
+      "1                      Couple  205305\n",
+      "2               Solo traveler   89779\n",
+      "3               Business trip   68176\n",
+      "4                       Group   51593\n",
+      "5  Family with young children   49318\n",
+      "6  Family with older children   21509\n",
+      "7      Travelers with friends    1610\n",
+      "8                  With a pet    1078\n"
+     ]
+    }
+   ],
    "source": [
     "# Get the value counts\n",
     "tag_vc = df_tags.value.value_counts()\n",
diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
index 9cc99703..90b44644 100644
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
@@ -10,40 +10,65 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
   },
-  "orig_nbformat": 4
+  "orig_nbformat": 4,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.7.0 64-bit ('3.7')"
+  },
+  "interpreter": {
+   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
+  }
  },
  "nbformat": 4,
  "nbformat_minor": 2,
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "[nltk_data] Downloading package vader_lexicon to\n[nltk_data]     /Users/jenlooper/nltk_data...\n"
+     ]
+    },
+    {
+     "output_type": "execute_result",
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "metadata": {},
+     "execution_count": 9
+    }
+   ],
    "source": [
     "import time\n",
     "import pandas as pd\n",
+    "import nltk as nltk\n",
     "from nltk.corpus import stopwords\n",
-    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n"
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+    "nltk.download('vader_lexicon')\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
     "vader_sentiment = SentimentIntensityAnalyzer()\n",
-    "# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n",
-    "# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
+    "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -59,7 +84,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -69,7 +94,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -85,7 +110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -96,9 +121,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Removing stop words took 5.77 seconds\n"
+     ]
+    }
+   ],
    "source": [
     "end = time.time()\n",
     "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
@@ -106,9 +139,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Calculating sentiment columns for both positive and negative reviews\n",
+      "Calculating sentiment took 201.07 seconds\n"
+     ]
+    }
+   ],
    "source": [
     "# Add a negative sentiment and positive sentiment column\n",
     "print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
@@ -121,9 +163,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "                                          Negative_Review  Negative_Sentiment\n",
+      "186584  So bad experience memories I hotel The first n...             -0.9920\n",
+      "129503  First charged twice room booked booking second...             -0.9896\n",
+      "307286  The staff Had bad experience even booking Janu...             -0.9889\n",
+      "452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884\n",
+      "201293  We usually traveling Paris 2 3 times year busi...             -0.9873\n",
+      "...                                                   ...                 ...\n",
+      "26899   I would say however one night expensive even d...              0.9933\n",
+      "138365  Wifi terribly slow I speed test network upload...              0.9938\n",
+      "79215   I find anything hotel first I walked past hote...              0.9938\n",
+      "278506  The property great location There bakery next ...              0.9945\n",
+      "339189  Guys I like hotel I wish return next year Howe...              0.9948\n",
+      "\n",
+      "[515738 rows x 2 columns]\n",
+      "                                          Positive_Review  Positive_Sentiment\n",
+      "137893  Bathroom Shower We going stay twice hotel 2 ni...             -0.9820\n",
+      "5839    I completely disappointed mad since reception ...             -0.9780\n",
+      "64158   get everything extra internet parking breakfas...             -0.9751\n",
+      "124178  I didnt like anythig Room small Asked upgrade ...             -0.9721\n",
+      "489137  Very rude manager abusive staff reception Dirt...             -0.9703\n",
+      "...                                                   ...                 ...\n",
+      "331570  Everything This recently renovated hotel class...              0.9984\n",
+      "322920  From moment stepped doors Guesthouse Hotel sta...              0.9985\n",
+      "293710  This place surprise expected good actually gre...              0.9985\n",
+      "417442  We celebrated wedding night Langham I commend ...              0.9985\n",
+      "132492  We arrived super cute boutique hotel area expl...              0.9987\n",
+      "\n",
+      "[515738 rows x 2 columns]\n"
+     ]
+    }
+   ],
    "source": [
     "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
     "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
@@ -133,7 +210,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -143,13 +220,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Saving results to Hotel_Reviews_NLP.csv\n"
+     ]
+    }
+   ],
    "source": [
     "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
     "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ]
 }
\ No newline at end of file