notebook edits

4 years ago · 6332008aa7
parent b57fdec684
commit 6332008aa7
3 changed files with 180 additions and 45 deletions
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb
@ -10,16 +10,23 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
-  "orig_nbformat": 4
+  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -30,7 +37,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
@ -54,7 +61,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
@ -65,7 +72,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
@ -75,7 +82,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -85,7 +92,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -98,7 +105,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -118,23 +125,32 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# No longer need any of these columns\n",
-    "df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
+    "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Saving results to Hotel_Reviews_Filtered.csv\n",
      "Filtering took 23.74 seconds\n"
     ]
    }
   ],
   "source": [
    "# Saving new data file with calculated columns\n",
    "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
-    "df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n",
+    "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
    "end = time.time()\n",
    "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb
@ -10,26 +10,35 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
-  "orig_nbformat": 4
+  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV (you can )\n",
    "import pandas as pd \n",
    "\n",
    "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
@ -42,7 +51,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
@ -53,7 +62,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -68,7 +77,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
@ -78,9 +87,27 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "The shape of the tags with no filtering: (2514684, 2)\n",
      "                        index   count\n",
      "0                Leisure trip  338423\n",
      "1                      Couple  205305\n",
      "2               Solo traveler   89779\n",
      "3               Business trip   68176\n",
      "4                       Group   51593\n",
      "5  Family with young children   49318\n",
      "6  Family with older children   21509\n",
      "7      Travelers with friends    1610\n",
      "8                  With a pet    1078\n"
     ]
    }
   ],
   "source": [
    "# Get the value counts\n",
    "tag_vc = df_tags.value.value_counts()\n",
--- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb
@ -10,40 +10,65 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": 3
+   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
-  "orig_nbformat": 4
+  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "[nltk_data] Downloading package vader_lexicon to\n[nltk_data]     /Users/jenlooper/nltk_data...\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "import time\n",
    "import pandas as pd\n",
    "import nltk as nltk\n",
    "from nltk.corpus import stopwords\n",
-    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n"
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "nltk.download('vader_lexicon')\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
    "vader_sentiment = SentimentIntensityAnalyzer()\n",
-    "# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n",
+    "\n"
    "# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
@ -59,7 +84,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
@ -69,7 +94,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
@ -85,7 +110,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
@ -96,9 +121,17 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Removing stop words took 5.77 seconds\n"
     ]
    }
   ],
   "source": [
    "end = time.time()\n",
    "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
@ -106,9 +139,18 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 16,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Calculating sentiment columns for both positive and negative reviews\n",
      "Calculating sentiment took 201.07 seconds\n"
     ]
    }
   ],
   "source": [
    "# Add a negative sentiment and positive sentiment column\n",
    "print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
@ -121,9 +163,44 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 17,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "                                          Negative_Review  Negative_Sentiment\n",
      "186584  So bad experience memories I hotel The first n...             -0.9920\n",
      "129503  First charged twice room booked booking second...             -0.9896\n",
      "307286  The staff Had bad experience even booking Janu...             -0.9889\n",
      "452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884\n",
      "201293  We usually traveling Paris 2 3 times year busi...             -0.9873\n",
      "...                                                   ...                 ...\n",
      "26899   I would say however one night expensive even d...              0.9933\n",
      "138365  Wifi terribly slow I speed test network upload...              0.9938\n",
      "79215   I find anything hotel first I walked past hote...              0.9938\n",
      "278506  The property great location There bakery next ...              0.9945\n",
      "339189  Guys I like hotel I wish return next year Howe...              0.9948\n",
      "\n",
      "[515738 rows x 2 columns]\n",
      "                                          Positive_Review  Positive_Sentiment\n",
      "137893  Bathroom Shower We going stay twice hotel 2 ni...             -0.9820\n",
      "5839    I completely disappointed mad since reception ...             -0.9780\n",
      "64158   get everything extra internet parking breakfas...             -0.9751\n",
      "124178  I didnt like anythig Room small Asked upgrade ...             -0.9721\n",
      "489137  Very rude manager abusive staff reception Dirt...             -0.9703\n",
      "...                                                   ...                 ...\n",
      "331570  Everything This recently renovated hotel class...              0.9984\n",
      "322920  From moment stepped doors Guesthouse Hotel sta...              0.9985\n",
      "293710  This place surprise expected good actually gre...              0.9985\n",
      "417442  We celebrated wedding night Langham I commend ...              0.9985\n",
      "132492  We arrived super cute boutique hotel area expl...              0.9987\n",
      "\n",
      "[515738 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
    "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
@ -133,7 +210,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
@ -143,13 +220,28 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 19,
   "metadata": {},
-   "outputs": [],
+   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Saving results to Hotel_Reviews_NLP.csv\n"
     ]
    }
   ],
   "source": [
    "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
    "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
 }