From 6332008aa785f5dccbd8655c07021bca6cfbf68c Mon Sep 17 00:00:00 2001 From: Jen Looper Date: Tue, 29 Jun 2021 13:55:05 -0400 Subject: [PATCH] notebook edits --- ...ebook-filtering.ipynb => 1-notebook.ipynb} | 44 ++++-- .../{notebook-tags.ipynb => 2-notebook.ipynb} | 45 ++++-- .../notebook-sentiment-analysis.ipynb | 136 +++++++++++++++--- 3 files changed, 180 insertions(+), 45 deletions(-) rename 6-NLP/5-Hotel-Reviews-2/solution/{notebook-filtering.ipynb => 1-notebook.ipynb} (81%) rename 6-NLP/5-Hotel-Reviews-2/solution/{notebook-tags.ipynb => 2-notebook.ipynb} (70%) diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb similarity index 81% rename from 6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb rename to 6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb index 3baa7fc1..43036cde 100644 --- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-filtering.ipynb +++ b/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb @@ -10,16 +10,23 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": 3 + "version": "3.7.0" }, - "orig_nbformat": 4 + "orig_nbformat": 4, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.0 64-bit ('3.7')" + }, + "interpreter": { + "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d" + } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -30,7 +37,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -54,7 +61,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -65,7 +72,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -75,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -98,7 +105,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -118,23 +125,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# No longer need any of these columns\n", - "df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n" + "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving results to Hotel_Reviews_Filtered.csv\n", + "Filtering took 23.74 seconds\n" + ] + } + ], "source": [ "# Saving new data file with calculated columns\n", "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n", - "df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n", + "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n", "end = time.time()\n", "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n" ] diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb similarity index 70% rename from 6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb rename to 6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb index 494b7947..18bdf4cf 100644 --- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-tags.ipynb +++ b/6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb @@ -10,26 +10,35 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": 3 + "version": "3.7.0" }, - "orig_nbformat": 4 + "orig_nbformat": 4, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.0 64-bit ('3.7')" + }, + "interpreter": { + "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d" + } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Load the hotel reviews from CSV (you can )\n", + "import pandas as pd \n", + "\n", "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -42,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -53,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -68,7 +77,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -78,9 +87,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "The shape of the tags with no filtering: (2514684, 2)\n", + " index count\n", + "0 Leisure trip 338423\n", + "1 Couple 205305\n", + "2 Solo traveler 89779\n", + "3 Business trip 68176\n", + "4 Group 51593\n", + "5 Family with young children 49318\n", + "6 Family with older children 21509\n", + "7 Travelers with friends 1610\n", + "8 With a pet 1078\n" + ] + } + ], "source": [ "# Get the value counts\n", "tag_vc = df_tags.value.value_counts()\n", diff --git a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb index 9cc99703..90b44644 100644 --- a/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb +++ b/6-NLP/5-Hotel-Reviews-2/solution/notebook-sentiment-analysis.ipynb @@ -10,40 +10,65 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": 3 + "version": "3.7.0" }, - "orig_nbformat": 4 + "orig_nbformat": 4, + "kernelspec": { + "name": "python3", + "display_name": "Python 3.7.0 64-bit ('3.7')" + }, + "interpreter": { + "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d" + } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "[nltk_data] Downloading package vader_lexicon to\n[nltk_data] /Users/jenlooper/nltk_data...\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "True" + ] + }, + "metadata": {}, + "execution_count": 9 + } + ], "source": [ "import time\n", "import pandas as pd\n", + "import nltk as nltk\n", "from nltk.corpus import stopwords\n", - "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n" + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "nltk.download('vader_lexicon')\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "# Create the vader sentiment analyser (there are others in NLTK you can try too)\n", "vader_sentiment = SentimentIntensityAnalyzer()\n", - "# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n", - "# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n" + "\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -59,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +94,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -85,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -96,9 +121,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Removing stop words took 5.77 seconds\n" + ] + } + ], "source": [ "end = time.time()\n", "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n" @@ -106,9 +139,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Calculating sentiment columns for both positive and negative reviews\n", + "Calculating sentiment took 201.07 seconds\n" + ] + } + ], "source": [ "# Add a negative sentiment and positive sentiment column\n", "print(\"Calculating sentiment columns for both positive and negative reviews\")\n", @@ -121,9 +163,44 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + " Negative_Review Negative_Sentiment\n", + "186584 So bad experience memories I hotel The first n... -0.9920\n", + "129503 First charged twice room booked booking second... -0.9896\n", + "307286 The staff Had bad experience even booking Janu... -0.9889\n", + "452092 No WLAN room Incredibly rude restaurant staff ... -0.9884\n", + "201293 We usually traveling Paris 2 3 times year busi... -0.9873\n", + "... ... ...\n", + "26899 I would say however one night expensive even d... 0.9933\n", + "138365 Wifi terribly slow I speed test network upload... 0.9938\n", + "79215 I find anything hotel first I walked past hote... 0.9938\n", + "278506 The property great location There bakery next ... 0.9945\n", + "339189 Guys I like hotel I wish return next year Howe... 0.9948\n", + "\n", + "[515738 rows x 2 columns]\n", + " Positive_Review Positive_Sentiment\n", + "137893 Bathroom Shower We going stay twice hotel 2 ni... -0.9820\n", + "5839 I completely disappointed mad since reception ... -0.9780\n", + "64158 get everything extra internet parking breakfas... -0.9751\n", + "124178 I didnt like anythig Room small Asked upgrade ... -0.9721\n", + "489137 Very rude manager abusive staff reception Dirt... -0.9703\n", + "... ... ...\n", + "331570 Everything This recently renovated hotel class... 0.9984\n", + "322920 From moment stepped doors Guesthouse Hotel sta... 0.9985\n", + "293710 This place surprise expected good actually gre... 0.9985\n", + "417442 We celebrated wedding night Langham I commend ... 0.9985\n", + "132492 We arrived super cute boutique hotel area expl... 0.9987\n", + "\n", + "[515738 rows x 2 columns]\n" + ] + } + ], "source": [ "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n", "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n", @@ -133,7 +210,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -143,13 +220,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Saving results to Hotel_Reviews_NLP.csv\n" + ] + } + ], "source": [ "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n", "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ] } \ No newline at end of file