notebook edits

pull/73/head
Jen Looper 4 years ago
parent b57fdec684
commit 6332008aa7

@ -10,16 +10,23 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": 3 "version": "3.7.0"
}, },
"orig_nbformat": 4 "orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.7.0 64-bit ('3.7')"
},
"interpreter": {
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
}
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2, "nbformat_minor": 2,
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 1,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -30,7 +37,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -54,7 +61,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -65,7 +72,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -75,7 +82,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -85,7 +92,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -98,7 +105,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -118,23 +125,32 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# No longer need any of these columns\n", "# No longer need any of these columns\n",
"df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n" "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving results to Hotel_Reviews_Filtered.csv\n",
"Filtering took 23.74 seconds\n"
]
}
],
"source": [ "source": [
"# Saving new data file with calculated columns\n", "# Saving new data file with calculated columns\n",
"print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n", "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
"df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n", "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
"end = time.time()\n", "end = time.time()\n",
"print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n" "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
] ]

@ -10,26 +10,35 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": 3 "version": "3.7.0"
}, },
"orig_nbformat": 4 "orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.7.0 64-bit ('3.7')"
},
"interpreter": {
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
}
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2, "nbformat_minor": 2,
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 4,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Load the hotel reviews from CSV (you can )\n", "# Load the hotel reviews from CSV (you can )\n",
"import pandas as pd \n",
"\n",
"df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n" "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -42,7 +51,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 6,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -53,7 +62,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -68,7 +77,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -78,9 +87,27 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"The shape of the tags with no filtering: (2514684, 2)\n",
" index count\n",
"0 Leisure trip 338423\n",
"1 Couple 205305\n",
"2 Solo traveler 89779\n",
"3 Business trip 68176\n",
"4 Group 51593\n",
"5 Family with young children 49318\n",
"6 Family with older children 21509\n",
"7 Travelers with friends 1610\n",
"8 With a pet 1078\n"
]
}
],
"source": [ "source": [
"# Get the value counts\n", "# Get the value counts\n",
"tag_vc = df_tags.value.value_counts()\n", "tag_vc = df_tags.value.value_counts()\n",

@ -10,40 +10,65 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": 3 "version": "3.7.0"
}, },
"orig_nbformat": 4 "orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.7.0 64-bit ('3.7')"
},
"interpreter": {
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
}
}, },
"nbformat": 4, "nbformat": 4,
"nbformat_minor": 2, "nbformat_minor": 2,
"cells": [ "cells": [
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 9,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"[nltk_data] Downloading package vader_lexicon to\n[nltk_data] /Users/jenlooper/nltk_data...\n"
]
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {},
"execution_count": 9
}
],
"source": [ "source": [
"import time\n", "import time\n",
"import pandas as pd\n", "import pandas as pd\n",
"import nltk as nltk\n",
"from nltk.corpus import stopwords\n", "from nltk.corpus import stopwords\n",
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n" "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
"nltk.download('vader_lexicon')\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
"vader_sentiment = SentimentIntensityAnalyzer()\n", "vader_sentiment = SentimentIntensityAnalyzer()\n",
"# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n", "\n"
"# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 11,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -59,7 +84,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 12,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -69,7 +94,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 13,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -85,7 +110,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 14,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -96,9 +121,17 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 15,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Removing stop words took 5.77 seconds\n"
]
}
],
"source": [ "source": [
"end = time.time()\n", "end = time.time()\n",
"print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n" "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
@ -106,9 +139,18 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 16,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Calculating sentiment columns for both positive and negative reviews\n",
"Calculating sentiment took 201.07 seconds\n"
]
}
],
"source": [ "source": [
"# Add a negative sentiment and positive sentiment column\n", "# Add a negative sentiment and positive sentiment column\n",
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n", "print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
@ -121,9 +163,44 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 17,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
" Negative_Review Negative_Sentiment\n",
"186584 So bad experience memories I hotel The first n... -0.9920\n",
"129503 First charged twice room booked booking second... -0.9896\n",
"307286 The staff Had bad experience even booking Janu... -0.9889\n",
"452092 No WLAN room Incredibly rude restaurant staff ... -0.9884\n",
"201293 We usually traveling Paris 2 3 times year busi... -0.9873\n",
"... ... ...\n",
"26899 I would say however one night expensive even d... 0.9933\n",
"138365 Wifi terribly slow I speed test network upload... 0.9938\n",
"79215 I find anything hotel first I walked past hote... 0.9938\n",
"278506 The property great location There bakery next ... 0.9945\n",
"339189 Guys I like hotel I wish return next year Howe... 0.9948\n",
"\n",
"[515738 rows x 2 columns]\n",
" Positive_Review Positive_Sentiment\n",
"137893 Bathroom Shower We going stay twice hotel 2 ni... -0.9820\n",
"5839 I completely disappointed mad since reception ... -0.9780\n",
"64158 get everything extra internet parking breakfas... -0.9751\n",
"124178 I didnt like anythig Room small Asked upgrade ... -0.9721\n",
"489137 Very rude manager abusive staff reception Dirt... -0.9703\n",
"... ... ...\n",
"331570 Everything This recently renovated hotel class... 0.9984\n",
"322920 From moment stepped doors Guesthouse Hotel sta... 0.9985\n",
"293710 This place surprise expected good actually gre... 0.9985\n",
"417442 We celebrated wedding night Langham I commend ... 0.9985\n",
"132492 We arrived super cute boutique hotel area expl... 0.9987\n",
"\n",
"[515738 rows x 2 columns]\n"
]
}
],
"source": [ "source": [
"df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n", "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n", "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
@ -133,7 +210,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 18,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -143,13 +220,28 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 19,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving results to Hotel_Reviews_NLP.csv\n"
]
}
],
"source": [ "source": [
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n", "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
"df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n" "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
] ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
} }
] ]
} }
Loading…
Cancel
Save