|
|
@ -10,40 +10,65 @@
|
|
|
|
"name": "python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": 3
|
|
|
|
"version": "3.7.0"
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"orig_nbformat": 4,
|
|
|
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
|
|
|
"name": "python3",
|
|
|
|
|
|
|
|
"display_name": "Python 3.7.0 64-bit ('3.7')"
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"orig_nbformat": 4
|
|
|
|
"interpreter": {
|
|
|
|
|
|
|
|
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
|
|
|
|
|
|
|
|
}
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 2,
|
|
|
|
"nbformat_minor": 2,
|
|
|
|
"cells": [
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 9,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"[nltk_data] Downloading package vader_lexicon to\n[nltk_data] /Users/jenlooper/nltk_data...\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"output_type": "execute_result",
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
"True"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"execution_count": 9
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"import time\n",
|
|
|
|
"import time\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
|
|
|
"import nltk as nltk\n",
|
|
|
|
"from nltk.corpus import stopwords\n",
|
|
|
|
"from nltk.corpus import stopwords\n",
|
|
|
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n"
|
|
|
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
|
|
|
|
|
|
|
"nltk.download('vader_lexicon')\n"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 10,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
|
|
|
|
|
|
|
|
"vader_sentiment = SentimentIntensityAnalyzer()\n",
|
|
|
|
"vader_sentiment = SentimentIntensityAnalyzer()\n",
|
|
|
|
"# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n",
|
|
|
|
"\n"
|
|
|
|
"# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
]
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 11,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
@ -59,7 +84,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 12,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
@ -69,7 +94,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 13,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
@ -85,7 +110,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 14,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
@ -96,9 +121,17 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 15,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Removing stop words took 5.77 seconds\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"end = time.time()\n",
|
|
|
|
"end = time.time()\n",
|
|
|
|
"print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
|
|
|
"print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
|
|
@ -106,9 +139,18 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 16,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Calculating sentiment columns for both positive and negative reviews\n",
|
|
|
|
|
|
|
|
"Calculating sentiment took 201.07 seconds\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"# Add a negative sentiment and positive sentiment column\n",
|
|
|
|
"# Add a negative sentiment and positive sentiment column\n",
|
|
|
|
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
|
|
|
|
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
|
|
|
@ -121,9 +163,44 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 17,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
" Negative_Review Negative_Sentiment\n",
|
|
|
|
|
|
|
|
"186584 So bad experience memories I hotel The first n... -0.9920\n",
|
|
|
|
|
|
|
|
"129503 First charged twice room booked booking second... -0.9896\n",
|
|
|
|
|
|
|
|
"307286 The staff Had bad experience even booking Janu... -0.9889\n",
|
|
|
|
|
|
|
|
"452092 No WLAN room Incredibly rude restaurant staff ... -0.9884\n",
|
|
|
|
|
|
|
|
"201293 We usually traveling Paris 2 3 times year busi... -0.9873\n",
|
|
|
|
|
|
|
|
"... ... ...\n",
|
|
|
|
|
|
|
|
"26899 I would say however one night expensive even d... 0.9933\n",
|
|
|
|
|
|
|
|
"138365 Wifi terribly slow I speed test network upload... 0.9938\n",
|
|
|
|
|
|
|
|
"79215 I find anything hotel first I walked past hote... 0.9938\n",
|
|
|
|
|
|
|
|
"278506 The property great location There bakery next ... 0.9945\n",
|
|
|
|
|
|
|
|
"339189 Guys I like hotel I wish return next year Howe... 0.9948\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"[515738 rows x 2 columns]\n",
|
|
|
|
|
|
|
|
" Positive_Review Positive_Sentiment\n",
|
|
|
|
|
|
|
|
"137893 Bathroom Shower We going stay twice hotel 2 ni... -0.9820\n",
|
|
|
|
|
|
|
|
"5839 I completely disappointed mad since reception ... -0.9780\n",
|
|
|
|
|
|
|
|
"64158 get everything extra internet parking breakfas... -0.9751\n",
|
|
|
|
|
|
|
|
"124178 I didnt like anythig Room small Asked upgrade ... -0.9721\n",
|
|
|
|
|
|
|
|
"489137 Very rude manager abusive staff reception Dirt... -0.9703\n",
|
|
|
|
|
|
|
|
"... ... ...\n",
|
|
|
|
|
|
|
|
"331570 Everything This recently renovated hotel class... 0.9984\n",
|
|
|
|
|
|
|
|
"322920 From moment stepped doors Guesthouse Hotel sta... 0.9985\n",
|
|
|
|
|
|
|
|
"293710 This place surprise expected good actually gre... 0.9985\n",
|
|
|
|
|
|
|
|
"417442 We celebrated wedding night Langham I commend ... 0.9985\n",
|
|
|
|
|
|
|
|
"132492 We arrived super cute boutique hotel area expl... 0.9987\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"[515738 rows x 2 columns]\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
|
|
|
|
"df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
|
|
|
|
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
|
|
|
|
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
|
|
|
@ -133,7 +210,7 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 18,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
@ -143,13 +220,28 @@
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"execution_count": 19,
|
|
|
|
"metadata": {},
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Saving results to Hotel_Reviews_NLP.csv\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"source": [
|
|
|
|
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
|
|
|
|
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
|
|
|
|
"df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
|
|
|
|
"df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
|
|
|
|
]
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": []
|
|
|
|
}
|
|
|
|
}
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|