You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
155 lines
4.9 KiB
155 lines
4.9 KiB
4 years ago
|
{
|
||
|
"metadata": {
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
|
"version": 3
|
||
|
},
|
||
|
"orig_nbformat": 4
|
||
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2,
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import time\n",
|
||
|
"import pandas as pd\n",
|
||
|
"from nltk.corpus import stopwords\n",
|
||
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
|
||
|
"vader_sentiment = SentimentIntensityAnalyzer()\n",
|
||
|
"# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n",
|
||
|
"# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# There are 3 possibilities of input for a review:\n",
|
||
|
"# It could be \"No Negative\", in which case, return 0\n",
|
||
|
"# It could be \"No Positive\", in which case, return 0\n",
|
||
|
"# It could be a review, in which case calculate the sentiment\n",
|
||
|
"def calc_sentiment(review): \n",
|
||
|
" if review == \"No Negative\" or review == \"No Positive\":\n",
|
||
|
" return 0\n",
|
||
|
" return vader_sentiment.polarity_scores(review)[\"compound\"] \n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Load the hotel reviews from CSV\n",
|
||
|
"df = pd.read_csv(\"../../data/Hotel_Reviews_Filtered.csv\")\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Remove stop words - can be slow for a lot of text!\n",
|
||
|
"# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches\n",
|
||
|
"# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends\n",
|
||
|
"start = time.time()\n",
|
||
|
"cache = set(stopwords.words(\"english\"))\n",
|
||
|
"def remove_stopwords(review):\n",
|
||
|
" text = \" \".join([word for word in review.split() if word not in cache])\n",
|
||
|
" return text\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Remove the stop words from both columns\n",
|
||
|
"df.Negative_Review = df.Negative_Review.apply(remove_stopwords) \n",
|
||
|
"df.Positive_Review = df.Positive_Review.apply(remove_stopwords)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"end = time.time()\n",
|
||
|
"print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Add a negative sentiment and positive sentiment column\n",
|
||
|
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
|
||
|
"start = time.time()\n",
|
||
|
"df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",
|
||
|
"df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",
|
||
|
"end = time.time()\n",
|
||
|
"print(\"Calculating sentiment took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
|
||
|
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
|
||
|
"df = df.sort_values(by=[\"Positive_Sentiment\"], ascending=True)\n",
|
||
|
"print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)\n",
|
||
|
"df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis=1)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
|
"execution_count": null,
|
||
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
|
||
|
"df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
|
||
|
]
|
||
|
}
|
||
|
]
|
||
|
}
|