parent
ad372f4863
commit
05dcd88fa2
@ -1,57 +0,0 @@
|
|||||||
import pandas as pd
|
|
||||||
import time
|
|
||||||
import ast
|
|
||||||
|
|
||||||
def replace_address(row):
|
|
||||||
if "Netherlands" in row["Hotel_Address"]:
|
|
||||||
return "Amsterdam, Netherlands"
|
|
||||||
elif "Barcelona" in row["Hotel_Address"]:
|
|
||||||
return "Barcelona, Spain"
|
|
||||||
elif "United Kingdom" in row["Hotel_Address"]:
|
|
||||||
return "London, United Kingdom"
|
|
||||||
elif "Milan" in row["Hotel_Address"]:
|
|
||||||
return "Milan, Italy"
|
|
||||||
elif "France" in row["Hotel_Address"]:
|
|
||||||
return "Paris, France"
|
|
||||||
elif "Vienna" in row["Hotel_Address"]:
|
|
||||||
return "Vienna, Austria"
|
|
||||||
else:
|
|
||||||
return row.Hotel_Address
|
|
||||||
|
|
||||||
# Load the hotel reviews from CSV
|
|
||||||
start = time.time()
|
|
||||||
df = pd.read_csv('Hotel_Reviews.csv')
|
|
||||||
|
|
||||||
# dropping columns we will not use:
|
|
||||||
df.drop(["lat", "lng"], axis = 1, inplace=True)
|
|
||||||
|
|
||||||
# Replace all the addresses with a shortened, more useful form
|
|
||||||
df["Hotel_Address"] = df.apply(replace_address, axis = 1)
|
|
||||||
|
|
||||||
# Drop `Additional_Number_of_Scoring`
|
|
||||||
df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True)
|
|
||||||
# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values
|
|
||||||
df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')
|
|
||||||
df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)
|
|
||||||
|
|
||||||
# Process the Tags into new columns
|
|
||||||
# The file Hotel_Reviews_Tags.py, identifies the most important tags
|
|
||||||
# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends,
|
|
||||||
# Family with young children, Family with older children, With a pet
|
|
||||||
df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0)
|
|
||||||
df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0)
|
|
||||||
df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0)
|
|
||||||
df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0)
|
|
||||||
df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0)
|
|
||||||
df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0)
|
|
||||||
df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0)
|
|
||||||
df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0)
|
|
||||||
|
|
||||||
# No longer need any of these columns
|
|
||||||
df.drop(["Tags", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given"], axis = 1, inplace=True)
|
|
||||||
|
|
||||||
# Saving new data file with calculated columns
|
|
||||||
print("Saving results to Hotel_Reviews_Filtered.csv")
|
|
||||||
df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)
|
|
||||||
end = time.time()
|
|
||||||
print("Filtering took " + str(round(end - start, 2)) + " seconds")
|
|
@ -1,56 +0,0 @@
|
|||||||
import time
|
|
||||||
import pandas as pd
|
|
||||||
from nltk.corpus import stopwords
|
|
||||||
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
|
||||||
|
|
||||||
# Create the vader sentiment analyser (there are others in NLTK you can try too)
|
|
||||||
vader_sentiment = SentimentIntensityAnalyzer()
|
|
||||||
# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.
|
|
||||||
# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
|
||||||
|
|
||||||
# There are 3 possibilities of input for a review:
|
|
||||||
# It could be "No Negative", in which case, return 0
|
|
||||||
# It could be "No Positive", in which case, return 0
|
|
||||||
# It could be a review, in which case calculate the sentiment
|
|
||||||
def calc_sentiment(review):
|
|
||||||
if review == "No Negative" or review == "No Positive":
|
|
||||||
return 0
|
|
||||||
return vader_sentiment.polarity_scores(review)["compound"]
|
|
||||||
|
|
||||||
# Load the hotel reviews from CSV
|
|
||||||
df = pd.read_csv("Hotel_Reviews_Filtered.csv")
|
|
||||||
|
|
||||||
# Remove stop words - can be slow for a lot of text!
|
|
||||||
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
|
|
||||||
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
|
|
||||||
start = time.time()
|
|
||||||
cache = set(stopwords.words("english"))
|
|
||||||
def remove_stopwords(review):
|
|
||||||
text = " ".join([word for word in review.split() if word not in cache])
|
|
||||||
return text
|
|
||||||
|
|
||||||
# Remove the stop words from both columns
|
|
||||||
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)
|
|
||||||
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
|
|
||||||
|
|
||||||
end = time.time()
|
|
||||||
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")
|
|
||||||
|
|
||||||
# Add a negative sentiment and positive sentiment column
|
|
||||||
print("Calculating sentiment columns for both positive and negative reviews")
|
|
||||||
start = time.time()
|
|
||||||
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
|
|
||||||
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
|
|
||||||
end = time.time()
|
|
||||||
print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
|
|
||||||
|
|
||||||
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
|
|
||||||
print(df[["Negative_Review", "Negative_Sentiment"]])
|
|
||||||
df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
|
|
||||||
print(df[["Positive_Review", "Positive_Sentiment"]])
|
|
||||||
|
|
||||||
# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
|
|
||||||
df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
|
|
||||||
|
|
||||||
print("Saving results to Hotel_Reviews_NLP.csv")
|
|
||||||
df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)
|
|
@ -0,0 +1,143 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import time\n",
|
||||||
|
"import ast"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def replace_address(row):\n",
|
||||||
|
" if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
|
||||||
|
" return \"Amsterdam, Netherlands\"\n",
|
||||||
|
" elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
|
||||||
|
" return \"Barcelona, Spain\"\n",
|
||||||
|
" elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
|
||||||
|
" return \"London, United Kingdom\"\n",
|
||||||
|
" elif \"Milan\" in row[\"Hotel_Address\"]: \n",
|
||||||
|
" return \"Milan, Italy\"\n",
|
||||||
|
" elif \"France\" in row[\"Hotel_Address\"]:\n",
|
||||||
|
" return \"Paris, France\"\n",
|
||||||
|
" elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
|
||||||
|
" return \"Vienna, Austria\" \n",
|
||||||
|
" else:\n",
|
||||||
|
" return row.Hotel_Address\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Load the hotel reviews from CSV\n",
|
||||||
|
"start = time.time()\n",
|
||||||
|
"df = pd.read_csv('../../data/Hotel_Reviews.csv')\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# dropping columns we will not use:\n",
|
||||||
|
"df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Replace all the addresses with a shortened, more useful form\n",
|
||||||
|
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Drop `Additional_Number_of_Scoring`\n",
|
||||||
|
"df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n",
|
||||||
|
"# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n",
|
||||||
|
"df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n",
|
||||||
|
"df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Process the Tags into new columns\n",
|
||||||
|
"# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
|
||||||
|
"# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
|
||||||
|
"# Family with young children, Family with older children, With a pet\n",
|
||||||
|
"df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
|
||||||
|
"df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
|
||||||
|
"df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
|
||||||
|
"df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
|
||||||
|
"df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
|
||||||
|
"df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
|
||||||
|
"df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
|
||||||
|
"df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# No longer need any of these columns\n",
|
||||||
|
"df.drop([\"Tags\", \"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Saving new data file with calculated columns\n",
|
||||||
|
"print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
|
||||||
|
"df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)\n",
|
||||||
|
"end = time.time()\n",
|
||||||
|
"print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
@ -0,0 +1,155 @@
|
|||||||
|
{
|
||||||
|
"metadata": {
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2,
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import time\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"from nltk.corpus import stopwords\n",
|
||||||
|
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create the vader sentiment analyser (there are others in NLTK you can try too)\n",
|
||||||
|
"vader_sentiment = SentimentIntensityAnalyzer()\n",
|
||||||
|
"# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n",
|
||||||
|
"# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# There are 3 possibilities of input for a review:\n",
|
||||||
|
"# It could be \"No Negative\", in which case, return 0\n",
|
||||||
|
"# It could be \"No Positive\", in which case, return 0\n",
|
||||||
|
"# It could be a review, in which case calculate the sentiment\n",
|
||||||
|
"def calc_sentiment(review): \n",
|
||||||
|
" if review == \"No Negative\" or review == \"No Positive\":\n",
|
||||||
|
" return 0\n",
|
||||||
|
" return vader_sentiment.polarity_scores(review)[\"compound\"] \n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Load the hotel reviews from CSV\n",
|
||||||
|
"df = pd.read_csv(\"../../data/Hotel_Reviews_Filtered.csv\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Remove stop words - can be slow for a lot of text!\n",
|
||||||
|
"# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches\n",
|
||||||
|
"# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends\n",
|
||||||
|
"start = time.time()\n",
|
||||||
|
"cache = set(stopwords.words(\"english\"))\n",
|
||||||
|
"def remove_stopwords(review):\n",
|
||||||
|
" text = \" \".join([word for word in review.split() if word not in cache])\n",
|
||||||
|
" return text\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Remove the stop words from both columns\n",
|
||||||
|
"df.Negative_Review = df.Negative_Review.apply(remove_stopwords) \n",
|
||||||
|
"df.Positive_Review = df.Positive_Review.apply(remove_stopwords)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"end = time.time()\n",
|
||||||
|
"print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Add a negative sentiment and positive sentiment column\n",
|
||||||
|
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
|
||||||
|
"start = time.time()\n",
|
||||||
|
"df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",
|
||||||
|
"df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",
|
||||||
|
"end = time.time()\n",
|
||||||
|
"print(\"Calculating sentiment took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
|
||||||
|
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
|
||||||
|
"df = df.sort_values(by=[\"Positive_Sentiment\"], ascending=True)\n",
|
||||||
|
"print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)\n",
|
||||||
|
"df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis=1)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
|
||||||
|
"df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
Loading…
Reference in new issue