parent
7a50781922
commit
8fcb80b6a7
@ -0,0 +1,396 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "df76afb5",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Hotel_Address\n",
|
||||
"London, United Kingdom 262301\n",
|
||||
"Barcelona, Spain 60149\n",
|
||||
"Paris, France 59928\n",
|
||||
"Amsterdam, Netherlands 57214\n",
|
||||
"Vienna, Austria 38939\n",
|
||||
"Milan, Italy 37207\n",
|
||||
"Name: count, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"import ast\n",
|
||||
"df = pd.read_csv('../data/Hotel_Reviews.csv')\n",
|
||||
"def replace_address(row):\n",
|
||||
" if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
|
||||
" return \"Amsterdam, Netherlands\"\n",
|
||||
" elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
|
||||
" return \"Barcelona, Spain\"\n",
|
||||
" elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
|
||||
" return \"London, United Kingdom\"\n",
|
||||
" elif \"Milan\" in row[\"Hotel_Address\"]: \n",
|
||||
" return \"Milan, Italy\"\n",
|
||||
" elif \"France\" in row[\"Hotel_Address\"]:\n",
|
||||
" return \"Paris, France\"\n",
|
||||
" elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
|
||||
" return \"Vienna, Austria\" \n",
|
||||
"\n",
|
||||
"# Replace all the addresses with a shortened, more useful form\n",
|
||||
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n",
|
||||
"# The sum of the value_counts() should add up to the total number of reviews\n",
|
||||
"print(df[\"Hotel_Address\"].value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "e8d5be1d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Hotel_Name</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Hotel_Address</th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>Amsterdam, Netherlands</th>\n",
|
||||
" <td>105</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Barcelona, Spain</th>\n",
|
||||
" <td>211</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>London, United Kingdom</th>\n",
|
||||
" <td>400</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Milan, Italy</th>\n",
|
||||
" <td>162</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Paris, France</th>\n",
|
||||
" <td>458</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>Vienna, Austria</th>\n",
|
||||
" <td>158</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Hotel_Name\n",
|
||||
"Hotel_Address \n",
|
||||
"Amsterdam, Netherlands 105\n",
|
||||
"Barcelona, Spain 211\n",
|
||||
"London, United Kingdom 400\n",
|
||||
"Milan, Italy 162\n",
|
||||
"Paris, France 458\n",
|
||||
"Vienna, Austria 158"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"display(df.groupby(\"Hotel_Address\").agg({\"Hotel_Name\":\"nunique\"}))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "3589cc50",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import time\n",
|
||||
"import pandas as pd\n",
|
||||
"start = time.time()\n",
|
||||
"df = pd.read_csv('../data/Hotel_Reviews.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "00950a13",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.drop([\"lat\", \"lng\"], axis = 1, inplace = True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "9fb154fe",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "b3119eab",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace = True)\n",
|
||||
"df.Total_Number_of_Reviews = df.groupby(\"Hotel_Name\")[\"Total_Number_of_Reviews\"].transform(\"count\")\n",
|
||||
"df.Average_Score = round(df.groupby(\"Hotel_Name\")[\"Reviewer_Score\"].transform(\"mean\"), 1)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "5316be2d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Process the Tags into new columns\n",
|
||||
"# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
|
||||
"# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
|
||||
"# Family with young children, Family with older children, With a pet\n",
|
||||
"df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
|
||||
"df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
|
||||
"df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
|
||||
"df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
|
||||
"df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
|
||||
"df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
|
||||
"df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
|
||||
"df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "3c52267c",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[nltk_data] Downloading package vader_lexicon to\n",
|
||||
"[nltk_data] C:\\Users\\user\\AppData\\Roaming\\nltk_data...\n",
|
||||
"[nltk_data] Package vader_lexicon is already up-to-date!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Saving results to Hotel_Reviews_NLP.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import time \n",
|
||||
"import pandas as pd\n",
|
||||
"import nltk as nltk\n",
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
||||
"nltk.download('vader_lexicon')\n",
|
||||
"df = pd.read_csv('../data/Hotel_Reviews_Filtered.csv')\n",
|
||||
"sia = SentimentIntensityAnalyzer()\n",
|
||||
"stop_words = stopwords.words('english')\n",
|
||||
"def clean_text(text):\n",
|
||||
" if pd.isnull(text):\n",
|
||||
" return \"\"\n",
|
||||
" text = text.lower()\n",
|
||||
" tokens =nltk.word_tokenize(text)\n",
|
||||
" tokens = [t for t in tokens if t.isalpha() and t not in stop_words]\n",
|
||||
" return \" \".join(tokens)\n",
|
||||
" # Apply sentiment analysis using VADER (on raw text here)\n",
|
||||
"df['Positive_Review_Sentiment'] = df['Positive_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n",
|
||||
"df['Negative_Review_Sentiment'] = df['Negative_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n",
|
||||
"\n",
|
||||
"# Optional: combine positive and negative sentiments for an overall score\n",
|
||||
"df['Overall_Sentiment'] = df[['Positive_Review_Sentiment', 'Negative_Review_Sentiment']].mean(axis=1)\n",
|
||||
"\n",
|
||||
"# Save the dataframe with new NLP data\n",
|
||||
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
|
||||
"df.to_csv('../data/Hotel_Reviews_NLP.csv', index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"id": "736c9d54",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk.corpus import stopwords\n",
|
||||
"df = pd.read_csv(\"../data/Hotel_Reviews_Filtered.csv\")\n",
|
||||
"start = time.time()\n",
|
||||
"cache = set(stopwords.words('english'))\n",
|
||||
"def remove_stopwords(review):\n",
|
||||
" text = \"\".join([word for word in review.split() if word not in cache])\n",
|
||||
" return text\n",
|
||||
"df.Negative_Review = df.Negative_Review.apply(remove_stopwords)\n",
|
||||
"df.Positive_Review = df.Positive_Review.apply(remove_stopwords)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"id": "aa8329d7",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
|
||||
"vader_sentiment = SentimentIntensityAnalyzer()\n",
|
||||
"def calc_sentiment(review):\n",
|
||||
" if review == \"No Negative\" or review == \"No Positive\":\n",
|
||||
" return 0\n",
|
||||
" return vader_sentiment.polarity_scores(review)[\"compound\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"id": "944cabbc",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Calculating sentiment columns for both positive and negative reviews\n",
|
||||
"Calculationg sentiment took27.21 Seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
|
||||
"start = time.time()\n",
|
||||
"df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",
|
||||
"df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",
|
||||
"end = time.time()\n",
|
||||
"print(\"Calculationg sentiment took\" + str(round(end-start, 2)) + \" Seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"id": "1b530472",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Negative_Review Negative_Sentiment\n",
|
||||
"349307 Cheating -0.5574\n",
|
||||
"52956 Horrible -0.5423\n",
|
||||
"114472 bad -0.5423\n",
|
||||
"478167 Bad -0.5423\n",
|
||||
"147464 bad -0.5423\n",
|
||||
"... ... ...\n",
|
||||
"317800 Great 0.6249\n",
|
||||
"397663 Awesome 0.6249\n",
|
||||
"482922 great 0.6249\n",
|
||||
"478966 Great 0.6249\n",
|
||||
"119192 great 0.6249\n",
|
||||
"\n",
|
||||
"[515738 rows x 2 columns]\n",
|
||||
" Positive_Review Positive_Sentiment\n",
|
||||
"235836 disaster -0.6249\n",
|
||||
"501482 Bad -0.5423\n",
|
||||
"409738 bad -0.5423\n",
|
||||
"427819 bad -0.5423\n",
|
||||
"209542 bad -0.5423\n",
|
||||
"... ... ...\n",
|
||||
"429765 Love 0.6369\n",
|
||||
"36292 BEST 0.6369\n",
|
||||
"326446 best 0.6369\n",
|
||||
"362320 Love 0.6369\n",
|
||||
"232187 Love 0.6369\n",
|
||||
"\n",
|
||||
"[515738 rows x 2 columns]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = df.sort_values(by = [\"Negative_Sentiment\"], ascending=True)\n",
|
||||
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
|
||||
"df = df.sort_values(by=[\"Positive_Sentiment\"], ascending = True)\n",
|
||||
"print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"id": "62c07395",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Saving results to Hotel_Reviews_NLP.csv\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis = 1)\n",
|
||||
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
|
||||
"df.to_csv(r\"../data/Hotel_Reviews_NLP.csv\", index= False)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
Reference in new issue