pull/915/head
Zarathushtra 2 weeks ago
parent 7a50781922
commit 8fcb80b6a7

@ -0,0 +1,396 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"id": "df76afb5",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Hotel_Address\n",
"London, United Kingdom 262301\n",
"Barcelona, Spain 60149\n",
"Paris, France 59928\n",
"Amsterdam, Netherlands 57214\n",
"Vienna, Austria 38939\n",
"Milan, Italy 37207\n",
"Name: count, dtype: int64\n"
]
}
],
"source": [
"import pandas as pd\n",
"import time\n",
"import ast\n",
"df = pd.read_csv('../data/Hotel_Reviews.csv')\n",
"def replace_address(row):\n",
" if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
" return \"Amsterdam, Netherlands\"\n",
" elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
" return \"Barcelona, Spain\"\n",
" elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
" return \"London, United Kingdom\"\n",
" elif \"Milan\" in row[\"Hotel_Address\"]: \n",
" return \"Milan, Italy\"\n",
" elif \"France\" in row[\"Hotel_Address\"]:\n",
" return \"Paris, France\"\n",
" elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
" return \"Vienna, Austria\" \n",
"\n",
"# Replace all the addresses with a shortened, more useful form\n",
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n",
"# The sum of the value_counts() should add up to the total number of reviews\n",
"print(df[\"Hotel_Address\"].value_counts())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e8d5be1d",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hotel_Name</th>\n",
" </tr>\n",
" <tr>\n",
" <th>Hotel_Address</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>Amsterdam, Netherlands</th>\n",
" <td>105</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Barcelona, Spain</th>\n",
" <td>211</td>\n",
" </tr>\n",
" <tr>\n",
" <th>London, United Kingdom</th>\n",
" <td>400</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Milan, Italy</th>\n",
" <td>162</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Paris, France</th>\n",
" <td>458</td>\n",
" </tr>\n",
" <tr>\n",
" <th>Vienna, Austria</th>\n",
" <td>158</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Hotel_Name\n",
"Hotel_Address \n",
"Amsterdam, Netherlands 105\n",
"Barcelona, Spain 211\n",
"London, United Kingdom 400\n",
"Milan, Italy 162\n",
"Paris, France 458\n",
"Vienna, Austria 158"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"display(df.groupby(\"Hotel_Address\").agg({\"Hotel_Name\":\"nunique\"}))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "3589cc50",
"metadata": {},
"outputs": [],
"source": [
"import time\n",
"import pandas as pd\n",
"start = time.time()\n",
"df = pd.read_csv('../data/Hotel_Reviews.csv')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "00950a13",
"metadata": {},
"outputs": [],
"source": [
"df.drop([\"lat\", \"lng\"], axis = 1, inplace = True)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "9fb154fe",
"metadata": {},
"outputs": [],
"source": [
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "b3119eab",
"metadata": {},
"outputs": [],
"source": [
"df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace = True)\n",
"df.Total_Number_of_Reviews = df.groupby(\"Hotel_Name\")[\"Total_Number_of_Reviews\"].transform(\"count\")\n",
"df.Average_Score = round(df.groupby(\"Hotel_Name\")[\"Reviewer_Score\"].transform(\"mean\"), 1)\n"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "5316be2d",
"metadata": {},
"outputs": [],
"source": [
"# Process the Tags into new columns\n",
"# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
"# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
"# Family with young children, Family with older children, With a pet\n",
"df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
"df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
"df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
"df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
"df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
"df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
"df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
"df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "3c52267c",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package vader_lexicon to\n",
"[nltk_data] C:\\Users\\user\\AppData\\Roaming\\nltk_data...\n",
"[nltk_data] Package vader_lexicon is already up-to-date!\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saving results to Hotel_Reviews_NLP.csv\n"
]
}
],
"source": [
"import time \n",
"import pandas as pd\n",
"import nltk as nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
"nltk.download('vader_lexicon')\n",
"df = pd.read_csv('../data/Hotel_Reviews_Filtered.csv')\n",
"sia = SentimentIntensityAnalyzer()\n",
"stop_words = stopwords.words('english')\n",
"def clean_text(text):\n",
" if pd.isnull(text):\n",
" return \"\"\n",
" text = text.lower()\n",
" tokens =nltk.word_tokenize(text)\n",
" tokens = [t for t in tokens if t.isalpha() and t not in stop_words]\n",
" return \" \".join(tokens)\n",
" # Apply sentiment analysis using VADER (on raw text here)\n",
"df['Positive_Review_Sentiment'] = df['Positive_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n",
"df['Negative_Review_Sentiment'] = df['Negative_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n",
"\n",
"# Optional: combine positive and negative sentiments for an overall score\n",
"df['Overall_Sentiment'] = df[['Positive_Review_Sentiment', 'Negative_Review_Sentiment']].mean(axis=1)\n",
"\n",
"# Save the dataframe with new NLP data\n",
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
"df.to_csv('../data/Hotel_Reviews_NLP.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "736c9d54",
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"df = pd.read_csv(\"../data/Hotel_Reviews_Filtered.csv\")\n",
"start = time.time()\n",
"cache = set(stopwords.words('english'))\n",
"def remove_stopwords(review):\n",
" text = \"\".join([word for word in review.split() if word not in cache])\n",
" return text\n",
"df.Negative_Review = df.Negative_Review.apply(remove_stopwords)\n",
"df.Positive_Review = df.Positive_Review.apply(remove_stopwords)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "aa8329d7",
"metadata": {},
"outputs": [],
"source": [
"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
"vader_sentiment = SentimentIntensityAnalyzer()\n",
"def calc_sentiment(review):\n",
" if review == \"No Negative\" or review == \"No Positive\":\n",
" return 0\n",
" return vader_sentiment.polarity_scores(review)[\"compound\"]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "944cabbc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Calculating sentiment columns for both positive and negative reviews\n",
"Calculationg sentiment took27.21 Seconds\n"
]
}
],
"source": [
"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
"start = time.time()\n",
"df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",
"df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",
"end = time.time()\n",
"print(\"Calculationg sentiment took\" + str(round(end-start, 2)) + \" Seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "1b530472",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Negative_Review Negative_Sentiment\n",
"349307 Cheating -0.5574\n",
"52956 Horrible -0.5423\n",
"114472 bad -0.5423\n",
"478167 Bad -0.5423\n",
"147464 bad -0.5423\n",
"... ... ...\n",
"317800 Great 0.6249\n",
"397663 Awesome 0.6249\n",
"482922 great 0.6249\n",
"478966 Great 0.6249\n",
"119192 great 0.6249\n",
"\n",
"[515738 rows x 2 columns]\n",
" Positive_Review Positive_Sentiment\n",
"235836 disaster -0.6249\n",
"501482 Bad -0.5423\n",
"409738 bad -0.5423\n",
"427819 bad -0.5423\n",
"209542 bad -0.5423\n",
"... ... ...\n",
"429765 Love 0.6369\n",
"36292 BEST 0.6369\n",
"326446 best 0.6369\n",
"362320 Love 0.6369\n",
"232187 Love 0.6369\n",
"\n",
"[515738 rows x 2 columns]\n"
]
}
],
"source": [
"df = df.sort_values(by = [\"Negative_Sentiment\"], ascending=True)\n",
"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
"df = df.sort_values(by=[\"Positive_Sentiment\"], ascending = True)\n",
"print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "62c07395",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Saving results to Hotel_Reviews_NLP.csv\n"
]
}
],
"source": [
"df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis = 1)\n",
"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
"df.to_csv(r\"../data/Hotel_Reviews_NLP.csv\", index= False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading…
Cancel
Save