diff --git a/6-NLP/5-Hotel-Reviews-2/notebook.ipynb b/6-NLP/5-Hotel-Reviews-2/notebook.ipynb index e69de29bb..2ebbfed03 100644 --- a/6-NLP/5-Hotel-Reviews-2/notebook.ipynb +++ b/6-NLP/5-Hotel-Reviews-2/notebook.ipynb @@ -0,0 +1,396 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "df76afb5", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hotel_Address\n", + "London, United Kingdom 262301\n", + "Barcelona, Spain 60149\n", + "Paris, France 59928\n", + "Amsterdam, Netherlands 57214\n", + "Vienna, Austria 38939\n", + "Milan, Italy 37207\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import time\n", + "import ast\n", + "df = pd.read_csv('../data/Hotel_Reviews.csv')\n", + "def replace_address(row):\n", + " if \"Netherlands\" in row[\"Hotel_Address\"]:\n", + " return \"Amsterdam, Netherlands\"\n", + " elif \"Barcelona\" in row[\"Hotel_Address\"]:\n", + " return \"Barcelona, Spain\"\n", + " elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n", + " return \"London, United Kingdom\"\n", + " elif \"Milan\" in row[\"Hotel_Address\"]: \n", + " return \"Milan, Italy\"\n", + " elif \"France\" in row[\"Hotel_Address\"]:\n", + " return \"Paris, France\"\n", + " elif \"Vienna\" in row[\"Hotel_Address\"]:\n", + " return \"Vienna, Austria\" \n", + "\n", + "# Replace all the addresses with a shortened, more useful form\n", + "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n", + "# The sum of the value_counts() should add up to the total number of reviews\n", + "print(df[\"Hotel_Address\"].value_counts())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e8d5be1d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Hotel_Name
Hotel_Address
Amsterdam, Netherlands105
Barcelona, Spain211
London, United Kingdom400
Milan, Italy162
Paris, France458
Vienna, Austria158
\n", + "
" + ], + "text/plain": [ + " Hotel_Name\n", + "Hotel_Address \n", + "Amsterdam, Netherlands 105\n", + "Barcelona, Spain 211\n", + "London, United Kingdom 400\n", + "Milan, Italy 162\n", + "Paris, France 458\n", + "Vienna, Austria 158" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "display(df.groupby(\"Hotel_Address\").agg({\"Hotel_Name\":\"nunique\"}))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "3589cc50", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "import pandas as pd\n", + "start = time.time()\n", + "df = pd.read_csv('../data/Hotel_Reviews.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "00950a13", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop([\"lat\", \"lng\"], axis = 1, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "9fb154fe", + "metadata": {}, + "outputs": [], + "source": [ + "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b3119eab", + "metadata": {}, + "outputs": [], + "source": [ + "df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace = True)\n", + "df.Total_Number_of_Reviews = df.groupby(\"Hotel_Name\")[\"Total_Number_of_Reviews\"].transform(\"count\")\n", + "df.Average_Score = round(df.groupby(\"Hotel_Name\")[\"Reviewer_Score\"].transform(\"mean\"), 1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "5316be2d", + "metadata": {}, + "outputs": [], + "source": [ + "# Process the Tags into new columns\n", + "# The file Hotel_Reviews_Tags.py, identifies the most important tags\n", + "# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n", + "# Family with young children, Family with older children, With a pet\n", + "df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n", + "df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n", + "df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n", + "df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n", + "df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n", + "df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n", + "df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n", + "df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3c52267c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package vader_lexicon to\n", + "[nltk_data] C:\\Users\\user\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package vader_lexicon is already up-to-date!\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving results to Hotel_Reviews_NLP.csv\n" + ] + } + ], + "source": [ + "import time \n", + "import pandas as pd\n", + "import nltk as nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "nltk.download('vader_lexicon')\n", + "df = pd.read_csv('../data/Hotel_Reviews_Filtered.csv')\n", + "sia = SentimentIntensityAnalyzer()\n", + "stop_words = stopwords.words('english')\n", + "def clean_text(text):\n", + " if pd.isnull(text):\n", + " return \"\"\n", + " text = text.lower()\n", + " tokens =nltk.word_tokenize(text)\n", + " tokens = [t for t in tokens if t.isalpha() and t not in stop_words]\n", + " return \" \".join(tokens)\n", + " # Apply sentiment analysis using VADER (on raw text here)\n", + "df['Positive_Review_Sentiment'] = df['Positive_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n", + "df['Negative_Review_Sentiment'] = df['Negative_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n", + "\n", + "# Optional: combine positive and negative sentiments for an overall score\n", + "df['Overall_Sentiment'] = df[['Positive_Review_Sentiment', 'Negative_Review_Sentiment']].mean(axis=1)\n", + "\n", + "# Save the dataframe with new NLP data\n", + "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n", + "df.to_csv('../data/Hotel_Reviews_NLP.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "736c9d54", + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords\n", + "df = pd.read_csv(\"../data/Hotel_Reviews_Filtered.csv\")\n", + "start = time.time()\n", + "cache = set(stopwords.words('english'))\n", + "def remove_stopwords(review):\n", + " text = \"\".join([word for word in review.split() if word not in cache])\n", + " return text\n", + "df.Negative_Review = df.Negative_Review.apply(remove_stopwords)\n", + "df.Positive_Review = df.Positive_Review.apply(remove_stopwords)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "aa8329d7", + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", + "vader_sentiment = SentimentIntensityAnalyzer()\n", + "def calc_sentiment(review):\n", + " if review == \"No Negative\" or review == \"No Positive\":\n", + " return 0\n", + " return vader_sentiment.polarity_scores(review)[\"compound\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "944cabbc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculating sentiment columns for both positive and negative reviews\n", + "Calculationg sentiment took27.21 Seconds\n" + ] + } + ], + "source": [ + "print(\"Calculating sentiment columns for both positive and negative reviews\")\n", + "start = time.time()\n", + "df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n", + "df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n", + "end = time.time()\n", + "print(\"Calculationg sentiment took\" + str(round(end-start, 2)) + \" Seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "1b530472", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Negative_Review Negative_Sentiment\n", + "349307 Cheating -0.5574\n", + "52956 Horrible -0.5423\n", + "114472 bad -0.5423\n", + "478167 Bad -0.5423\n", + "147464 bad -0.5423\n", + "... ... ...\n", + "317800 Great 0.6249\n", + "397663 Awesome 0.6249\n", + "482922 great 0.6249\n", + "478966 Great 0.6249\n", + "119192 great 0.6249\n", + "\n", + "[515738 rows x 2 columns]\n", + " Positive_Review Positive_Sentiment\n", + "235836 disaster -0.6249\n", + "501482 Bad -0.5423\n", + "409738 bad -0.5423\n", + "427819 bad -0.5423\n", + "209542 bad -0.5423\n", + "... ... ...\n", + "429765 Love 0.6369\n", + "36292 BEST 0.6369\n", + "326446 best 0.6369\n", + "362320 Love 0.6369\n", + "232187 Love 0.6369\n", + "\n", + "[515738 rows x 2 columns]\n" + ] + } + ], + "source": [ + "df = df.sort_values(by = [\"Negative_Sentiment\"], ascending=True)\n", + "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n", + "df = df.sort_values(by=[\"Positive_Sentiment\"], ascending = True)\n", + "print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "62c07395", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saving results to Hotel_Reviews_NLP.csv\n" + ] + } + ], + "source": [ + "df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis = 1)\n", + "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n", + "df.to_csv(r\"../data/Hotel_Reviews_NLP.csv\", index= False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}