From 74ab6ac0d71620ecc6a228db5d45e58cb196f4e5 Mon Sep 17 00:00:00 2001 From: raygaeta Date: Tue, 6 Feb 2024 09:38:53 -0800 Subject: [PATCH] NLP Hotel Reviews Data Analysis PD --- 6-NLP/4-Hotel-Reviews-1/notebook.ipynb | 586 +++++++++++++++++++++++++ 1 file changed, 586 insertions(+) diff --git a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb index e69de29b..1449297a 100644 --- a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb +++ b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb @@ -0,0 +1,586 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data file now, this could take a while depending on file size\n", + "Loading took 1.97 seconds\n" + ] + } + ], + "source": [ + "print(\"Loading data file now, this could take a while depending on file size\")\n", + "start = time.time()\n", + "# df is 'DataFrame' - make sure you downloaded the file to the data folder\n", + "df = pd.read_csv('/Users/ray/Desktop/ML-For-Beginners/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews.csv')\n", + "end = time.time()\n", + "print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The shape of the data (rows, cols) is (515738, 17)\n" + ] + } + ], + "source": [ + "print(\"The shape of the data (rows, cols) is \" + str(df.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 227 different nationalities\n" + ] + } + ], + "source": [ + "nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n", + "print(\"There are \" + str(nationality_freq.size) + \" different nationalities\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reviewer_Nationality\n", + " United Kingdom 245246\n", + " United States of America 35437\n", + " Australia 21686\n", + " Ireland 14827\n", + " United Arab Emirates 10235\n", + " ... \n", + " Cape Verde 1\n", + " Northern Mariana Islands 1\n", + " Tuvalu 1\n", + " Guinea 1\n", + " Palau 1\n", + "Name: count, Length: 227, dtype: int64\n" + ] + } + ], + "source": [ + "print(nationality_freq) " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The highest frequency reviewer nationality is United Kingdom with 245246 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/1920668807.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n" + ] + } + ], + "source": [ + "print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The next 10 highest frequency reviewer nationalities are:\n", + "Reviewer_Nationality\n", + " United States of America 35437\n", + " Australia 21686\n", + " Ireland 14827\n", + " United Arab Emirates 10235\n", + " Saudi Arabia 8951\n", + " Netherlands 8772\n", + " Switzerland 8678\n", + " Germany 7941\n", + " Canada 7894\n", + " France 7296\n" + ] + } + ], + "source": [ + "print(\"The next 10 highest frequency reviewer nationalities are:\")\n", + "print(nationality_freq[1:11].to_string())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n", + "The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n", + "The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n", + "The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n", + "The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n", + "The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n", + "The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n", + "The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n", + "The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n", + "The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n" + ] + } + ], + "source": [ + "for nat in nationality_freq[:10].index:\n", + " # First, extract all the rows that match the criteria into a new dataframe\n", + " nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n", + " # Now get the hotel freq\n", + " freq = nat_df[\"Hotel_Name\"].value_counts()\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Hotel_NameTotal_Number_of_ReviewsTotal_Reviews_Found
0Hotel Arena1403405
405K K Hotel George1831566
971Apex Temple Court Hotel26191037
2008The Park Grand London Paddington43801770
3778Monhotel Lounge SPA17135
............
511962Suite Hotel 900 m zur Oper3461439
512401Hotel Amadeus717144
512545The Berkeley232100
512645Holiday Inn London Kensington59452768
515413Atlantis Hotel Vienna2823325
\n", + "

1492 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " Hotel_Name Total_Number_of_Reviews \\\n", + "0 Hotel Arena 1403 \n", + "405 K K Hotel George 1831 \n", + "971 Apex Temple Court Hotel 2619 \n", + "2008 The Park Grand London Paddington 4380 \n", + "3778 Monhotel Lounge SPA 171 \n", + "... ... ... \n", + "511962 Suite Hotel 900 m zur Oper 3461 \n", + "512401 Hotel Amadeus 717 \n", + "512545 The Berkeley 232 \n", + "512645 Holiday Inn London Kensington 5945 \n", + "515413 Atlantis Hotel Vienna 2823 \n", + "\n", + " Total_Reviews_Found \n", + "0 405 \n", + "405 566 \n", + "971 1037 \n", + "2008 1770 \n", + "3778 35 \n", + "... ... \n", + "511962 439 \n", + "512401 144 \n", + "512545 100 \n", + "512645 2768 \n", + "515413 325 \n", + "\n", + "[1492 rows x 3 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n", + " \n", + " # Get rid of all the duplicated rows\n", + "hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n", + "display(hotel_freq_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Average_Score_DifferenceAverage_ScoreCalc_Average_ScoreHotel_Name
495945-0.87.78.5Best Western Hotel Astoria
111027-0.78.89.5Hotel Stendhal Place Vend me Paris MGallery by...
43688-0.77.58.2Mercure Paris Porte d Orleans
178253-0.77.98.6Renaissance Paris Vendome Hotel
218258-0.57.07.5Hotel Royal Elys es
...............
1514160.77.87.1Best Western Allegro Nation
221890.87.16.3Holiday Inn Paris Montparnasse Pasteur
2503080.98.67.7MARQUIS Faubourg St Honor Relais Ch teaux
689360.96.85.9Villa Eugenie
38131.37.25.9Kube Hotel Ice Bar
\n", + "

1492 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Average_Score_Difference Average_Score Calc_Average_Score \\\n", + "495945 -0.8 7.7 8.5 \n", + "111027 -0.7 8.8 9.5 \n", + "43688 -0.7 7.5 8.2 \n", + "178253 -0.7 7.9 8.6 \n", + "218258 -0.5 7.0 7.5 \n", + "... ... ... ... \n", + "151416 0.7 7.8 7.1 \n", + "22189 0.8 7.1 6.3 \n", + "250308 0.9 8.6 7.7 \n", + "68936 0.9 6.8 5.9 \n", + "3813 1.3 7.2 5.9 \n", + "\n", + " Hotel_Name \n", + "495945 Best Western Hotel Astoria \n", + "111027 Hotel Stendhal Place Vend me Paris MGallery by... \n", + "43688 Mercure Paris Porte d Orleans \n", + "178253 Renaissance Paris Vendome Hotel \n", + "218258 Hotel Royal Elys es \n", + "... ... \n", + "151416 Best Western Allegro Nation \n", + "22189 Holiday Inn Paris Montparnasse Pasteur \n", + "250308 MARQUIS Faubourg St Honor Relais Ch teaux \n", + "68936 Villa Eugenie \n", + "3813 Kube Hotel Ice Bar \n", + "\n", + "[1492 rows x 4 columns]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "def get_difference_review_avg(row):\n", + " return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]\n", + " \n", + " # 'mean' is mathematical word for 'average'\n", + "df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n", + "\n", + "# Add a new column with the difference between the two average scores\n", + "df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n", + "\n", + "# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n", + "review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n", + "\n", + "# Sort the dataframe to find the lowest and highest average score difference\n", + "review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n", + "\n", + "display(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of No Negative reviews: 127890\n", + "Number of No Positive reviews: 35946\n", + "Number of both No Negative and No Positive reviews: 127\n", + "Sum took 0.17 seconds\n" + ] + } + ], + "source": [ + "start = time.time()\n", + "no_negative_reviews = sum(df.Negative_Review == \"No Negative\")\n", + "print(\"Number of No Negative reviews: \" + str(no_negative_reviews))\n", + "\n", + "no_positive_reviews = sum(df[\"Positive_Review\"] == \"No Positive\")\n", + "print(\"Number of No Positive reviews: \" + str(no_positive_reviews))\n", + "\n", + "both_no_reviews = sum((df.Negative_Review == \"No Negative\") & (df.Positive_Review == \"No Positive\"))\n", + "print(\"Number of both No Negative and No Positive reviews: \" + str(both_no_reviews))\n", + "\n", + "end = time.time()\n", + "print(\"Sum took \" + str(round(end - start, 2)) + \" seconds\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}