diff --git a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb index e69de29b..1449297a 100644 --- a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb +++ b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb @@ -0,0 +1,586 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading data file now, this could take a while depending on file size\n", + "Loading took 1.97 seconds\n" + ] + } + ], + "source": [ + "print(\"Loading data file now, this could take a while depending on file size\")\n", + "start = time.time()\n", + "# df is 'DataFrame' - make sure you downloaded the file to the data folder\n", + "df = pd.read_csv('/Users/ray/Desktop/ML-For-Beginners/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews.csv')\n", + "end = time.time()\n", + "print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The shape of the data (rows, cols) is (515738, 17)\n" + ] + } + ], + "source": [ + "print(\"The shape of the data (rows, cols) is \" + str(df.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 227 different nationalities\n" + ] + } + ], + "source": [ + "nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n", + "print(\"There are \" + str(nationality_freq.size) + \" different nationalities\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reviewer_Nationality\n", + " United Kingdom 245246\n", + " United States of America 35437\n", + " Australia 21686\n", + " Ireland 14827\n", + " United Arab Emirates 10235\n", + " ... \n", + " Cape Verde 1\n", + " Northern Mariana Islands 1\n", + " Tuvalu 1\n", + " Guinea 1\n", + " Palau 1\n", + "Name: count, Length: 227, dtype: int64\n" + ] + } + ], + "source": [ + "print(nationality_freq) " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The highest frequency reviewer nationality is United Kingdom with 245246 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/1920668807.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n" + ] + } + ], + "source": [ + "print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The next 10 highest frequency reviewer nationalities are:\n", + "Reviewer_Nationality\n", + " United States of America 35437\n", + " Australia 21686\n", + " Ireland 14827\n", + " United Arab Emirates 10235\n", + " Saudi Arabia 8951\n", + " Netherlands 8772\n", + " Switzerland 8678\n", + " Germany 7941\n", + " Canada 7894\n", + " France 7296\n" + ] + } + ], + "source": [ + "print(\"The next 10 highest frequency reviewer nationalities are:\")\n", + "print(nationality_freq[1:11].to_string())" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n", + "The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n", + "The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n", + "The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n", + "The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n", + "The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n", + "The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n", + "The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n", + "The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n", + "The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n" + ] + } + ], + "source": [ + "for nat in nationality_freq[:10].index:\n", + " # First, extract all the rows that match the criteria into a new dataframe\n", + " nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n", + " # Now get the hotel freq\n", + " freq = nat_df[\"Hotel_Name\"].value_counts()\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | Hotel_Name | \n", + "Total_Number_of_Reviews | \n", + "Total_Reviews_Found | \n", + "
---|---|---|---|
0 | \n", + "Hotel Arena | \n", + "1403 | \n", + "405 | \n", + "
405 | \n", + "K K Hotel George | \n", + "1831 | \n", + "566 | \n", + "
971 | \n", + "Apex Temple Court Hotel | \n", + "2619 | \n", + "1037 | \n", + "
2008 | \n", + "The Park Grand London Paddington | \n", + "4380 | \n", + "1770 | \n", + "
3778 | \n", + "Monhotel Lounge SPA | \n", + "171 | \n", + "35 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "
511962 | \n", + "Suite Hotel 900 m zur Oper | \n", + "3461 | \n", + "439 | \n", + "
512401 | \n", + "Hotel Amadeus | \n", + "717 | \n", + "144 | \n", + "
512545 | \n", + "The Berkeley | \n", + "232 | \n", + "100 | \n", + "
512645 | \n", + "Holiday Inn London Kensington | \n", + "5945 | \n", + "2768 | \n", + "
515413 | \n", + "Atlantis Hotel Vienna | \n", + "2823 | \n", + "325 | \n", + "
1492 rows × 3 columns
\n", + "\n", + " | Average_Score_Difference | \n", + "Average_Score | \n", + "Calc_Average_Score | \n", + "Hotel_Name | \n", + "
---|---|---|---|---|
495945 | \n", + "-0.8 | \n", + "7.7 | \n", + "8.5 | \n", + "Best Western Hotel Astoria | \n", + "
111027 | \n", + "-0.7 | \n", + "8.8 | \n", + "9.5 | \n", + "Hotel Stendhal Place Vend me Paris MGallery by... | \n", + "
43688 | \n", + "-0.7 | \n", + "7.5 | \n", + "8.2 | \n", + "Mercure Paris Porte d Orleans | \n", + "
178253 | \n", + "-0.7 | \n", + "7.9 | \n", + "8.6 | \n", + "Renaissance Paris Vendome Hotel | \n", + "
218258 | \n", + "-0.5 | \n", + "7.0 | \n", + "7.5 | \n", + "Hotel Royal Elys es | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
151416 | \n", + "0.7 | \n", + "7.8 | \n", + "7.1 | \n", + "Best Western Allegro Nation | \n", + "
22189 | \n", + "0.8 | \n", + "7.1 | \n", + "6.3 | \n", + "Holiday Inn Paris Montparnasse Pasteur | \n", + "
250308 | \n", + "0.9 | \n", + "8.6 | \n", + "7.7 | \n", + "MARQUIS Faubourg St Honor Relais Ch teaux | \n", + "
68936 | \n", + "0.9 | \n", + "6.8 | \n", + "5.9 | \n", + "Villa Eugenie | \n", + "
3813 | \n", + "1.3 | \n", + "7.2 | \n", + "5.9 | \n", + "Kube Hotel Ice Bar | \n", + "
1492 rows × 4 columns
\n", + "