NLP Hotel Reviews Data Analysis PD

pull/719/head
raygaeta 2 years ago
parent 8979d09fef
commit 74ab6ac0d7

@ -0,0 +1,586 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading data file now, this could take a while depending on file size\n",
"Loading took 1.97 seconds\n"
]
}
],
"source": [
"print(\"Loading data file now, this could take a while depending on file size\")\n",
"start = time.time()\n",
"# df is 'DataFrame' - make sure you downloaded the file to the data folder\n",
"df = pd.read_csv('/Users/ray/Desktop/ML-For-Beginners/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews.csv')\n",
"end = time.time()\n",
"print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The shape of the data (rows, cols) is (515738, 17)\n"
]
}
],
"source": [
"print(\"The shape of the data (rows, cols) is \" + str(df.shape))"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 227 different nationalities\n"
]
}
],
"source": [
"nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n",
"print(\"There are \" + str(nationality_freq.size) + \" different nationalities\")"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reviewer_Nationality\n",
" United Kingdom 245246\n",
" United States of America 35437\n",
" Australia 21686\n",
" Ireland 14827\n",
" United Arab Emirates 10235\n",
" ... \n",
" Cape Verde 1\n",
" Northern Mariana Islands 1\n",
" Tuvalu 1\n",
" Guinea 1\n",
" Palau 1\n",
"Name: count, Length: 227, dtype: int64\n"
]
}
],
"source": [
"print(nationality_freq) "
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The highest frequency reviewer nationality is United Kingdom with 245246 reviews.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/1920668807.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n"
]
}
],
"source": [
"print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The next 10 highest frequency reviewer nationalities are:\n",
"Reviewer_Nationality\n",
" United States of America 35437\n",
" Australia 21686\n",
" Ireland 14827\n",
" United Arab Emirates 10235\n",
" Saudi Arabia 8951\n",
" Netherlands 8772\n",
" Switzerland 8678\n",
" Germany 7941\n",
" Canada 7894\n",
" France 7296\n"
]
}
],
"source": [
"print(\"The next 10 highest frequency reviewer nationalities are:\")\n",
"print(nationality_freq[1:11].to_string())"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n",
"The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n",
"The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n",
"The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n",
"The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n",
"The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n",
"The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n",
"The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n",
"The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n",
"The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
]
}
],
"source": [
"for nat in nationality_freq[:10].index:\n",
" # First, extract all the rows that match the criteria into a new dataframe\n",
" nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n",
" # Now get the hotel freq\n",
" freq = nat_df[\"Hotel_Name\"].value_counts()\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Hotel_Name</th>\n",
" <th>Total_Number_of_Reviews</th>\n",
" <th>Total_Reviews_Found</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Hotel Arena</td>\n",
" <td>1403</td>\n",
" <td>405</td>\n",
" </tr>\n",
" <tr>\n",
" <th>405</th>\n",
" <td>K K Hotel George</td>\n",
" <td>1831</td>\n",
" <td>566</td>\n",
" </tr>\n",
" <tr>\n",
" <th>971</th>\n",
" <td>Apex Temple Court Hotel</td>\n",
" <td>2619</td>\n",
" <td>1037</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2008</th>\n",
" <td>The Park Grand London Paddington</td>\n",
" <td>4380</td>\n",
" <td>1770</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3778</th>\n",
" <td>Monhotel Lounge SPA</td>\n",
" <td>171</td>\n",
" <td>35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>511962</th>\n",
" <td>Suite Hotel 900 m zur Oper</td>\n",
" <td>3461</td>\n",
" <td>439</td>\n",
" </tr>\n",
" <tr>\n",
" <th>512401</th>\n",
" <td>Hotel Amadeus</td>\n",
" <td>717</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>512545</th>\n",
" <td>The Berkeley</td>\n",
" <td>232</td>\n",
" <td>100</td>\n",
" </tr>\n",
" <tr>\n",
" <th>512645</th>\n",
" <td>Holiday Inn London Kensington</td>\n",
" <td>5945</td>\n",
" <td>2768</td>\n",
" </tr>\n",
" <tr>\n",
" <th>515413</th>\n",
" <td>Atlantis Hotel Vienna</td>\n",
" <td>2823</td>\n",
" <td>325</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1492 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" Hotel_Name Total_Number_of_Reviews \\\n",
"0 Hotel Arena 1403 \n",
"405 K K Hotel George 1831 \n",
"971 Apex Temple Court Hotel 2619 \n",
"2008 The Park Grand London Paddington 4380 \n",
"3778 Monhotel Lounge SPA 171 \n",
"... ... ... \n",
"511962 Suite Hotel 900 m zur Oper 3461 \n",
"512401 Hotel Amadeus 717 \n",
"512545 The Berkeley 232 \n",
"512645 Holiday Inn London Kensington 5945 \n",
"515413 Atlantis Hotel Vienna 2823 \n",
"\n",
" Total_Reviews_Found \n",
"0 405 \n",
"405 566 \n",
"971 1037 \n",
"2008 1770 \n",
"3778 35 \n",
"... ... \n",
"511962 439 \n",
"512401 144 \n",
"512545 100 \n",
"512645 2768 \n",
"515413 325 \n",
"\n",
"[1492 rows x 3 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n",
" \n",
" # Get rid of all the duplicated rows\n",
"hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
"display(hotel_freq_df)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Average_Score_Difference</th>\n",
" <th>Average_Score</th>\n",
" <th>Calc_Average_Score</th>\n",
" <th>Hotel_Name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>495945</th>\n",
" <td>-0.8</td>\n",
" <td>7.7</td>\n",
" <td>8.5</td>\n",
" <td>Best Western Hotel Astoria</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111027</th>\n",
" <td>-0.7</td>\n",
" <td>8.8</td>\n",
" <td>9.5</td>\n",
" <td>Hotel Stendhal Place Vend me Paris MGallery by...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43688</th>\n",
" <td>-0.7</td>\n",
" <td>7.5</td>\n",
" <td>8.2</td>\n",
" <td>Mercure Paris Porte d Orleans</td>\n",
" </tr>\n",
" <tr>\n",
" <th>178253</th>\n",
" <td>-0.7</td>\n",
" <td>7.9</td>\n",
" <td>8.6</td>\n",
" <td>Renaissance Paris Vendome Hotel</td>\n",
" </tr>\n",
" <tr>\n",
" <th>218258</th>\n",
" <td>-0.5</td>\n",
" <td>7.0</td>\n",
" <td>7.5</td>\n",
" <td>Hotel Royal Elys es</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>151416</th>\n",
" <td>0.7</td>\n",
" <td>7.8</td>\n",
" <td>7.1</td>\n",
" <td>Best Western Allegro Nation</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22189</th>\n",
" <td>0.8</td>\n",
" <td>7.1</td>\n",
" <td>6.3</td>\n",
" <td>Holiday Inn Paris Montparnasse Pasteur</td>\n",
" </tr>\n",
" <tr>\n",
" <th>250308</th>\n",
" <td>0.9</td>\n",
" <td>8.6</td>\n",
" <td>7.7</td>\n",
" <td>MARQUIS Faubourg St Honor Relais Ch teaux</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68936</th>\n",
" <td>0.9</td>\n",
" <td>6.8</td>\n",
" <td>5.9</td>\n",
" <td>Villa Eugenie</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3813</th>\n",
" <td>1.3</td>\n",
" <td>7.2</td>\n",
" <td>5.9</td>\n",
" <td>Kube Hotel Ice Bar</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1492 rows × 4 columns</p>\n",
"</div>"
],
"text/plain": [
" Average_Score_Difference Average_Score Calc_Average_Score \\\n",
"495945 -0.8 7.7 8.5 \n",
"111027 -0.7 8.8 9.5 \n",
"43688 -0.7 7.5 8.2 \n",
"178253 -0.7 7.9 8.6 \n",
"218258 -0.5 7.0 7.5 \n",
"... ... ... ... \n",
"151416 0.7 7.8 7.1 \n",
"22189 0.8 7.1 6.3 \n",
"250308 0.9 8.6 7.7 \n",
"68936 0.9 6.8 5.9 \n",
"3813 1.3 7.2 5.9 \n",
"\n",
" Hotel_Name \n",
"495945 Best Western Hotel Astoria \n",
"111027 Hotel Stendhal Place Vend me Paris MGallery by... \n",
"43688 Mercure Paris Porte d Orleans \n",
"178253 Renaissance Paris Vendome Hotel \n",
"218258 Hotel Royal Elys es \n",
"... ... \n",
"151416 Best Western Allegro Nation \n",
"22189 Holiday Inn Paris Montparnasse Pasteur \n",
"250308 MARQUIS Faubourg St Honor Relais Ch teaux \n",
"68936 Villa Eugenie \n",
"3813 Kube Hotel Ice Bar \n",
"\n",
"[1492 rows x 4 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def get_difference_review_avg(row):\n",
" return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]\n",
" \n",
" # 'mean' is mathematical word for 'average'\n",
"df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n",
"\n",
"# Add a new column with the difference between the two average scores\n",
"df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n",
"\n",
"# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n",
"review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
"\n",
"# Sort the dataframe to find the lowest and highest average score difference\n",
"review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n",
"\n",
"display(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of No Negative reviews: 127890\n",
"Number of No Positive reviews: 35946\n",
"Number of both No Negative and No Positive reviews: 127\n",
"Sum took 0.17 seconds\n"
]
}
],
"source": [
"start = time.time()\n",
"no_negative_reviews = sum(df.Negative_Review == \"No Negative\")\n",
"print(\"Number of No Negative reviews: \" + str(no_negative_reviews))\n",
"\n",
"no_positive_reviews = sum(df[\"Positive_Review\"] == \"No Positive\")\n",
"print(\"Number of No Positive reviews: \" + str(no_positive_reviews))\n",
"\n",
"both_no_reviews = sum((df.Negative_Review == \"No Negative\") & (df.Positive_Review == \"No Positive\"))\n",
"print(\"Number of both No Negative and No Positive reviews: \" + str(both_no_reviews))\n",
"\n",
"end = time.time()\n",
"print(\"Sum took \" + str(round(end - start, 2)) + \" seconds\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading…
Cancel
Save