|
|
@ -0,0 +1,586 @@
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cells": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 25,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
|
|
|
"import time"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 26,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Loading data file now, this could take a while depending on file size\n",
|
|
|
|
|
|
|
|
"Loading took 1.97 seconds\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"print(\"Loading data file now, this could take a while depending on file size\")\n",
|
|
|
|
|
|
|
|
"start = time.time()\n",
|
|
|
|
|
|
|
|
"# df is 'DataFrame' - make sure you downloaded the file to the data folder\n",
|
|
|
|
|
|
|
|
"df = pd.read_csv('/Users/ray/Desktop/ML-For-Beginners/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews.csv')\n",
|
|
|
|
|
|
|
|
"end = time.time()\n",
|
|
|
|
|
|
|
|
"print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 27,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"The shape of the data (rows, cols) is (515738, 17)\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"print(\"The shape of the data (rows, cols) is \" + str(df.shape))"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 28,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"There are 227 different nationalities\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n",
|
|
|
|
|
|
|
|
"print(\"There are \" + str(nationality_freq.size) + \" different nationalities\")"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 29,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Reviewer_Nationality\n",
|
|
|
|
|
|
|
|
" United Kingdom 245246\n",
|
|
|
|
|
|
|
|
" United States of America 35437\n",
|
|
|
|
|
|
|
|
" Australia 21686\n",
|
|
|
|
|
|
|
|
" Ireland 14827\n",
|
|
|
|
|
|
|
|
" United Arab Emirates 10235\n",
|
|
|
|
|
|
|
|
" ... \n",
|
|
|
|
|
|
|
|
" Cape Verde 1\n",
|
|
|
|
|
|
|
|
" Northern Mariana Islands 1\n",
|
|
|
|
|
|
|
|
" Tuvalu 1\n",
|
|
|
|
|
|
|
|
" Guinea 1\n",
|
|
|
|
|
|
|
|
" Palau 1\n",
|
|
|
|
|
|
|
|
"Name: count, Length: 227, dtype: int64\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"print(nationality_freq) "
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 30,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"The highest frequency reviewer nationality is United Kingdom with 245246 reviews.\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/1920668807.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 31,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"The next 10 highest frequency reviewer nationalities are:\n",
|
|
|
|
|
|
|
|
"Reviewer_Nationality\n",
|
|
|
|
|
|
|
|
" United States of America 35437\n",
|
|
|
|
|
|
|
|
" Australia 21686\n",
|
|
|
|
|
|
|
|
" Ireland 14827\n",
|
|
|
|
|
|
|
|
" United Arab Emirates 10235\n",
|
|
|
|
|
|
|
|
" Saudi Arabia 8951\n",
|
|
|
|
|
|
|
|
" Netherlands 8772\n",
|
|
|
|
|
|
|
|
" Switzerland 8678\n",
|
|
|
|
|
|
|
|
" Germany 7941\n",
|
|
|
|
|
|
|
|
" Canada 7894\n",
|
|
|
|
|
|
|
|
" France 7296\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"print(\"The next 10 highest frequency reviewer nationalities are:\")\n",
|
|
|
|
|
|
|
|
"print(nationality_freq[1:11].to_string())"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 32,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n",
|
|
|
|
|
|
|
|
"The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
|
|
|
|
|
|
|
"/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"for nat in nationality_freq[:10].index:\n",
|
|
|
|
|
|
|
|
" # First, extract all the rows that match the criteria into a new dataframe\n",
|
|
|
|
|
|
|
|
" nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n",
|
|
|
|
|
|
|
|
" # Now get the hotel freq\n",
|
|
|
|
|
|
|
|
" freq = nat_df[\"Hotel_Name\"].value_counts()\n",
|
|
|
|
|
|
|
|
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 33,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 34,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/html": [
|
|
|
|
|
|
|
|
"<div>\n",
|
|
|
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"</style>\n",
|
|
|
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
|
|
|
" <th>Hotel_Name</th>\n",
|
|
|
|
|
|
|
|
" <th>Total_Number_of_Reviews</th>\n",
|
|
|
|
|
|
|
|
" <th>Total_Reviews_Found</th>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
|
|
|
" <td>Hotel Arena</td>\n",
|
|
|
|
|
|
|
|
" <td>1403</td>\n",
|
|
|
|
|
|
|
|
" <td>405</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>405</th>\n",
|
|
|
|
|
|
|
|
" <td>K K Hotel George</td>\n",
|
|
|
|
|
|
|
|
" <td>1831</td>\n",
|
|
|
|
|
|
|
|
" <td>566</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>971</th>\n",
|
|
|
|
|
|
|
|
" <td>Apex Temple Court Hotel</td>\n",
|
|
|
|
|
|
|
|
" <td>2619</td>\n",
|
|
|
|
|
|
|
|
" <td>1037</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>2008</th>\n",
|
|
|
|
|
|
|
|
" <td>The Park Grand London Paddington</td>\n",
|
|
|
|
|
|
|
|
" <td>4380</td>\n",
|
|
|
|
|
|
|
|
" <td>1770</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>3778</th>\n",
|
|
|
|
|
|
|
|
" <td>Monhotel Lounge SPA</td>\n",
|
|
|
|
|
|
|
|
" <td>171</td>\n",
|
|
|
|
|
|
|
|
" <td>35</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>511962</th>\n",
|
|
|
|
|
|
|
|
" <td>Suite Hotel 900 m zur Oper</td>\n",
|
|
|
|
|
|
|
|
" <td>3461</td>\n",
|
|
|
|
|
|
|
|
" <td>439</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>512401</th>\n",
|
|
|
|
|
|
|
|
" <td>Hotel Amadeus</td>\n",
|
|
|
|
|
|
|
|
" <td>717</td>\n",
|
|
|
|
|
|
|
|
" <td>144</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>512545</th>\n",
|
|
|
|
|
|
|
|
" <td>The Berkeley</td>\n",
|
|
|
|
|
|
|
|
" <td>232</td>\n",
|
|
|
|
|
|
|
|
" <td>100</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>512645</th>\n",
|
|
|
|
|
|
|
|
" <td>Holiday Inn London Kensington</td>\n",
|
|
|
|
|
|
|
|
" <td>5945</td>\n",
|
|
|
|
|
|
|
|
" <td>2768</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>515413</th>\n",
|
|
|
|
|
|
|
|
" <td>Atlantis Hotel Vienna</td>\n",
|
|
|
|
|
|
|
|
" <td>2823</td>\n",
|
|
|
|
|
|
|
|
" <td>325</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
|
|
|
"</table>\n",
|
|
|
|
|
|
|
|
"<p>1492 rows × 3 columns</p>\n",
|
|
|
|
|
|
|
|
"</div>"
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
" Hotel_Name Total_Number_of_Reviews \\\n",
|
|
|
|
|
|
|
|
"0 Hotel Arena 1403 \n",
|
|
|
|
|
|
|
|
"405 K K Hotel George 1831 \n",
|
|
|
|
|
|
|
|
"971 Apex Temple Court Hotel 2619 \n",
|
|
|
|
|
|
|
|
"2008 The Park Grand London Paddington 4380 \n",
|
|
|
|
|
|
|
|
"3778 Monhotel Lounge SPA 171 \n",
|
|
|
|
|
|
|
|
"... ... ... \n",
|
|
|
|
|
|
|
|
"511962 Suite Hotel 900 m zur Oper 3461 \n",
|
|
|
|
|
|
|
|
"512401 Hotel Amadeus 717 \n",
|
|
|
|
|
|
|
|
"512545 The Berkeley 232 \n",
|
|
|
|
|
|
|
|
"512645 Holiday Inn London Kensington 5945 \n",
|
|
|
|
|
|
|
|
"515413 Atlantis Hotel Vienna 2823 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" Total_Reviews_Found \n",
|
|
|
|
|
|
|
|
"0 405 \n",
|
|
|
|
|
|
|
|
"405 566 \n",
|
|
|
|
|
|
|
|
"971 1037 \n",
|
|
|
|
|
|
|
|
"2008 1770 \n",
|
|
|
|
|
|
|
|
"3778 35 \n",
|
|
|
|
|
|
|
|
"... ... \n",
|
|
|
|
|
|
|
|
"511962 439 \n",
|
|
|
|
|
|
|
|
"512401 144 \n",
|
|
|
|
|
|
|
|
"512545 100 \n",
|
|
|
|
|
|
|
|
"512645 2768 \n",
|
|
|
|
|
|
|
|
"515413 325 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"[1492 rows x 3 columns]"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" # Get rid of all the duplicated rows\n",
|
|
|
|
|
|
|
|
"hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
|
|
|
|
|
|
|
|
"display(hotel_freq_df)"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 37,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"data": {
|
|
|
|
|
|
|
|
"text/html": [
|
|
|
|
|
|
|
|
"<div>\n",
|
|
|
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
|
|
|
" }\n",
|
|
|
|
|
|
|
|
"</style>\n",
|
|
|
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
|
|
|
" <th>Average_Score_Difference</th>\n",
|
|
|
|
|
|
|
|
" <th>Average_Score</th>\n",
|
|
|
|
|
|
|
|
" <th>Calc_Average_Score</th>\n",
|
|
|
|
|
|
|
|
" <th>Hotel_Name</th>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>495945</th>\n",
|
|
|
|
|
|
|
|
" <td>-0.8</td>\n",
|
|
|
|
|
|
|
|
" <td>7.7</td>\n",
|
|
|
|
|
|
|
|
" <td>8.5</td>\n",
|
|
|
|
|
|
|
|
" <td>Best Western Hotel Astoria</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>111027</th>\n",
|
|
|
|
|
|
|
|
" <td>-0.7</td>\n",
|
|
|
|
|
|
|
|
" <td>8.8</td>\n",
|
|
|
|
|
|
|
|
" <td>9.5</td>\n",
|
|
|
|
|
|
|
|
" <td>Hotel Stendhal Place Vend me Paris MGallery by...</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>43688</th>\n",
|
|
|
|
|
|
|
|
" <td>-0.7</td>\n",
|
|
|
|
|
|
|
|
" <td>7.5</td>\n",
|
|
|
|
|
|
|
|
" <td>8.2</td>\n",
|
|
|
|
|
|
|
|
" <td>Mercure Paris Porte d Orleans</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>178253</th>\n",
|
|
|
|
|
|
|
|
" <td>-0.7</td>\n",
|
|
|
|
|
|
|
|
" <td>7.9</td>\n",
|
|
|
|
|
|
|
|
" <td>8.6</td>\n",
|
|
|
|
|
|
|
|
" <td>Renaissance Paris Vendome Hotel</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>218258</th>\n",
|
|
|
|
|
|
|
|
" <td>-0.5</td>\n",
|
|
|
|
|
|
|
|
" <td>7.0</td>\n",
|
|
|
|
|
|
|
|
" <td>7.5</td>\n",
|
|
|
|
|
|
|
|
" <td>Hotel Royal Elys es</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>151416</th>\n",
|
|
|
|
|
|
|
|
" <td>0.7</td>\n",
|
|
|
|
|
|
|
|
" <td>7.8</td>\n",
|
|
|
|
|
|
|
|
" <td>7.1</td>\n",
|
|
|
|
|
|
|
|
" <td>Best Western Allegro Nation</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>22189</th>\n",
|
|
|
|
|
|
|
|
" <td>0.8</td>\n",
|
|
|
|
|
|
|
|
" <td>7.1</td>\n",
|
|
|
|
|
|
|
|
" <td>6.3</td>\n",
|
|
|
|
|
|
|
|
" <td>Holiday Inn Paris Montparnasse Pasteur</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>250308</th>\n",
|
|
|
|
|
|
|
|
" <td>0.9</td>\n",
|
|
|
|
|
|
|
|
" <td>8.6</td>\n",
|
|
|
|
|
|
|
|
" <td>7.7</td>\n",
|
|
|
|
|
|
|
|
" <td>MARQUIS Faubourg St Honor Relais Ch teaux</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>68936</th>\n",
|
|
|
|
|
|
|
|
" <td>0.9</td>\n",
|
|
|
|
|
|
|
|
" <td>6.8</td>\n",
|
|
|
|
|
|
|
|
" <td>5.9</td>\n",
|
|
|
|
|
|
|
|
" <td>Villa Eugenie</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
|
|
|
" <th>3813</th>\n",
|
|
|
|
|
|
|
|
" <td>1.3</td>\n",
|
|
|
|
|
|
|
|
" <td>7.2</td>\n",
|
|
|
|
|
|
|
|
" <td>5.9</td>\n",
|
|
|
|
|
|
|
|
" <td>Kube Hotel Ice Bar</td>\n",
|
|
|
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
|
|
|
"</table>\n",
|
|
|
|
|
|
|
|
"<p>1492 rows × 4 columns</p>\n",
|
|
|
|
|
|
|
|
"</div>"
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"text/plain": [
|
|
|
|
|
|
|
|
" Average_Score_Difference Average_Score Calc_Average_Score \\\n",
|
|
|
|
|
|
|
|
"495945 -0.8 7.7 8.5 \n",
|
|
|
|
|
|
|
|
"111027 -0.7 8.8 9.5 \n",
|
|
|
|
|
|
|
|
"43688 -0.7 7.5 8.2 \n",
|
|
|
|
|
|
|
|
"178253 -0.7 7.9 8.6 \n",
|
|
|
|
|
|
|
|
"218258 -0.5 7.0 7.5 \n",
|
|
|
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
|
|
|
"151416 0.7 7.8 7.1 \n",
|
|
|
|
|
|
|
|
"22189 0.8 7.1 6.3 \n",
|
|
|
|
|
|
|
|
"250308 0.9 8.6 7.7 \n",
|
|
|
|
|
|
|
|
"68936 0.9 6.8 5.9 \n",
|
|
|
|
|
|
|
|
"3813 1.3 7.2 5.9 \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
" Hotel_Name \n",
|
|
|
|
|
|
|
|
"495945 Best Western Hotel Astoria \n",
|
|
|
|
|
|
|
|
"111027 Hotel Stendhal Place Vend me Paris MGallery by... \n",
|
|
|
|
|
|
|
|
"43688 Mercure Paris Porte d Orleans \n",
|
|
|
|
|
|
|
|
"178253 Renaissance Paris Vendome Hotel \n",
|
|
|
|
|
|
|
|
"218258 Hotel Royal Elys es \n",
|
|
|
|
|
|
|
|
"... ... \n",
|
|
|
|
|
|
|
|
"151416 Best Western Allegro Nation \n",
|
|
|
|
|
|
|
|
"22189 Holiday Inn Paris Montparnasse Pasteur \n",
|
|
|
|
|
|
|
|
"250308 MARQUIS Faubourg St Honor Relais Ch teaux \n",
|
|
|
|
|
|
|
|
"68936 Villa Eugenie \n",
|
|
|
|
|
|
|
|
"3813 Kube Hotel Ice Bar \n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"[1492 rows x 4 columns]"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"output_type": "display_data"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"def get_difference_review_avg(row):\n",
|
|
|
|
|
|
|
|
" return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]\n",
|
|
|
|
|
|
|
|
" \n",
|
|
|
|
|
|
|
|
" # 'mean' is mathematical word for 'average'\n",
|
|
|
|
|
|
|
|
"df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# Add a new column with the difference between the two average scores\n",
|
|
|
|
|
|
|
|
"df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n",
|
|
|
|
|
|
|
|
"review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"# Sort the dataframe to find the lowest and highest average score difference\n",
|
|
|
|
|
|
|
|
"review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"display(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
|
|
|
"execution_count": 38,
|
|
|
|
|
|
|
|
"metadata": {},
|
|
|
|
|
|
|
|
"outputs": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
|
|
|
"text": [
|
|
|
|
|
|
|
|
"Number of No Negative reviews: 127890\n",
|
|
|
|
|
|
|
|
"Number of No Positive reviews: 35946\n",
|
|
|
|
|
|
|
|
"Number of both No Negative and No Positive reviews: 127\n",
|
|
|
|
|
|
|
|
"Sum took 0.17 seconds\n"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"source": [
|
|
|
|
|
|
|
|
"start = time.time()\n",
|
|
|
|
|
|
|
|
"no_negative_reviews = sum(df.Negative_Review == \"No Negative\")\n",
|
|
|
|
|
|
|
|
"print(\"Number of No Negative reviews: \" + str(no_negative_reviews))\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"no_positive_reviews = sum(df[\"Positive_Review\"] == \"No Positive\")\n",
|
|
|
|
|
|
|
|
"print(\"Number of No Positive reviews: \" + str(no_positive_reviews))\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"both_no_reviews = sum((df.Negative_Review == \"No Negative\") & (df.Positive_Review == \"No Positive\"))\n",
|
|
|
|
|
|
|
|
"print(\"Number of both No Negative and No Positive reviews: \" + str(both_no_reviews))\n",
|
|
|
|
|
|
|
|
"\n",
|
|
|
|
|
|
|
|
"end = time.time()\n",
|
|
|
|
|
|
|
|
"print(\"Sum took \" + str(round(end - start, 2)) + \" seconds\")"
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
],
|
|
|
|
|
|
|
|
"metadata": {
|
|
|
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
|
|
|
"display_name": ".venv",
|
|
|
|
|
|
|
|
"language": "python",
|
|
|
|
|
|
|
|
"name": "python3"
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"language_info": {
|
|
|
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
|
|
|
"version": 3
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
|
|
|
"name": "python",
|
|
|
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
|
|
|
"version": "3.12.0"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
},
|
|
|
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
|
|
|
"nbformat_minor": 2
|
|
|
|
|
|
|
|
}
|