pull/915/head
Zarathushtra 4 weeks ago
parent bbfa03c2d1
commit 7a50781922

@ -0,0 +1,254 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "3b3e3368",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading the data file now, could take a while...\n",
"Loading took5.67 seconds\n"
]
}
],
"source": [
"import pandas as pd\n",
"import time\n",
"print(\"Loading the data file now, could take a while...\")\n",
"start_time = time.time()\n",
"df = pd.read_csv('../data/Hotel_Reviews.csv')\n",
"end = time.time()\n",
"print(\"Loading took\" + str(round(end - start_time, 2)) + \" seconds\")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f78a15e0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The shap of the data (rows, cols) is (515738, 17)\n"
]
}
],
"source": [
"print (\"The shap of the data (rows, cols) is \" + str(df.shape))"
]
},
{
"cell_type": "markdown",
"id": "e244a843",
"metadata": {},
"source": [
"### Calculate the frequency count for reviewer nationalities:\n",
"\n",
"How many distinct values are there for the column Reviewer_Nationality and what are they?\n",
"What reviewer nationality is the most common in the dataset (print country and number of reviews)?"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a9d66aa9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"There are 227different nationalities in the dataset\n",
"Reviewer_Nationality\n",
"United Kingdom 245246\n",
"United States of America 35437\n",
"Australia 21686\n",
"Ireland 14827\n",
"United Arab Emirates 10235\n",
" ... \n",
"Cape Verde 1\n",
"Northern Mariana Islands 1\n",
"Tuvalu 1\n",
"Guinea 1\n",
"Palau 1\n",
"Name: count, Length: 227, dtype: int64\n"
]
}
],
"source": [
"nationality_freq = df['Reviewer_Nationality'].value_counts()\n",
"print(\"There are \" + str(nationality_freq.size) + \" different nationalities in the dataset\")\n",
"print(nationality_freq)"
]
},
{
"cell_type": "markdown",
"id": "770fa9b5",
"metadata": {},
"source": [
"### What are the next top 10 most frequently found nationalities, and their frequency count?"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "908dc6d0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The next 10 highest frequency reviewer nationalities are:\n",
"Reviewer_Nationality\n",
"United States of America 35437\n",
"Australia 21686\n",
"Ireland 14827\n",
"United Arab Emirates 10235\n",
"Saudi Arabia 8951\n",
"Netherlands 8772\n",
"Switzerland 8678\n",
"Germany 7941\n",
"Canada 7894\n",
"France 7296\n"
]
}
],
"source": [
"print(\"The next 10 highest frequency reviewer nationalities are:\")\n",
"print(nationality_freq[1:11].to_string())"
]
},
{
"cell_type": "markdown",
"id": "cbde61cf",
"metadata": {},
"source": [
"### What was the most frequently reviewed hotel for each of the top 10 most reviewer nationalities?"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "f8c4d995",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n",
"The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n",
"The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n",
"The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n",
"The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n",
"The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n",
"The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n",
"The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
]
}
],
"source": [
"for nat in nationality_freq[:10].index:\n",
" # First, extract all the rows that match the criteria into a new dataframe\n",
" nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n",
" # Now get the hotel freq\n",
" freq = nat_df[\"Hotel_Name\"].value_counts()\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9dbc5464",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading…
Cancel
Save