parent
bbfa03c2d1
commit
7a50781922
@ -0,0 +1,254 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "3b3e3368",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading the data file now, could take a while...\n",
|
||||
"Loading took5.67 seconds\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import time\n",
|
||||
"print(\"Loading the data file now, could take a while...\")\n",
|
||||
"start_time = time.time()\n",
|
||||
"df = pd.read_csv('../data/Hotel_Reviews.csv')\n",
|
||||
"end = time.time()\n",
|
||||
"print(\"Loading took\" + str(round(end - start_time, 2)) + \" seconds\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "f78a15e0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The shap of the data (rows, cols) is (515738, 17)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print (\"The shap of the data (rows, cols) is \" + str(df.shape))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e244a843",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Calculate the frequency count for reviewer nationalities:\n",
|
||||
"\n",
|
||||
"How many distinct values are there for the column Reviewer_Nationality and what are they?\n",
|
||||
"What reviewer nationality is the most common in the dataset (print country and number of reviews)?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a9d66aa9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"There are 227different nationalities in the dataset\n",
|
||||
"Reviewer_Nationality\n",
|
||||
"United Kingdom 245246\n",
|
||||
"United States of America 35437\n",
|
||||
"Australia 21686\n",
|
||||
"Ireland 14827\n",
|
||||
"United Arab Emirates 10235\n",
|
||||
" ... \n",
|
||||
"Cape Verde 1\n",
|
||||
"Northern Mariana Islands 1\n",
|
||||
"Tuvalu 1\n",
|
||||
"Guinea 1\n",
|
||||
"Palau 1\n",
|
||||
"Name: count, Length: 227, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"nationality_freq = df['Reviewer_Nationality'].value_counts()\n",
|
||||
"print(\"There are \" + str(nationality_freq.size) + \" different nationalities in the dataset\")\n",
|
||||
"print(nationality_freq)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "770fa9b5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### What are the next top 10 most frequently found nationalities, and their frequency count?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "908dc6d0",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The next 10 highest frequency reviewer nationalities are:\n",
|
||||
"Reviewer_Nationality\n",
|
||||
"United States of America 35437\n",
|
||||
"Australia 21686\n",
|
||||
"Ireland 14827\n",
|
||||
"United Arab Emirates 10235\n",
|
||||
"Saudi Arabia 8951\n",
|
||||
"Netherlands 8772\n",
|
||||
"Switzerland 8678\n",
|
||||
"Germany 7941\n",
|
||||
"Canada 7894\n",
|
||||
"France 7296\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(\"The next 10 highest frequency reviewer nationalities are:\")\n",
|
||||
"print(nationality_freq[1:11].to_string())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cbde61cf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### What was the most frequently reviewed hotel for each of the top 10 most reviewer nationalities?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "f8c4d995",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n",
|
||||
"The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n",
|
||||
"The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n",
|
||||
"The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n",
|
||||
"The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n",
|
||||
"The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n",
|
||||
"The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n",
|
||||
"The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"for nat in nationality_freq[:10].index:\n",
|
||||
" # First, extract all the rows that match the criteria into a new dataframe\n",
|
||||
" nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n",
|
||||
" # Now get the hotel freq\n",
|
||||
" freq = nat_df[\"Hotel_Name\"].value_counts()\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "9dbc5464",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": ".venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.13.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
Loading…
Reference in new issue