diff --git a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb index e69de29bb..e7214990b 100644 --- a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb +++ b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "3b3e3368", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading the data file now, could take a while...\n", + "Loading took5.67 seconds\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import time\n", + "print(\"Loading the data file now, could take a while...\")\n", + "start_time = time.time()\n", + "df = pd.read_csv('../data/Hotel_Reviews.csv')\n", + "end = time.time()\n", + "print(\"Loading took\" + str(round(end - start_time, 2)) + \" seconds\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f78a15e0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The shap of the data (rows, cols) is (515738, 17)\n" + ] + } + ], + "source": [ + "print (\"The shap of the data (rows, cols) is \" + str(df.shape))" + ] + }, + { + "cell_type": "markdown", + "id": "e244a843", + "metadata": {}, + "source": [ + "### Calculate the frequency count for reviewer nationalities:\n", + "\n", + "How many distinct values are there for the column Reviewer_Nationality and what are they?\n", + "What reviewer nationality is the most common in the dataset (print country and number of reviews)?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a9d66aa9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 227different nationalities in the dataset\n", + "Reviewer_Nationality\n", + "United Kingdom 245246\n", + "United States of America 35437\n", + "Australia 21686\n", + "Ireland 14827\n", + "United Arab Emirates 10235\n", + " ... \n", + "Cape Verde 1\n", + "Northern Mariana Islands 1\n", + "Tuvalu 1\n", + "Guinea 1\n", + "Palau 1\n", + "Name: count, Length: 227, dtype: int64\n" + ] + } + ], + "source": [ + "nationality_freq = df['Reviewer_Nationality'].value_counts()\n", + "print(\"There are \" + str(nationality_freq.size) + \" different nationalities in the dataset\")\n", + "print(nationality_freq)" + ] + }, + { + "cell_type": "markdown", + "id": "770fa9b5", + "metadata": {}, + "source": [ + "### What are the next top 10 most frequently found nationalities, and their frequency count?" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "908dc6d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The next 10 highest frequency reviewer nationalities are:\n", + "Reviewer_Nationality\n", + "United States of America 35437\n", + "Australia 21686\n", + "Ireland 14827\n", + "United Arab Emirates 10235\n", + "Saudi Arabia 8951\n", + "Netherlands 8772\n", + "Switzerland 8678\n", + "Germany 7941\n", + "Canada 7894\n", + "France 7296\n" + ] + } + ], + "source": [ + "print(\"The next 10 highest frequency reviewer nationalities are:\")\n", + "print(nationality_freq[1:11].to_string())" + ] + }, + { + "cell_type": "markdown", + "id": "cbde61cf", + "metadata": {}, + "source": [ + "### What was the most frequently reviewed hotel for each of the top 10 most reviewer nationalities?" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f8c4d995", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n", + "The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n", + "The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n", + "The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n", + "The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n", + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n", + "The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n", + "The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n", + "The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n" + ] + } + ], + "source": [ + "for nat in nationality_freq[:10].index:\n", + " # First, extract all the rows that match the criteria into a new dataframe\n", + " nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n", + " # Now get the hotel freq\n", + " freq = nat_df[\"Hotel_Name\"].value_counts()\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9dbc5464", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}