change

2 months ago · 7a50781922
parent bbfa03c2d1
commit 7a50781922
1 changed files with 254 additions and 0 deletions
--- a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb
+++ b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb
@ -0,0 +1,254 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "3b3e3368",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading the data file now, could take a while...\n",
+      "Loading took5.67 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import time\n",
+    "print(\"Loading the data file now, could take a while...\")\n",
+    "start_time = time.time()\n",
+    "df = pd.read_csv('../data/Hotel_Reviews.csv')\n",
+    "end = time.time()\n",
+    "print(\"Loading took\" + str(round(end - start_time, 2)) + \" seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f78a15e0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The shap of the data (rows, cols) is (515738, 17)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print (\"The shap of the data (rows, cols) is \" + str(df.shape))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e244a843",
+   "metadata": {},
+   "source": [
+    "### Calculate the frequency count for reviewer nationalities:\n",
+    "\n",
+    "How many distinct values are there for the column Reviewer_Nationality and what are they?\n",
+    "What reviewer nationality is the most common in the dataset (print country and number of reviews)?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9d66aa9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are 227different nationalities in the dataset\n",
+      "Reviewer_Nationality\n",
+      "United Kingdom               245246\n",
+      "United States of America      35437\n",
+      "Australia                     21686\n",
+      "Ireland                       14827\n",
+      "United Arab Emirates          10235\n",
+      "                              ...  \n",
+      "Cape Verde                        1\n",
+      "Northern Mariana Islands          1\n",
+      "Tuvalu                            1\n",
+      "Guinea                            1\n",
+      "Palau                             1\n",
+      "Name: count, Length: 227, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "nationality_freq = df['Reviewer_Nationality'].value_counts()\n",
+    "print(\"There are \" + str(nationality_freq.size) + \" different nationalities in the dataset\")\n",
+    "print(nationality_freq)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "770fa9b5",
+   "metadata": {},
+   "source": [
+    "### What are the next top 10 most frequently found nationalities, and their frequency count?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "908dc6d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The next 10 highest frequency reviewer nationalities are:\n",
+      "Reviewer_Nationality\n",
+      "United States of America     35437\n",
+      "Australia                    21686\n",
+      "Ireland                      14827\n",
+      "United Arab Emirates         10235\n",
+      "Saudi Arabia                  8951\n",
+      "Netherlands                   8772\n",
+      "Switzerland                   8678\n",
+      "Germany                       7941\n",
+      "Canada                        7894\n",
+      "France                        7296\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"The next 10 highest frequency reviewer nationalities are:\")\n",
+    "print(nationality_freq[1:11].to_string())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cbde61cf",
+   "metadata": {},
+   "source": [
+    "### What was the most frequently reviewed hotel for each of the top 10 most reviewer nationalities?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f8c4d995",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n",
+      "The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n",
+      "The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n",
+      "The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n",
+      "The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n",
+      "The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n",
+      "The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n",
+      "The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\user\\AppData\\Local\\Temp\\ipykernel_23868\\2970136166.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "for nat in nationality_freq[:10].index:\n",
+    "   # First, extract all the rows that match the criteria into a new dataframe\n",
+    "   nat_df = df[df[\"Reviewer_Nationality\"] == nat]   \n",
+    "   # Now get the hotel freq\n",
+    "   freq = nat_df[\"Hotel_Name\"].value_counts()\n",
+    "   print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9dbc5464",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}