NLP Hotel Reviews Data Analysis PD

2 years ago · 74ab6ac0d7
parent 8979d09fef
commit 74ab6ac0d7
1 changed files with 586 additions and 0 deletions
--- a/6-NLP/4-Hotel-Reviews-1/notebook.ipynb
+++ b/6-NLP/4-Hotel-Reviews-1/notebook.ipynb
@ -0,0 +1,586 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loading data file now, this could take a while depending on file size\n",
+      "Loading took 1.97 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Loading data file now, this could take a while depending on file size\")\n",
+    "start = time.time()\n",
+    "# df is 'DataFrame' - make sure you downloaded the file to the data folder\n",
+    "df = pd.read_csv('/Users/ray/Desktop/ML-For-Beginners/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews.csv')\n",
+    "end = time.time()\n",
+    "print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The shape of the data (rows, cols) is (515738, 17)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"The shape of the data (rows, cols) is \" + str(df.shape))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "There are 227 different nationalities\n"
+     ]
+    }
+   ],
+   "source": [
+    "nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n",
+    "print(\"There are \" + str(nationality_freq.size) + \" different nationalities\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reviewer_Nationality\n",
+      " United Kingdom               245246\n",
+      " United States of America      35437\n",
+      " Australia                     21686\n",
+      " Ireland                       14827\n",
+      " United Arab Emirates          10235\n",
+      "                               ...  \n",
+      " Cape Verde                        1\n",
+      " Northern Mariana Islands          1\n",
+      " Tuvalu                            1\n",
+      " Guinea                            1\n",
+      " Palau                             1\n",
+      "Name: count, Length: 227, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(nationality_freq) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The highest frequency reviewer nationality is United Kingdom with 245246 reviews.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/1920668807.py:1: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The next 10 highest frequency reviewer nationalities are:\n",
+      "Reviewer_Nationality\n",
+      " United States of America     35437\n",
+      " Australia                    21686\n",
+      " Ireland                      14827\n",
+      " United Arab Emirates         10235\n",
+      " Saudi Arabia                  8951\n",
+      " Netherlands                   8772\n",
+      " Switzerland                   8678\n",
+      " Germany                       7941\n",
+      " Canada                        7894\n",
+      " France                        7296\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"The next 10 highest frequency reviewer nationalities are:\")\n",
+    "print(nationality_freq[1:11].to_string())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The most reviewed hotel for United Kingdom was Britannia International Hotel Canary Wharf with 3833 reviews.\n",
+      "The most reviewed hotel for United States of America was Hotel Esther a with 423 reviews.\n",
+      "The most reviewed hotel for Australia was Park Plaza Westminster Bridge London with 167 reviews.\n",
+      "The most reviewed hotel for Ireland was Copthorne Tara Hotel London Kensington with 239 reviews.\n",
+      "The most reviewed hotel for United Arab Emirates was Millennium Hotel London Knightsbridge with 129 reviews.\n",
+      "The most reviewed hotel for Saudi Arabia was The Cumberland A Guoman Hotel with 142 reviews.\n",
+      "The most reviewed hotel for Netherlands was Jaz Amsterdam with 97 reviews.\n",
+      "The most reviewed hotel for Switzerland was Hotel Da Vinci with 97 reviews.\n",
+      "The most reviewed hotel for Germany was Hotel Da Vinci with 86 reviews.\n",
+      "The most reviewed hotel for Canada was St James Court A Taj Hotel London with 61 reviews.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n",
+      "/var/folders/3m/kq3blxtj7gj8n_v2dv5lns700000gp/T/ipykernel_23341/3364414134.py:6: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n",
+      "  print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "for nat in nationality_freq[:10].index:\n",
+    "   # First, extract all the rows that match the criteria into a new dataframe\n",
+    "   nat_df = df[df[\"Reviewer_Nationality\"] == nat]   \n",
+    "   # Now get the hotel freq\n",
+    "   freq = nat_df[\"Hotel_Name\"].value_counts()\n",
+    "   print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Hotel_Name</th>\n",
+       "      <th>Total_Number_of_Reviews</th>\n",
+       "      <th>Total_Reviews_Found</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Hotel Arena</td>\n",
+       "      <td>1403</td>\n",
+       "      <td>405</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>405</th>\n",
+       "      <td>K K Hotel George</td>\n",
+       "      <td>1831</td>\n",
+       "      <td>566</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>971</th>\n",
+       "      <td>Apex Temple Court Hotel</td>\n",
+       "      <td>2619</td>\n",
+       "      <td>1037</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2008</th>\n",
+       "      <td>The Park Grand London Paddington</td>\n",
+       "      <td>4380</td>\n",
+       "      <td>1770</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3778</th>\n",
+       "      <td>Monhotel Lounge SPA</td>\n",
+       "      <td>171</td>\n",
+       "      <td>35</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>511962</th>\n",
+       "      <td>Suite Hotel 900 m zur Oper</td>\n",
+       "      <td>3461</td>\n",
+       "      <td>439</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>512401</th>\n",
+       "      <td>Hotel Amadeus</td>\n",
+       "      <td>717</td>\n",
+       "      <td>144</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>512545</th>\n",
+       "      <td>The Berkeley</td>\n",
+       "      <td>232</td>\n",
+       "      <td>100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>512645</th>\n",
+       "      <td>Holiday Inn London Kensington</td>\n",
+       "      <td>5945</td>\n",
+       "      <td>2768</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>515413</th>\n",
+       "      <td>Atlantis Hotel Vienna</td>\n",
+       "      <td>2823</td>\n",
+       "      <td>325</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1492 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                              Hotel_Name  Total_Number_of_Reviews  \\\n",
+       "0                            Hotel Arena                     1403   \n",
+       "405                     K K Hotel George                     1831   \n",
+       "971              Apex Temple Court Hotel                     2619   \n",
+       "2008    The Park Grand London Paddington                     4380   \n",
+       "3778                 Monhotel Lounge SPA                      171   \n",
+       "...                                  ...                      ...   \n",
+       "511962        Suite Hotel 900 m zur Oper                     3461   \n",
+       "512401                     Hotel Amadeus                      717   \n",
+       "512545                      The Berkeley                      232   \n",
+       "512645     Holiday Inn London Kensington                     5945   \n",
+       "515413             Atlantis Hotel Vienna                     2823   \n",
+       "\n",
+       "        Total_Reviews_Found  \n",
+       "0                       405  \n",
+       "405                     566  \n",
+       "971                    1037  \n",
+       "2008                   1770  \n",
+       "3778                     35  \n",
+       "...                     ...  \n",
+       "511962                  439  \n",
+       "512401                  144  \n",
+       "512545                  100  \n",
+       "512645                 2768  \n",
+       "515413                  325  \n",
+       "\n",
+       "[1492 rows x 3 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n",
+    "   \n",
+    "   # Get rid of all the duplicated rows\n",
+    "hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
+    "display(hotel_freq_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Average_Score_Difference</th>\n",
+       "      <th>Average_Score</th>\n",
+       "      <th>Calc_Average_Score</th>\n",
+       "      <th>Hotel_Name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>495945</th>\n",
+       "      <td>-0.8</td>\n",
+       "      <td>7.7</td>\n",
+       "      <td>8.5</td>\n",
+       "      <td>Best Western Hotel Astoria</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>111027</th>\n",
+       "      <td>-0.7</td>\n",
+       "      <td>8.8</td>\n",
+       "      <td>9.5</td>\n",
+       "      <td>Hotel Stendhal Place Vend me Paris MGallery by...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43688</th>\n",
+       "      <td>-0.7</td>\n",
+       "      <td>7.5</td>\n",
+       "      <td>8.2</td>\n",
+       "      <td>Mercure Paris Porte d Orleans</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>178253</th>\n",
+       "      <td>-0.7</td>\n",
+       "      <td>7.9</td>\n",
+       "      <td>8.6</td>\n",
+       "      <td>Renaissance Paris Vendome Hotel</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>218258</th>\n",
+       "      <td>-0.5</td>\n",
+       "      <td>7.0</td>\n",
+       "      <td>7.5</td>\n",
+       "      <td>Hotel Royal Elys es</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>151416</th>\n",
+       "      <td>0.7</td>\n",
+       "      <td>7.8</td>\n",
+       "      <td>7.1</td>\n",
+       "      <td>Best Western Allegro Nation</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>22189</th>\n",
+       "      <td>0.8</td>\n",
+       "      <td>7.1</td>\n",
+       "      <td>6.3</td>\n",
+       "      <td>Holiday Inn Paris Montparnasse Pasteur</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>250308</th>\n",
+       "      <td>0.9</td>\n",
+       "      <td>8.6</td>\n",
+       "      <td>7.7</td>\n",
+       "      <td>MARQUIS Faubourg St Honor Relais Ch teaux</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>68936</th>\n",
+       "      <td>0.9</td>\n",
+       "      <td>6.8</td>\n",
+       "      <td>5.9</td>\n",
+       "      <td>Villa Eugenie</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3813</th>\n",
+       "      <td>1.3</td>\n",
+       "      <td>7.2</td>\n",
+       "      <td>5.9</td>\n",
+       "      <td>Kube Hotel Ice Bar</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>1492 rows × 4 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        Average_Score_Difference  Average_Score  Calc_Average_Score  \\\n",
+       "495945                      -0.8            7.7                 8.5   \n",
+       "111027                      -0.7            8.8                 9.5   \n",
+       "43688                       -0.7            7.5                 8.2   \n",
+       "178253                      -0.7            7.9                 8.6   \n",
+       "218258                      -0.5            7.0                 7.5   \n",
+       "...                          ...            ...                 ...   \n",
+       "151416                       0.7            7.8                 7.1   \n",
+       "22189                        0.8            7.1                 6.3   \n",
+       "250308                       0.9            8.6                 7.7   \n",
+       "68936                        0.9            6.8                 5.9   \n",
+       "3813                         1.3            7.2                 5.9   \n",
+       "\n",
+       "                                               Hotel_Name  \n",
+       "495945                         Best Western Hotel Astoria  \n",
+       "111027  Hotel Stendhal Place Vend me Paris MGallery by...  \n",
+       "43688                       Mercure Paris Porte d Orleans  \n",
+       "178253                    Renaissance Paris Vendome Hotel  \n",
+       "218258                                Hotel Royal Elys es  \n",
+       "...                                                   ...  \n",
+       "151416                        Best Western Allegro Nation  \n",
+       "22189              Holiday Inn Paris Montparnasse Pasteur  \n",
+       "250308          MARQUIS Faubourg St Honor Relais Ch teaux  \n",
+       "68936                                       Villa Eugenie  \n",
+       "3813                                   Kube Hotel Ice Bar  \n",
+       "\n",
+       "[1492 rows x 4 columns]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "def get_difference_review_avg(row):\n",
+    "     return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]\n",
+    "   \n",
+    "   # 'mean' is mathematical word for 'average'\n",
+    "df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n",
+    "\n",
+    "# Add a new column with the difference between the two average scores\n",
+    "df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n",
+    "\n",
+    "# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n",
+    "review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
+    "\n",
+    "# Sort the dataframe to find the lowest and highest average score difference\n",
+    "review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n",
+    "\n",
+    "display(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of No Negative reviews: 127890\n",
+      "Number of No Positive reviews: 35946\n",
+      "Number of both No Negative and No Positive reviews: 127\n",
+      "Sum took 0.17 seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "start = time.time()\n",
+    "no_negative_reviews = sum(df.Negative_Review == \"No Negative\")\n",
+    "print(\"Number of No Negative reviews: \" + str(no_negative_reviews))\n",
+    "\n",
+    "no_positive_reviews = sum(df[\"Positive_Review\"] == \"No Positive\")\n",
+    "print(\"Number of No Positive reviews: \" + str(no_positive_reviews))\n",
+    "\n",
+    "both_no_reviews = sum((df.Negative_Review == \"No Negative\") & (df.Positive_Review == \"No Positive\"))\n",
+    "print(\"Number of both No Negative and No Positive reviews: \" + str(both_no_reviews))\n",
+    "\n",
+    "end = time.time()\n",
+    "print(\"Sum took \" + str(round(end - start, 2)) + \" seconds\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}