{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": 3
  },
  "orig_nbformat": 4,
  "coopTranslator": {
   "original_hash": "2d05e7db439376aa824f4b387f8324ca",
   "translation_date": "2025-08-29T15:43:43+00:00",
   "source_file": "6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb",
   "language_code": "tl"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EDA\n",
    "import pandas as pd\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_difference_review_avg(row):\n",
    "    return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV\n",
    "print(\"Loading data file now, this could take a while depending on file size\")\n",
    "start = time.time()\n",
    "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n",
    "end = time.time()\n",
    "print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What shape is the data (rows, columns)?\n",
    "print(\"The shape of the data (rows, cols) is \" + str(df.shape))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# value_counts() creates a Series object that has index and values\n",
    "#                in this case, the country and the frequency they occur in reviewer nationality\n",
    "nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What reviewer nationality is the most common in the dataset?\n",
    "print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What is the top 10 most common nationalities and their frequencies?\n",
    "print(\"The top 10 highest frequency reviewer nationalities are:\")\n",
    "print(nationality_freq[0:10].to_string())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How many unique nationalities are there?\n",
    "print(\"There are \" + str(nationality_freq.index.size) + \" unique nationalities in the dataset\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews\n",
    "for nat in nationality_freq[:10].index:\n",
    "   # First, extract all the rows that match the criteria into a new dataframe\n",
    "   nat_df = df[df[\"Reviewer_Nationality\"] == nat]   \n",
    "   # Now get the hotel freq\n",
    "   freq = nat_df[\"Hotel_Name\"].value_counts()\n",
    "   print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?\n",
    "# First create a new dataframe based on the old one, removing the uneeded columns\n",
    "hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)\n",
    "# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found\n",
    "hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n",
    "# Get rid of all the duplicated rows\n",
    "hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
    "print()\n",
    "print(hotel_freq_df.to_string())\n",
    "print(str(hotel_freq_df.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# While there is an `Average_Score` for each hotel according to the dataset, \n",
    "# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)\n",
    "# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. \n",
    "df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n",
    "# Add a new column with the difference between the two average scores\n",
    "df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n",
    "# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n",
    "review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
    "# Sort the dataframe to find the lowest and highest average score difference\n",
    "review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n",
    "print(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])\n",
    "# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n---\n\n**Paunawa**:  \nAng dokumentong ito ay isinalin gamit ang AI translation service na [Co-op Translator](https://github.com/Azure/co-op-translator). Bagama't sinisikap naming maging tumpak, tandaan na ang mga awtomatikong pagsasalin ay maaaring maglaman ng mga pagkakamali o hindi pagkakatugma. Ang orihinal na dokumento sa kanyang katutubong wika ang dapat ituring na opisyal na sanggunian. Para sa mahalagang impormasyon, inirerekomenda ang propesyonal na pagsasalin ng tao. Hindi kami mananagot sa anumang hindi pagkakaunawaan o maling interpretasyon na maaaring magmula sa paggamit ng pagsasaling ito.\n"
   ]
  }
 ]
}