ML-For-Beginners/translations/vi/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  },
  "coopTranslator": {
   "original_hash": "033cb89c85500224b3c63fd04f49b4aa",
   "translation_date": "2025-09-06T15:22:44+00:00",
   "source_file": "6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb",
   "language_code": "vi"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import time\n",
    "import ast"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def replace_address(row):\n",
    "    if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
    "        return \"Amsterdam, Netherlands\"\n",
    "    elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
    "        return \"Barcelona, Spain\"\n",
    "    elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
    "        return \"London, United Kingdom\"\n",
    "    elif \"Milan\" in row[\"Hotel_Address\"]:        \n",
    "        return \"Milan, Italy\"\n",
    "    elif \"France\" in row[\"Hotel_Address\"]:\n",
    "        return \"Paris, France\"\n",
    "    elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
    "        return \"Vienna, Austria\" \n",
    "    else:\n",
    "        return row.Hotel_Address\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV\n",
    "start = time.time()\n",
    "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# dropping columns we will not use:\n",
    "df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace all the addresses with a shortened, more useful form\n",
    "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Drop `Additional_Number_of_Scoring`\n",
    "df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n",
    "# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n",
    "df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n",
    "df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Process the Tags into new columns\n",
    "# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
    "# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
    "# Family with young children, Family with older children, With a pet\n",
    "df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
    "df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
    "df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
    "df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
    "df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
    "df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
    "df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
    "df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# No longer need any of these columns\n",
    "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Saving results to Hotel_Reviews_Filtered.csv\n",
      "Filtering took 23.74 seconds\n"
     ]
    }
   ],
   "source": [
    "# Saving new data file with calculated columns\n",
    "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
    "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
    "end = time.time()\n",
    "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n---\n\n**Tuyên bố miễn trừ trách nhiệm**:  \nTài liệu này đã được dịch bằng dịch vụ dịch thuật AI [Co-op Translator](https://github.com/Azure/co-op-translator). Mặc dù chúng tôi cố gắng đảm bảo độ chính xác, xin lưu ý rằng các bản dịch tự động có thể chứa lỗi hoặc không chính xác. Tài liệu gốc bằng ngôn ngữ bản địa nên được coi là nguồn thông tin chính thức. Đối với các thông tin quan trọng, khuyến nghị sử dụng dịch vụ dịch thuật chuyên nghiệp bởi con người. Chúng tôi không chịu trách nhiệm cho bất kỳ sự hiểu lầm hoặc diễn giải sai nào phát sinh từ việc sử dụng bản dịch này.\n"
   ]
  }
 ]
}