You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ML-For-Beginners/translations/hk/6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb

172 lines
5.7 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
},
"orig_nbformat": 4,
"kernelspec": {
"name": "python3",
"display_name": "Python 3.7.0 64-bit ('3.7')"
},
"interpreter": {
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
},
"coopTranslator": {
"original_hash": "033cb89c85500224b3c63fd04f49b4aa",
"translation_date": "2025-09-03T20:58:29+00:00",
"source_file": "6-NLP/5-Hotel-Reviews-2/solution/1-notebook.ipynb",
"language_code": "hk"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import time\n",
"import ast"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def replace_address(row):\n",
" if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
" return \"Amsterdam, Netherlands\"\n",
" elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
" return \"Barcelona, Spain\"\n",
" elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
" return \"London, United Kingdom\"\n",
" elif \"Milan\" in row[\"Hotel_Address\"]: \n",
" return \"Milan, Italy\"\n",
" elif \"France\" in row[\"Hotel_Address\"]:\n",
" return \"Paris, France\"\n",
" elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
" return \"Vienna, Austria\" \n",
" else:\n",
" return row.Hotel_Address\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Load the hotel reviews from CSV\n",
"start = time.time()\n",
"df = pd.read_csv('../../data/Hotel_Reviews.csv')\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# dropping columns we will not use:\n",
"df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Replace all the addresses with a shortened, more useful form\n",
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Drop `Additional_Number_of_Scoring`\n",
"df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n",
"# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n",
"df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n",
"df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Process the Tags into new columns\n",
"# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
"# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
"# Family with young children, Family with older children, With a pet\n",
"df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
"df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
"df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
"df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
"df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
"df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
"df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
"df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# No longer need any of these columns\n",
"df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Saving results to Hotel_Reviews_Filtered.csv\n",
"Filtering took 23.74 seconds\n"
]
}
],
"source": [
"# Saving new data file with calculated columns\n",
"print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
"df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
"end = time.time()\n",
"print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n---\n\n**免責聲明** \n本文件已使用人工智能翻譯服務 [Co-op Translator](https://github.com/Azure/co-op-translator) 進行翻譯。雖然我們致力於提供準確的翻譯,但請注意,自動翻譯可能包含錯誤或不準確之處。原始語言的文件應被視為權威來源。對於重要信息,建議使用專業人工翻譯。我們對因使用此翻譯而引起的任何誤解或錯誤解釋概不負責。\n"
]
}
]
}