You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
159 lines
4.9 KiB
159 lines
4.9 KiB
4 years ago
|
{
|
||
|
"metadata": {
|
||
|
"language_info": {
|
||
|
"codemirror_mode": {
|
||
|
"name": "ipython",
|
||
|
"version": 3
|
||
|
},
|
||
|
"file_extension": ".py",
|
||
|
"mimetype": "text/x-python",
|
||
|
"name": "python",
|
||
|
"nbconvert_exporter": "python",
|
||
|
"pygments_lexer": "ipython3",
|
||
4 years ago
|
"version": "3.7.0"
|
||
4 years ago
|
},
|
||
4 years ago
|
"orig_nbformat": 4,
|
||
|
"kernelspec": {
|
||
|
"name": "python3",
|
||
|
"display_name": "Python 3.7.0 64-bit ('3.7')"
|
||
|
},
|
||
|
"interpreter": {
|
||
|
"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
|
||
|
}
|
||
4 years ago
|
},
|
||
|
"nbformat": 4,
|
||
|
"nbformat_minor": 2,
|
||
|
"cells": [
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 1,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"import pandas as pd\n",
|
||
|
"import time\n",
|
||
|
"import ast"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 2,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"def replace_address(row):\n",
|
||
|
" if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
|
||
|
" return \"Amsterdam, Netherlands\"\n",
|
||
|
" elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
|
||
|
" return \"Barcelona, Spain\"\n",
|
||
|
" elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
|
||
|
" return \"London, United Kingdom\"\n",
|
||
|
" elif \"Milan\" in row[\"Hotel_Address\"]: \n",
|
||
|
" return \"Milan, Italy\"\n",
|
||
|
" elif \"France\" in row[\"Hotel_Address\"]:\n",
|
||
|
" return \"Paris, France\"\n",
|
||
|
" elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
|
||
|
" return \"Vienna, Austria\" \n",
|
||
|
" else:\n",
|
||
|
" return row.Hotel_Address\n",
|
||
|
" "
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 3,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Load the hotel reviews from CSV\n",
|
||
|
"start = time.time()\n",
|
||
|
"df = pd.read_csv('../../data/Hotel_Reviews.csv')\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 4,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# dropping columns we will not use:\n",
|
||
|
"df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 5,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Replace all the addresses with a shortened, more useful form\n",
|
||
|
"df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 6,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Drop `Additional_Number_of_Scoring`\n",
|
||
|
"df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n",
|
||
|
"# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n",
|
||
|
"df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n",
|
||
|
"df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 7,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# Process the Tags into new columns\n",
|
||
|
"# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
|
||
|
"# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
|
||
|
"# Family with young children, Family with older children, With a pet\n",
|
||
|
"df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
|
||
|
"df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
|
||
|
"df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
|
||
|
"df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
|
||
|
"df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
|
||
|
"df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
|
||
|
"df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
|
||
|
"df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n"
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 8,
|
||
4 years ago
|
"metadata": {},
|
||
|
"outputs": [],
|
||
|
"source": [
|
||
|
"# No longer need any of these columns\n",
|
||
4 years ago
|
"df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n"
|
||
4 years ago
|
]
|
||
|
},
|
||
|
{
|
||
|
"cell_type": "code",
|
||
4 years ago
|
"execution_count": 9,
|
||
4 years ago
|
"metadata": {},
|
||
4 years ago
|
"outputs": [
|
||
|
{
|
||
|
"output_type": "stream",
|
||
|
"name": "stdout",
|
||
|
"text": [
|
||
|
"Saving results to Hotel_Reviews_Filtered.csv\n",
|
||
|
"Filtering took 23.74 seconds\n"
|
||
|
]
|
||
|
}
|
||
|
],
|
||
4 years ago
|
"source": [
|
||
|
"# Saving new data file with calculated columns\n",
|
||
|
"print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n",
|
||
4 years ago
|
"df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n",
|
||
4 years ago
|
"end = time.time()\n",
|
||
|
"print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||
|
]
|
||
|
}
|
||
|
]
|
||
|
}
|