{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" }, "orig_nbformat": 4, "kernelspec": { "name": "python3", "display_name": "Python 3.7.0 64-bit ('3.7')" }, "interpreter": { "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d" } }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import time\n", "import ast" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def replace_address(row):\n", " if \"Netherlands\" in row[\"Hotel_Address\"]:\n", " return \"Amsterdam, Netherlands\"\n", " elif \"Barcelona\" in row[\"Hotel_Address\"]:\n", " return \"Barcelona, Spain\"\n", " elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n", " return \"London, United Kingdom\"\n", " elif \"Milan\" in row[\"Hotel_Address\"]: \n", " return \"Milan, Italy\"\n", " elif \"France\" in row[\"Hotel_Address\"]:\n", " return \"Paris, France\"\n", " elif \"Vienna\" in row[\"Hotel_Address\"]:\n", " return \"Vienna, Austria\" \n", " else:\n", " return row.Hotel_Address\n", " " ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Load the hotel reviews from CSV\n", "start = time.time()\n", "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# dropping columns we will not use:\n", "df.drop([\"lat\", \"lng\"], axis = 1, inplace=True)\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# Replace all the addresses with a shortened, more useful form\n", "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "# Drop `Additional_Number_of_Scoring`\n", "df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace=True)\n", "# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values\n", "df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')\n", "df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Process the Tags into new columns\n", "# The file Hotel_Reviews_Tags.py, identifies the most important tags\n", "# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n", "# Family with young children, Family with older children, With a pet\n", "df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n", "df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n", "df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n", "df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n", "df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n", "df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n", "df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n", "df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# No longer need any of these columns\n", "df.drop([\"Review_Date\", \"Review_Total_Negative_Word_Counts\", \"Review_Total_Positive_Word_Counts\", \"days_since_review\", \"Total_Number_of_Reviews_Reviewer_Has_Given\"], axis = 1, inplace=True)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Saving results to Hotel_Reviews_Filtered.csv\n", "Filtering took 23.74 seconds\n" ] } ], "source": [ "# Saving new data file with calculated columns\n", "print(\"Saving results to Hotel_Reviews_Filtered.csv\")\n", "df.to_csv(r'../../data/Hotel_Reviews_Filtered.csv', index = False)\n", "end = time.time()\n", "print(\"Filtering took \" + str(round(end - start, 2)) + \" seconds\")\n" ] } ] }