parent
5e452709e3
commit
b1db678757
@ -0,0 +1,57 @@
|
||||
import pandas as pd
|
||||
import time
|
||||
import ast
|
||||
|
||||
def replace_address(row):
|
||||
if "Netherlands" in row["Hotel_Address"]:
|
||||
return "Amsterdam, Netherlands"
|
||||
elif "Barcelona" in row["Hotel_Address"]:
|
||||
return "Barcelona, Spain"
|
||||
elif "United Kingdom" in row["Hotel_Address"]:
|
||||
return "London, United Kingdom"
|
||||
elif "Milan" in row["Hotel_Address"]:
|
||||
return "Milan, Italy"
|
||||
elif "France" in row["Hotel_Address"]:
|
||||
return "Paris, France"
|
||||
elif "Vienna" in row["Hotel_Address"]:
|
||||
return "Vienna, Austria"
|
||||
else:
|
||||
return row.Hotel_Address
|
||||
|
||||
# Load the hotel reviews from CSV
|
||||
start = time.time()
|
||||
df = pd.read_csv('Hotel_Reviews.csv')
|
||||
|
||||
# dropping columns we will not use:
|
||||
df.drop(["lat", "lng"], axis = 1, inplace=True)
|
||||
|
||||
# Replace all the addresses with a shortened, more useful form
|
||||
df["Hotel_Address"] = df.apply(replace_address, axis = 1)
|
||||
|
||||
# Drop `Additional_Number_of_Scoring`
|
||||
df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True)
|
||||
# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values
|
||||
df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')
|
||||
df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)
|
||||
|
||||
# Process the Tags into new columns
|
||||
# The file Hotel_Reviews_Tags.py, identifies the most important tags
|
||||
# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends,
|
||||
# Family with young children, Family with older children, With a pet
|
||||
df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0)
|
||||
df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0)
|
||||
df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0)
|
||||
df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0)
|
||||
df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0)
|
||||
df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0)
|
||||
df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0)
|
||||
df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0)
|
||||
|
||||
# No longer need any of these columns
|
||||
df.drop(["Tags", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given"], axis = 1, inplace=True)
|
||||
|
||||
# Saving new data file with calculated columns
|
||||
print("Saving results to Hotel_Reviews_Filtered.csv")
|
||||
df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)
|
||||
end = time.time()
|
||||
print("Filtering took " + str(round(end - start, 2)) + " seconds")
|
Loading…
Reference in new issue