diff --git a/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py b/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py new file mode 100644 index 000000000..d11f15948 --- /dev/null +++ b/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py @@ -0,0 +1,57 @@ +import pandas as pd +import time +import ast + +def replace_address(row): + if "Netherlands" in row["Hotel_Address"]: + return "Amsterdam, Netherlands" + elif "Barcelona" in row["Hotel_Address"]: + return "Barcelona, Spain" + elif "United Kingdom" in row["Hotel_Address"]: + return "London, United Kingdom" + elif "Milan" in row["Hotel_Address"]: + return "Milan, Italy" + elif "France" in row["Hotel_Address"]: + return "Paris, France" + elif "Vienna" in row["Hotel_Address"]: + return "Vienna, Austria" + else: + return row.Hotel_Address + +# Load the hotel reviews from CSV +start = time.time() +df = pd.read_csv('Hotel_Reviews.csv') + +# dropping columns we will not use: +df.drop(["lat", "lng"], axis = 1, inplace=True) + +# Replace all the addresses with a shortened, more useful form +df["Hotel_Address"] = df.apply(replace_address, axis = 1) + +# Drop `Additional_Number_of_Scoring` +df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True) +# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values +df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count') +df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1) + +# Process the Tags into new columns +# The file Hotel_Reviews_Tags.py, identifies the most important tags +# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, +# Family with young children, Family with older children, With a pet +df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0) +df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0) +df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0) +df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0) +df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0) +df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0) +df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0) +df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0) + +# No longer need any of these columns +df.drop(["Tags", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given"], axis = 1, inplace=True) + +# Saving new data file with calculated columns +print("Saving results to Hotel_Reviews_Filtered.csv") +df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False) +end = time.time() +print("Filtering took " + str(round(end - start, 2)) + " seconds")