Create Hotel_Dataset_Filtering.py

4 years ago · b1db678757
parent 5e452709e3
commit b1db678757
1 changed files with 57 additions and 0 deletions
--- a/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py
+++ b/6-NLP/5-Hotel-Reviews-2/Hotel_Dataset_Filtering.py
@ -0,0 +1,57 @@
+import pandas as pd
+import time
+import ast
+
+def replace_address(row):
+    if "Netherlands" in row["Hotel_Address"]:
+        return "Amsterdam, Netherlands"
+    elif "Barcelona" in row["Hotel_Address"]:
+        return "Barcelona, Spain"
+    elif "United Kingdom" in row["Hotel_Address"]:
+        return "London, United Kingdom"
+    elif "Milan" in row["Hotel_Address"]:        
+        return "Milan, Italy"
+    elif "France" in row["Hotel_Address"]:
+        return "Paris, France"
+    elif "Vienna" in row["Hotel_Address"]:
+        return "Vienna, Austria" 
+    else:
+        return row.Hotel_Address
+    
+# Load the hotel reviews from CSV
+start = time.time()
+df = pd.read_csv('Hotel_Reviews.csv')
+
+# dropping columns we will not use:
+df.drop(["lat", "lng"], axis = 1, inplace=True)
+
+# Replace all the addresses with a shortened, more useful form
+df["Hotel_Address"] = df.apply(replace_address, axis = 1)
+
+# Drop `Additional_Number_of_Scoring`
+df.drop(["Additional_Number_of_Scoring"], axis = 1, inplace=True)
+# Replace `Total_Number_of_Reviews` and `Average_Score` with our own calculated values
+df.Total_Number_of_Reviews = df.groupby('Hotel_Name').transform('count')
+df.Average_Score = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)
+
+# Process the Tags into new columns
+# The file Hotel_Reviews_Tags.py, identifies the most important tags
+# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, 
+# Family with young children, Family with older children, With a pet
+df["Leisure_trip"] = df.Tags.apply(lambda tag: 1 if "Leisure trip" in tag else 0)
+df["Couple"] = df.Tags.apply(lambda tag: 1 if "Couple" in tag else 0)
+df["Solo_traveler"] = df.Tags.apply(lambda tag: 1 if "Solo traveler" in tag else 0)
+df["Business_trip"] = df.Tags.apply(lambda tag: 1 if "Business trip" in tag else 0)
+df["Group"] = df.Tags.apply(lambda tag: 1 if "Group" in tag or "Travelers with friends" in tag else 0)
+df["Family_with_young_children"] = df.Tags.apply(lambda tag: 1 if "Family with young children" in tag else 0)
+df["Family_with_older_children"] = df.Tags.apply(lambda tag: 1 if "Family with older children" in tag else 0)
+df["With_a_pet"] = df.Tags.apply(lambda tag: 1 if "With a pet" in tag else 0)
+
+# No longer need any of these columns
+df.drop(["Tags", "Review_Date", "Review_Total_Negative_Word_Counts", "Review_Total_Positive_Word_Counts", "days_since_review", "Total_Number_of_Reviews_Reviewer_Has_Given"], axis = 1, inplace=True)
+
+# Saving new data file with calculated columns
+print("Saving results to Hotel_Reviews_Filtered.csv")
+df.to_csv(r'Hotel_Reviews_Filtered.csv', index = False)
+end = time.time()
+print("Filtering took " + str(round(end - start, 2)) + " seconds")