From 618a4a5feb36b67e24644c74973b6d92f73b64ce Mon Sep 17 00:00:00 2001 From: "Stephen Howell (MSFT)" <38020233+stephen-howell@users.noreply.github.com> Date: Fri, 25 Jun 2021 01:35:40 +0100 Subject: [PATCH] Create Hotel_Reviews_Tags.py --- 6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py diff --git a/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py b/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py new file mode 100644 index 000000000..26049c864 --- /dev/null +++ b/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py @@ -0,0 +1,49 @@ +# This code explores the Tag column of the Hotel_Reviews dataset. +# It is not integral to the NLP aspect of the lesson, but useful for learning pandas and EDA +# The goal was to identify what tags were worth keeping +import pandas as pd + +# Load the hotel reviews from CSV (you can ) +df = pd.read_csv('Hotel_Reviews_Filtered.csv') + +# We want to find the most useful tags to keep +# Remove opening and closing brackets +df.Tags = df.Tags.str.strip("[']") +# remove all quotes too +df.Tags = df.Tags.str.replace(" ', '", ",", regex = False) + +# removing this to take advantage of the 'already a phrase' fact of the dataset +# Now split the strings into a list +tag_list_df = df.Tags.str.split(',', expand = True) + +# Remove leading and trailing spaces +df["Tag_1"] = tag_list_df[0].str.strip() +df["Tag_2"] = tag_list_df[1].str.strip() +df["Tag_3"] = tag_list_df[2].str.strip() +df["Tag_4"] = tag_list_df[3].str.strip() +df["Tag_5"] = tag_list_df[4].str.strip() +df["Tag_6"] = tag_list_df[5].str.strip() + +# Merge the 6 columns into one with melt +df_tags = df.melt(value_vars=["Tag_1", "Tag_2", "Tag_3", "Tag_4", "Tag_5", "Tag_6"]) + +# Get the value counts +tag_vc = df_tags.value.value_counts() +# print(tag_vc) +print("The shape of the tags with no filtering:", str(df_tags.shape)) +# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000 +df_tags = df_tags[~df_tags.value.str.contains("Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double", na=False, case=False)] +tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 1000") +# Print the top 10 (there should only be 9 and we'll use these in the filtering section) +print(tag_vc[:10]) + +# index count +# 0 Leisure trip 417778 +# 1 Couple 252294 +# 2 Solo traveler 108545 +# 3 Business trip 82939 +# 4 Group 65392 +# 5 Family with young children 61015 +# 6 Family with older children 26349 +# 7 Travelers with friends 2143 +# 8 With a pet 1405