parent
145a42e330
commit
ad372f4863
@ -1,62 +0,0 @@
|
||||
# EDA
|
||||
import pandas as pd
|
||||
import time
|
||||
|
||||
def get_difference_review_avg(row):
|
||||
return row["Average_Score"] - row["Calc_Average_Score"]
|
||||
|
||||
# Load the hotel reviews from CSV
|
||||
print("Loading data file now, this could take a while depending on file size")
|
||||
start = time.time()
|
||||
df = pd.read_csv('Hotel_Reviews.csv')
|
||||
end = time.time()
|
||||
print("Loading took " + str(round(end - start, 2)) + " seconds")
|
||||
|
||||
# What shape is the data (rows, columns)?
|
||||
print("The shape of the data (rows, cols) is " + str(df.shape))
|
||||
|
||||
# value_counts() creates a Series object that has index and values
|
||||
# in this case, the country and the frequency they occur in reviewer nationality
|
||||
nationality_freq = df["Reviewer_Nationality"].value_counts()
|
||||
|
||||
# What reviewer nationality is the most common in the dataset?
|
||||
print("The highest frequency reviewer nationality is " + str(nationality_freq.index[0]).strip() + " with " + str(nationality_freq[0]) + " reviews.")
|
||||
|
||||
# What is the top 10 most common nationalities and their frequencies?
|
||||
print("The top 10 highest frequency reviewer nationalities are:")
|
||||
print(nationality_freq[0:10].to_string())
|
||||
|
||||
# How many unique nationalities are there?
|
||||
print("There are " + str(nationality_freq.index.size) + " unique nationalities in the dataset")
|
||||
|
||||
# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews
|
||||
for nat in nationality_freq[:10].index:
|
||||
# First, extract all the rows that match the criteria into a new dataframe
|
||||
nat_df = df[df["Reviewer_Nationality"] == nat]
|
||||
# Now get the hotel freq
|
||||
freq = nat_df["Hotel_Name"].value_counts()
|
||||
print("The most reviewed hotel for " + str(nat).strip() + " was " + str(freq.index[0]) + " with " + str(freq[0]) + " reviews.")
|
||||
|
||||
# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?
|
||||
# First create a new dataframe based on the old one, removing the uneeded columns
|
||||
hotel_freq_df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Negative_Review", "Review_Total_Negative_Word_Counts", "Positive_Review", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Reviewer_Score", "Tags", "days_since_review", "lat", "lng"], axis = 1)
|
||||
# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found
|
||||
hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')
|
||||
# Get rid of all the duplicated rows
|
||||
hotel_freq_df = hotel_freq_df.drop_duplicates(subset = ["Hotel_Name"])
|
||||
print()
|
||||
print(hotel_freq_df.to_string())
|
||||
print(str(hotel_freq_df.shape))
|
||||
|
||||
# While there is an `Average_Score` for each hotel according to the dataset,
|
||||
# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)
|
||||
# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average.
|
||||
df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)
|
||||
# Add a new column with the difference between the two average scores
|
||||
df["Average_Score_Difference"] = df.apply(get_difference_review_avg, axis = 1)
|
||||
# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)
|
||||
review_scores_df = df.drop_duplicates(subset = ["Hotel_Name"])
|
||||
# Sort the dataframe to find the lowest and highest average score difference
|
||||
review_scores_df = review_scores_df.sort_values(by=["Average_Score_Difference"])
|
||||
print(review_scores_df[["Average_Score_Difference", "Average_Score", "Calc_Average_Score", "Hotel_Name"]])
|
||||
# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?
|
@ -1,49 +0,0 @@
|
||||
# This code explores the Tag column of the Hotel_Reviews dataset.
|
||||
# It is not integral to the NLP aspect of the lesson, but useful for learning pandas and EDA
|
||||
# The goal was to identify what tags were worth keeping
|
||||
import pandas as pd
|
||||
|
||||
# Load the hotel reviews from CSV (you can )
|
||||
df = pd.read_csv('Hotel_Reviews_Filtered.csv')
|
||||
|
||||
# We want to find the most useful tags to keep
|
||||
# Remove opening and closing brackets
|
||||
df.Tags = df.Tags.str.strip("[']")
|
||||
# remove all quotes too
|
||||
df.Tags = df.Tags.str.replace(" ', '", ",", regex = False)
|
||||
|
||||
# removing this to take advantage of the 'already a phrase' fact of the dataset
|
||||
# Now split the strings into a list
|
||||
tag_list_df = df.Tags.str.split(',', expand = True)
|
||||
|
||||
# Remove leading and trailing spaces
|
||||
df["Tag_1"] = tag_list_df[0].str.strip()
|
||||
df["Tag_2"] = tag_list_df[1].str.strip()
|
||||
df["Tag_3"] = tag_list_df[2].str.strip()
|
||||
df["Tag_4"] = tag_list_df[3].str.strip()
|
||||
df["Tag_5"] = tag_list_df[4].str.strip()
|
||||
df["Tag_6"] = tag_list_df[5].str.strip()
|
||||
|
||||
# Merge the 6 columns into one with melt
|
||||
df_tags = df.melt(value_vars=["Tag_1", "Tag_2", "Tag_3", "Tag_4", "Tag_5", "Tag_6"])
|
||||
|
||||
# Get the value counts
|
||||
tag_vc = df_tags.value.value_counts()
|
||||
# print(tag_vc)
|
||||
print("The shape of the tags with no filtering:", str(df_tags.shape))
|
||||
# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000
|
||||
df_tags = df_tags[~df_tags.value.str.contains("Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double", na=False, case=False)]
|
||||
tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 1000")
|
||||
# Print the top 10 (there should only be 9 and we'll use these in the filtering section)
|
||||
print(tag_vc[:10])
|
||||
|
||||
# index count
|
||||
# 0 Leisure trip 417778
|
||||
# 1 Couple 252294
|
||||
# 2 Solo traveler 108545
|
||||
# 3 Business trip 82939
|
||||
# 4 Group 65392
|
||||
# 5 Family with young children 61015
|
||||
# 6 Family with older children 26349
|
||||
# 7 Travelers with friends 2143
|
||||
# 8 With a pet 1405
|
@ -0,0 +1,161 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": 3
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# EDA\n",
|
||||
"import pandas as pd\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_difference_review_avg(row):\n",
|
||||
" return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the hotel reviews from CSV\n",
|
||||
"print(\"Loading data file now, this could take a while depending on file size\")\n",
|
||||
"start = time.time()\n",
|
||||
"df = pd.read_csv('../../data/Hotel_Reviews.csv')\n",
|
||||
"end = time.time()\n",
|
||||
"print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# What shape is the data (rows, columns)?\n",
|
||||
"print(\"The shape of the data (rows, cols) is \" + str(df.shape))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# value_counts() creates a Series object that has index and values\n",
|
||||
"# in this case, the country and the frequency they occur in reviewer nationality\n",
|
||||
"nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# What reviewer nationality is the most common in the dataset?\n",
|
||||
"print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# What is the top 10 most common nationalities and their frequencies?\n",
|
||||
"print(\"The top 10 highest frequency reviewer nationalities are:\")\n",
|
||||
"print(nationality_freq[0:10].to_string())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# How many unique nationalities are there?\n",
|
||||
"print(\"There are \" + str(nationality_freq.index.size) + \" unique nationalities in the dataset\")\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews\n",
|
||||
"for nat in nationality_freq[:10].index:\n",
|
||||
" # First, extract all the rows that match the criteria into a new dataframe\n",
|
||||
" nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n",
|
||||
" # Now get the hotel freq\n",
|
||||
" freq = nat_df[\"Hotel_Name\"].value_counts()\n",
|
||||
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?\n",
|
||||
"# First create a new dataframe based on the old one, removing the uneeded columns\n",
|
||||
"hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)\n",
|
||||
"# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found\n",
|
||||
"hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n",
|
||||
"# Get rid of all the duplicated rows\n",
|
||||
"hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
|
||||
"print()\n",
|
||||
"print(hotel_freq_df.to_string())\n",
|
||||
"print(str(hotel_freq_df.shape))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# While there is an `Average_Score` for each hotel according to the dataset, \n",
|
||||
"# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)\n",
|
||||
"# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. \n",
|
||||
"df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n",
|
||||
"# Add a new column with the difference between the two average scores\n",
|
||||
"df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n",
|
||||
"# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n",
|
||||
"review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
|
||||
"# Sort the dataframe to find the lowest and highest average score difference\n",
|
||||
"review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n",
|
||||
"print(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])\n",
|
||||
"# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?\n"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
@ -0,0 +1,97 @@
|
||||
{
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": 3
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2,
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Load the hotel reviews from CSV (you can )\n",
|
||||
"df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# We want to find the most useful tags to keep\n",
|
||||
"# Remove opening and closing brackets\n",
|
||||
"df.Tags = df.Tags.str.strip(\"[']\")\n",
|
||||
"# remove all quotes too\n",
|
||||
"df.Tags = df.Tags.str.replace(\" ', '\", \",\", regex = False)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# removing this to take advantage of the 'already a phrase' fact of the dataset \n",
|
||||
"# Now split the strings into a list\n",
|
||||
"tag_list_df = df.Tags.str.split(',', expand = True)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Remove leading and trailing spaces\n",
|
||||
"df[\"Tag_1\"] = tag_list_df[0].str.strip()\n",
|
||||
"df[\"Tag_2\"] = tag_list_df[1].str.strip()\n",
|
||||
"df[\"Tag_3\"] = tag_list_df[2].str.strip()\n",
|
||||
"df[\"Tag_4\"] = tag_list_df[3].str.strip()\n",
|
||||
"df[\"Tag_5\"] = tag_list_df[4].str.strip()\n",
|
||||
"df[\"Tag_6\"] = tag_list_df[5].str.strip()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Merge the 6 columns into one with melt\n",
|
||||
"df_tags = df.melt(value_vars=[\"Tag_1\", \"Tag_2\", \"Tag_3\", \"Tag_4\", \"Tag_5\", \"Tag_6\"])\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Get the value counts\n",
|
||||
"tag_vc = df_tags.value.value_counts()\n",
|
||||
"# print(tag_vc)\n",
|
||||
"print(\"The shape of the tags with no filtering:\", str(df_tags.shape))\n",
|
||||
"# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000\n",
|
||||
"df_tags = df_tags[~df_tags.value.str.contains(\"Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double\", na=False, case=False)]\n",
|
||||
"tag_vc = df_tags.value.value_counts().reset_index(name=\"count\").query(\"count > 1000\")\n",
|
||||
"# Print the top 10 (there should only be 9 and we'll use these in the filtering section)\n",
|
||||
"print(tag_vc[:10])"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
Loading…
Reference in new issue