From 3772821f6d486684bcbf04538e82cdf1f90c202d Mon Sep 17 00:00:00 2001 From: Jen Looper Date: Thu, 24 Jun 2021 21:23:11 -0400 Subject: [PATCH] NLP lesson 4 - convert to notebook --- .../Hotel_Reviews_Explorer.py | 62 ------- 6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py | 49 ------ 6-NLP/4-Hotel-Reviews-1/README.md | 2 +- .../solution/notebook-explorer.ipynb | 161 ++++++++++++++++++ .../solution/notebook-tags.ipynb | 97 +++++++++++ .../4-Hotel-Reviews-1/solution/notebook.ipynb | 0 6 files changed, 259 insertions(+), 112 deletions(-) delete mode 100644 6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Explorer.py delete mode 100644 6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py create mode 100644 6-NLP/4-Hotel-Reviews-1/solution/notebook-explorer.ipynb create mode 100644 6-NLP/4-Hotel-Reviews-1/solution/notebook-tags.ipynb delete mode 100644 6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb diff --git a/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Explorer.py b/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Explorer.py deleted file mode 100644 index b377bcdb8..000000000 --- a/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Explorer.py +++ /dev/null @@ -1,62 +0,0 @@ -# EDA -import pandas as pd -import time - -def get_difference_review_avg(row): - return row["Average_Score"] - row["Calc_Average_Score"] - -# Load the hotel reviews from CSV -print("Loading data file now, this could take a while depending on file size") -start = time.time() -df = pd.read_csv('Hotel_Reviews.csv') -end = time.time() -print("Loading took " + str(round(end - start, 2)) + " seconds") - -# What shape is the data (rows, columns)? -print("The shape of the data (rows, cols) is " + str(df.shape)) - -# value_counts() creates a Series object that has index and values -# in this case, the country and the frequency they occur in reviewer nationality -nationality_freq = df["Reviewer_Nationality"].value_counts() - -# What reviewer nationality is the most common in the dataset? -print("The highest frequency reviewer nationality is " + str(nationality_freq.index[0]).strip() + " with " + str(nationality_freq[0]) + " reviews.") - -# What is the top 10 most common nationalities and their frequencies? -print("The top 10 highest frequency reviewer nationalities are:") -print(nationality_freq[0:10].to_string()) - -# How many unique nationalities are there? -print("There are " + str(nationality_freq.index.size) + " unique nationalities in the dataset") - -# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews -for nat in nationality_freq[:10].index: - # First, extract all the rows that match the criteria into a new dataframe - nat_df = df[df["Reviewer_Nationality"] == nat] - # Now get the hotel freq - freq = nat_df["Hotel_Name"].value_counts() - print("The most reviewed hotel for " + str(nat).strip() + " was " + str(freq.index[0]) + " with " + str(freq[0]) + " reviews.") - -# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`? -# First create a new dataframe based on the old one, removing the uneeded columns -hotel_freq_df = df.drop(["Hotel_Address", "Additional_Number_of_Scoring", "Review_Date", "Average_Score", "Reviewer_Nationality", "Negative_Review", "Review_Total_Negative_Word_Counts", "Positive_Review", "Review_Total_Positive_Word_Counts", "Total_Number_of_Reviews_Reviewer_Has_Given", "Reviewer_Score", "Tags", "days_since_review", "lat", "lng"], axis = 1) -# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found -hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count') -# Get rid of all the duplicated rows -hotel_freq_df = hotel_freq_df.drop_duplicates(subset = ["Hotel_Name"]) -print() -print(hotel_freq_df.to_string()) -print(str(hotel_freq_df.shape)) - -# While there is an `Average_Score` for each hotel according to the dataset, -# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel) -# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. -df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1) -# Add a new column with the difference between the two average scores -df["Average_Score_Difference"] = df.apply(get_difference_review_avg, axis = 1) -# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel) -review_scores_df = df.drop_duplicates(subset = ["Hotel_Name"]) -# Sort the dataframe to find the lowest and highest average score difference -review_scores_df = review_scores_df.sort_values(by=["Average_Score_Difference"]) -print(review_scores_df[["Average_Score_Difference", "Average_Score", "Calc_Average_Score", "Hotel_Name"]]) -# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`? diff --git a/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py b/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py deleted file mode 100644 index 26049c864..000000000 --- a/6-NLP/4-Hotel-Reviews-1/Hotel_Reviews_Tags.py +++ /dev/null @@ -1,49 +0,0 @@ -# This code explores the Tag column of the Hotel_Reviews dataset. -# It is not integral to the NLP aspect of the lesson, but useful for learning pandas and EDA -# The goal was to identify what tags were worth keeping -import pandas as pd - -# Load the hotel reviews from CSV (you can ) -df = pd.read_csv('Hotel_Reviews_Filtered.csv') - -# We want to find the most useful tags to keep -# Remove opening and closing brackets -df.Tags = df.Tags.str.strip("[']") -# remove all quotes too -df.Tags = df.Tags.str.replace(" ', '", ",", regex = False) - -# removing this to take advantage of the 'already a phrase' fact of the dataset -# Now split the strings into a list -tag_list_df = df.Tags.str.split(',', expand = True) - -# Remove leading and trailing spaces -df["Tag_1"] = tag_list_df[0].str.strip() -df["Tag_2"] = tag_list_df[1].str.strip() -df["Tag_3"] = tag_list_df[2].str.strip() -df["Tag_4"] = tag_list_df[3].str.strip() -df["Tag_5"] = tag_list_df[4].str.strip() -df["Tag_6"] = tag_list_df[5].str.strip() - -# Merge the 6 columns into one with melt -df_tags = df.melt(value_vars=["Tag_1", "Tag_2", "Tag_3", "Tag_4", "Tag_5", "Tag_6"]) - -# Get the value counts -tag_vc = df_tags.value.value_counts() -# print(tag_vc) -print("The shape of the tags with no filtering:", str(df_tags.shape)) -# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000 -df_tags = df_tags[~df_tags.value.str.contains("Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double", na=False, case=False)] -tag_vc = df_tags.value.value_counts().reset_index(name="count").query("count > 1000") -# Print the top 10 (there should only be 9 and we'll use these in the filtering section) -print(tag_vc[:10]) - -# index count -# 0 Leisure trip 417778 -# 1 Couple 252294 -# 2 Solo traveler 108545 -# 3 Business trip 82939 -# 4 Group 65392 -# 5 Family with young children 61015 -# 6 Family with older children 26349 -# 7 Travelers with friends 2143 -# 8 With a pet 1405 diff --git a/6-NLP/4-Hotel-Reviews-1/README.md b/6-NLP/4-Hotel-Reviews-1/README.md index 7876ce93d..c6247a176 100644 --- a/6-NLP/4-Hotel-Reviews-1/README.md +++ b/6-NLP/4-Hotel-Reviews-1/README.md @@ -347,7 +347,7 @@ Treat the following questions as coding tasks and attempt to answer them without both_no_reviews = df.apply(lambda x: True if x['Negative_Review'] == "No Negative" and x['Positive_Review'] == "No Positive" else False , axis=1) print("Number of both No Negative and No Positive reviews: " + str(len(both_no_reviews[both_no_reviews == True].index))) end = time.time() - print("Lamdas took " + str(round(end - start, 2)) + " seconds") + print("Lambdas took " + str(round(end - start, 2)) + " seconds") Number of No Negative reviews: 127890 Number of No Positive reviews: 35946 diff --git a/6-NLP/4-Hotel-Reviews-1/solution/notebook-explorer.ipynb b/6-NLP/4-Hotel-Reviews-1/solution/notebook-explorer.ipynb new file mode 100644 index 000000000..f4536e1fb --- /dev/null +++ b/6-NLP/4-Hotel-Reviews-1/solution/notebook-explorer.ipynb @@ -0,0 +1,161 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# EDA\n", + "import pandas as pd\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_difference_review_avg(row):\n", + " return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the hotel reviews from CSV\n", + "print(\"Loading data file now, this could take a while depending on file size\")\n", + "start = time.time()\n", + "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n", + "end = time.time()\n", + "print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What shape is the data (rows, columns)?\n", + "print(\"The shape of the data (rows, cols) is \" + str(df.shape))\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# value_counts() creates a Series object that has index and values\n", + "# in this case, the country and the frequency they occur in reviewer nationality\n", + "nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What reviewer nationality is the most common in the dataset?\n", + "print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What is the top 10 most common nationalities and their frequencies?\n", + "print(\"The top 10 highest frequency reviewer nationalities are:\")\n", + "print(nationality_freq[0:10].to_string())\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many unique nationalities are there?\n", + "print(\"There are \" + str(nationality_freq.index.size) + \" unique nationalities in the dataset\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews\n", + "for nat in nationality_freq[:10].index:\n", + " # First, extract all the rows that match the criteria into a new dataframe\n", + " nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n", + " # Now get the hotel freq\n", + " freq = nat_df[\"Hotel_Name\"].value_counts()\n", + " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?\n", + "# First create a new dataframe based on the old one, removing the uneeded columns\n", + "hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)\n", + "# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found\n", + "hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n", + "# Get rid of all the duplicated rows\n", + "hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n", + "print()\n", + "print(hotel_freq_df.to_string())\n", + "print(str(hotel_freq_df.shape))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# While there is an `Average_Score` for each hotel according to the dataset, \n", + "# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)\n", + "# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. \n", + "df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n", + "# Add a new column with the difference between the two average scores\n", + "df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n", + "# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n", + "review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n", + "# Sort the dataframe to find the lowest and highest average score difference\n", + "review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n", + "print(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])\n", + "# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?\n" + ] + } + ] +} \ No newline at end of file diff --git a/6-NLP/4-Hotel-Reviews-1/solution/notebook-tags.ipynb b/6-NLP/4-Hotel-Reviews-1/solution/notebook-tags.ipynb new file mode 100644 index 000000000..494b79479 --- /dev/null +++ b/6-NLP/4-Hotel-Reviews-1/solution/notebook-tags.ipynb @@ -0,0 +1,97 @@ +{ + "metadata": { + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": 3 + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the hotel reviews from CSV (you can )\n", + "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We want to find the most useful tags to keep\n", + "# Remove opening and closing brackets\n", + "df.Tags = df.Tags.str.strip(\"[']\")\n", + "# remove all quotes too\n", + "df.Tags = df.Tags.str.replace(\" ', '\", \",\", regex = False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# removing this to take advantage of the 'already a phrase' fact of the dataset \n", + "# Now split the strings into a list\n", + "tag_list_df = df.Tags.str.split(',', expand = True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove leading and trailing spaces\n", + "df[\"Tag_1\"] = tag_list_df[0].str.strip()\n", + "df[\"Tag_2\"] = tag_list_df[1].str.strip()\n", + "df[\"Tag_3\"] = tag_list_df[2].str.strip()\n", + "df[\"Tag_4\"] = tag_list_df[3].str.strip()\n", + "df[\"Tag_5\"] = tag_list_df[4].str.strip()\n", + "df[\"Tag_6\"] = tag_list_df[5].str.strip()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Merge the 6 columns into one with melt\n", + "df_tags = df.melt(value_vars=[\"Tag_1\", \"Tag_2\", \"Tag_3\", \"Tag_4\", \"Tag_5\", \"Tag_6\"])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the value counts\n", + "tag_vc = df_tags.value.value_counts()\n", + "# print(tag_vc)\n", + "print(\"The shape of the tags with no filtering:\", str(df_tags.shape))\n", + "# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000\n", + "df_tags = df_tags[~df_tags.value.str.contains(\"Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double\", na=False, case=False)]\n", + "tag_vc = df_tags.value.value_counts().reset_index(name=\"count\").query(\"count > 1000\")\n", + "# Print the top 10 (there should only be 9 and we'll use these in the filtering section)\n", + "print(tag_vc[:10])" + ] + } + ] +} \ No newline at end of file diff --git a/6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb b/6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb deleted file mode 100644 index e69de29bb..000000000