{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3 }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# EDA\n", "import pandas as pd\n", "import time" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def get_difference_review_avg(row):\n", " return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the hotel reviews from CSV\n", "print(\"Loading data file now, this could take a while depending on file size\")\n", "start = time.time()\n", "df = pd.read_csv('../../data/Hotel_Reviews.csv')\n", "end = time.time()\n", "print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What shape is the data (rows, columns)?\n", "print(\"The shape of the data (rows, cols) is \" + str(df.shape))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# value_counts() creates a Series object that has index and values\n", "# in this case, the country and the frequency they occur in reviewer nationality\n", "nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What reviewer nationality is the most common in the dataset?\n", "print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What is the top 10 most common nationalities and their frequencies?\n", "print(\"The top 10 highest frequency reviewer nationalities are:\")\n", "print(nationality_freq[0:10].to_string())\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# How many unique nationalities are there?\n", "print(\"There are \" + str(nationality_freq.index.size) + \" unique nationalities in the dataset\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews\n", "for nat in nationality_freq[:10].index:\n", " # First, extract all the rows that match the criteria into a new dataframe\n", " nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n", " # Now get the hotel freq\n", " freq = nat_df[\"Hotel_Name\"].value_counts()\n", " print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?\n", "# First create a new dataframe based on the old one, removing the uneeded columns\n", "hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)\n", "# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found\n", "hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n", "# Get rid of all the duplicated rows\n", "hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n", "print()\n", "print(hotel_freq_df.to_string())\n", "print(str(hotel_freq_df.shape))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# While there is an `Average_Score` for each hotel according to the dataset, \n", "# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)\n", "# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. \n", "df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n", "# Add a new column with the difference between the two average scores\n", "df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n", "# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n", "review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n", "# Sort the dataframe to find the lowest and highest average score difference\n", "review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n", "print(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])\n", "# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?\n" ] } ] }