You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
ML-For-Beginners/translations/hr/6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb

174 lines
6.7 KiB

{
"metadata": {
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": 3
},
"orig_nbformat": 4,
"coopTranslator": {
"original_hash": "2d05e7db439376aa824f4b387f8324ca",
"translation_date": "2025-09-04T09:29:35+00:00",
"source_file": "6-NLP/4-Hotel-Reviews-1/solution/notebook.ipynb",
"language_code": "hr"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# EDA\n",
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_difference_review_avg(row):\n",
" return row[\"Average_Score\"] - row[\"Calc_Average_Score\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load the hotel reviews from CSV\n",
"print(\"Loading data file now, this could take a while depending on file size\")\n",
"start = time.time()\n",
"df = pd.read_csv('../../data/Hotel_Reviews.csv')\n",
"end = time.time()\n",
"print(\"Loading took \" + str(round(end - start, 2)) + \" seconds\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# What shape is the data (rows, columns)?\n",
"print(\"The shape of the data (rows, cols) is \" + str(df.shape))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# value_counts() creates a Series object that has index and values\n",
"# in this case, the country and the frequency they occur in reviewer nationality\n",
"nationality_freq = df[\"Reviewer_Nationality\"].value_counts()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# What reviewer nationality is the most common in the dataset?\n",
"print(\"The highest frequency reviewer nationality is \" + str(nationality_freq.index[0]).strip() + \" with \" + str(nationality_freq[0]) + \" reviews.\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# What is the top 10 most common nationalities and their frequencies?\n",
"print(\"The top 10 highest frequency reviewer nationalities are:\")\n",
"print(nationality_freq[0:10].to_string())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# How many unique nationalities are there?\n",
"print(\"There are \" + str(nationality_freq.index.size) + \" unique nationalities in the dataset\")\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# What was the most frequently reviewed hotel for the top 10 nationalities - print the hotel and number of reviews\n",
"for nat in nationality_freq[:10].index:\n",
" # First, extract all the rows that match the criteria into a new dataframe\n",
" nat_df = df[df[\"Reviewer_Nationality\"] == nat] \n",
" # Now get the hotel freq\n",
" freq = nat_df[\"Hotel_Name\"].value_counts()\n",
" print(\"The most reviewed hotel for \" + str(nat).strip() + \" was \" + str(freq.index[0]) + \" with \" + str(freq[0]) + \" reviews.\") \n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# How many reviews are there per hotel (frequency count of hotel) and do the results match the value in `Total_Number_of_Reviews`?\n",
"# First create a new dataframe based on the old one, removing the uneeded columns\n",
"hotel_freq_df = df.drop([\"Hotel_Address\", \"Additional_Number_of_Scoring\", \"Review_Date\", \"Average_Score\", \"Reviewer_Nationality\", \"Negative_Review\", \"Review_Total_Negative_Word_Counts\", \"Positive_Review\", \"Review_Total_Positive_Word_Counts\", \"Total_Number_of_Reviews_Reviewer_Has_Given\", \"Reviewer_Score\", \"Tags\", \"days_since_review\", \"lat\", \"lng\"], axis = 1)\n",
"# Group the rows by Hotel_Name, count them and put the result in a new column Total_Reviews_Found\n",
"hotel_freq_df['Total_Reviews_Found'] = hotel_freq_df.groupby('Hotel_Name').transform('count')\n",
"# Get rid of all the duplicated rows\n",
"hotel_freq_df = hotel_freq_df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
"print()\n",
"print(hotel_freq_df.to_string())\n",
"print(str(hotel_freq_df.shape))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# While there is an `Average_Score` for each hotel according to the dataset, \n",
"# you can also calculate an average score (getting the average of all reviewer scores in the dataset for each hotel)\n",
"# Add a new column to your dataframe with the column header `Calc_Average_Score` that contains that calculated average. \n",
"df['Calc_Average_Score'] = round(df.groupby('Hotel_Name').Reviewer_Score.transform('mean'), 1)\n",
"# Add a new column with the difference between the two average scores\n",
"df[\"Average_Score_Difference\"] = df.apply(get_difference_review_avg, axis = 1)\n",
"# Create a df without all the duplicates of Hotel_Name (so only 1 row per hotel)\n",
"review_scores_df = df.drop_duplicates(subset = [\"Hotel_Name\"])\n",
"# Sort the dataframe to find the lowest and highest average score difference\n",
"review_scores_df = review_scores_df.sort_values(by=[\"Average_Score_Difference\"])\n",
"print(review_scores_df[[\"Average_Score_Difference\", \"Average_Score\", \"Calc_Average_Score\", \"Hotel_Name\"]])\n",
"# Do any hotels have the same (rounded to 1 decimal place) `Average_Score` and `Calc_Average_Score`?\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n---\n\n**Odricanje od odgovornosti**: \nOvaj dokument je preveden korištenjem AI usluge za prevođenje [Co-op Translator](https://github.com/Azure/co-op-translator). Iako nastojimo osigurati točnost, imajte na umu da automatski prijevodi mogu sadržavati pogreške ili netočnosti. Izvorni dokument na izvornom jeziku treba smatrati mjerodavnim izvorom. Za ključne informacije preporučuje se profesionalni prijevod od strane ljudskog prevoditelja. Ne preuzimamo odgovornost za bilo kakva nesporazuma ili pogrešna tumačenja koja proizlaze iz korištenja ovog prijevoda.\n"
]
}
]
}