ML-For-Beginners/translations/pl/6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  },
  "coopTranslator": {
   "original_hash": "341efc86325ec2a214f682f57a189dfd",
   "translation_date": "2025-09-03T20:58:38+00:00",
   "source_file": "6-NLP/5-Hotel-Reviews-2/solution/2-notebook.ipynb",
   "language_code": "pl"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV (you can )\n",
    "import pandas as pd \n",
    "\n",
    "df = pd.read_csv('../../data/Hotel_Reviews_Filtered.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We want to find the most useful tags to keep\n",
    "# Remove opening and closing brackets\n",
    "df.Tags = df.Tags.str.strip(\"[']\")\n",
    "# remove all quotes too\n",
    "df.Tags = df.Tags.str.replace(\" ', '\", \",\", regex = False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# removing this to take advantage of the 'already a phrase' fact of the dataset \n",
    "# Now split the strings into a list\n",
    "tag_list_df = df.Tags.str.split(',', expand = True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove leading and trailing spaces\n",
    "df[\"Tag_1\"] = tag_list_df[0].str.strip()\n",
    "df[\"Tag_2\"] = tag_list_df[1].str.strip()\n",
    "df[\"Tag_3\"] = tag_list_df[2].str.strip()\n",
    "df[\"Tag_4\"] = tag_list_df[3].str.strip()\n",
    "df[\"Tag_5\"] = tag_list_df[4].str.strip()\n",
    "df[\"Tag_6\"] = tag_list_df[5].str.strip()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge the 6 columns into one with melt\n",
    "df_tags = df.melt(value_vars=[\"Tag_1\", \"Tag_2\", \"Tag_3\", \"Tag_4\", \"Tag_5\", \"Tag_6\"])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "The shape of the tags with no filtering: (2514684, 2)\n",
      "                        index   count\n",
      "0                Leisure trip  338423\n",
      "1                      Couple  205305\n",
      "2               Solo traveler   89779\n",
      "3               Business trip   68176\n",
      "4                       Group   51593\n",
      "5  Family with young children   49318\n",
      "6  Family with older children   21509\n",
      "7      Travelers with friends    1610\n",
      "8                  With a pet    1078\n"
     ]
    }
   ],
   "source": [
    "# Get the value counts\n",
    "tag_vc = df_tags.value.value_counts()\n",
    "# print(tag_vc)\n",
    "print(\"The shape of the tags with no filtering:\", str(df_tags.shape))\n",
    "# Drop rooms, suites, and length of stay, mobile device and anything with less count than a 1000\n",
    "df_tags = df_tags[~df_tags.value.str.contains(\"Standard|room|Stayed|device|Beds|Suite|Studio|King|Superior|Double\", na=False, case=False)]\n",
    "tag_vc = df_tags.value.value_counts().reset_index(name=\"count\").query(\"count > 1000\")\n",
    "# Print the top 10 (there should only be 9 and we'll use these in the filtering section)\n",
    "print(tag_vc[:10])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\n---\n\n**Zastrzeżenie**:  \nTen dokument został przetłumaczony za pomocą usługi tłumaczenia AI [Co-op Translator](https://github.com/Azure/co-op-translator). Chociaż dokładamy wszelkich starań, aby zapewnić poprawność tłumaczenia, prosimy pamiętać, że automatyczne tłumaczenia mogą zawierać błędy lub nieścisłości. Oryginalny dokument w jego rodzimym języku powinien być uznawany za autorytatywne źródło. W przypadku informacji o kluczowym znaczeniu zaleca się skorzystanie z profesjonalnego tłumaczenia przez człowieka. Nie ponosimy odpowiedzialności za jakiekolwiek nieporozumienia lub błędne interpretacje wynikające z użycia tego tłumaczenia.\n"
   ]
  }
 ]
}