nlp

1 month ago · 8fcb80b6a7
parent 7a50781922
commit 8fcb80b6a7
1 changed files with 396 additions and 0 deletions
--- a/6-NLP/5-Hotel-Reviews-2/notebook.ipynb
+++ b/6-NLP/5-Hotel-Reviews-2/notebook.ipynb
@ -0,0 +1,396 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "df76afb5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hotel_Address\n",
+      "London, United Kingdom    262301\n",
+      "Barcelona, Spain           60149\n",
+      "Paris, France              59928\n",
+      "Amsterdam, Netherlands     57214\n",
+      "Vienna, Austria            38939\n",
+      "Milan, Italy               37207\n",
+      "Name: count, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import time\n",
+    "import ast\n",
+    "df = pd.read_csv('../data/Hotel_Reviews.csv')\n",
+    "def replace_address(row):\n",
+    "    if \"Netherlands\" in row[\"Hotel_Address\"]:\n",
+    "        return \"Amsterdam, Netherlands\"\n",
+    "    elif \"Barcelona\" in row[\"Hotel_Address\"]:\n",
+    "        return \"Barcelona, Spain\"\n",
+    "    elif \"United Kingdom\" in row[\"Hotel_Address\"]:\n",
+    "        return \"London, United Kingdom\"\n",
+    "    elif \"Milan\" in row[\"Hotel_Address\"]:        \n",
+    "        return \"Milan, Italy\"\n",
+    "    elif \"France\" in row[\"Hotel_Address\"]:\n",
+    "        return \"Paris, France\"\n",
+    "    elif \"Vienna\" in row[\"Hotel_Address\"]:\n",
+    "        return \"Vienna, Austria\" \n",
+    "\n",
+    "# Replace all the addresses with a shortened, more useful form\n",
+    "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)\n",
+    "# The sum of the value_counts() should add up to the total number of reviews\n",
+    "print(df[\"Hotel_Address\"].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e8d5be1d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Hotel_Name</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Hotel_Address</th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Amsterdam, Netherlands</th>\n",
+       "      <td>105</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Barcelona, Spain</th>\n",
+       "      <td>211</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>London, United Kingdom</th>\n",
+       "      <td>400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Milan, Italy</th>\n",
+       "      <td>162</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Paris, France</th>\n",
+       "      <td>458</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Vienna, Austria</th>\n",
+       "      <td>158</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                        Hotel_Name\n",
+       "Hotel_Address                     \n",
+       "Amsterdam, Netherlands         105\n",
+       "Barcelona, Spain               211\n",
+       "London, United Kingdom         400\n",
+       "Milan, Italy                   162\n",
+       "Paris, France                  458\n",
+       "Vienna, Austria                158"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "display(df.groupby(\"Hotel_Address\").agg({\"Hotel_Name\":\"nunique\"}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "3589cc50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "import pandas as pd\n",
+    "start = time.time()\n",
+    "df = pd.read_csv('../data/Hotel_Reviews.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "00950a13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop([\"lat\", \"lng\"], axis = 1, inplace = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9fb154fe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[\"Hotel_Address\"] = df.apply(replace_address, axis = 1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "b3119eab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop([\"Additional_Number_of_Scoring\"], axis = 1, inplace = True)\n",
+    "df.Total_Number_of_Reviews = df.groupby(\"Hotel_Name\")[\"Total_Number_of_Reviews\"].transform(\"count\")\n",
+    "df.Average_Score = round(df.groupby(\"Hotel_Name\")[\"Reviewer_Score\"].transform(\"mean\"), 1)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "5316be2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process the Tags into new columns\n",
+    "# The file Hotel_Reviews_Tags.py, identifies the most important tags\n",
+    "# Leisure trip, Couple, Solo traveler, Business trip, Group combined with Travelers with friends, \n",
+    "# Family with young children, Family with older children, With a pet\n",
+    "df[\"Leisure_trip\"] = df.Tags.apply(lambda tag: 1 if \"Leisure trip\" in tag else 0)\n",
+    "df[\"Couple\"] = df.Tags.apply(lambda tag: 1 if \"Couple\" in tag else 0)\n",
+    "df[\"Solo_traveler\"] = df.Tags.apply(lambda tag: 1 if \"Solo traveler\" in tag else 0)\n",
+    "df[\"Business_trip\"] = df.Tags.apply(lambda tag: 1 if \"Business trip\" in tag else 0)\n",
+    "df[\"Group\"] = df.Tags.apply(lambda tag: 1 if \"Group\" in tag or \"Travelers with friends\" in tag else 0)\n",
+    "df[\"Family_with_young_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with young children\" in tag else 0)\n",
+    "df[\"Family_with_older_children\"] = df.Tags.apply(lambda tag: 1 if \"Family with older children\" in tag else 0)\n",
+    "df[\"With_a_pet\"] = df.Tags.apply(lambda tag: 1 if \"With a pet\" in tag else 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "3c52267c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package vader_lexicon to\n",
+      "[nltk_data]     C:\\Users\\user\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package vader_lexicon is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saving results to Hotel_Reviews_NLP.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time \n",
+    "import pandas as pd\n",
+    "import nltk as nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+    "nltk.download('vader_lexicon')\n",
+    "df = pd.read_csv('../data/Hotel_Reviews_Filtered.csv')\n",
+    "sia = SentimentIntensityAnalyzer()\n",
+    "stop_words = stopwords.words('english')\n",
+    "def clean_text(text):\n",
+    "    if pd.isnull(text):\n",
+    "        return \"\"\n",
+    "    text = text.lower()\n",
+    "    tokens =nltk.word_tokenize(text)\n",
+    "    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]\n",
+    "    return \" \".join(tokens)\n",
+    "    # Apply sentiment analysis using VADER (on raw text here)\n",
+    "df['Positive_Review_Sentiment'] = df['Positive_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n",
+    "df['Negative_Review_Sentiment'] = df['Negative_Review'].apply(lambda x: sia.polarity_scores(str(x))['compound'])\n",
+    "\n",
+    "# Optional: combine positive and negative sentiments for an overall score\n",
+    "df['Overall_Sentiment'] = df[['Positive_Review_Sentiment', 'Negative_Review_Sentiment']].mean(axis=1)\n",
+    "\n",
+    "# Save the dataframe with new NLP data\n",
+    "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
+    "df.to_csv('../data/Hotel_Reviews_NLP.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "736c9d54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "df = pd.read_csv(\"../data/Hotel_Reviews_Filtered.csv\")\n",
+    "start = time.time()\n",
+    "cache = set(stopwords.words('english'))\n",
+    "def remove_stopwords(review):\n",
+    "    text = \"\".join([word for word in review.split() if word not in cache])\n",
+    "    return text\n",
+    "df.Negative_Review = df.Negative_Review.apply(remove_stopwords)\n",
+    "df.Positive_Review = df.Positive_Review.apply(remove_stopwords)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "aa8329d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
+    "vader_sentiment = SentimentIntensityAnalyzer()\n",
+    "def calc_sentiment(review):\n",
+    "    if review == \"No Negative\" or review == \"No Positive\":\n",
+    "        return 0\n",
+    "    return vader_sentiment.polarity_scores(review)[\"compound\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "944cabbc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Calculating sentiment columns for both positive and negative reviews\n",
+      "Calculationg sentiment took27.21 Seconds\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
+    "start = time.time()\n",
+    "df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",
+    "df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",
+    "end = time.time()\n",
+    "print(\"Calculationg sentiment took\" + str(round(end-start, 2)) + \" Seconds\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "1b530472",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "       Negative_Review  Negative_Sentiment\n",
+      "349307        Cheating             -0.5574\n",
+      "52956         Horrible             -0.5423\n",
+      "114472             bad             -0.5423\n",
+      "478167             Bad             -0.5423\n",
+      "147464             bad             -0.5423\n",
+      "...                ...                 ...\n",
+      "317800           Great              0.6249\n",
+      "397663         Awesome              0.6249\n",
+      "482922           great              0.6249\n",
+      "478966           Great              0.6249\n",
+      "119192           great              0.6249\n",
+      "\n",
+      "[515738 rows x 2 columns]\n",
+      "       Positive_Review  Positive_Sentiment\n",
+      "235836        disaster             -0.6249\n",
+      "501482             Bad             -0.5423\n",
+      "409738             bad             -0.5423\n",
+      "427819             bad             -0.5423\n",
+      "209542             bad             -0.5423\n",
+      "...                ...                 ...\n",
+      "429765            Love              0.6369\n",
+      "36292             BEST              0.6369\n",
+      "326446            best              0.6369\n",
+      "362320            Love              0.6369\n",
+      "232187            Love              0.6369\n",
+      "\n",
+      "[515738 rows x 2 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = df.sort_values(by = [\"Negative_Sentiment\"], ascending=True)\n",
+    "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
+    "df = df.sort_values(by=[\"Positive_Sentiment\"], ascending = True)\n",
+    "print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "62c07395",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saving results to Hotel_Reviews_NLP.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis = 1)\n",
+    "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
+    "df.to_csv(r\"../data/Hotel_Reviews_NLP.csv\", index= False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}