ML-For-Beginners/6-NLP/5-Hotel-Reviews-2/solution/3-notebook.ipynb

{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  },
  "orig_nbformat": 4,
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.7.0 64-bit ('3.7')"
  },
  "interpreter": {
   "hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stderr",
     "text": [
      "[nltk_data] Downloading package vader_lexicon to\n[nltk_data]     /Users/jenlooper/nltk_data...\n"
     ]
    },
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "metadata": {},
     "execution_count": 9
    }
   ],
   "source": [
    "import time\n",
    "import pandas as pd\n",
    "import nltk as nltk\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "nltk.download('vader_lexicon')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "vader_sentiment = SentimentIntensityAnalyzer()\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# There are 3 possibilities of input for a review:\n",
    "# It could be \"No Negative\", in which case, return 0\n",
    "# It could be \"No Positive\", in which case, return 0\n",
    "# It could be a review, in which case calculate the sentiment\n",
    "def calc_sentiment(review):    \n",
    "    if review == \"No Negative\" or review == \"No Positive\":\n",
    "        return 0\n",
    "    return vader_sentiment.polarity_scores(review)[\"compound\"]    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load the hotel reviews from CSV\n",
    "df = pd.read_csv(\"../../data/Hotel_Reviews_Filtered.csv\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove stop words - can be slow for a lot of text!\n",
    "# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches\n",
    "# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends\n",
    "start = time.time()\n",
    "cache = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(review):\n",
    "    text = \" \".join([word for word in review.split() if word not in cache])\n",
    "    return text\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove the stop words from both columns\n",
    "df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   \n",
    "df.Positive_Review = df.Positive_Review.apply(remove_stopwords)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Removing stop words took 5.77 seconds\n"
     ]
    }
   ],
   "source": [
    "end = time.time()\n",
    "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Calculating sentiment columns for both positive and negative reviews\n",
      "Calculating sentiment took 201.07 seconds\n"
     ]
    }
   ],
   "source": [
    "# Add a negative sentiment and positive sentiment column\n",
    "print(\"Calculating sentiment columns for both positive and negative reviews\")\n",
    "start = time.time()\n",
    "df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",
    "df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",
    "end = time.time()\n",
    "print(\"Calculating sentiment took \" + str(round(end - start, 2)) + \" seconds\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "                                          Negative_Review  Negative_Sentiment\n",
      "186584  So bad experience memories I hotel The first n...             -0.9920\n",
      "129503  First charged twice room booked booking second...             -0.9896\n",
      "307286  The staff Had bad experience even booking Janu...             -0.9889\n",
      "452092  No WLAN room Incredibly rude restaurant staff ...             -0.9884\n",
      "201293  We usually traveling Paris 2 3 times year busi...             -0.9873\n",
      "...                                                   ...                 ...\n",
      "26899   I would say however one night expensive even d...              0.9933\n",
      "138365  Wifi terribly slow I speed test network upload...              0.9938\n",
      "79215   I find anything hotel first I walked past hote...              0.9938\n",
      "278506  The property great location There bakery next ...              0.9945\n",
      "339189  Guys I like hotel I wish return next year Howe...              0.9948\n",
      "\n",
      "[515738 rows x 2 columns]\n",
      "                                          Positive_Review  Positive_Sentiment\n",
      "137893  Bathroom Shower We going stay twice hotel 2 ni...             -0.9820\n",
      "5839    I completely disappointed mad since reception ...             -0.9780\n",
      "64158   get everything extra internet parking breakfas...             -0.9751\n",
      "124178  I didnt like anythig Room small Asked upgrade ...             -0.9721\n",
      "489137  Very rude manager abusive staff reception Dirt...             -0.9703\n",
      "...                                                   ...                 ...\n",
      "331570  Everything This recently renovated hotel class...              0.9984\n",
      "322920  From moment stepped doors Guesthouse Hotel sta...              0.9985\n",
      "293710  This place surprise expected good actually gre...              0.9985\n",
      "417442  We celebrated wedding night Langham I commend ...              0.9985\n",
      "132492  We arrived super cute boutique hotel area expl...              0.9987\n",
      "\n",
      "[515738 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",
    "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",
    "df = df.sort_values(by=[\"Positive_Sentiment\"], ascending=True)\n",
    "print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)\n",
    "df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Saving results to Hotel_Reviews_NLP.csv\n"
     ]
    }
   ],
   "source": [
    "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",
    "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ]
}
moving files from py to notebook 3 years ago			`{`
			`"metadata": {`
			`"language_info": {`
			`"codemirror_mode": {`
			`"name": "ipython",`
			`"version": 3`
			`},`
			`"file_extension": ".py",`
			`"mimetype": "text/x-python",`
			`"name": "python",`
			`"nbconvert_exporter": "python",`
			`"pygments_lexer": "ipython3",`
notebook edits 3 years ago			`"version": "3.7.0"`
moving files from py to notebook 3 years ago			`},`
notebook edits 3 years ago			`"orig_nbformat": 4,`
			`"kernelspec": {`
			`"name": "python3",`
			`"display_name": "Python 3.7.0 64-bit ('3.7')"`
			`},`
			`"interpreter": {`
			`"hash": "70b38d7a306a849643e446cd70466270a13445e5987dfa1344ef2b127438fa4d"`
			`}`
moving files from py to notebook 3 years ago			`},`
			`"nbformat": 4,`
			`"nbformat_minor": 2,`
			`"cells": [`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 9,`
moving files from py to notebook 3 years ago			`"metadata": {},`
notebook edits 3 years ago			`"outputs": [`
			`{`
			`"output_type": "stream",`
			`"name": "stderr",`
			`"text": [`
			`"[nltk_data] Downloading package vader_lexicon to\n[nltk_data] /Users/jenlooper/nltk_data...\n"`
			`]`
			`},`
			`{`
			`"output_type": "execute_result",`
			`"data": {`
			`"text/plain": [`
			`"True"`
			`]`
			`},`
			`"metadata": {},`
			`"execution_count": 9`
			`}`
			`],`
moving files from py to notebook 3 years ago			`"source": [`
			`"import time\n",`
			`"import pandas as pd\n",`
notebook edits 3 years ago			`"import nltk as nltk\n",`
moving files from py to notebook 3 years ago			`"from nltk.corpus import stopwords\n",`
notebook edits 3 years ago			`"from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",`
			`"nltk.download('vader_lexicon')\n"`
moving files from py to notebook 3 years ago			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 10,`
moving files from py to notebook 3 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"vader_sentiment = SentimentIntensityAnalyzer()\n",`
notebook edits 3 years ago			`"\n"`
moving files from py to notebook 3 years ago			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 11,`
moving files from py to notebook 3 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# There are 3 possibilities of input for a review:\n",`
			`"# It could be \"No Negative\", in which case, return 0\n",`
			`"# It could be \"No Positive\", in which case, return 0\n",`
			`"# It could be a review, in which case calculate the sentiment\n",`
			`"def calc_sentiment(review): \n",`
			`" if review == \"No Negative\" or review == \"No Positive\":\n",`
			`" return 0\n",`
			`" return vader_sentiment.polarity_scores(review)[\"compound\"] \n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 12,`
moving files from py to notebook 3 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Load the hotel reviews from CSV\n",`
			`"df = pd.read_csv(\"../../data/Hotel_Reviews_Filtered.csv\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 13,`
moving files from py to notebook 3 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Remove stop words - can be slow for a lot of text!\n",`
			`"# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches\n",`
			`"# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends\n",`
			`"start = time.time()\n",`
			`"cache = set(stopwords.words(\"english\"))\n",`
			`"def remove_stopwords(review):\n",`
			`" text = \" \".join([word for word in review.split() if word not in cache])\n",`
			`" return text\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 14,`
moving files from py to notebook 3 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Remove the stop words from both columns\n",`
			`"df.Negative_Review = df.Negative_Review.apply(remove_stopwords) \n",`
			`"df.Positive_Review = df.Positive_Review.apply(remove_stopwords)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 15,`
moving files from py to notebook 3 years ago			`"metadata": {},`
notebook edits 3 years ago			`"outputs": [`
			`{`
			`"output_type": "stream",`
			`"name": "stdout",`
			`"text": [`
			`"Removing stop words took 5.77 seconds\n"`
			`]`
			`}`
			`],`
moving files from py to notebook 3 years ago			`"source": [`
			`"end = time.time()\n",`
			`"print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 16,`
moving files from py to notebook 3 years ago			`"metadata": {},`
notebook edits 3 years ago			`"outputs": [`
			`{`
			`"output_type": "stream",`
			`"name": "stdout",`
			`"text": [`
			`"Calculating sentiment columns for both positive and negative reviews\n",`
			`"Calculating sentiment took 201.07 seconds\n"`
			`]`
			`}`
			`],`
moving files from py to notebook 3 years ago			`"source": [`
			`"# Add a negative sentiment and positive sentiment column\n",`
			`"print(\"Calculating sentiment columns for both positive and negative reviews\")\n",`
			`"start = time.time()\n",`
			`"df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n",`
			`"df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n",`
			`"end = time.time()\n",`
			`"print(\"Calculating sentiment took \" + str(round(end - start, 2)) + \" seconds\")\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 17,`
moving files from py to notebook 3 years ago			`"metadata": {},`
notebook edits 3 years ago			`"outputs": [`
			`{`
			`"output_type": "stream",`
			`"name": "stdout",`
			`"text": [`
			`" Negative_Review Negative_Sentiment\n",`
			`"186584 So bad experience memories I hotel The first n... -0.9920\n",`
			`"129503 First charged twice room booked booking second... -0.9896\n",`
			`"307286 The staff Had bad experience even booking Janu... -0.9889\n",`
			`"452092 No WLAN room Incredibly rude restaurant staff ... -0.9884\n",`
			`"201293 We usually traveling Paris 2 3 times year busi... -0.9873\n",`
			`"... ... ...\n",`
			`"26899 I would say however one night expensive even d... 0.9933\n",`
			`"138365 Wifi terribly slow I speed test network upload... 0.9938\n",`
			`"79215 I find anything hotel first I walked past hote... 0.9938\n",`
			`"278506 The property great location There bakery next ... 0.9945\n",`
			`"339189 Guys I like hotel I wish return next year Howe... 0.9948\n",`
			`"\n",`
			`"[515738 rows x 2 columns]\n",`
			`" Positive_Review Positive_Sentiment\n",`
			`"137893 Bathroom Shower We going stay twice hotel 2 ni... -0.9820\n",`
			`"5839 I completely disappointed mad since reception ... -0.9780\n",`
			`"64158 get everything extra internet parking breakfas... -0.9751\n",`
			`"124178 I didnt like anythig Room small Asked upgrade ... -0.9721\n",`
			`"489137 Very rude manager abusive staff reception Dirt... -0.9703\n",`
			`"... ... ...\n",`
			`"331570 Everything This recently renovated hotel class... 0.9984\n",`
			`"322920 From moment stepped doors Guesthouse Hotel sta... 0.9985\n",`
			`"293710 This place surprise expected good actually gre... 0.9985\n",`
			`"417442 We celebrated wedding night Langham I commend ... 0.9985\n",`
			`"132492 We arrived super cute boutique hotel area expl... 0.9987\n",`
			`"\n",`
			`"[515738 rows x 2 columns]\n"`
			`]`
			`}`
			`],`
moving files from py to notebook 3 years ago			`"source": [`
			`"df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n",`
			`"print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n",`
			`"df = df.sort_values(by=[\"Positive_Sentiment\"], ascending=True)\n",`
			`"print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 18,`
moving files from py to notebook 3 years ago			`"metadata": {},`
			`"outputs": [],`
			`"source": [`
			`"# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)\n",`
			`"df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis=1)\n"`
			`]`
			`},`
			`{`
			`"cell_type": "code",`
notebook edits 3 years ago			`"execution_count": 19,`
moving files from py to notebook 3 years ago			`"metadata": {},`
notebook edits 3 years ago			`"outputs": [`
			`{`
			`"output_type": "stream",`
			`"name": "stdout",`
			`"text": [`
			`"Saving results to Hotel_Reviews_NLP.csv\n"`
			`]`
			`}`
			`],`
moving files from py to notebook 3 years ago			`"source": [`
			`"print(\"Saving results to Hotel_Reviews_NLP.csv\")\n",`
			`"df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n"`
			`]`
notebook edits 3 years ago			`},`
			`{`
			`"cell_type": "code",`
			`"execution_count": null,`
			`"metadata": {},`
			`"outputs": [],`
			`"source": []`
moving files from py to notebook 3 years ago			`}`
			`]`
			`}`