{ "metadata": { "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": 3 }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2, "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import time\n", "import pandas as pd\n", "from nltk.corpus import stopwords\n", "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create the vader sentiment analyser (there are others in NLTK you can try too)\n", "vader_sentiment = SentimentIntensityAnalyzer()\n", "# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. \n", "# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# There are 3 possibilities of input for a review:\n", "# It could be \"No Negative\", in which case, return 0\n", "# It could be \"No Positive\", in which case, return 0\n", "# It could be a review, in which case calculate the sentiment\n", "def calc_sentiment(review): \n", " if review == \"No Negative\" or review == \"No Positive\":\n", " return 0\n", " return vader_sentiment.polarity_scores(review)[\"compound\"] \n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load the hotel reviews from CSV\n", "df = pd.read_csv(\"../../data/Hotel_Reviews_Filtered.csv\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Remove stop words - can be slow for a lot of text!\n", "# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches\n", "# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends\n", "start = time.time()\n", "cache = set(stopwords.words(\"english\"))\n", "def remove_stopwords(review):\n", " text = \" \".join([word for word in review.split() if word not in cache])\n", " return text\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Remove the stop words from both columns\n", "df.Negative_Review = df.Negative_Review.apply(remove_stopwords) \n", "df.Positive_Review = df.Positive_Review.apply(remove_stopwords)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "end = time.time()\n", "print(\"Removing stop words took \" + str(round(end - start, 2)) + \" seconds\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Add a negative sentiment and positive sentiment column\n", "print(\"Calculating sentiment columns for both positive and negative reviews\")\n", "start = time.time()\n", "df[\"Negative_Sentiment\"] = df.Negative_Review.apply(calc_sentiment)\n", "df[\"Positive_Sentiment\"] = df.Positive_Review.apply(calc_sentiment)\n", "end = time.time()\n", "print(\"Calculating sentiment took \" + str(round(end - start, 2)) + \" seconds\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "df = df.sort_values(by=[\"Negative_Sentiment\"], ascending=True)\n", "print(df[[\"Negative_Review\", \"Negative_Sentiment\"]])\n", "df = df.sort_values(by=[\"Positive_Sentiment\"], ascending=True)\n", "print(df[[\"Positive_Review\", \"Positive_Sentiment\"]])\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)\n", "df = df.reindex([\"Hotel_Name\", \"Hotel_Address\", \"Total_Number_of_Reviews\", \"Average_Score\", \"Reviewer_Score\", \"Negative_Sentiment\", \"Positive_Sentiment\", \"Reviewer_Nationality\", \"Leisure_trip\", \"Couple\", \"Solo_traveler\", \"Business_trip\", \"Group\", \"Family_with_young_children\", \"Family_with_older_children\", \"With_a_pet\", \"Negative_Review\", \"Positive_Review\"], axis=1)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(\"Saving results to Hotel_Reviews_NLP.csv\")\n", "df.to_csv(r\"../../data/Hotel_Reviews_NLP.csv\", index = False)\n" ] } ] }