From 145a42e3302f50a8b67fea55546471f722b72fd6 Mon Sep 17 00:00:00 2001 From: "Stephen Howell (MSFT)" <38020233+stephen-howell@users.noreply.github.com> Date: Fri, 25 Jun 2021 01:38:35 +0100 Subject: [PATCH] Create Hotel_Reviews_Sentiment_Analysis.py --- .../Hotel_Reviews_Sentiment_Analysis.py | 56 +++++++++++++++++++ 1 file changed, 56 insertions(+) create mode 100644 6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py diff --git a/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py new file mode 100644 index 00000000..c53df2b3 --- /dev/null +++ b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py @@ -0,0 +1,56 @@ +import time +import pandas as pd +from nltk.corpus import stopwords +from nltk.sentiment.vader import SentimentIntensityAnalyzer + +# Create the vader sentiment analyser (there are others in NLTK you can try too) +vader_sentiment = SentimentIntensityAnalyzer() +# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. +# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. + +# There are 3 possibilities of input for a review: +# It could be "No Negative", in which case, return 0 +# It could be "No Positive", in which case, return 0 +# It could be a review, in which case calculate the sentiment +def calc_sentiment(review): + if review == "No Negative" or review == "No Positive": + return 0 + return vader_sentiment.polarity_scores(review)["compound"] + +# Load the hotel reviews from CSV +df = pd.read_csv("Hotel_Reviews_Filtered.csv") + +# Remove stop words - can be slow for a lot of text! +# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches +# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends +start = time.time() +cache = set(stopwords.words("english")) +def remove_stopwords(review): + text = " ".join([word for word in review.split() if word not in cache]) + return text + +# Remove the stop words from both columns +df.Negative_Review = df.Negative_Review.apply(remove_stopwords) +df.Positive_Review = df.Positive_Review.apply(remove_stopwords) + +end = time.time() +print("Removing stop words took " + str(round(end - start, 2)) + " seconds") + +# Add a negative sentiment and positive sentiment column +print("Calculating sentiment columns for both positive and negative reviews") +start = time.time() +df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment) +df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment) +end = time.time() +print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds") + +df = df.sort_values(by=["Negative_Sentiment"], ascending=True) +print(df[["Negative_Review", "Negative_Sentiment"]]) +df = df.sort_values(by=["Positive_Sentiment"], ascending=True) +print(df[["Positive_Review", "Positive_Sentiment"]]) + +# Reorder the columns (This is cosmetic, but to make it easier to explore the data later) +df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1) + +print("Saving results to Hotel_Reviews_NLP.csv") +df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)