From 145a42e3302f50a8b67fea55546471f722b72fd6 Mon Sep 17 00:00:00 2001
From: "Stephen Howell (MSFT)"
 <38020233+stephen-howell@users.noreply.github.com>
Date: Fri, 25 Jun 2021 01:38:35 +0100
Subject: [PATCH] Create Hotel_Reviews_Sentiment_Analysis.py

---
 .../Hotel_Reviews_Sentiment_Analysis.py       | 56 +++++++++++++++++++
 1 file changed, 56 insertions(+)
 create mode 100644 6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py

diff --git a/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py
new file mode 100644
index 00000000..c53df2b3
--- /dev/null
+++ b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py
@@ -0,0 +1,56 @@
+import time
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+
+# Create the vader sentiment analyser (there are others in NLTK you can try too)
+vader_sentiment = SentimentIntensityAnalyzer()
+# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. 
+# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+
+# There are 3 possibilities of input for a review:
+# It could be "No Negative", in which case, return 0
+# It could be "No Positive", in which case, return 0
+# It could be a review, in which case calculate the sentiment
+def calc_sentiment(review):    
+    if review == "No Negative" or review == "No Positive":
+        return 0
+    return vader_sentiment.polarity_scores(review)["compound"]    
+
+# Load the hotel reviews from CSV
+df = pd.read_csv("Hotel_Reviews_Filtered.csv")
+
+# Remove stop words - can be slow for a lot of text!
+# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
+# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
+start = time.time()
+cache = set(stopwords.words("english"))
+def remove_stopwords(review):
+    text = " ".join([word for word in review.split() if word not in cache])
+    return text
+
+# Remove the stop words from both columns
+df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
+df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
+
+end = time.time()
+print("Removing stop words took " + str(round(end - start, 2)) + " seconds")
+
+# Add a negative sentiment and positive sentiment column
+print("Calculating sentiment columns for both positive and negative reviews")
+start = time.time()
+df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
+df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
+end = time.time()
+print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
+
+df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
+print(df[["Negative_Review", "Negative_Sentiment"]])
+df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
+print(df[["Positive_Review", "Positive_Sentiment"]])
+
+# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
+df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
+
+print("Saving results to Hotel_Reviews_NLP.csv")
+df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)