Create Hotel_Reviews_Sentiment_Analysis.py

4 years ago · 145a42e330
parent b1db678757
commit 145a42e330
1 changed files with 56 additions and 0 deletions
--- a/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py
+++ b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py
@ -0,0 +1,56 @@
+import time
+import pandas as pd
+from nltk.corpus import stopwords
+from nltk.sentiment.vader import SentimentIntensityAnalyzer
+
+# Create the vader sentiment analyser (there are others in NLTK you can try too)
+vader_sentiment = SentimentIntensityAnalyzer()
+# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. 
+# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
+
+# There are 3 possibilities of input for a review:
+# It could be "No Negative", in which case, return 0
+# It could be "No Positive", in which case, return 0
+# It could be a review, in which case calculate the sentiment
+def calc_sentiment(review):    
+    if review == "No Negative" or review == "No Positive":
+        return 0
+    return vader_sentiment.polarity_scores(review)["compound"]    
+
+# Load the hotel reviews from CSV
+df = pd.read_csv("Hotel_Reviews_Filtered.csv")
+
+# Remove stop words - can be slow for a lot of text!
+# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
+# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
+start = time.time()
+cache = set(stopwords.words("english"))
+def remove_stopwords(review):
+    text = " ".join([word for word in review.split() if word not in cache])
+    return text
+
+# Remove the stop words from both columns
+df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
+df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
+
+end = time.time()
+print("Removing stop words took " + str(round(end - start, 2)) + " seconds")
+
+# Add a negative sentiment and positive sentiment column
+print("Calculating sentiment columns for both positive and negative reviews")
+start = time.time()
+df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
+df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
+end = time.time()
+print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
+
+df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
+print(df[["Negative_Review", "Negative_Sentiment"]])
+df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
+print(df[["Positive_Review", "Positive_Sentiment"]])
+
+# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
+df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
+
+print("Saving results to Hotel_Reviews_NLP.csv")
+df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)