Create Hotel_Reviews_Sentiment_Analysis.py

4 years ago · 145a42e330
parent b1db678757
commit 145a42e330
1 changed files with 56 additions and 0 deletions
--- a/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py
+++ b/6-NLP/5-Hotel-Reviews-2/Hotel_Reviews_Sentiment_Analysis.py
@ -0,0 +1,56 @@
 import time
 import pandas as pd
 from nltk.corpus import stopwords
 from nltk.sentiment.vader import SentimentIntensityAnalyzer
 # Create the vader sentiment analyser (there are others in NLTK you can try too)
 vader_sentiment = SentimentIntensityAnalyzer()
 # Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. 
 # Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
 # There are 3 possibilities of input for a review:
 # It could be "No Negative", in which case, return 0
 # It could be "No Positive", in which case, return 0
 # It could be a review, in which case calculate the sentiment
 def calc_sentiment(review):    
    if review == "No Negative" or review == "No Positive":
        return 0
    return vader_sentiment.polarity_scores(review)["compound"]    
 # Load the hotel reviews from CSV
 df = pd.read_csv("Hotel_Reviews_Filtered.csv")
 # Remove stop words - can be slow for a lot of text!
 # Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
 # https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
 start = time.time()
 cache = set(stopwords.words("english"))
 def remove_stopwords(review):
    text = " ".join([word for word in review.split() if word not in cache])
    return text
 # Remove the stop words from both columns
 df.Negative_Review = df.Negative_Review.apply(remove_stopwords)   
 df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
 end = time.time()
 print("Removing stop words took " + str(round(end - start, 2)) + " seconds")
 # Add a negative sentiment and positive sentiment column
 print("Calculating sentiment columns for both positive and negative reviews")
 start = time.time()
 df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
 df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
 end = time.time()
 print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
 df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
 print(df[["Negative_Review", "Negative_Sentiment"]])
 df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
 print(df[["Positive_Review", "Positive_Sentiment"]])
 # Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
 df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
 print("Saving results to Hotel_Reviews_NLP.csv")
 df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)