import time import pandas as pd from nltk.corpus import stopwords from nltk.sentiment.vader import SentimentIntensityAnalyzer # Create the vader sentiment analyser (there are others in NLTK you can try too) vader_sentiment = SentimentIntensityAnalyzer() # Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. # Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. # There are 3 possibilities of input for a review: # It could be "No Negative", in which case, return 0 # It could be "No Positive", in which case, return 0 # It could be a review, in which case calculate the sentiment def calc_sentiment(review): if review == "No Negative" or review == "No Positive": return 0 return vader_sentiment.polarity_scores(review)["compound"] # Load the hotel reviews from CSV df = pd.read_csv("Hotel_Reviews_Filtered.csv") # Remove stop words - can be slow for a lot of text! # Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches # https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends start = time.time() cache = set(stopwords.words("english")) def remove_stopwords(review): text = " ".join([word for word in review.split() if word not in cache]) return text # Remove the stop words from both columns df.Negative_Review = df.Negative_Review.apply(remove_stopwords) df.Positive_Review = df.Positive_Review.apply(remove_stopwords) end = time.time() print("Removing stop words took " + str(round(end - start, 2)) + " seconds") # Add a negative sentiment and positive sentiment column print("Calculating sentiment columns for both positive and negative reviews") start = time.time() df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment) df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment) end = time.time() print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds") df = df.sort_values(by=["Negative_Sentiment"], ascending=True) print(df[["Negative_Review", "Negative_Sentiment"]]) df = df.sort_values(by=["Positive_Sentiment"], ascending=True) print(df[["Positive_Review", "Positive_Sentiment"]]) # Reorder the columns (This is cosmetic, but to make it easier to explore the data later) df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1) print("Saving results to Hotel_Reviews_NLP.csv") df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)