parent
b1db678757
commit
145a42e330
@ -0,0 +1,56 @@
|
||||
import time
|
||||
import pandas as pd
|
||||
from nltk.corpus import stopwords
|
||||
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||
|
||||
# Create the vader sentiment analyser (there are others in NLTK you can try too)
|
||||
vader_sentiment = SentimentIntensityAnalyzer()
|
||||
# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.
|
||||
# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
||||
|
||||
# There are 3 possibilities of input for a review:
|
||||
# It could be "No Negative", in which case, return 0
|
||||
# It could be "No Positive", in which case, return 0
|
||||
# It could be a review, in which case calculate the sentiment
|
||||
def calc_sentiment(review):
|
||||
if review == "No Negative" or review == "No Positive":
|
||||
return 0
|
||||
return vader_sentiment.polarity_scores(review)["compound"]
|
||||
|
||||
# Load the hotel reviews from CSV
|
||||
df = pd.read_csv("Hotel_Reviews_Filtered.csv")
|
||||
|
||||
# Remove stop words - can be slow for a lot of text!
|
||||
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
|
||||
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
|
||||
start = time.time()
|
||||
cache = set(stopwords.words("english"))
|
||||
def remove_stopwords(review):
|
||||
text = " ".join([word for word in review.split() if word not in cache])
|
||||
return text
|
||||
|
||||
# Remove the stop words from both columns
|
||||
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)
|
||||
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
|
||||
|
||||
end = time.time()
|
||||
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")
|
||||
|
||||
# Add a negative sentiment and positive sentiment column
|
||||
print("Calculating sentiment columns for both positive and negative reviews")
|
||||
start = time.time()
|
||||
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
|
||||
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
|
||||
end = time.time()
|
||||
print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
|
||||
|
||||
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
|
||||
print(df[["Negative_Review", "Negative_Sentiment"]])
|
||||
df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
|
||||
print(df[["Positive_Review", "Positive_Sentiment"]])
|
||||
|
||||
# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
|
||||
df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
|
||||
|
||||
print("Saving results to Hotel_Reviews_NLP.csv")
|
||||
df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)
|
Loading…
Reference in new issue