parent
b1db678757
commit
145a42e330
@ -0,0 +1,56 @@
|
|||||||
|
import time
|
||||||
|
import pandas as pd
|
||||||
|
from nltk.corpus import stopwords
|
||||||
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer
|
||||||
|
|
||||||
|
# Create the vader sentiment analyser (there are others in NLTK you can try too)
|
||||||
|
vader_sentiment = SentimentIntensityAnalyzer()
|
||||||
|
# Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.
|
||||||
|
# Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
|
||||||
|
|
||||||
|
# There are 3 possibilities of input for a review:
|
||||||
|
# It could be "No Negative", in which case, return 0
|
||||||
|
# It could be "No Positive", in which case, return 0
|
||||||
|
# It could be a review, in which case calculate the sentiment
|
||||||
|
def calc_sentiment(review):
|
||||||
|
if review == "No Negative" or review == "No Positive":
|
||||||
|
return 0
|
||||||
|
return vader_sentiment.polarity_scores(review)["compound"]
|
||||||
|
|
||||||
|
# Load the hotel reviews from CSV
|
||||||
|
df = pd.read_csv("Hotel_Reviews_Filtered.csv")
|
||||||
|
|
||||||
|
# Remove stop words - can be slow for a lot of text!
|
||||||
|
# Ryan Han (ryanxjhan on Kaggle) has a great post measuring performance of different stop words removal approaches
|
||||||
|
# https://www.kaggle.com/ryanxjhan/fast-stop-words-removal # using the approach that Ryan recommends
|
||||||
|
start = time.time()
|
||||||
|
cache = set(stopwords.words("english"))
|
||||||
|
def remove_stopwords(review):
|
||||||
|
text = " ".join([word for word in review.split() if word not in cache])
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Remove the stop words from both columns
|
||||||
|
df.Negative_Review = df.Negative_Review.apply(remove_stopwords)
|
||||||
|
df.Positive_Review = df.Positive_Review.apply(remove_stopwords)
|
||||||
|
|
||||||
|
end = time.time()
|
||||||
|
print("Removing stop words took " + str(round(end - start, 2)) + " seconds")
|
||||||
|
|
||||||
|
# Add a negative sentiment and positive sentiment column
|
||||||
|
print("Calculating sentiment columns for both positive and negative reviews")
|
||||||
|
start = time.time()
|
||||||
|
df["Negative_Sentiment"] = df.Negative_Review.apply(calc_sentiment)
|
||||||
|
df["Positive_Sentiment"] = df.Positive_Review.apply(calc_sentiment)
|
||||||
|
end = time.time()
|
||||||
|
print("Calculating sentiment took " + str(round(end - start, 2)) + " seconds")
|
||||||
|
|
||||||
|
df = df.sort_values(by=["Negative_Sentiment"], ascending=True)
|
||||||
|
print(df[["Negative_Review", "Negative_Sentiment"]])
|
||||||
|
df = df.sort_values(by=["Positive_Sentiment"], ascending=True)
|
||||||
|
print(df[["Positive_Review", "Positive_Sentiment"]])
|
||||||
|
|
||||||
|
# Reorder the columns (This is cosmetic, but to make it easier to explore the data later)
|
||||||
|
df = df.reindex(["Hotel_Name", "Hotel_Address", "Total_Number_of_Reviews", "Average_Score", "Reviewer_Score", "Negative_Sentiment", "Positive_Sentiment", "Reviewer_Nationality", "Leisure_trip", "Couple", "Solo_traveler", "Business_trip", "Group", "Family_with_young_children", "Family_with_older_children", "With_a_pet", "Negative_Review", "Positive_Review"], axis=1)
|
||||||
|
|
||||||
|
print("Saving results to Hotel_Reviews_NLP.csv")
|
||||||
|
df.to_csv(r"Hotel_Reviews_NLP.csv", index = False)
|
Loading…
Reference in new issue