Example Added to understand NLP better.

Data Extraction using Tweepy with extending sentiment analysis by TextBlob further semantic analysis using SpaCy by calculating cosine similarity and concluding it by providing all details in CSV format.
4 years ago · 84b9bb1c71
parent 2f69359deb
commit 84b9bb1c71
1 changed files with 229 additions and 0 deletions
--- a/6-NLP/3-Translation-Sentiment/Example/Twitter-Sentiment-Semantic-Analysis
+++ b/6-NLP/3-Translation-Sentiment/Example/Twitter-Sentiment-Semantic-Analysis
@ -0,0 +1,229 @@
 from tweepy import OAuthHandler
 from tweepy.streaming import StreamListener
 import tweepy
 import json
 import pandas as pd
 import csv
 import numpy as np
 import re
 from textblob import TextBlob
 import string
 import preprocessor as pp
 import os
 import time
 import matplotlib.pyplot as plt
 from wordcloud import WordCloud
 import wikipedia
 import spacy
 import nltk
 import en_core_web_lg
 nlp = en_core_web_lg.load()
 plt.style.use('fivethirtyeight')
 consumer_key = 'vbmjXqywZLhmv9558pByted7H7Q'
 consumer_secret = 'tVh6UXfW2atSX7665PylXwxQwhbKUWS7nwo8YvjlgnsdOgmItRx3y'
 #access_key = '477407786-2Q5Gx2'
 #access_secret = 'A4sesXVpq44zH'
 auth = tweepy.AppAuthHandler(consumer_key, consumer_secret)
 #auth = OAuthHandler(consumer_key, consumer_secret)
 #auth.set_access_token(access_key, access_secret)
 api = tweepy.API(auth,wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
 '''
 By default spaCy calculates cosine similarity. Similarity is determined by comparing word vectors or word embeddings, multi-dimensional meaning representations of a word.
 It returns return (numpy.dot(self.vector, other.vector) / (self_norm * other_norm))'''
 # Removing stopwords using for loop
 def remove_stopwords(text):
    doc = nlp(text.lower()) #1
    result = [] #2
    for token in doc: #3
        if token.text in nlp.Defaults.stop_words: #4
            continue
        result.append(token.text)#5
    return " ".join(result) #6
 def cleanTxt(text):
    text = re.sub('@[A-Za-z0–9]+', '', text)  # Removing @mentions
    text = re.sub('#', '', text)  # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text)  # Removing RT
    text = re.sub('https?:\/\/\S+', '', text)  # Removing hyperlink
    return text
 # Remove stopwords, punctuation, and pronouns
 def process_text(text):
    doc = nlp(text.lower())
    result = []
    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        if token.is_punct:
            continue
        if token.lemma_ == '-PRON-':
            continue
        result.append(token.lemma_)
    return " ".join(result)
 # Removing stopwords using list comprehension
 def remove_stopwords_fast(text):
    doc = nlp(text.lower())
    result = [token.text for token in doc if token.text not in nlp.Defaults.stop_words]
    return " ".join(result)
 '''
 # without lemmatization
 def remove_pronoun(text):
    doc = nlp(text.lower())
    result = [token for token in doc if token.lemma_ != '-PRON-']
    return " ".join(result)
 # with lemmatization   
 def remove_pronoun(text):
    doc = nlp(text.lower())
    result = [token.lemma_ for token in doc if token.lemma_ != '-PRON-']
 return " ".join(result)
 '''
 def wikiPedia(searchWord):
    result = wikipedia.summary(searchWord)
    result = cleanTxt(result)
    result = remove_stopwords_fast(result)
    result = process_text(result)
    result = nlp(result)
    return result
 def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
 def getPolarity(text):
    return TextBlob(text).sentiment.polarity
 def getAnalysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'
 def scraptweets(search_words, date_since, numTweets, numRuns, res):
    # Define a pandas dataframe to store the date:
    db_tweets = pd.DataFrame(
        columns=['username', 'acctdesc', 'verified', 'location', 'following', 'followers', 'totaltweets',
                 'usercreatedts', 'tweetcreatedts', 'retweetcount', 'text', 'hashtags', 'Subjectivity', 'Polarity',
                 'Analysis','Semantic'])
    program_start = time.time()
    doc1 = wikiPedia(search_words)
    for i in range(0, numRuns):
        # We will time how long it takes to scrape tweets for each run:
        start_run = time.time()
        tweets = tweepy.Cursor(api.search, q=search_words, lang="en", result_type=res, since=date_since,
                               tweet_mode='extended').items(numTweets)
        # Store these tweets into a python list
        tweet_list = [tweet for tweet in tweets]
        noTweets = 0
        for tweet in tweet_list:
            # Pull the values
            username = tweet.user.screen_name
            acctdesc = tweet.user.description
            verified = tweet.user.verified
            location = tweet.user.location
            following = tweet.user.friends_count
            followers = tweet.user.followers_count
            totaltweets = tweet.user.statuses_count
            usercreatedts = tweet.user.created_at
            tweetcreatedts = tweet.created_at
            retweetcount = tweet.retweet_count
            hashtags = tweet.entities['hashtags']
            try:
                text = tweet.retweeted_status.full_text
            except AttributeError:  # Not a Retweet
                text = tweet.full_text
            #text = cleanTxt(process_text(text))
            Subjectivity = getSubjectivity(cleanTxt(process_text(text)))
            Polarity = getPolarity(cleanTxt(process_text(text)))
            Analysis = getAnalysis(Polarity)
            Semantic = doc1.similarity(nlp(cleanTxt(process_text(text))))
            ith_tweet = [username, acctdesc, verified, location, following, followers, totaltweets,
                         usercreatedts, tweetcreatedts, retweetcount, text, hashtags, Subjectivity, Polarity, Analysis,Semantic]
            db_tweets.loc[len(db_tweets)] = ith_tweet
            noTweets += 1
        # Run ended:
        end_run = time.time()
        duration_run = round((end_run - start_run) / 60, 2)
        print('no. of tweets scraped for run {} is {}'.format(i + 1, noTweets))
        print('time take for {} run to complete is {} mins'.format(i + 1, duration_run))
    from datetime import datetime
    # Chunking/Sorting tweet based on Semantic Analysis
    db_tweets.sort_values("Semantic", axis = 0, ascending = False, 
                 inplace = True, na_position ='first') 
    to_csv_timestamp = datetime.today().strftime('%Y%m%d_%H%M%S')
    plt.figure(figsize=(8, 6))
    for i in range(0, db_tweets.shape[0]):
        plt.scatter(db_tweets["Polarity"][i], db_tweets["Subjectivity"][i], color='Red')
    # Dotted Graph
    plt.title('Sentiment Analysis')
    plt.xlabel('Polarity')
    plt.ylabel('Subjectivity')
    plt.show()
    # Count v/s Sentiment
    ptweets = db_tweets[db_tweets.Analysis == 'Positive']
    ptweets = ptweets['text']
    ptweets
    round((ptweets.shape[0] / db_tweets.shape[0]) * 100, 1)
    ntweets = db_tweets[db_tweets.Analysis == 'Negative']
    ntweets = ntweets['text']
    ntweets
    round((ntweets.shape[0] / db_tweets.shape[0]) * 100, 1)
    db_tweets['Analysis'].value_counts()
    plt.title('Sentiment Analysis')
    plt.xlabel('Sentiment')
    plt.ylabel('Counts')
    db_tweets['Analysis'].value_counts().plot(kind='bar')
    plt.show()
    # Word Cloud
    allWords = ' '.join([cleanTxt(process_text(twts)) for twts in db_tweets['text']])
    wordCloud = WordCloud(width=500, height=300, random_state=21, max_font_size=110).generate(allWords)
    plt.imshow(wordCloud, interpolation="bilinear")
    plt.axis('off')
    plt.show()
    filename = to_csv_timestamp + 'tweets.csv'
    db_tweets.to_csv(filename, index=False)
    program_end = time.time()
    print('Scraping has completed!')
    print('Total time taken to scrap is {} minutes.'.format(round(program_end - program_start) / 60, 2))
 # Initialise these variables:
 print('Enter your query -->if hashtag include it too<--, in case of multiple query add both query with --> OR')
 search_words = input()
 print('Date Since in YYYY-MM-DD format')
 date_since = input()
 print('Enter No of tweets to be scrapped for Analysis')
 numTweets = int(input())
 print('Enter No of Runs')
 numRuns = int(input())
 print('Enter result type --> mixed or popular <--')
 res = input()
 # Call the function scraptweets
 scraptweets(search_words, date_since, numTweets, numRuns, res)