Source code for bs_ds.saywhat

# -*- coding: utf-8 -*-

"""A collection of language processing tools."""

# import nltk
# nltk.download('stopwords','punkt')

# from nltk.corpus import stopwords
# import string
# from nltk import word_tokenize, FreqDist
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics import accuracy_score
# from sklearn.datasets import fetch_20newsgroups
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.naive_bayes import MultinomialNB
# import pandas as pd
# import numpy as np
# np.random.seed(0)

[docs]def make_stopwords(punctuation=True):
    """Makes and returns a stopwords_list for enlgish combined with punctuation(default)."""
    import nltk
    # nltk.download('stopwords')s
    from nltk.corpus import stopwords
    try:
        stopwords.words('english')
    except LookupError:
        nltk.download('stopwords',quiet=True)

    import string
    stopwords_list = []
    stopwords_list = stopwords.words('english') + list(string.punctuation)
    stopwords_list += ["''", '""', '...', '``']
    return stopwords_list

[docs]def process_article(article, stopwords_list=make_stopwords()):
    """Source: Learn.Co Text Classification Lab"""
    import nltk
    tokens = nltk.word_tokenize(article)
    stopwords_removed = [token.lower() for token in tokens if token not in stopwords_list]
    return stopwords_removed

[docs]class W2vVectorizer(object):
    """From Learn.co Text Classification with Word Embeddings Lab.
    An sklearn-comaptible class containing the vectors for the fit Word2Vec."""

    def __init__(self, w2v, glove):
        # takes in a dictionary of words and vectors as input
        import numpy as np

        self.w2v = w2v
        if len(w2v) == 0:
            self.dimensions = 0
        else:
            self.dimensions = len(w2v[next(iter(glove))])

    # Note from Mike: Even though it doesn't do anything, it's required that this object implement a fit method or else
    # It can't be used in a sklearn Pipeline.
[docs]    def fit(self, X, y):
        return self

[docs]    def transform(self, X):
        import numpy as np
        return np.array([
            np.mean([self.w2v[w] for w in words if w in self.w2v]
                   or [np.zeros(self.dimensions)], axis=0) for words in X])


[docs]def connect_twitter_api(api_key, api_secret_key):
    """Use tweepy to connect to the twitter-API and return tweepy api object."""
    import tweepy, sys
    auth = tweepy.AppAuthHandler(api_key, api_secret_key)
    api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

    if (not api):
        print("Can't authenticate.")
        sys.exit(-1)

    return api


[docs]def search_twitter_api(api_object, searchQuery, maxTweets, fName, tweetsPerQry=100, max_id=0, sinceId=None):
    """Take an authenticated tweepy api_object, a search queary, max# of tweets to retreive, a desintation filename.
    Uses tweept.api.search for the searchQuery until maxTweets is reached, saved harvest tweets to fName."""
    import sys, jsonpickle, os, tweepy
    api = api_object
    tweetCount = 0
    print(f'Downloading max{maxTweets} for {searchQuery}...')
    with open(fName, 'a+') as f:
        while tweetCount < maxTweets:

            try:
                if (max_id <=0):
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry, tweet_mode='extended')
                    else:
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry, since_id=sinceId, tweet_mode='extended')

                else:
                    if (not sinceId):
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry, max_id=str(max_id-1), tweet_mode='extended')
                    else:
                        new_tweets = api.search(q=searchQuery, count=tweetsPerQry, max_id=str(max_id-1),since_id=sinceId, tweet_mode='extended')

                if not new_tweets:
                    print('No more tweets found')
                    break

                for tweet in new_tweets:
                    f.write(jsonpickle.encode(tweet._json, unpicklable=False)+'\n')

                tweetCount+=len(new_tweets)

                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id

            except tweepy.TweepError as e:
                # Just exit if any error
                print("some error : " + str(e))
                break
    print ("Downloaded {0} tweets, Saved to {1}\n".format(tweetCount, fName))