spaCy/examples/twitter_filter.py

from __future__ import unicode_literals, print_function
import plac
import codecs
import sys
import math

import spacy.en
from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ

from termcolor import colored
from twython import TwythonStreamer

from os import path
from math import sqrt

from numpy import dot
from numpy.linalg import norm


class Meaning(object):
    def __init__(self, vectors):
        if vectors:
            self.vector = sum(vectors) / len(vectors)
            self.norm = norm(self.vector)
        else:
            self.vector = None
            self.norm = 0

    @classmethod
    def from_path(cls, nlp, loc):
        with codecs.open(loc, 'r', 'utf8') as file_:
            terms = file_.read().strip().split()
        return cls.from_terms(nlp, terms)

    @classmethod
    def from_tokens(cls, nlp, tokens):
        vectors = [t.repvec for t in tokens]
        return cls(vectors)

    @classmethod
    def from_terms(cls, nlp, examples):
        lexemes = [nlp.vocab[eg] for eg in examples]
        vectors = [eg.repvec for eg in lexemes]
        return cls(vectors)

    def similarity(self, other):
        if not self.norm or not other.norm:
            return -1
        return dot(self.vector, other.vector) / (self.norm * other.norm)


def print_colored(model, stream=sys.stdout):
    if model['is_match']:
        color = 'green'
    elif model['is_reject']:
        color = 'red'
    else:
        color = 'grey'

    if not model['is_rare'] and model['is_match'] and not model['is_reject']:
        match_score = colored('%.3f' % model['match_score'], 'green')
        reject_score = colored('%.3f' % model['reject_score'], 'red')
        prob = '%.5f' % model['prob']

        print(match_score, reject_score, prob)
        print(repr(model['text']), color)
        print('')


class TextMatcher(object):
    def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
        self.nlp = nlp
        self.get_target = get_target
        self.get_reject = get_reject
        self.min_prob = min_prob
        self.min_match = min_match
        self.max_reject = max_reject

    def __call__(self, text):
        tweet = self.nlp(text)
        target_terms = self.get_target()
        reject_terms = self.get_reject()

        prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
        meaning = Meaning.from_tokens(self, tweet)

        match_score = meaning.similarity(self.get_target())
        reject_score = meaning.similarity(self.get_reject())
        return {
            'text': tweet.string,
            'prob': prob,
            'match_score': match_score,
            'reject_score': reject_score,
            'is_rare': prob < self.min_prob,
            'is_match': prob >= self.min_prob  and match_score  >= self.min_match,
            'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
        }


class Connection(TwythonStreamer):
    def __init__(self, keys_dir, handler, view):
        keys = Secrets(keys_dir)
        TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)
        self.handler = handler
        self.view = view

    def on_success(self, data):
        text = data.get('text', u'')
        # Twython returns either bytes or unicode, depending on tweet.
        # #APIshaming
        try:
            model = self.handler(text)
        except TypeError:
            model = self.handler(text.decode('utf8'))
        status = self.view(model, sys.stdin)

    def on_error(self, status_code, data):
        print(status_code)


class Secrets(object):
    def __init__(self, key_dir):
        self.key = open(path.join(key_dir, 'key.txt')).read().strip()
        self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
        self.token = open(path.join(key_dir, 'token.txt')).read().strip()
        self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()


def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
    # We don't need the parser for this demo, so may as well save the loading time
    nlp = spacy.en.English(Parser=None)
    get_target = lambda: Meaning.from_path(nlp, target_loc)
    get_reject = lambda: Meaning.from_path(nlp, reject_loc)
    matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)

    twitter = Connection(keys_dir, matcher, print_colored)
    twitter.statuses.filter(track=term)


if __name__ == '__main__':
    plac.call(main)