diff --git a/examples/twitter_filter.py b/examples/twitter_filter.py index f842acdd4..b6e4e4e83 100644 --- a/examples/twitter_filter.py +++ b/examples/twitter_filter.py @@ -1,140 +1,35 @@ +# encoding: utf8 from __future__ import unicode_literals, print_function import plac import codecs -import sys -import math +import pathlib +import random +import twython import spacy.en -from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ -from termcolor import colored -from twython import TwythonStreamer - -from os import path -from math import sqrt - -from numpy import dot -from numpy.linalg import norm +import _handler -class Meaning(object): - def __init__(self, vectors): - if vectors: - self.vector = sum(vectors) / len(vectors) - self.norm = norm(self.vector) - else: - self.vector = None - self.norm = 0 - - @classmethod - def from_path(cls, nlp, loc): - with codecs.open(loc, 'r', 'utf8') as file_: - terms = file_.read().strip().split() - return cls.from_terms(nlp, terms) - - @classmethod - def from_tokens(cls, nlp, tokens): - vectors = [t.repvec for t in tokens] - return cls(vectors) - - @classmethod - def from_terms(cls, nlp, examples): - lexemes = [nlp.vocab[eg] for eg in examples] - vectors = [eg.repvec for eg in lexemes] - return cls(vectors) - - def similarity(self, other): - if not self.norm or not other.norm: - return -1 - return dot(self.vector, other.vector) / (self.norm * other.norm) - - -def print_colored(model, stream=sys.stdout): - if model['is_match']: - color = 'green' - elif model['is_reject']: - color = 'red' - else: - color = 'grey' - - if not model['is_rare'] and model['is_match'] and not model['is_reject']: - match_score = colored('%.3f' % model['match_score'], 'green') - reject_score = colored('%.3f' % model['reject_score'], 'red') - prob = '%.5f' % model['prob'] - - print(match_score, reject_score, prob) - print(repr(model['text']), color) - print('') - - -class TextMatcher(object): - def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject): +class Connection(twython.TwythonStreamer): + def __init__(self, keys_dir, nlp, query): + keys_dir = pathlib.Path(keys_dir) + read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip() + api_key = map(read, ['key', 'secret', 'token', 'token_secret']) + twython.TwythonStreamer.__init__(self, *api_key) self.nlp = nlp - self.get_target = get_target - self.get_reject = get_reject - self.min_prob = min_prob - self.min_match = min_match - self.max_reject = max_reject - - def __call__(self, text): - tweet = self.nlp(text) - target_terms = self.get_target() - reject_terms = self.get_reject() - - prob = sum(math.exp(w.prob) for w in tweet) / len(tweet) - meaning = Meaning.from_tokens(self, tweet) - - match_score = meaning.similarity(self.get_target()) - reject_score = meaning.similarity(self.get_reject()) - return { - 'text': tweet.string, - 'prob': prob, - 'match_score': match_score, - 'reject_score': reject_score, - 'is_rare': prob < self.min_prob, - 'is_match': prob >= self.min_prob and match_score >= self.min_match, - 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject - } - - -class Connection(TwythonStreamer): - def __init__(self, keys_dir, handler, view): - keys = Secrets(keys_dir) - TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) - self.handler = handler - self.view = view + self.query = query def on_success(self, data): - text = data.get('text', u'') - # Twython returns either bytes or unicode, depending on tweet. - # #APIshaming - try: - model = self.handler(text) - except TypeError: - model = self.handler(text.decode('utf8')) - status = self.view(model, sys.stdin) - - def on_error(self, status_code, data): - print(status_code) + _handler.handle_tweet(self.nlp, data, self.query) + if random.random() >= 0.1: + reload(_handler) -class Secrets(object): - def __init__(self, key_dir): - self.key = open(path.join(key_dir, 'key.txt')).read().strip() - self.secret = open(path.join(key_dir, 'secret.txt')).read().strip() - self.token = open(path.join(key_dir, 'token.txt')).read().strip() - self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip() - - -def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5): - # We don't need the parser for this demo, so may as well save the loading time - nlp = spacy.en.English(Parser=None) - get_target = lambda: Meaning.from_path(nlp, target_loc) - get_reject = lambda: Meaning.from_path(nlp, reject_loc) - matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject) - - twitter = Connection(keys_dir, matcher, print_colored) - twitter.statuses.filter(track=term) +def main(keys_dir, term): + nlp = spacy.en.English() + twitter = Connection(keys_dir, nlp, term) + twitter.statuses.filter(track=term, language='en') if __name__ == '__main__':