* Begin rewriting twitter_filter examples

2015-08-22 22:12:26 +02:00 · 2015-08-22 22:12:26 +02:00 · 692a8d3e3c
parent f9a6bea746
commit 692a8d3e3c
1 changed files with 19 additions and 124 deletions
--- a/examples/twitter_filter.py
+++ b/examples/twitter_filter.py
@ -1,140 +1,35 @@
+# encoding: utf8
 from __future__ import unicode_literals, print_function
 import plac
 import codecs
-import sys
-import math
+import pathlib
+import random

+import twython
 import spacy.en
-from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ

-from termcolor import colored
-from twython import TwythonStreamer
-
-from os import path
-from math import sqrt
-
-from numpy import dot
-from numpy.linalg import norm
+import _handler


-class Meaning(object):
-    def __init__(self, vectors):
-        if vectors:
-            self.vector = sum(vectors) / len(vectors)
-            self.norm = norm(self.vector)
-        else:
-            self.vector = None
-            self.norm = 0
-
-    @classmethod
-    def from_path(cls, nlp, loc):
-        with codecs.open(loc, 'r', 'utf8') as file_:
-            terms = file_.read().strip().split()
-        return cls.from_terms(nlp, terms)
-
-    @classmethod
-    def from_tokens(cls, nlp, tokens):
-        vectors = [t.repvec for t in tokens]
-        return cls(vectors)
-
-    @classmethod
-    def from_terms(cls, nlp, examples):
-        lexemes = [nlp.vocab[eg] for eg in examples]
-        vectors = [eg.repvec for eg in lexemes]
-        return cls(vectors)
-
-    def similarity(self, other):
-        if not self.norm or not other.norm:
-            return -1
-        return dot(self.vector, other.vector) / (self.norm * other.norm)
-
-
-def print_colored(model, stream=sys.stdout):
-    if model['is_match']:
-        color = 'green'
-    elif model['is_reject']:
-        color = 'red'
-    else:
-        color = 'grey'
-    
-    if not model['is_rare'] and model['is_match'] and not model['is_reject']:
-        match_score = colored('%.3f' % model['match_score'], 'green')
-        reject_score = colored('%.3f' % model['reject_score'], 'red')
-        prob = '%.5f' % model['prob']
-
-        print(match_score, reject_score, prob)
-        print(repr(model['text']), color)
-        print('')
-
-
-class TextMatcher(object):
-    def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
+class Connection(twython.TwythonStreamer):
+    def __init__(self, keys_dir, nlp, query):
+        keys_dir = pathlib.Path(keys_dir)
+        read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
+        api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
+        twython.TwythonStreamer.__init__(self, *api_key)
        self.nlp = nlp
-        self.get_target = get_target
-        self.get_reject = get_reject
-        self.min_prob = min_prob
-        self.min_match = min_match
-        self.max_reject = max_reject
-
-    def __call__(self, text):
-        tweet = self.nlp(text)
-        target_terms = self.get_target()
-        reject_terms = self.get_reject()
-
-        prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
-        meaning = Meaning.from_tokens(self, tweet)
-        
-        match_score = meaning.similarity(self.get_target())
-        reject_score = meaning.similarity(self.get_reject())
-        return {
-            'text': tweet.string,
-            'prob': prob,
-            'match_score': match_score,
-            'reject_score': reject_score,
-            'is_rare': prob < self.min_prob,
-            'is_match': prob >= self.min_prob  and match_score  >= self.min_match,
-            'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
-        }
-
-
-class Connection(TwythonStreamer):
-    def __init__(self, keys_dir, handler, view):
-        keys = Secrets(keys_dir)
-        TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret) 
-        self.handler = handler
-        self.view = view
+        self.query = query

    def on_success(self, data):
-        text = data.get('text', u'')
-        # Twython returns either bytes or unicode, depending on tweet.
-        # #APIshaming
-        try:
-            model = self.handler(text)
-        except TypeError:
-            model = self.handler(text.decode('utf8'))
-        status = self.view(model, sys.stdin)
-
-    def on_error(self, status_code, data):
-        print(status_code)
+        _handler.handle_tweet(self.nlp, data, self.query)
+        if random.random() >= 0.1:
+            reload(_handler)


-class Secrets(object):
-    def __init__(self, key_dir):
-        self.key = open(path.join(key_dir, 'key.txt')).read().strip()
-        self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
-        self.token = open(path.join(key_dir, 'token.txt')).read().strip()
-        self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
-
-
-def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
-    # We don't need the parser for this demo, so may as well save the loading time
-    nlp = spacy.en.English(Parser=None)
-    get_target = lambda: Meaning.from_path(nlp, target_loc)
-    get_reject = lambda: Meaning.from_path(nlp, reject_loc)
-    matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
-
-    twitter = Connection(keys_dir, matcher, print_colored)
-    twitter.statuses.filter(track=term)
+def main(keys_dir, term):
+    nlp = spacy.en.English()
+    twitter = Connection(keys_dir, nlp, term)
+    twitter.statuses.filter(track=term, language='en')


 if __name__ == '__main__':