mirror of https://github.com/explosion/spaCy.git
* Begin rewriting twitter_filter examples
This commit is contained in:
parent
f9a6bea746
commit
692a8d3e3c
|
@ -1,140 +1,35 @@
|
||||||
|
# encoding: utf8
|
||||||
from __future__ import unicode_literals, print_function
|
from __future__ import unicode_literals, print_function
|
||||||
import plac
|
import plac
|
||||||
import codecs
|
import codecs
|
||||||
import sys
|
import pathlib
|
||||||
import math
|
import random
|
||||||
|
|
||||||
|
import twython
|
||||||
import spacy.en
|
import spacy.en
|
||||||
from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
|
|
||||||
|
|
||||||
from termcolor import colored
|
import _handler
|
||||||
from twython import TwythonStreamer
|
|
||||||
|
|
||||||
from os import path
|
|
||||||
from math import sqrt
|
|
||||||
|
|
||||||
from numpy import dot
|
|
||||||
from numpy.linalg import norm
|
|
||||||
|
|
||||||
|
|
||||||
class Meaning(object):
|
class Connection(twython.TwythonStreamer):
|
||||||
def __init__(self, vectors):
|
def __init__(self, keys_dir, nlp, query):
|
||||||
if vectors:
|
keys_dir = pathlib.Path(keys_dir)
|
||||||
self.vector = sum(vectors) / len(vectors)
|
read = lambda fn: (keys_dir / (fn + '.txt')).open().read().strip()
|
||||||
self.norm = norm(self.vector)
|
api_key = map(read, ['key', 'secret', 'token', 'token_secret'])
|
||||||
else:
|
twython.TwythonStreamer.__init__(self, *api_key)
|
||||||
self.vector = None
|
|
||||||
self.norm = 0
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_path(cls, nlp, loc):
|
|
||||||
with codecs.open(loc, 'r', 'utf8') as file_:
|
|
||||||
terms = file_.read().strip().split()
|
|
||||||
return cls.from_terms(nlp, terms)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_tokens(cls, nlp, tokens):
|
|
||||||
vectors = [t.repvec for t in tokens]
|
|
||||||
return cls(vectors)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_terms(cls, nlp, examples):
|
|
||||||
lexemes = [nlp.vocab[eg] for eg in examples]
|
|
||||||
vectors = [eg.repvec for eg in lexemes]
|
|
||||||
return cls(vectors)
|
|
||||||
|
|
||||||
def similarity(self, other):
|
|
||||||
if not self.norm or not other.norm:
|
|
||||||
return -1
|
|
||||||
return dot(self.vector, other.vector) / (self.norm * other.norm)
|
|
||||||
|
|
||||||
|
|
||||||
def print_colored(model, stream=sys.stdout):
|
|
||||||
if model['is_match']:
|
|
||||||
color = 'green'
|
|
||||||
elif model['is_reject']:
|
|
||||||
color = 'red'
|
|
||||||
else:
|
|
||||||
color = 'grey'
|
|
||||||
|
|
||||||
if not model['is_rare'] and model['is_match'] and not model['is_reject']:
|
|
||||||
match_score = colored('%.3f' % model['match_score'], 'green')
|
|
||||||
reject_score = colored('%.3f' % model['reject_score'], 'red')
|
|
||||||
prob = '%.5f' % model['prob']
|
|
||||||
|
|
||||||
print(match_score, reject_score, prob)
|
|
||||||
print(repr(model['text']), color)
|
|
||||||
print('')
|
|
||||||
|
|
||||||
|
|
||||||
class TextMatcher(object):
|
|
||||||
def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
|
|
||||||
self.nlp = nlp
|
self.nlp = nlp
|
||||||
self.get_target = get_target
|
self.query = query
|
||||||
self.get_reject = get_reject
|
|
||||||
self.min_prob = min_prob
|
|
||||||
self.min_match = min_match
|
|
||||||
self.max_reject = max_reject
|
|
||||||
|
|
||||||
def __call__(self, text):
|
|
||||||
tweet = self.nlp(text)
|
|
||||||
target_terms = self.get_target()
|
|
||||||
reject_terms = self.get_reject()
|
|
||||||
|
|
||||||
prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
|
|
||||||
meaning = Meaning.from_tokens(self, tweet)
|
|
||||||
|
|
||||||
match_score = meaning.similarity(self.get_target())
|
|
||||||
reject_score = meaning.similarity(self.get_reject())
|
|
||||||
return {
|
|
||||||
'text': tweet.string,
|
|
||||||
'prob': prob,
|
|
||||||
'match_score': match_score,
|
|
||||||
'reject_score': reject_score,
|
|
||||||
'is_rare': prob < self.min_prob,
|
|
||||||
'is_match': prob >= self.min_prob and match_score >= self.min_match,
|
|
||||||
'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class Connection(TwythonStreamer):
|
|
||||||
def __init__(self, keys_dir, handler, view):
|
|
||||||
keys = Secrets(keys_dir)
|
|
||||||
TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)
|
|
||||||
self.handler = handler
|
|
||||||
self.view = view
|
|
||||||
|
|
||||||
def on_success(self, data):
|
def on_success(self, data):
|
||||||
text = data.get('text', u'')
|
_handler.handle_tweet(self.nlp, data, self.query)
|
||||||
# Twython returns either bytes or unicode, depending on tweet.
|
if random.random() >= 0.1:
|
||||||
# #APIshaming
|
reload(_handler)
|
||||||
try:
|
|
||||||
model = self.handler(text)
|
|
||||||
except TypeError:
|
|
||||||
model = self.handler(text.decode('utf8'))
|
|
||||||
status = self.view(model, sys.stdin)
|
|
||||||
|
|
||||||
def on_error(self, status_code, data):
|
|
||||||
print(status_code)
|
|
||||||
|
|
||||||
|
|
||||||
class Secrets(object):
|
def main(keys_dir, term):
|
||||||
def __init__(self, key_dir):
|
nlp = spacy.en.English()
|
||||||
self.key = open(path.join(key_dir, 'key.txt')).read().strip()
|
twitter = Connection(keys_dir, nlp, term)
|
||||||
self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
|
twitter.statuses.filter(track=term, language='en')
|
||||||
self.token = open(path.join(key_dir, 'token.txt')).read().strip()
|
|
||||||
self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
|
|
||||||
|
|
||||||
|
|
||||||
def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
|
|
||||||
# We don't need the parser for this demo, so may as well save the loading time
|
|
||||||
nlp = spacy.en.English(Parser=None)
|
|
||||||
get_target = lambda: Meaning.from_path(nlp, target_loc)
|
|
||||||
get_reject = lambda: Meaning.from_path(nlp, reject_loc)
|
|
||||||
matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
|
|
||||||
|
|
||||||
twitter = Connection(keys_dir, matcher, print_colored)
|
|
||||||
twitter.statuses.filter(track=term)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue