spaCy/docs/redesign/tute_twitter.jade

205 lines
7.7 KiB
Plaintext

doctype html
html(lang='en')
head
meta(charset='utf-8')
title spaCy Blog
meta(name='description', content='')
meta(name='author', content='Matthew Honnibal')
link(rel='stylesheet', href='css/style.css')
//if lt IE 9
script(src='http://html5shiv.googlecode.com/svn/trunk/html5.js')
body#blog
header(role='banner')
h1.logo spaCy Blog
.slogan Blog
main#content(role='main')
article.post
header
h2 Finding Relevant Tweets
.subhead
| by
a(href='#', rel='author') Matthew Honnibal
| on
time(datetime='2015-08-14') December
details
summary: h4 Imports
pre.language-python
| from __future__ import unicode_literals, print_function
| import plac
| import codecs
| import sys
| import math
|
| import spacy.en
| from spacy.parts_of_speech import VERB, NOUN, ADV, ADJ
|
| from termcolor import colored
| from twython import TwythonStreamer
|
| from os import path
| from math import sqrt
|
| from numpy import dot
| from numpy.linalg import norm
|
|
details
summary: h4 Simple vector-averaging similarity
pre.language-python: code
| class Meaning(object):
| def __init__(self, vectors):
| if vectors:
| self.vector = sum(vectors) / len(vectors)
| self.norm = norm(self.vector)
| else:
| self.vector = None
| self.norm = 0
|
| @classmethod
| def from_path(cls, nlp, loc):
| with codecs.open(loc, 'r', 'utf8') as file_:
| terms = file_.read().strip().split()
| return cls.from_terms(nlp, terms)
|
| @classmethod
| def from_tokens(cls, nlp, tokens):
| vectors = [t.repvec for t in tokens]
| return cls(vectors)
|
| @classmethod
| def from_terms(cls, nlp, examples):
| lexemes = [nlp.vocab[eg] for eg in examples]
| vectors = [eg.repvec for eg in lexemes]
| return cls(vectors)
|
| def similarity(self, other):
| if not self.norm or not other.norm:
| return -1
| return dot(self.vector, other.vector) / (self.norm * other.norm)
|
details
summary: h4 Print matches
pre.language-python: code
|
| def print_colored(model, stream=sys.stdout):
| if model['is_match']:
| color = 'green'
| elif model['is_reject']:
| color = 'red'
| else:
| color = 'grey'
|
| if not model['is_rare'] and model['is_match'] and not model['is_reject']:
| match_score = colored('%.3f' % model['match_score'], 'green')
| reject_score = colored('%.3f' % model['reject_score'], 'red')
| prob = '%.5f' % model['prob']
|
| print(match_score, reject_score, prob)
| print(repr(model['text']), color)
| print('')
|
|
details
summary: h4 TextMatcher: Process the tweets using spaCy
pre.language-python: code
| class TextMatcher(object):
| def __init__(self, nlp, get_target, get_reject, min_prob, min_match, max_reject):
| self.nlp = nlp
| self.get_target = get_target
| self.get_reject = get_reject
| self.min_prob = min_prob
| self.min_match = min_match
| self.max_reject = max_reject
|
| def __call__(self, text):
| tweet = self.nlp(text)
| target_terms = self.get_target()
| reject_terms = self.get_reject()
|
| prob = sum(math.exp(w.prob) for w in tweet) / len(tweet)
| meaning = Meaning.from_tokens(self, tweet)
|
| match_score = meaning.similarity(self.get_target())
| reject_score = meaning.similarity(self.get_reject())
| return {
| 'text': tweet.string,
| 'prob': prob,
| 'match_score': match_score,
| 'reject_score': reject_score,
| 'is_rare': prob < self.min_prob,
| 'is_match': prob >= self.min_prob and match_score >= self.min_match,
| 'is_reject': prob >= self.min_prob and reject_score >= self.max_reject
| }
|
|
details
summary: h4 Connect to Twitter and stream tweets
pre.language-python: code
| class Connection(TwythonStreamer):
| def __init__(self, keys_dir, handler, view):
| keys = Secrets(keys_dir)
| TwythonStreamer.__init__(self, keys.key, keys.secret, keys.token, keys.token_secret)
| self.handler = handler
| self.view = view
|
| def on_success(self, data):
| text = data.get('text', u'')
| # Twython returns either bytes or unicode, depending on tweet.
| # #APIshaming
| try:
| model = self.handler(text)
| except TypeError:
| model = self.handler(text.decode('utf8'))
| status = self.view(model, sys.stdin)
|
| def on_error(self, status_code, data):
| print(status_code)
|
|
| class Secrets(object):
| def __init__(self, key_dir):
| self.key = open(path.join(key_dir, 'key.txt')).read().strip()
| self.secret = open(path.join(key_dir, 'secret.txt')).read().strip()
| self.token = open(path.join(key_dir, 'token.txt')).read().strip()
| self.token_secret = open(path.join(key_dir, 'token_secret.txt')).read().strip()
|
|
details
summary: h4 Command-line interface
pre.language-python: code
| def main(keys_dir, term, target_loc, reject_loc, min_prob=-20, min_match=0.8, max_reject=0.5):
| # We don't need the parser for this demo, so may as well save the loading time
| nlp = spacy.en.English(Parser=None)
| get_target = lambda: Meaning.from_path(nlp, target_loc)
| get_reject = lambda: Meaning.from_path(nlp, reject_loc)
| matcher = TextMatcher(nlp, get_target, get_reject, min_prob, min_match, max_reject)
|
| twitter = Connection(keys_dir, matcher, print_colored)
| twitter.statuses.filter(track=term)
|
|
| if __name__ == '__main__':
| plac.call(main)
|
footer(role='contentinfo')
script(src='js/prism.js')