Merge remote-tracking branch 'refs/remotes/honnibal/master'

This commit is contained in:
maxirmx 2015-10-15 12:10:23 +03:00
commit 23475360e4
3 changed files with 49 additions and 20 deletions

37
examples/_handler.py Normal file
View File

@ -0,0 +1,37 @@
# encoding: utf8
from __future__ import unicode_literals, print_function
from math import sqrt
from numpy import dot
from numpy.linalg import norm
def handle_tweet(spacy, tweet_data, query):
text = tweet_data.get('text', u'')
# Twython returns either bytes or unicode, depending on tweet.
# ಠ_ಠ #APIshaming
try:
match_tweet(spacy, text, query)
except TypeError:
match_tweet(spacy, text.decode('utf8'), query)
def match_tweet(spacy, text, query):
def get_vector(word):
return spacy.vocab[word].repvec
tweet = spacy(text)
tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
if tweet:
accept = map(get_vector, 'child classroom teach'.split())
reject = map(get_vector, 'mouth hands giveaway'.split())
y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
if (y / (y + n)) >= 0.5 or True:
print(text)
def cos(v1, v2):
return dot(v1, v2) / (norm(v1) * norm(v2))

View File

@ -248,24 +248,16 @@ cdef class Token:
property conjuncts:
def __get__(self):
"""Get a list of conjoined words"""
"""Get a list of conjoined words."""
cdef Token word
conjs = []
if self.c.pos != CONJ and self.c.pos != PUNCT:
seen_conj = False
for word in reversed(list(self.lefts)):
if word.c.pos == CONJ:
seen_conj = True
elif seen_conj and word.c.pos == self.c.pos:
conjs.append(word)
conjs.reverse()
conjs.append(self)
if seen_conj:
return conjs
elif self is not self.head and self in self.head.conjuncts:
return self.head.conjuncts
else:
return []
conjuncts = []
if self.dep_ != 'conj':
for word in self.rights:
if word.dep_ == 'conj':
yield word
yield from word.conjuncts
conjuncts.append(word)
conjuncts.extend(word.conjuncts)
property ent_type:
def __get__(self):

View File

@ -7,6 +7,6 @@ def test_space_attachment(EN):
sentence = 'This is a test.\nTo ensure spaces are attached well.'
doc = EN(sentence)
for word in doc:
if word.is_space:
assert word.head.i == (word.i - 1)
for sent in doc.sents:
if len(sent) == 1:
assert not sent[-1].is_space