mirror of https://github.com/explosion/spaCy.git
Merge remote-tracking branch 'refs/remotes/honnibal/master'
This commit is contained in:
commit
23475360e4
|
@ -0,0 +1,37 @@
|
||||||
|
# encoding: utf8
|
||||||
|
from __future__ import unicode_literals, print_function
|
||||||
|
|
||||||
|
from math import sqrt
|
||||||
|
from numpy import dot
|
||||||
|
from numpy.linalg import norm
|
||||||
|
|
||||||
|
|
||||||
|
def handle_tweet(spacy, tweet_data, query):
|
||||||
|
text = tweet_data.get('text', u'')
|
||||||
|
# Twython returns either bytes or unicode, depending on tweet.
|
||||||
|
# ಠ_ಠ #APIshaming
|
||||||
|
try:
|
||||||
|
match_tweet(spacy, text, query)
|
||||||
|
except TypeError:
|
||||||
|
match_tweet(spacy, text.decode('utf8'), query)
|
||||||
|
|
||||||
|
|
||||||
|
def match_tweet(spacy, text, query):
|
||||||
|
def get_vector(word):
|
||||||
|
return spacy.vocab[word].repvec
|
||||||
|
|
||||||
|
tweet = spacy(text)
|
||||||
|
tweet = [w.repvec for w in tweet if w.is_alpha and w.lower_ != query]
|
||||||
|
if tweet:
|
||||||
|
accept = map(get_vector, 'child classroom teach'.split())
|
||||||
|
reject = map(get_vector, 'mouth hands giveaway'.split())
|
||||||
|
|
||||||
|
y = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in accept)
|
||||||
|
n = sum(max(cos(w1, w2), 0) for w1 in tweet for w2 in reject)
|
||||||
|
|
||||||
|
if (y / (y + n)) >= 0.5 or True:
|
||||||
|
print(text)
|
||||||
|
|
||||||
|
|
||||||
|
def cos(v1, v2):
|
||||||
|
return dot(v1, v2) / (norm(v1) * norm(v2))
|
|
@ -248,24 +248,16 @@ cdef class Token:
|
||||||
|
|
||||||
property conjuncts:
|
property conjuncts:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
"""Get a list of conjoined words"""
|
"""Get a list of conjoined words."""
|
||||||
cdef Token word
|
cdef Token word
|
||||||
conjs = []
|
conjuncts = []
|
||||||
if self.c.pos != CONJ and self.c.pos != PUNCT:
|
if self.dep_ != 'conj':
|
||||||
seen_conj = False
|
for word in self.rights:
|
||||||
for word in reversed(list(self.lefts)):
|
if word.dep_ == 'conj':
|
||||||
if word.c.pos == CONJ:
|
yield word
|
||||||
seen_conj = True
|
yield from word.conjuncts
|
||||||
elif seen_conj and word.c.pos == self.c.pos:
|
conjuncts.append(word)
|
||||||
conjs.append(word)
|
conjuncts.extend(word.conjuncts)
|
||||||
conjs.reverse()
|
|
||||||
conjs.append(self)
|
|
||||||
if seen_conj:
|
|
||||||
return conjs
|
|
||||||
elif self is not self.head and self in self.head.conjuncts:
|
|
||||||
return self.head.conjuncts
|
|
||||||
else:
|
|
||||||
return []
|
|
||||||
|
|
||||||
property ent_type:
|
property ent_type:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
|
|
@ -7,6 +7,6 @@ def test_space_attachment(EN):
|
||||||
sentence = 'This is a test.\nTo ensure spaces are attached well.'
|
sentence = 'This is a test.\nTo ensure spaces are attached well.'
|
||||||
doc = EN(sentence)
|
doc = EN(sentence)
|
||||||
|
|
||||||
for word in doc:
|
for sent in doc.sents:
|
||||||
if word.is_space:
|
if len(sent) == 1:
|
||||||
assert word.head.i == (word.i - 1)
|
assert not sent[-1].is_space
|
||||||
|
|
Loading…
Reference in New Issue