mirror of https://github.com/explosion/spaCy.git
Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
commit
3ca16ddbd4
|
@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
|
|||
7. Please place an “x” on one of the applicable statement below. Please do NOT
|
||||
mark both statements:
|
||||
|
||||
* [ ] I am signing on behalf of myself as an individual and no other person
|
||||
* [x] I am signing on behalf of myself as an individual and no other person
|
||||
or entity, including my employer, has or will have rights with respect to my
|
||||
contributions.
|
||||
|
||||
|
@ -96,11 +96,11 @@ mark both statements:
|
|||
|
||||
## Contributor Details
|
||||
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | |
|
||||
| Company name (if applicable) | |
|
||||
| Title or role (if applicable) | |
|
||||
| Date | |
|
||||
| GitHub username | |
|
||||
| Website (optional) | |
|
||||
| Field | Entry |
|
||||
|------------------------------- | -------------------- |
|
||||
| Name | Abhinav Sharma |
|
||||
| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. |
|
||||
| Title or role (if applicable) | Machine Learning Engineer |
|
||||
| Date | 3 Novermber 2017 |
|
||||
| GitHub username | abhi18av |
|
||||
| Website (optional) | https://abhi18av.github.io/ |
|
||||
|
|
|
@ -409,12 +409,14 @@ def build_tagger_model(nr_class, **cfg):
|
|||
else:
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
pretrained_dims=pretrained_dims)
|
||||
softmax = with_flatten(Softmax(nr_class, token_vector_width))
|
||||
model = (
|
||||
tok2vec
|
||||
>> with_flatten(Softmax(nr_class, token_vector_width))
|
||||
>> softmax
|
||||
)
|
||||
model.nI = None
|
||||
model.tok2vec = tok2vec
|
||||
model.softmax = softmax
|
||||
return model
|
||||
|
||||
|
||||
|
|
|
@ -391,9 +391,10 @@ class Language(object):
|
|||
for name, proc in pipes:
|
||||
if not hasattr(proc, 'update'):
|
||||
continue
|
||||
grads = {}
|
||||
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
for key, (W, dW) in grads.items():
|
||||
sgd(W, dW, key=key)
|
||||
|
||||
def preprocess_gold(self, docs_golds):
|
||||
"""Can be called before training to pre-process gold data. By default,
|
||||
|
|
|
@ -129,8 +129,14 @@ cdef class Morphology:
|
|||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
# TODO: Currently we've assumed that we know the number of tags --
|
||||
# RichTagC is an array, and _cache is a PreshMapArray
|
||||
# This is really bad: it makes the morphology typed to the tagger
|
||||
# classes, which is all wrong.
|
||||
self.exc[(tag_str, orth_str)] = dict(attrs)
|
||||
tag = self.strings.add(tag_str)
|
||||
if tag not in self.reverse_index:
|
||||
return
|
||||
tag_id = self.reverse_index[tag]
|
||||
orth = self.strings[orth_str]
|
||||
cdef RichTagC rich_tag = self.rich_tags[tag_id]
|
||||
|
|
|
@ -11,7 +11,7 @@ import ujson
|
|||
import msgpack
|
||||
|
||||
from thinc.api import chain
|
||||
from thinc.v2v import Affine, Softmax
|
||||
from thinc.v2v import Affine, SELU, Softmax
|
||||
from thinc.t2v import Pooling, max_pool, mean_pool
|
||||
from thinc.neural.util import to_categorical, copy_array
|
||||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
@ -29,7 +29,7 @@ from .compat import json_dumps
|
|||
from .attrs import POS
|
||||
from .parts_of_speech import X
|
||||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models
|
||||
from ._ml import link_vectors_to_models, zero_init, flatten
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -91,8 +91,8 @@ class Pipe(object):
|
|||
Both __call__ and pipe should delegate to the `predict()`
|
||||
and `set_annotations()` methods.
|
||||
"""
|
||||
scores = self.predict([doc])
|
||||
self.set_annotations([doc], scores)
|
||||
scores, tensors = self.predict([doc])
|
||||
self.set_annotations([doc], scores, tensors=tensors)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
|
@ -103,8 +103,8 @@ class Pipe(object):
|
|||
"""
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
scores = self.predict(docs)
|
||||
self.set_annotations(docs, scores)
|
||||
scores, tensors = self.predict(docs)
|
||||
self.set_annotations(docs, scores, tensor=tensors)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -113,7 +113,7 @@ class Pipe(object):
|
|||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def set_annotations(self, docs, scores):
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
"""Modify a batch of documents, using pre-computed scores."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
@ -216,7 +216,7 @@ class Tensorizer(Pipe):
|
|||
name = 'tensorizer'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, width=128, embed_size=4000, **cfg):
|
||||
def Model(cls, output_size=300, input_size=384, **cfg):
|
||||
"""Create a new statistical model for the class.
|
||||
|
||||
width (int): Output size of the model.
|
||||
|
@ -224,9 +224,11 @@ class Tensorizer(Pipe):
|
|||
**cfg: Config parameters.
|
||||
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||
"""
|
||||
width = util.env_opt('token_vector_width', width)
|
||||
embed_size = util.env_opt('embed_size', embed_size)
|
||||
return Tok2Vec(width, embed_size, **cfg)
|
||||
model = chain(
|
||||
SELU(output_size, input_size),
|
||||
SELU(output_size, output_size),
|
||||
zero_init(Affine(output_size, output_size)))
|
||||
return model
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Construct a new statistical model. Weights are not allocated on
|
||||
|
@ -244,6 +246,7 @@ class Tensorizer(Pipe):
|
|||
"""
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
self.input_models = []
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 3)
|
||||
|
@ -269,8 +272,8 @@ class Tensorizer(Pipe):
|
|||
"""
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
tokvecses = self.predict(docs)
|
||||
self.set_annotations(docs, tokvecses)
|
||||
tensors = self.predict(docs)
|
||||
self.set_annotations(docs, tensors)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -279,18 +282,19 @@ class Tensorizer(Pipe):
|
|||
docs (iterable): A sequence of `Doc` objects.
|
||||
RETURNS (object): Vector representations for each token in the docs.
|
||||
"""
|
||||
tokvecs = self.model(docs)
|
||||
return tokvecs
|
||||
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
|
||||
outputs = self.model(inputs)
|
||||
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
|
||||
|
||||
def set_annotations(self, docs, tokvecses):
|
||||
def set_annotations(self, docs, tensors):
|
||||
"""Set the tensor attribute for a batch of documents.
|
||||
|
||||
docs (iterable): A sequence of `Doc` objects.
|
||||
tokvecs (object): Vector representation for each token in the docs.
|
||||
tensors (object): Vector representation for each token in the docs.
|
||||
"""
|
||||
for doc, tokvecs in zip(docs, tokvecses):
|
||||
assert tokvecs.shape[0] == len(doc)
|
||||
doc.tensor = tokvecs
|
||||
for doc, tensor in zip(docs, tensors):
|
||||
assert tensor.shape[0] == len(doc)
|
||||
doc.tensor = tensor
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
"""Update the model.
|
||||
|
@ -303,11 +307,34 @@ class Tensorizer(Pipe):
|
|||
"""
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
|
||||
return tokvecs, bp_tokvecs
|
||||
inputs = []
|
||||
bp_inputs = []
|
||||
for tok2vec in self.input_models:
|
||||
tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
|
||||
inputs.append(tensor)
|
||||
bp_inputs.append(bp_tensor)
|
||||
inputs = self.model.ops.xp.hstack(inputs)
|
||||
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
|
||||
loss, d_scores = self.get_loss(docs, golds, scores)
|
||||
d_inputs = bp_scores(d_scores, sgd=sgd)
|
||||
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
|
||||
for d_input, bp_input in zip(d_inputs, bp_inputs):
|
||||
bp_input(d_input, sgd=sgd)
|
||||
if losses is not None:
|
||||
losses.setdefault(self.name, 0.)
|
||||
losses[self.name] += loss
|
||||
return loss
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
raise NotImplementedError
|
||||
def get_loss(self, docs, golds, prediction):
|
||||
target = []
|
||||
i = 0
|
||||
for doc in docs:
|
||||
vectors = self.model.ops.xp.vstack([w.vector for w in doc])
|
||||
target.append(vectors)
|
||||
target = self.model.ops.xp.vstack(target)
|
||||
d_scores = (prediction - target) / prediction.shape[0]
|
||||
loss = (d_scores**2).sum()
|
||||
return loss, d_scores
|
||||
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None):
|
||||
"""Allocate models, pre-process training data and acquire a trainer and
|
||||
|
@ -316,8 +343,13 @@ class Tensorizer(Pipe):
|
|||
gold_tuples (iterable): Gold-standard training data.
|
||||
pipeline (list): The pipeline the model is part of.
|
||||
"""
|
||||
for name, model in pipeline:
|
||||
if getattr(model, 'tok2vec', None):
|
||||
self.input_models.append(model.tok2vec)
|
||||
if self.model is True:
|
||||
self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.cfg['input_size'] = 384
|
||||
self.cfg['output_size'] = 300
|
||||
#self.cfg['pretrained_dims'] = self.vocab.vectors_length
|
||||
self.model = self.Model(**self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
||||
|
@ -337,28 +369,37 @@ class Tagger(Pipe):
|
|||
def labels(self):
|
||||
return self.vocab.morphology.tag_names
|
||||
|
||||
@property
|
||||
def tok2vec(self):
|
||||
if self.model in (None, True, False):
|
||||
return None
|
||||
else:
|
||||
return chain(self.model.tok2vec, flatten)
|
||||
|
||||
def __call__(self, doc):
|
||||
tags = self.predict([doc])
|
||||
self.set_annotations([doc], tags)
|
||||
tags, tokvecs = self.predict([doc])
|
||||
self.set_annotations([doc], tags, tensors=tokvecs)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
tag_ids = self.predict(docs)
|
||||
self.set_annotations(docs, tag_ids)
|
||||
tag_ids, tokvecs = self.predict(docs)
|
||||
self.set_annotations(docs, tag_ids, tensors=tokvecs)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
scores = self.model(docs)
|
||||
scores = self.model.ops.flatten(scores)
|
||||
guesses = scores.argmax(axis=1)
|
||||
if not isinstance(guesses, numpy.ndarray):
|
||||
guesses = guesses.get()
|
||||
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
|
||||
return guesses
|
||||
tokvecs = self.model.tok2vec(docs)
|
||||
scores = self.model.softmax(tokvecs)
|
||||
guesses = []
|
||||
for doc_scores in scores:
|
||||
doc_guesses = doc_scores.argmax(axis=1)
|
||||
if not isinstance(doc_guesses, numpy.ndarray):
|
||||
doc_guesses = doc_guesses.get()
|
||||
guesses.append(doc_guesses)
|
||||
return guesses, tokvecs
|
||||
|
||||
def set_annotations(self, docs, batch_tag_ids):
|
||||
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
||||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
cdef Doc doc
|
||||
|
@ -373,6 +414,8 @@ class Tagger(Pipe):
|
|||
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
|
||||
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
|
||||
idx += 1
|
||||
if tensors is not None:
|
||||
doc.extend_tensor(tensors[i])
|
||||
doc.is_tagged = True
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
|
@ -573,7 +616,7 @@ class MultitaskObjective(Tagger):
|
|||
def labels(self, value):
|
||||
self.cfg['labels'] = value
|
||||
|
||||
def set_annotations(self, docs, dep_ids):
|
||||
def set_annotations(self, docs, dep_ids, tensors=None):
|
||||
pass
|
||||
|
||||
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
|
||||
|
@ -720,15 +763,15 @@ class TextCategorizer(Pipe):
|
|||
self.cfg['labels'] = value
|
||||
|
||||
def __call__(self, doc):
|
||||
scores = self.predict([doc])
|
||||
self.set_annotations([doc], scores)
|
||||
scores, tensors = self.predict([doc])
|
||||
self.set_annotations([doc], scores, tensors=tensors)
|
||||
return doc
|
||||
|
||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||
for docs in cytoolz.partition_all(batch_size, stream):
|
||||
docs = list(docs)
|
||||
scores = self.predict(docs)
|
||||
self.set_annotations(docs, scores)
|
||||
scores, tensors = self.predict(docs)
|
||||
self.set_annotations(docs, scores, tensors=tensors)
|
||||
yield from docs
|
||||
|
||||
def predict(self, docs):
|
||||
|
@ -736,8 +779,10 @@ class TextCategorizer(Pipe):
|
|||
scores = self.model.ops.asarray(scores)
|
||||
return scores
|
||||
|
||||
def set_annotations(self, docs, scores):
|
||||
def set_annotations(self, docs, scores, tensors=None):
|
||||
for i, doc in enumerate(docs):
|
||||
if tensors is not None:
|
||||
doc.extend_tensor(tensors[i])
|
||||
for j, label in enumerate(self.labels):
|
||||
doc.cats[label] = float(scores[i, j])
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# cython: infer_types=True
|
||||
# cython: cdivision=True
|
||||
# cython: boundscheck=False
|
||||
# cython: profile=True
|
||||
# coding: utf-8
|
||||
from __future__ import unicode_literals, print_function
|
||||
|
||||
|
@ -322,15 +323,17 @@ cdef class Parser:
|
|||
beam_density = self.cfg.get('beam_density', 0.0)
|
||||
cdef Beam beam
|
||||
if beam_width == 1:
|
||||
states = self.parse_batch([doc])
|
||||
self.set_annotations([doc], states)
|
||||
states, tokvecs = self.parse_batch([doc])
|
||||
self.set_annotations([doc], states, tensors=tokvecs)
|
||||
return doc
|
||||
else:
|
||||
beam = self.beam_parse([doc],
|
||||
beam_width=beam_width, beam_density=beam_density)[0]
|
||||
beams, tokvecs = self.beam_parse([doc],
|
||||
beam_width=beam_width,
|
||||
beam_density=beam_density)
|
||||
beam = beams[0]
|
||||
output = self.moves.get_beam_annot(beam)
|
||||
state = <StateClass>beam.at(0)
|
||||
self.set_annotations([doc], [state])
|
||||
self.set_annotations([doc], [state], tensors=tokvecs)
|
||||
_cleanup(beam)
|
||||
return output
|
||||
|
||||
|
@ -356,15 +359,16 @@ cdef class Parser:
|
|||
for subbatch in cytoolz.partition_all(8, by_length):
|
||||
subbatch = list(subbatch)
|
||||
if beam_width == 1:
|
||||
parse_states = self.parse_batch(subbatch)
|
||||
parse_states, tokvecs = self.parse_batch(subbatch)
|
||||
beams = []
|
||||
else:
|
||||
beams = self.beam_parse(subbatch, beam_width=beam_width,
|
||||
beam_density=beam_density)
|
||||
beams, tokvecs = self.beam_parse(subbatch,
|
||||
beam_width=beam_width,
|
||||
beam_density=beam_density)
|
||||
parse_states = []
|
||||
for beam in beams:
|
||||
parse_states.append(<StateClass>beam.at(0))
|
||||
self.set_annotations(subbatch, parse_states)
|
||||
self.set_annotations(subbatch, parse_states, tensors=tokvecs)
|
||||
yield from batch
|
||||
|
||||
def parse_batch(self, docs):
|
||||
|
@ -411,7 +415,9 @@ cdef class Parser:
|
|||
feat_weights, bias, hW, hb,
|
||||
nr_class, nr_hidden, nr_feat, nr_piece)
|
||||
PyErr_CheckSignals()
|
||||
return state_objs
|
||||
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||
[len(doc) for doc in docs])
|
||||
return state_objs, tokvecs
|
||||
|
||||
cdef void _parseC(self, StateC* state,
|
||||
const float* feat_weights, const float* bias,
|
||||
|
@ -508,7 +514,9 @@ cdef class Parser:
|
|||
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
|
||||
beam.check_done(_check_final_state, NULL)
|
||||
beams.append(beam)
|
||||
return beams
|
||||
tokvecs = self.model[0].ops.unflatten(tokvecs,
|
||||
[len(doc) for doc in docs])
|
||||
return beams, tokvecs
|
||||
|
||||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
|
@ -735,18 +743,29 @@ cdef class Parser:
|
|||
c_d_scores += d_scores.shape[1]
|
||||
return d_scores
|
||||
|
||||
def set_annotations(self, docs, states):
|
||||
def set_annotations(self, docs, states, tensors=None):
|
||||
cdef StateClass state
|
||||
cdef Doc doc
|
||||
for state, doc in zip(states, docs):
|
||||
for i, (state, doc) in enumerate(zip(states, docs)):
|
||||
self.moves.finalize_state(state.c)
|
||||
for i in range(doc.length):
|
||||
doc.c[i] = state.c._sent[i]
|
||||
for j in range(doc.length):
|
||||
doc.c[j] = state.c._sent[j]
|
||||
if tensors is not None:
|
||||
doc.extend_tensor(tensors[i])
|
||||
self.moves.finalize_doc(doc)
|
||||
|
||||
for hook in self.postprocesses:
|
||||
for doc in docs:
|
||||
hook(doc)
|
||||
|
||||
@property
|
||||
def tok2vec(self):
|
||||
'''Return the embedding and convolutional layer of the model.'''
|
||||
if self.model in (None, True, False):
|
||||
return None
|
||||
else:
|
||||
return self.model[0]
|
||||
|
||||
@property
|
||||
def postprocesses(self):
|
||||
# Available for subclasses, e.g. to deprojectivize
|
||||
|
|
|
@ -22,35 +22,37 @@ def test_doc_lemmatization(EN):
|
|||
("ring", ["ring"]),
|
||||
("axes", ["axis", "axe", "ax"])])
|
||||
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||
assert en_lemmatizer.noun(text) == lemmas
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
|
||||
("feed", ["feed"]),
|
||||
("need", ["need"]),
|
||||
("ring", ["ring"]),
|
||||
("axes", ["axis", "axe", "ax"])])
|
||||
("ring", ["ring"])])
|
||||
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
|
||||
assert en_lemmatizer.noun(text) == set(lemmas)
|
||||
# Cases like this are problematic -- not clear what we should do to resolve
|
||||
# ambiguity?
|
||||
# ("axes", ["ax", "axes", "axis"])])
|
||||
assert en_lemmatizer.noun(text) == lemmas
|
||||
|
||||
|
||||
@pytest.mark.xfail
|
||||
@pytest.mark.models('en')
|
||||
def test_en_lemmatizer_base_forms(en_lemmatizer):
|
||||
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
|
||||
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
|
||||
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive']
|
||||
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva']
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
|
||||
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
|
||||
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see']
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_en_lemmatizer_punct(en_lemmatizer):
|
||||
assert en_lemmatizer.punct('“') == set(['"'])
|
||||
assert en_lemmatizer.punct('“') == set(['"'])
|
||||
assert en_lemmatizer.punct('“') == ['"']
|
||||
assert en_lemmatizer.punct('“') == ['"']
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
|
|
|
@ -75,3 +75,11 @@ def test_en_models_probs(example):
|
|||
assert not prob0 == prob1
|
||||
assert not prob0 == prob2
|
||||
assert not prob1 == prob2
|
||||
|
||||
|
||||
@pytest.mark.models('en')
|
||||
def test_no_vectors_similarity(EN):
|
||||
doc1 = EN(u'hallo')
|
||||
doc2 = EN(u'hi')
|
||||
assert doc1.similarity(doc2) > 0
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ def test_sents_1_2(parser):
|
|||
doc[1].sent_start = True
|
||||
doc[2].sent_start = True
|
||||
doc = parser(doc)
|
||||
assert len(list(doc.sents)) == 3
|
||||
assert len(list(doc.sents)) >= 3
|
||||
|
||||
|
||||
def test_sents_1_3(parser):
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import regex as re
|
||||
from ...lang.en import English
|
||||
from ...tokenizer import Tokenizer
|
||||
|
||||
|
||||
def test_issue1488():
|
||||
prefix_re = re.compile(r'''[\[\("']''')
|
||||
suffix_re = re.compile(r'''[\]\)"']''')
|
||||
infix_re = re.compile(r'''[-~\.]''')
|
||||
simple_url_re = re.compile(r'''^https?://''')
|
||||
|
||||
def my_tokenizer(nlp):
|
||||
return Tokenizer(nlp.vocab, {},
|
||||
prefix_search=prefix_re.search,
|
||||
suffix_search=suffix_re.search,
|
||||
infix_finditer=infix_re.finditer,
|
||||
token_match=simple_url_re.match)
|
||||
|
||||
nlp = English()
|
||||
nlp.tokenizer = my_tokenizer(nlp)
|
||||
doc = nlp("This is a test.")
|
||||
for token in doc:
|
||||
assert token.text
|
|
@ -72,7 +72,17 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
|
|||
dY = model.ops.allocate((15, nO, nP))
|
||||
ids = model.ops.allocate((15, nF))
|
||||
ids[1,2] = -1
|
||||
dY[1,2] = 1
|
||||
dY[1] = 1
|
||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
||||
model._backprop_padding(dY, ids)
|
||||
assert model.d_pad[0, 2, 0, 0] == 1.
|
||||
model.d_pad.fill(0.)
|
||||
ids.fill(0.)
|
||||
dY.fill(0.)
|
||||
ids[1,2] = -1
|
||||
ids[1,1] = -1
|
||||
ids[1,0] = -1
|
||||
dY[1] = 1
|
||||
assert model.d_pad[0, 2, 0, 0] == 0.
|
||||
model._backprop_padding(dY, ids)
|
||||
assert model.d_pad[0, 2, 0, 0] == 3.
|
||||
|
|
|
@ -10,6 +10,7 @@ import numpy.linalg
|
|||
import struct
|
||||
import dill
|
||||
import msgpack
|
||||
from thinc.neural.util import get_array_module, copy_array
|
||||
|
||||
from libc.string cimport memcpy, memset
|
||||
from libc.math cimport sqrt
|
||||
|
@ -306,9 +307,9 @@ cdef class Doc:
|
|||
def __get__(self):
|
||||
if 'has_vector' in self.user_hooks:
|
||||
return self.user_hooks['has_vector'](self)
|
||||
elif any(token.has_vector for token in self):
|
||||
elif self.vocab.vectors.data.size:
|
||||
return True
|
||||
elif self.tensor is not None:
|
||||
elif self.tensor.size:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
@ -329,13 +330,13 @@ cdef class Doc:
|
|||
self._vector = numpy.zeros((self.vocab.vectors_length,),
|
||||
dtype='f')
|
||||
return self._vector
|
||||
elif self.has_vector:
|
||||
elif self.vocab.vectors.data.size > 0:
|
||||
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
|
||||
for token in self.c[:self.length]:
|
||||
vector += self.vocab.get_vector(token.lex.orth)
|
||||
self._vector = vector / len(self)
|
||||
return self._vector
|
||||
elif self.tensor is not None:
|
||||
elif self.tensor.size > 0:
|
||||
self._vector = self.tensor.mean(axis=0)
|
||||
return self._vector
|
||||
else:
|
||||
|
@ -827,6 +828,23 @@ cdef class Doc:
|
|||
attrs[:, 2:])
|
||||
return self
|
||||
|
||||
def extend_tensor(self, tensor):
|
||||
'''Concatenate a new tensor onto the doc.tensor object.
|
||||
|
||||
The doc.tensor attribute holds dense feature vectors
|
||||
computed by the models in the pipeline. Let's say a
|
||||
document with 30 words has a tensor with 128 dimensions
|
||||
per word. doc.tensor.shape will be (30, 128). After
|
||||
calling doc.extend_tensor with an array of hape (30, 64),
|
||||
doc.tensor == (30, 192).
|
||||
'''
|
||||
xp = get_array_module(self.tensor)
|
||||
if self.tensor.size == 0:
|
||||
self.tensor.resize(tensor.shape)
|
||||
copy_array(self.tensor, tensor)
|
||||
else:
|
||||
self.tensor = xp.hstack((self.tensor, tensor))
|
||||
|
||||
def merge(self, int start_idx, int end_idx, *args, **attributes):
|
||||
"""Retokenize the document, such that the span at
|
||||
`doc.text[start_idx : end_idx]` is merged into a single token. If
|
||||
|
|
|
@ -283,7 +283,12 @@ cdef class Span:
|
|||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_span_hooks:
|
||||
return self.doc.user_span_hooks['has_vector'](self)
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.vocab.vectors.data.size > 0:
|
||||
return any(token.has_vector for token in self)
|
||||
elif self.doc.tensor.size > 0:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
property vector:
|
||||
"""A real-valued meaning representation. Defaults to an average of the
|
||||
|
|
|
@ -292,6 +292,8 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if 'has_vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['has_vector'](self)
|
||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||
return True
|
||||
return self.vocab.has_vector(self.c.lex.orth)
|
||||
|
||||
property vector:
|
||||
|
@ -303,7 +305,10 @@ cdef class Token:
|
|||
def __get__(self):
|
||||
if 'vector' in self.doc.user_token_hooks:
|
||||
return self.doc.user_token_hooks['vector'](self)
|
||||
return self.vocab.get_vector(self.c.lex.orth)
|
||||
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
|
||||
return self.doc.tensor[self.i]
|
||||
else:
|
||||
return self.vocab.get_vector(self.c.lex.orth)
|
||||
|
||||
property vector_norm:
|
||||
"""The L2 norm of the token's vector representation.
|
||||
|
|
|
@ -11,9 +11,8 @@ if environment == "deploy"
|
|||
|
||||
script(src="/assets/js/vendor/prism.min.js")
|
||||
|
||||
if SECTION == "models"
|
||||
if compare_models
|
||||
script(src="/assets/js/vendor/chart.min.js")
|
||||
script(src="/assets/js/models.js?v#{V_JS}" type="module")
|
||||
|
||||
script
|
||||
if quickstart
|
||||
|
@ -24,15 +23,15 @@ script
|
|||
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
|
||||
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
|
||||
|
||||
|
||||
if IS_PAGE
|
||||
script
|
||||
if IS_PAGE
|
||||
| ((window.gitter = {}).chat = {}).options = {
|
||||
| useStyles: false,
|
||||
| activationElement: '.js-gitter-button',
|
||||
| targetElement: '.js-gitter',
|
||||
| room: '!{SOCIAL.gitter}'
|
||||
| };
|
||||
|
||||
if IS_PAGE
|
||||
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
|
||||
|
||||
|
||||
|
@ -48,39 +47,36 @@ if IS_PAGE
|
|||
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
|
||||
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
|
||||
|
||||
//- Browsers with JS module support.
|
||||
Will be ignored otherwise.
|
||||
|
||||
script(type="module")
|
||||
| import ProgressBar from '/assets/js/progress.js';
|
||||
!=ProgressBar
|
||||
if changelog
|
||||
| import Changelog from '/assets/js/changelog.js';
|
||||
!=Changelog
|
||||
if IS_PAGE
|
||||
| import NavHighlighter from '/assets/js/nav-highlighter.js';
|
||||
!=NavHighlighter
|
||||
| import GitHubEmbed from '/assets/js/github-embed.js';
|
||||
!=GitHubEmbed
|
||||
if HAS_MODELS
|
||||
| import { ModelLoader } from '/assets/js/models.js';
|
||||
!=ModelLoader
|
||||
if compare_models
|
||||
| import { ModelComparer } from '/assets/js/models.js';
|
||||
!=ModelComparer
|
||||
|
||||
//- Browsers with no JS module support.
|
||||
Won't be fetched or interpreted otherwise.
|
||||
|
||||
script(nomodule src="/assets/js/rollup.js")
|
||||
script(nomodule)
|
||||
!=ProgressBar
|
||||
if changelog
|
||||
!=Changelog
|
||||
if IS_PAGE
|
||||
!=NavHighlighter
|
||||
!=GitHubEmbed
|
||||
if HAS_MODELS
|
||||
!=ModeLoader
|
||||
if compare_models
|
||||
!=ModelComparer
|
||||
if environment == "deploy"
|
||||
//- DEPLOY: use compiled rollup.js and instantiate classes directly
|
||||
script(src="/assets/js/rollup.js")
|
||||
script
|
||||
!=ProgressBar
|
||||
if changelog
|
||||
!=Changelog
|
||||
if IS_PAGE
|
||||
!=NavHighlighter
|
||||
!=GitHubEmbed
|
||||
if HAS_MODELS
|
||||
!=ModelLoader
|
||||
if compare_models
|
||||
!=ModelComparer
|
||||
else
|
||||
//- DEVELOPMENT: Use ES6 modules
|
||||
script(type="module")
|
||||
| import ProgressBar from '/assets/js/progress.js';
|
||||
!=ProgressBar
|
||||
if changelog
|
||||
| import Changelog from '/assets/js/changelog.js';
|
||||
!=Changelog
|
||||
if IS_PAGE
|
||||
| import NavHighlighter from '/assets/js/nav-highlighter.js';
|
||||
!=NavHighlighter
|
||||
| import GitHubEmbed from '/assets/js/github-embed.js';
|
||||
!=GitHubEmbed
|
||||
if HAS_MODELS
|
||||
| import { ModelLoader } from '/assets/js/models.js';
|
||||
!=ModelLoader
|
||||
if compare_models
|
||||
| import { ModelComparer } from '/assets/js/models.js';
|
||||
!=ModelComparer
|
||||
|
|
|
@ -198,6 +198,7 @@ export class ModelComparer {
|
|||
this.fonts = CHART_FONTS;
|
||||
this.defaultModels = defaultModels;
|
||||
this.tpl.get('result').style.display = 'block';
|
||||
this.tpl.get('error').style.display = 'none';
|
||||
this.fetchCompat()
|
||||
.then(compat => this.init(compat))
|
||||
.catch(this.showError.bind(this))
|
||||
|
|
|
@ -40,13 +40,10 @@
|
|||
},
|
||||
|
||||
"MODELS": {
|
||||
"en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
|
||||
"de": ["de_core_news_sm", "de_core_news_md"],
|
||||
"es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
|
||||
"pt": ["pt_core_news_sm"],
|
||||
"fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
|
||||
"en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
|
||||
"de": ["de_core_news_sm"],
|
||||
"es": ["es_core_news_sm", "es_core_news_md"],
|
||||
"it": ["it_core_news_sm"],
|
||||
"nl": ["nl_core_news_sm"],
|
||||
"xx": ["xx_ent_wiki_sm"]
|
||||
},
|
||||
|
||||
|
|
|
@ -218,7 +218,7 @@ p
|
|||
| If an exception consists of more than one token, the #[code ORTH] values
|
||||
| combined always need to #[strong match the original string]. The way the
|
||||
| original string is split up can be pretty arbitrary sometimes – for
|
||||
| example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
|
||||
| example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
|
||||
| Because of how the tokenizer works, it's currently not possible to split
|
||||
| single-letter strings into multiple tokens.
|
||||
|
||||
|
|
|
@ -198,11 +198,11 @@ p
|
|||
| #[code .finditer()] methods:
|
||||
|
||||
+code.
|
||||
import re
|
||||
import regex as re
|
||||
from spacy.tokenizer import Tokenizer
|
||||
|
||||
prefix_re = re.compile(r'''[\[\("']''')
|
||||
suffix_re = re.compile(r'''[\]\)"']''')
|
||||
prefix_re = re.compile(r'''^[\[\("']''')
|
||||
suffix_re = re.compile(r'''[\]\)"']$''')
|
||||
infix_re = re.compile(r'''[-~]''')
|
||||
simple_url_re = re.compile(r'''^https?://''')
|
||||
|
||||
|
@ -220,6 +220,17 @@ p
|
|||
| specialize are #[code find_prefix], #[code find_suffix] and
|
||||
| #[code find_infix].
|
||||
|
||||
+infobox("Important note", "⚠️")
|
||||
| When customising the prefix, suffix and infix handling, remember that
|
||||
| you're passing in #[strong functions] for spaCy to execute, e.g.
|
||||
| #[code prefix_re.search] – not just the regular expressions. This means
|
||||
| that your functions also need to define how the rules should be applied.
|
||||
| For example, if you're adding your own prefix rules, you need
|
||||
| to make sure they're only applied to characters at the
|
||||
| #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
|
||||
| suffix rules should only be applied at the #[strong end of a token],
|
||||
| so your expression should end with a #[code $].
|
||||
|
||||
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
|
||||
|
||||
p
|
||||
|
|
Loading…
Reference in New Issue