Merge branch 'develop' of https://github.com/explosion/spaCy into develop

This commit is contained in:
Matthew Honnibal 2017-11-04 00:25:02 +01:00
commit 3ca16ddbd4
19 changed files with 290 additions and 138 deletions

View File

@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
7. Please place an “x” on one of the applicable statement below. Please do NOT
mark both statements:
* [ ] I am signing on behalf of myself as an individual and no other person
* [x] I am signing on behalf of myself as an individual and no other person
or entity, including my employer, has or will have rights with respect to my
contributions.
@ -96,11 +96,11 @@ mark both statements:
## Contributor Details
| Field | Entry |
|------------------------------- | -------------------- |
| Name | |
| Company name (if applicable) | |
| Title or role (if applicable) | |
| Date | |
| GitHub username | |
| Website (optional) | |
| Field | Entry |
|------------------------------- | -------------------- |
| Name | Abhinav Sharma |
| Company name (if applicable) | Fourtek I.T. Solutions Pvt. Ltd. |
| Title or role (if applicable) | Machine Learning Engineer |
| Date | 3 Novermber 2017 |
| GitHub username | abhi18av |
| Website (optional) | https://abhi18av.github.io/ |

View File

@ -409,12 +409,14 @@ def build_tagger_model(nr_class, **cfg):
else:
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_dims=pretrained_dims)
softmax = with_flatten(Softmax(nr_class, token_vector_width))
model = (
tok2vec
>> with_flatten(Softmax(nr_class, token_vector_width))
>> softmax
)
model.nI = None
model.tok2vec = tok2vec
model.softmax = softmax
return model

View File

@ -391,9 +391,10 @@ class Language(object):
for name, proc in pipes:
if not hasattr(proc, 'update'):
continue
grads = {}
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
for key, (W, dW) in grads.items():
sgd(W, dW, key=key)
def preprocess_gold(self, docs_golds):
"""Can be called before training to pre-process gold data. By default,

View File

@ -129,8 +129,14 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception.
"""
# TODO: Currently we've assumed that we know the number of tags --
# RichTagC is an array, and _cache is a PreshMapArray
# This is really bad: it makes the morphology typed to the tagger
# classes, which is all wrong.
self.exc[(tag_str, orth_str)] = dict(attrs)
tag = self.strings.add(tag_str)
if tag not in self.reverse_index:
return
tag_id = self.reverse_index[tag]
orth = self.strings[orth_str]
cdef RichTagC rich_tag = self.rich_tags[tag_id]

View File

@ -11,7 +11,7 @@ import ujson
import msgpack
from thinc.api import chain
from thinc.v2v import Affine, Softmax
from thinc.v2v import Affine, SELU, Softmax
from thinc.t2v import Pooling, max_pool, mean_pool
from thinc.neural.util import to_categorical, copy_array
from thinc.neural._classes.difference import Siamese, CauchySimilarity
@ -29,7 +29,7 @@ from .compat import json_dumps
from .attrs import POS
from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models
from ._ml import link_vectors_to_models, zero_init, flatten
from . import util
@ -91,8 +91,8 @@ class Pipe(object):
Both __call__ and pipe should delegate to the `predict()`
and `set_annotations()` methods.
"""
scores = self.predict([doc])
self.set_annotations([doc], scores)
scores, tensors = self.predict([doc])
self.set_annotations([doc], scores, tensors=tensors)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
@ -103,8 +103,8 @@ class Pipe(object):
"""
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
scores = self.predict(docs)
self.set_annotations(docs, scores)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensor=tensors)
yield from docs
def predict(self, docs):
@ -113,7 +113,7 @@ class Pipe(object):
"""
raise NotImplementedError
def set_annotations(self, docs, scores):
def set_annotations(self, docs, scores, tensors=None):
"""Modify a batch of documents, using pre-computed scores."""
raise NotImplementedError
@ -216,7 +216,7 @@ class Tensorizer(Pipe):
name = 'tensorizer'
@classmethod
def Model(cls, width=128, embed_size=4000, **cfg):
def Model(cls, output_size=300, input_size=384, **cfg):
"""Create a new statistical model for the class.
width (int): Output size of the model.
@ -224,9 +224,11 @@ class Tensorizer(Pipe):
**cfg: Config parameters.
RETURNS (Model): A `thinc.neural.Model` or similar instance.
"""
width = util.env_opt('token_vector_width', width)
embed_size = util.env_opt('embed_size', embed_size)
return Tok2Vec(width, embed_size, **cfg)
model = chain(
SELU(output_size, input_size),
SELU(output_size, output_size),
zero_init(Affine(output_size, output_size)))
return model
def __init__(self, vocab, model=True, **cfg):
"""Construct a new statistical model. Weights are not allocated on
@ -244,6 +246,7 @@ class Tensorizer(Pipe):
"""
self.vocab = vocab
self.model = model
self.input_models = []
self.cfg = dict(cfg)
self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
self.cfg.setdefault('cnn_maxout_pieces', 3)
@ -269,8 +272,8 @@ class Tensorizer(Pipe):
"""
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tokvecses = self.predict(docs)
self.set_annotations(docs, tokvecses)
tensors = self.predict(docs)
self.set_annotations(docs, tensors)
yield from docs
def predict(self, docs):
@ -279,18 +282,19 @@ class Tensorizer(Pipe):
docs (iterable): A sequence of `Doc` objects.
RETURNS (object): Vector representations for each token in the docs.
"""
tokvecs = self.model(docs)
return tokvecs
inputs = self.model.ops.flatten([doc.tensor for doc in docs])
outputs = self.model(inputs)
return self.model.ops.unflatten(outputs, [len(d) for d in docs])
def set_annotations(self, docs, tokvecses):
def set_annotations(self, docs, tensors):
"""Set the tensor attribute for a batch of documents.
docs (iterable): A sequence of `Doc` objects.
tokvecs (object): Vector representation for each token in the docs.
tensors (object): Vector representation for each token in the docs.
"""
for doc, tokvecs in zip(docs, tokvecses):
assert tokvecs.shape[0] == len(doc)
doc.tensor = tokvecs
for doc, tensor in zip(docs, tensors):
assert tensor.shape[0] == len(doc)
doc.tensor = tensor
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
"""Update the model.
@ -303,11 +307,34 @@ class Tensorizer(Pipe):
"""
if isinstance(docs, Doc):
docs = [docs]
tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
return tokvecs, bp_tokvecs
inputs = []
bp_inputs = []
for tok2vec in self.input_models:
tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
inputs.append(tensor)
bp_inputs.append(bp_tensor)
inputs = self.model.ops.xp.hstack(inputs)
scores, bp_scores = self.model.begin_update(inputs, drop=drop)
loss, d_scores = self.get_loss(docs, golds, scores)
d_inputs = bp_scores(d_scores, sgd=sgd)
d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
for d_input, bp_input in zip(d_inputs, bp_inputs):
bp_input(d_input, sgd=sgd)
if losses is not None:
losses.setdefault(self.name, 0.)
losses[self.name] += loss
return loss
def get_loss(self, docs, golds, scores):
raise NotImplementedError
def get_loss(self, docs, golds, prediction):
target = []
i = 0
for doc in docs:
vectors = self.model.ops.xp.vstack([w.vector for w in doc])
target.append(vectors)
target = self.model.ops.xp.vstack(target)
d_scores = (prediction - target) / prediction.shape[0]
loss = (d_scores**2).sum()
return loss, d_scores
def begin_training(self, gold_tuples=tuple(), pipeline=None):
"""Allocate models, pre-process training data and acquire a trainer and
@ -316,8 +343,13 @@ class Tensorizer(Pipe):
gold_tuples (iterable): Gold-standard training data.
pipeline (list): The pipeline the model is part of.
"""
for name, model in pipeline:
if getattr(model, 'tok2vec', None):
self.input_models.append(model.tok2vec)
if self.model is True:
self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.cfg['input_size'] = 384
self.cfg['output_size'] = 300
#self.cfg['pretrained_dims'] = self.vocab.vectors_length
self.model = self.Model(**self.cfg)
link_vectors_to_models(self.vocab)
@ -337,28 +369,37 @@ class Tagger(Pipe):
def labels(self):
return self.vocab.morphology.tag_names
@property
def tok2vec(self):
if self.model in (None, True, False):
return None
else:
return chain(self.model.tok2vec, flatten)
def __call__(self, doc):
tags = self.predict([doc])
self.set_annotations([doc], tags)
tags, tokvecs = self.predict([doc])
self.set_annotations([doc], tags, tensors=tokvecs)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
tag_ids = self.predict(docs)
self.set_annotations(docs, tag_ids)
tag_ids, tokvecs = self.predict(docs)
self.set_annotations(docs, tag_ids, tensors=tokvecs)
yield from docs
def predict(self, docs):
scores = self.model(docs)
scores = self.model.ops.flatten(scores)
guesses = scores.argmax(axis=1)
if not isinstance(guesses, numpy.ndarray):
guesses = guesses.get()
guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
return guesses
tokvecs = self.model.tok2vec(docs)
scores = self.model.softmax(tokvecs)
guesses = []
for doc_scores in scores:
doc_guesses = doc_scores.argmax(axis=1)
if not isinstance(doc_guesses, numpy.ndarray):
doc_guesses = doc_guesses.get()
guesses.append(doc_guesses)
return guesses, tokvecs
def set_annotations(self, docs, batch_tag_ids):
def set_annotations(self, docs, batch_tag_ids, tensors=None):
if isinstance(docs, Doc):
docs = [docs]
cdef Doc doc
@ -373,6 +414,8 @@ class Tagger(Pipe):
if doc.c[j].tag == 0 and doc.c[j].pos == 0:
vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
idx += 1
if tensors is not None:
doc.extend_tensor(tensors[i])
doc.is_tagged = True
def update(self, docs, golds, drop=0., sgd=None, losses=None):
@ -573,7 +616,7 @@ class MultitaskObjective(Tagger):
def labels(self, value):
self.cfg['labels'] = value
def set_annotations(self, docs, dep_ids):
def set_annotations(self, docs, dep_ids, tensors=None):
pass
def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
@ -720,15 +763,15 @@ class TextCategorizer(Pipe):
self.cfg['labels'] = value
def __call__(self, doc):
scores = self.predict([doc])
self.set_annotations([doc], scores)
scores, tensors = self.predict([doc])
self.set_annotations([doc], scores, tensors=tensors)
return doc
def pipe(self, stream, batch_size=128, n_threads=-1):
for docs in cytoolz.partition_all(batch_size, stream):
docs = list(docs)
scores = self.predict(docs)
self.set_annotations(docs, scores)
scores, tensors = self.predict(docs)
self.set_annotations(docs, scores, tensors=tensors)
yield from docs
def predict(self, docs):
@ -736,8 +779,10 @@ class TextCategorizer(Pipe):
scores = self.model.ops.asarray(scores)
return scores
def set_annotations(self, docs, scores):
def set_annotations(self, docs, scores, tensors=None):
for i, doc in enumerate(docs):
if tensors is not None:
doc.extend_tensor(tensors[i])
for j, label in enumerate(self.labels):
doc.cats[label] = float(scores[i, j])

View File

@ -1,6 +1,7 @@
# cython: infer_types=True
# cython: cdivision=True
# cython: boundscheck=False
# cython: profile=True
# coding: utf-8
from __future__ import unicode_literals, print_function
@ -322,15 +323,17 @@ cdef class Parser:
beam_density = self.cfg.get('beam_density', 0.0)
cdef Beam beam
if beam_width == 1:
states = self.parse_batch([doc])
self.set_annotations([doc], states)
states, tokvecs = self.parse_batch([doc])
self.set_annotations([doc], states, tensors=tokvecs)
return doc
else:
beam = self.beam_parse([doc],
beam_width=beam_width, beam_density=beam_density)[0]
beams, tokvecs = self.beam_parse([doc],
beam_width=beam_width,
beam_density=beam_density)
beam = beams[0]
output = self.moves.get_beam_annot(beam)
state = <StateClass>beam.at(0)
self.set_annotations([doc], [state])
self.set_annotations([doc], [state], tensors=tokvecs)
_cleanup(beam)
return output
@ -356,15 +359,16 @@ cdef class Parser:
for subbatch in cytoolz.partition_all(8, by_length):
subbatch = list(subbatch)
if beam_width == 1:
parse_states = self.parse_batch(subbatch)
parse_states, tokvecs = self.parse_batch(subbatch)
beams = []
else:
beams = self.beam_parse(subbatch, beam_width=beam_width,
beam_density=beam_density)
beams, tokvecs = self.beam_parse(subbatch,
beam_width=beam_width,
beam_density=beam_density)
parse_states = []
for beam in beams:
parse_states.append(<StateClass>beam.at(0))
self.set_annotations(subbatch, parse_states)
self.set_annotations(subbatch, parse_states, tensors=tokvecs)
yield from batch
def parse_batch(self, docs):
@ -411,7 +415,9 @@ cdef class Parser:
feat_weights, bias, hW, hb,
nr_class, nr_hidden, nr_feat, nr_piece)
PyErr_CheckSignals()
return state_objs
tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs])
return state_objs, tokvecs
cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* bias,
@ -508,7 +514,9 @@ cdef class Parser:
beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
beam.check_done(_check_final_state, NULL)
beams.append(beam)
return beams
tokvecs = self.model[0].ops.unflatten(tokvecs,
[len(doc) for doc in docs])
return beams, tokvecs
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
@ -735,18 +743,29 @@ cdef class Parser:
c_d_scores += d_scores.shape[1]
return d_scores
def set_annotations(self, docs, states):
def set_annotations(self, docs, states, tensors=None):
cdef StateClass state
cdef Doc doc
for state, doc in zip(states, docs):
for i, (state, doc) in enumerate(zip(states, docs)):
self.moves.finalize_state(state.c)
for i in range(doc.length):
doc.c[i] = state.c._sent[i]
for j in range(doc.length):
doc.c[j] = state.c._sent[j]
if tensors is not None:
doc.extend_tensor(tensors[i])
self.moves.finalize_doc(doc)
for hook in self.postprocesses:
for doc in docs:
hook(doc)
@property
def tok2vec(self):
'''Return the embedding and convolutional layer of the model.'''
if self.model in (None, True, False):
return None
else:
return self.model[0]
@property
def postprocesses(self):
# Available for subclasses, e.g. to deprojectivize

View File

@ -22,35 +22,37 @@ def test_doc_lemmatization(EN):
("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
assert en_lemmatizer.noun(text) == lemmas
@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
("feed", ["feed"]),
("need", ["need"]),
("ring", ["ring"]),
("axes", ["axis", "axe", "ax"])])
("ring", ["ring"])])
def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
assert en_lemmatizer.noun(text) == set(lemmas)
# Cases like this are problematic -- not clear what we should do to resolve
# ambiguity?
# ("axes", ["ax", "axes", "axis"])])
assert en_lemmatizer.noun(text) == lemmas
@pytest.mark.xfail
@pytest.mark.models('en')
def test_en_lemmatizer_base_forms(en_lemmatizer):
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive']
assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva']
@pytest.mark.models('en')
def test_en_lemmatizer_base_form_verb(en_lemmatizer):
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see']
@pytest.mark.models('en')
def test_en_lemmatizer_punct(en_lemmatizer):
assert en_lemmatizer.punct('') == set(['"'])
assert en_lemmatizer.punct('') == set(['"'])
assert en_lemmatizer.punct('') == ['"']
assert en_lemmatizer.punct('') == ['"']
@pytest.mark.models('en')

View File

@ -75,3 +75,11 @@ def test_en_models_probs(example):
assert not prob0 == prob1
assert not prob0 == prob2
assert not prob1 == prob2
@pytest.mark.models('en')
def test_no_vectors_similarity(EN):
doc1 = EN(u'hallo')
doc2 = EN(u'hi')
assert doc1.similarity(doc2) > 0

View File

@ -56,7 +56,7 @@ def test_sents_1_2(parser):
doc[1].sent_start = True
doc[2].sent_start = True
doc = parser(doc)
assert len(list(doc.sents)) == 3
assert len(list(doc.sents)) >= 3
def test_sents_1_3(parser):

View File

@ -0,0 +1,26 @@
# coding: utf8
from __future__ import unicode_literals
import regex as re
from ...lang.en import English
from ...tokenizer import Tokenizer
def test_issue1488():
prefix_re = re.compile(r'''[\[\("']''')
suffix_re = re.compile(r'''[\]\)"']''')
infix_re = re.compile(r'''[-~\.]''')
simple_url_re = re.compile(r'''^https?://''')
def my_tokenizer(nlp):
return Tokenizer(nlp.vocab, {},
prefix_search=prefix_re.search,
suffix_search=suffix_re.search,
infix_finditer=infix_re.finditer,
token_match=simple_url_re.match)
nlp = English()
nlp.tokenizer = my_tokenizer(nlp)
doc = nlp("This is a test.")
for token in doc:
assert token.text

View File

@ -72,7 +72,17 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
dY = model.ops.allocate((15, nO, nP))
ids = model.ops.allocate((15, nF))
ids[1,2] = -1
dY[1,2] = 1
dY[1] = 1
assert model.d_pad[0, 2, 0, 0] == 0.
model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 1.
model.d_pad.fill(0.)
ids.fill(0.)
dY.fill(0.)
ids[1,2] = -1
ids[1,1] = -1
ids[1,0] = -1
dY[1] = 1
assert model.d_pad[0, 2, 0, 0] == 0.
model._backprop_padding(dY, ids)
assert model.d_pad[0, 2, 0, 0] == 3.

View File

@ -10,6 +10,7 @@ import numpy.linalg
import struct
import dill
import msgpack
from thinc.neural.util import get_array_module, copy_array
from libc.string cimport memcpy, memset
from libc.math cimport sqrt
@ -306,9 +307,9 @@ cdef class Doc:
def __get__(self):
if 'has_vector' in self.user_hooks:
return self.user_hooks['has_vector'](self)
elif any(token.has_vector for token in self):
elif self.vocab.vectors.data.size:
return True
elif self.tensor is not None:
elif self.tensor.size:
return True
else:
return False
@ -329,13 +330,13 @@ cdef class Doc:
self._vector = numpy.zeros((self.vocab.vectors_length,),
dtype='f')
return self._vector
elif self.has_vector:
elif self.vocab.vectors.data.size > 0:
vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
for token in self.c[:self.length]:
vector += self.vocab.get_vector(token.lex.orth)
self._vector = vector / len(self)
return self._vector
elif self.tensor is not None:
elif self.tensor.size > 0:
self._vector = self.tensor.mean(axis=0)
return self._vector
else:
@ -827,6 +828,23 @@ cdef class Doc:
attrs[:, 2:])
return self
def extend_tensor(self, tensor):
'''Concatenate a new tensor onto the doc.tensor object.
The doc.tensor attribute holds dense feature vectors
computed by the models in the pipeline. Let's say a
document with 30 words has a tensor with 128 dimensions
per word. doc.tensor.shape will be (30, 128). After
calling doc.extend_tensor with an array of hape (30, 64),
doc.tensor == (30, 192).
'''
xp = get_array_module(self.tensor)
if self.tensor.size == 0:
self.tensor.resize(tensor.shape)
copy_array(self.tensor, tensor)
else:
self.tensor = xp.hstack((self.tensor, tensor))
def merge(self, int start_idx, int end_idx, *args, **attributes):
"""Retokenize the document, such that the span at
`doc.text[start_idx : end_idx]` is merged into a single token. If

View File

@ -283,7 +283,12 @@ cdef class Span:
def __get__(self):
if 'has_vector' in self.doc.user_span_hooks:
return self.doc.user_span_hooks['has_vector'](self)
return any(token.has_vector for token in self)
elif self.vocab.vectors.data.size > 0:
return any(token.has_vector for token in self)
elif self.doc.tensor.size > 0:
return True
else:
return False
property vector:
"""A real-valued meaning representation. Defaults to an average of the

View File

@ -292,6 +292,8 @@ cdef class Token:
def __get__(self):
if 'has_vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['has_vector'](self)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return True
return self.vocab.has_vector(self.c.lex.orth)
property vector:
@ -303,7 +305,10 @@ cdef class Token:
def __get__(self):
if 'vector' in self.doc.user_token_hooks:
return self.doc.user_token_hooks['vector'](self)
return self.vocab.get_vector(self.c.lex.orth)
if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
return self.doc.tensor[self.i]
else:
return self.vocab.get_vector(self.c.lex.orth)
property vector_norm:
"""The L2 norm of the token's vector representation.

View File

@ -11,9 +11,8 @@ if environment == "deploy"
script(src="/assets/js/vendor/prism.min.js")
if SECTION == "models"
if compare_models
script(src="/assets/js/vendor/chart.min.js")
script(src="/assets/js/models.js?v#{V_JS}" type="module")
script
if quickstart
@ -24,15 +23,15 @@ script
| (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
| ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');
if IS_PAGE
script
if IS_PAGE
| ((window.gitter = {}).chat = {}).options = {
| useStyles: false,
| activationElement: '.js-gitter-button',
| targetElement: '.js-gitter',
| room: '!{SOCIAL.gitter}'
| };
if IS_PAGE
script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)
@ -48,39 +47,36 @@ if IS_PAGE
- ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
- ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"
//- Browsers with JS module support.
Will be ignored otherwise.
script(type="module")
| import ProgressBar from '/assets/js/progress.js';
!=ProgressBar
if changelog
| import Changelog from '/assets/js/changelog.js';
!=Changelog
if IS_PAGE
| import NavHighlighter from '/assets/js/nav-highlighter.js';
!=NavHighlighter
| import GitHubEmbed from '/assets/js/github-embed.js';
!=GitHubEmbed
if HAS_MODELS
| import { ModelLoader } from '/assets/js/models.js';
!=ModelLoader
if compare_models
| import { ModelComparer } from '/assets/js/models.js';
!=ModelComparer
//- Browsers with no JS module support.
Won't be fetched or interpreted otherwise.
script(nomodule src="/assets/js/rollup.js")
script(nomodule)
!=ProgressBar
if changelog
!=Changelog
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
if HAS_MODELS
!=ModeLoader
if compare_models
!=ModelComparer
if environment == "deploy"
//- DEPLOY: use compiled rollup.js and instantiate classes directly
script(src="/assets/js/rollup.js")
script
!=ProgressBar
if changelog
!=Changelog
if IS_PAGE
!=NavHighlighter
!=GitHubEmbed
if HAS_MODELS
!=ModelLoader
if compare_models
!=ModelComparer
else
//- DEVELOPMENT: Use ES6 modules
script(type="module")
| import ProgressBar from '/assets/js/progress.js';
!=ProgressBar
if changelog
| import Changelog from '/assets/js/changelog.js';
!=Changelog
if IS_PAGE
| import NavHighlighter from '/assets/js/nav-highlighter.js';
!=NavHighlighter
| import GitHubEmbed from '/assets/js/github-embed.js';
!=GitHubEmbed
if HAS_MODELS
| import { ModelLoader } from '/assets/js/models.js';
!=ModelLoader
if compare_models
| import { ModelComparer } from '/assets/js/models.js';
!=ModelComparer

View File

@ -198,6 +198,7 @@ export class ModelComparer {
this.fonts = CHART_FONTS;
this.defaultModels = defaultModels;
this.tpl.get('result').style.display = 'block';
this.tpl.get('error').style.display = 'none';
this.fetchCompat()
.then(compat => this.init(compat))
.catch(this.showError.bind(this))

View File

@ -40,13 +40,10 @@
},
"MODELS": {
"en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
"de": ["de_core_news_sm", "de_core_news_md"],
"es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
"pt": ["pt_core_news_sm"],
"fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
"en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
"de": ["de_core_news_sm"],
"es": ["es_core_news_sm", "es_core_news_md"],
"it": ["it_core_news_sm"],
"nl": ["nl_core_news_sm"],
"xx": ["xx_ent_wiki_sm"]
},

View File

@ -218,7 +218,7 @@ p
| If an exception consists of more than one token, the #[code ORTH] values
| combined always need to #[strong match the original string]. The way the
| original string is split up can be pretty arbitrary sometimes for
| example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
| example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
| Because of how the tokenizer works, it's currently not possible to split
| single-letter strings into multiple tokens.

View File

@ -198,11 +198,11 @@ p
| #[code .finditer()] methods:
+code.
import re
import regex as re
from spacy.tokenizer import Tokenizer
prefix_re = re.compile(r'''[\[\(&quot;&apos;]''')
suffix_re = re.compile(r'''[\]\)&quot;&apos;]''')
prefix_re = re.compile(r'''^[\[\(&quot;&apos;]''')
suffix_re = re.compile(r'''[\]\)&quot;&apos;]$''')
infix_re = re.compile(r'''[-~]''')
simple_url_re = re.compile(r'''^https?://''')
@ -220,6 +220,17 @@ p
| specialize are #[code find_prefix], #[code find_suffix] and
| #[code find_infix].
+infobox("Important note", "⚠️")
| When customising the prefix, suffix and infix handling, remember that
| you're passing in #[strong functions] for spaCy to execute, e.g.
| #[code prefix_re.search] not just the regular expressions. This means
| that your functions also need to define how the rules should be applied.
| For example, if you're adding your own prefix rules, you need
| to make sure they're only applied to characters at the
| #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
| suffix rules should only be applied at the #[strong end of a token],
| so your expression should end with a #[code $].
+h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline
p