Merge branch 'develop' of https://github.com/explosion/spaCy into develop

2017-11-04 00:25:02 +01:00 · 2017-11-04 00:25:02 +01:00 · 3ca16ddbd4
parent e4ec4be948 2639ecd5f8
commit 3ca16ddbd4
19 changed files with 290 additions and 138 deletions
--- a/.github/CONTRIBUTOR_AGREEMENT.md
+++ b/.github/CONTRIBUTOR_AGREEMENT.md
@ -87,7 +87,7 @@ U.S. Federal law. Any choice of law rules will not apply.
 7. Please place an “x” on one of the applicable statement below. Please do NOT
 mark both statements:

-    * [ ] I am signing on behalf of myself as an individual and no other person
+    * [x] I am signing on behalf of myself as an individual and no other person
    or entity, including my employer, has or will have rights with respect to my
    contributions.

@ -96,11 +96,11 @@ mark both statements:

 ## Contributor Details

-| Field                          | Entry                |
-|------------------------------- | -------------------- |
-| Name                           |                      |
-| Company name (if applicable)   |                      |
-| Title or role (if applicable)  |                      |
-| Date                           |                      |
-| GitHub username                |                      |
-| Website (optional)             |                      |
+| Field                          | Entry                              |
+|------------------------------- | --------------------               |
+| Name                           | Abhinav Sharma                     |
+| Company name (if applicable)   | Fourtek I.T. Solutions Pvt. Ltd.   |
+| Title or role (if applicable)  | Machine Learning Engineer          |
+| Date                           | 3 Novermber 2017                   |
+| GitHub username                | abhi18av                           |
+| Website (optional)             | https://abhi18av.github.io/        |
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@ -409,12 +409,14 @@ def build_tagger_model(nr_class, **cfg):
        else:
            tok2vec = Tok2Vec(token_vector_width, embed_size,
                              pretrained_dims=pretrained_dims)
+        softmax = with_flatten(Softmax(nr_class, token_vector_width))
        model = (
            tok2vec
-            >> with_flatten(Softmax(nr_class, token_vector_width))
+            >> softmax
        )
    model.nI = None
    model.tok2vec = tok2vec
+    model.softmax = softmax
    return model


--- a/spacy/language.py
+++ b/spacy/language.py
@ -391,9 +391,10 @@ class Language(object):
        for name, proc in pipes:
            if not hasattr(proc, 'update'):
                continue
+            grads = {}
            proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
-        for key, (W, dW) in grads.items():
-            sgd(W, dW, key=key)
+            for key, (W, dW) in grads.items():
+                sgd(W, dW, key=key)

    def preprocess_gold(self, docs_golds):
        """Can be called before training to pre-process gold data. By default,
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -129,8 +129,14 @@ cdef class Morphology:
        tag (unicode): The part-of-speech tag to key the exception.
        orth (unicode): The word-form to key the exception.
        """
+        # TODO: Currently we've assumed that we know the number of tags -- 
+        # RichTagC is an array, and _cache is a PreshMapArray
+        # This is really bad: it makes the morphology typed to the tagger
+        # classes, which is all wrong.
        self.exc[(tag_str, orth_str)] = dict(attrs)
        tag = self.strings.add(tag_str)
+        if tag not in self.reverse_index:
+            return
        tag_id = self.reverse_index[tag]
        orth = self.strings[orth_str]
        cdef RichTagC rich_tag = self.rich_tags[tag_id]
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -11,7 +11,7 @@ import ujson
 import msgpack

 from thinc.api import chain
-from thinc.v2v import Affine, Softmax
+from thinc.v2v import Affine, SELU, Softmax
 from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural.util import to_categorical, copy_array
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
@ -29,7 +29,7 @@ from .compat import json_dumps
 from .attrs import POS
 from .parts_of_speech import X
 from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
-from ._ml import link_vectors_to_models
+from ._ml import link_vectors_to_models, zero_init, flatten
 from . import util


@ -91,8 +91,8 @@ class Pipe(object):
        Both __call__ and pipe should delegate to the `predict()`
        and `set_annotations()` methods.
        """
-        scores = self.predict([doc])
-        self.set_annotations([doc], scores)
+        scores, tensors = self.predict([doc])
+        self.set_annotations([doc], scores, tensors=tensors)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
@ -103,8 +103,8 @@ class Pipe(object):
        """
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
-            scores = self.predict(docs)
-            self.set_annotations(docs, scores)
+            scores, tensors = self.predict(docs)
+            self.set_annotations(docs, scores, tensor=tensors)
            yield from docs

    def predict(self, docs):
@ -113,7 +113,7 @@ class Pipe(object):
        """
        raise NotImplementedError

-    def set_annotations(self, docs, scores):
+    def set_annotations(self, docs, scores, tensors=None):
        """Modify a batch of documents, using pre-computed scores."""
        raise NotImplementedError

@ -216,7 +216,7 @@ class Tensorizer(Pipe):
    name = 'tensorizer'

    @classmethod
-    def Model(cls, width=128, embed_size=4000, **cfg):
+    def Model(cls, output_size=300, input_size=384, **cfg):
        """Create a new statistical model for the class.

        width (int): Output size of the model.
@ -224,9 +224,11 @@ class Tensorizer(Pipe):
        **cfg: Config parameters.
        RETURNS (Model): A `thinc.neural.Model` or similar instance.
        """
-        width = util.env_opt('token_vector_width', width)
-        embed_size = util.env_opt('embed_size', embed_size)
-        return Tok2Vec(width, embed_size, **cfg)
+        model = chain(
+                    SELU(output_size, input_size),
+                    SELU(output_size, output_size),
+                    zero_init(Affine(output_size, output_size)))
+        return model

    def __init__(self, vocab, model=True, **cfg):
        """Construct a new statistical model. Weights are not allocated on
@ -244,6 +246,7 @@ class Tensorizer(Pipe):
        """
        self.vocab = vocab
        self.model = model
+        self.input_models = []
        self.cfg = dict(cfg)
        self.cfg['pretrained_dims'] = self.vocab.vectors.data.shape[1]
        self.cfg.setdefault('cnn_maxout_pieces', 3)
@ -269,8 +272,8 @@ class Tensorizer(Pipe):
        """
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
-            tokvecses = self.predict(docs)
-            self.set_annotations(docs, tokvecses)
+            tensors = self.predict(docs)
+            self.set_annotations(docs, tensors)
            yield from docs

    def predict(self, docs):
@ -279,18 +282,19 @@ class Tensorizer(Pipe):
        docs (iterable): A sequence of `Doc` objects.
        RETURNS (object): Vector representations for each token in the docs.
        """
-        tokvecs = self.model(docs)
-        return tokvecs
+        inputs = self.model.ops.flatten([doc.tensor for doc in docs])
+        outputs = self.model(inputs)
+        return self.model.ops.unflatten(outputs, [len(d) for d in docs])

-    def set_annotations(self, docs, tokvecses):
+    def set_annotations(self, docs, tensors):
        """Set the tensor attribute for a batch of documents.

        docs (iterable): A sequence of `Doc` objects.
-        tokvecs (object): Vector representation for each token in the docs.
+        tensors (object): Vector representation for each token in the docs.
        """
-        for doc, tokvecs in zip(docs, tokvecses):
-            assert tokvecs.shape[0] == len(doc)
-            doc.tensor = tokvecs
+        for doc, tensor in zip(docs, tensors):
+            assert tensor.shape[0] == len(doc)
+            doc.tensor = tensor

    def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
        """Update the model.
@ -303,11 +307,34 @@ class Tensorizer(Pipe):
        """
        if isinstance(docs, Doc):
            docs = [docs]
-        tokvecs, bp_tokvecs = self.model.begin_update(docs, drop=drop)
-        return tokvecs, bp_tokvecs
+        inputs = []
+        bp_inputs = []
+        for tok2vec in self.input_models:
+            tensor, bp_tensor = tok2vec.begin_update(docs, drop=drop)
+            inputs.append(tensor)
+            bp_inputs.append(bp_tensor)
+        inputs = self.model.ops.xp.hstack(inputs)
+        scores, bp_scores = self.model.begin_update(inputs, drop=drop)
+        loss, d_scores = self.get_loss(docs, golds, scores)
+        d_inputs = bp_scores(d_scores, sgd=sgd)
+        d_inputs = self.model.ops.xp.split(d_inputs, len(self.input_models), axis=1)
+        for d_input, bp_input in zip(d_inputs, bp_inputs): 
+            bp_input(d_input, sgd=sgd)
+        if losses is not None:
+            losses.setdefault(self.name, 0.)
+            losses[self.name] += loss
+        return loss

-    def get_loss(self, docs, golds, scores):
-        raise NotImplementedError
+    def get_loss(self, docs, golds, prediction):
+        target = []
+        i = 0
+        for doc in docs:
+            vectors = self.model.ops.xp.vstack([w.vector for w in doc])
+            target.append(vectors)
+        target = self.model.ops.xp.vstack(target)
+        d_scores = (prediction - target) / prediction.shape[0]
+        loss = (d_scores**2).sum()
+        return loss, d_scores

    def begin_training(self, gold_tuples=tuple(), pipeline=None):
        """Allocate models, pre-process training data and acquire a trainer and
@ -316,8 +343,13 @@ class Tensorizer(Pipe):
        gold_tuples (iterable): Gold-standard training data.
        pipeline (list): The pipeline the model is part of.
        """
+        for name, model in pipeline:
+            if getattr(model, 'tok2vec', None):
+                self.input_models.append(model.tok2vec)
        if self.model is True:
-            self.cfg['pretrained_dims'] = self.vocab.vectors_length
+            self.cfg['input_size'] = 384
+            self.cfg['output_size'] = 300
+            #self.cfg['pretrained_dims'] = self.vocab.vectors_length
            self.model = self.Model(**self.cfg)
        link_vectors_to_models(self.vocab)

@ -337,28 +369,37 @@ class Tagger(Pipe):
    def labels(self):
        return self.vocab.morphology.tag_names

+    @property
+    def tok2vec(self):
+        if self.model in (None, True, False):
+            return None
+        else:
+            return chain(self.model.tok2vec, flatten)
+
    def __call__(self, doc):
-        tags = self.predict([doc])
-        self.set_annotations([doc], tags)
+        tags, tokvecs = self.predict([doc])
+        self.set_annotations([doc], tags, tensors=tokvecs)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
-            tag_ids = self.predict(docs)
-            self.set_annotations(docs, tag_ids)
+            tag_ids, tokvecs = self.predict(docs)
+            self.set_annotations(docs, tag_ids, tensors=tokvecs)
            yield from docs

    def predict(self, docs):
-        scores = self.model(docs)
-        scores = self.model.ops.flatten(scores)
-        guesses = scores.argmax(axis=1)
-        if not isinstance(guesses, numpy.ndarray):
-            guesses = guesses.get()
-        guesses = self.model.ops.unflatten(guesses, [len(d) for d in docs])
-        return guesses
+        tokvecs = self.model.tok2vec(docs)
+        scores = self.model.softmax(tokvecs)
+        guesses = []
+        for doc_scores in scores:
+            doc_guesses = doc_scores.argmax(axis=1)
+            if not isinstance(doc_guesses, numpy.ndarray):
+                doc_guesses = doc_guesses.get()
+            guesses.append(doc_guesses)
+        return guesses, tokvecs

-    def set_annotations(self, docs, batch_tag_ids):
+    def set_annotations(self, docs, batch_tag_ids, tensors=None):
        if isinstance(docs, Doc):
            docs = [docs]
        cdef Doc doc
@ -373,6 +414,8 @@ class Tagger(Pipe):
                if doc.c[j].tag == 0 and doc.c[j].pos == 0:
                    vocab.morphology.assign_tag_id(&doc.c[j], tag_id)
                idx += 1
+            if tensors is not None:
+                doc.extend_tensor(tensors[i])
        doc.is_tagged = True

    def update(self, docs, golds, drop=0., sgd=None, losses=None):
@ -573,7 +616,7 @@ class MultitaskObjective(Tagger):
    def labels(self, value):
        self.cfg['labels'] = value

-    def set_annotations(self, docs, dep_ids):
+    def set_annotations(self, docs, dep_ids, tensors=None):
        pass

    def begin_training(self, gold_tuples=tuple(), pipeline=None, tok2vec=None):
@ -720,15 +763,15 @@ class TextCategorizer(Pipe):
        self.cfg['labels'] = value

    def __call__(self, doc):
-        scores = self.predict([doc])
-        self.set_annotations([doc], scores)
+        scores, tensors = self.predict([doc])
+        self.set_annotations([doc], scores, tensors=tensors)
        return doc

    def pipe(self, stream, batch_size=128, n_threads=-1):
        for docs in cytoolz.partition_all(batch_size, stream):
            docs = list(docs)
-            scores = self.predict(docs)
-            self.set_annotations(docs, scores)
+            scores, tensors = self.predict(docs)
+            self.set_annotations(docs, scores, tensors=tensors)
            yield from docs

    def predict(self, docs):
@ -736,8 +779,10 @@ class TextCategorizer(Pipe):
        scores = self.model.ops.asarray(scores)
        return scores

-    def set_annotations(self, docs, scores):
+    def set_annotations(self, docs, scores, tensors=None):
        for i, doc in enumerate(docs):
+            if tensors is not None:
+                doc.extend_tensor(tensors[i])
            for j, label in enumerate(self.labels):
                doc.cats[label] = float(scores[i, j])

--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -1,6 +1,7 @@
 # cython: infer_types=True
 # cython: cdivision=True
 # cython: boundscheck=False
+# cython: profile=True
 # coding: utf-8
 from __future__ import unicode_literals, print_function

@ -322,15 +323,17 @@ cdef class Parser:
            beam_density = self.cfg.get('beam_density', 0.0)
        cdef Beam beam
        if beam_width == 1:
-            states = self.parse_batch([doc])
-            self.set_annotations([doc], states)
+            states, tokvecs = self.parse_batch([doc])
+            self.set_annotations([doc], states, tensors=tokvecs)
            return doc
        else:
-            beam = self.beam_parse([doc],
-                        beam_width=beam_width, beam_density=beam_density)[0]
+            beams, tokvecs = self.beam_parse([doc],
+                                beam_width=beam_width,
+                                beam_density=beam_density)
+            beam = beams[0]
            output = self.moves.get_beam_annot(beam)
            state = <StateClass>beam.at(0)
-            self.set_annotations([doc], [state])
+            self.set_annotations([doc], [state], tensors=tokvecs)
            _cleanup(beam)
            return output

@ -356,15 +359,16 @@ cdef class Parser:
            for subbatch in cytoolz.partition_all(8, by_length):
                subbatch = list(subbatch)
                if beam_width == 1:
-                    parse_states = self.parse_batch(subbatch)
+                    parse_states, tokvecs = self.parse_batch(subbatch)
                    beams = []
                else:
-                    beams = self.beam_parse(subbatch, beam_width=beam_width,
-                                            beam_density=beam_density)
+                    beams, tokvecs = self.beam_parse(subbatch,
+                                        beam_width=beam_width,
+                                        beam_density=beam_density)
                    parse_states = []
                    for beam in beams:
                        parse_states.append(<StateClass>beam.at(0))
-                self.set_annotations(subbatch, parse_states)
+                self.set_annotations(subbatch, parse_states, tensors=tokvecs)
            yield from batch

    def parse_batch(self, docs):
@ -411,7 +415,9 @@ cdef class Parser:
                    feat_weights, bias, hW, hb,
                    nr_class, nr_hidden, nr_feat, nr_piece)
        PyErr_CheckSignals()
-        return state_objs
+        tokvecs = self.model[0].ops.unflatten(tokvecs,
+                                    [len(doc) for doc in docs])
+        return state_objs, tokvecs

    cdef void _parseC(self, StateC* state, 
            const float* feat_weights, const float* bias,
@ -508,7 +514,9 @@ cdef class Parser:
                beam.advance(_transition_state, _hash_state, <void*>self.moves.c)
                beam.check_done(_check_final_state, NULL)
            beams.append(beam)
-        return beams
+        tokvecs = self.model[0].ops.unflatten(tokvecs,
+                                    [len(doc) for doc in docs])
+        return beams, tokvecs

    def update(self, docs, golds, drop=0., sgd=None, losses=None):
        if not any(self.moves.has_gold(gold) for gold in golds):
@ -735,18 +743,29 @@ cdef class Parser:
            c_d_scores += d_scores.shape[1]
        return d_scores

-    def set_annotations(self, docs, states):
+    def set_annotations(self, docs, states, tensors=None):
        cdef StateClass state
        cdef Doc doc
-        for state, doc in zip(states, docs):
+        for i, (state, doc) in enumerate(zip(states, docs)):
            self.moves.finalize_state(state.c)
-            for i in range(doc.length):
-                doc.c[i] = state.c._sent[i]
+            for j in range(doc.length):
+                doc.c[j] = state.c._sent[j]
+            if tensors is not None:
+                doc.extend_tensor(tensors[i])
            self.moves.finalize_doc(doc)
+
            for hook in self.postprocesses:
                for doc in docs:
                    hook(doc)

+    @property
+    def tok2vec(self):
+        '''Return the embedding and convolutional layer of the model.'''
+        if self.model in (None, True, False):
+            return None
+        else:
+            return self.model[0]
+
    @property
    def postprocesses(self):
        # Available for subclasses, e.g. to deprojectivize
--- a/spacy/tests/lang/en/test_lemmatizer.py
+++ b/spacy/tests/lang/en/test_lemmatizer.py
@ -22,35 +22,37 @@ def test_doc_lemmatization(EN):
                                         ("ring", ["ring"]),
                                         ("axes", ["axis", "axe", "ax"])])
 def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
-    assert en_lemmatizer.noun(text) == set(lemmas)
+    assert en_lemmatizer.noun(text) == lemmas


@pytest.mark.models('en')
@pytest.mark.parametrize('text,lemmas', [("bleed", ["bleed"]),
                                         ("feed", ["feed"]),
                                         ("need", ["need"]),
-                                         ("ring", ["ring"]),
-                                         ("axes", ["axis", "axe", "ax"])])
+                                         ("ring", ["ring"])])
 def test_en_lemmatizer_noun_lemmas(en_lemmatizer, text, lemmas):
-    assert en_lemmatizer.noun(text) == set(lemmas)
+    # Cases like this are problematic -- not clear what we should do to resolve
+    # ambiguity?
+    # ("axes", ["ax", "axes", "axis"])])
+    assert en_lemmatizer.noun(text) == lemmas


@pytest.mark.xfail
@pytest.mark.models('en')
 def test_en_lemmatizer_base_forms(en_lemmatizer):
-    assert en_lemmatizer.noun('dive', {'number': 'sing'}) == set(['dive'])
-    assert en_lemmatizer.noun('dive', {'number': 'plur'}) == set(['diva'])
+    assert en_lemmatizer.noun('dive', {'number': 'sing'}) == ['dive']
+    assert en_lemmatizer.noun('dive', {'number': 'plur'}) == ['diva']


@pytest.mark.models('en')
 def test_en_lemmatizer_base_form_verb(en_lemmatizer):
-    assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == set(['see'])
+    assert en_lemmatizer.verb('saw', {'verbform': 'past'}) == ['see']


@pytest.mark.models('en')
 def test_en_lemmatizer_punct(en_lemmatizer):
-    assert en_lemmatizer.punct('“') == set(['"'])
-    assert en_lemmatizer.punct('“') == set(['"'])
+    assert en_lemmatizer.punct('“') == ['"']
+    assert en_lemmatizer.punct('“') == ['"']


@pytest.mark.models('en')
--- a/spacy/tests/lang/en/test_models.py
+++ b/spacy/tests/lang/en/test_models.py
@ -75,3 +75,11 @@ def test_en_models_probs(example):
    assert not prob0 == prob1
    assert not prob0 == prob2
    assert not prob1 == prob2
+
+
+@pytest.mark.models('en')
+def test_no_vectors_similarity(EN):
+    doc1 = EN(u'hallo')
+    doc2 = EN(u'hi')
+    assert doc1.similarity(doc2) > 0
+
--- a/spacy/tests/parser/test_preset_sbd.py
+++ b/spacy/tests/parser/test_preset_sbd.py
@ -56,7 +56,7 @@ def test_sents_1_2(parser):
    doc[1].sent_start = True
    doc[2].sent_start = True
    doc = parser(doc)
-    assert len(list(doc.sents)) == 3
+    assert len(list(doc.sents)) >= 3


 def test_sents_1_3(parser):
--- a/spacy/tests/regression/test_issue1488.py
+++ b/spacy/tests/regression/test_issue1488.py
@ -0,0 +1,26 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import regex as re
+from ...lang.en import English
+from ...tokenizer import Tokenizer
+
+
+def test_issue1488():
+    prefix_re = re.compile(r'''[\[\("']''')
+    suffix_re = re.compile(r'''[\]\)"']''')
+    infix_re = re.compile(r'''[-~\.]''')
+    simple_url_re = re.compile(r'''^https?://''')
+
+    def my_tokenizer(nlp):
+        return Tokenizer(nlp.vocab, {},
+                         prefix_search=prefix_re.search,
+                         suffix_search=suffix_re.search,
+                         infix_finditer=infix_re.finditer,
+                         token_match=simple_url_re.match)
+
+    nlp = English()
+    nlp.tokenizer = my_tokenizer(nlp)
+    doc = nlp("This is a test.")
+    for token in doc:
+        assert token.text
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -72,7 +72,17 @@ def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
    dY = model.ops.allocate((15, nO, nP))
    ids = model.ops.allocate((15, nF))
    ids[1,2] = -1
-    dY[1,2] = 1
+    dY[1] = 1
    assert model.d_pad[0, 2, 0, 0] == 0.
    model._backprop_padding(dY, ids)
    assert model.d_pad[0, 2, 0, 0] == 1.
+    model.d_pad.fill(0.)
+    ids.fill(0.)
+    dY.fill(0.)
+    ids[1,2] = -1
+    ids[1,1] = -1
+    ids[1,0] = -1
+    dY[1] = 1
+    assert model.d_pad[0, 2, 0, 0] == 0.
+    model._backprop_padding(dY, ids)
+    assert model.d_pad[0, 2, 0, 0] == 3.
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -10,6 +10,7 @@ import numpy.linalg
 import struct
 import dill
 import msgpack
+from thinc.neural.util import get_array_module, copy_array

 from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
@ -306,9 +307,9 @@ cdef class Doc:
        def __get__(self):
            if 'has_vector' in self.user_hooks:
                return self.user_hooks['has_vector'](self)
-            elif any(token.has_vector for token in self):
+            elif self.vocab.vectors.data.size:
                return True
-            elif self.tensor is not None:
+            elif self.tensor.size:
                return True
            else:
                return False
@ -329,13 +330,13 @@ cdef class Doc:
                self._vector = numpy.zeros((self.vocab.vectors_length,),
                                           dtype='f')
                return self._vector
-            elif self.has_vector:
+            elif self.vocab.vectors.data.size > 0:
                vector = numpy.zeros((self.vocab.vectors_length,), dtype='f')
                for token in self.c[:self.length]:
                    vector += self.vocab.get_vector(token.lex.orth)
                self._vector = vector / len(self)
                return self._vector
-            elif self.tensor is not None:
+            elif self.tensor.size > 0:
                self._vector = self.tensor.mean(axis=0)
                return self._vector
            else:
@ -827,6 +828,23 @@ cdef class Doc:
                        attrs[:, 2:])
        return self

+    def extend_tensor(self, tensor):
+        '''Concatenate a new tensor onto the doc.tensor object.
+
+        The doc.tensor attribute holds dense feature vectors
+        computed by the models in the pipeline. Let's say a
+        document with 30 words has a tensor with 128 dimensions
+        per word. doc.tensor.shape will be (30, 128). After
+        calling doc.extend_tensor with an array of hape (30, 64),
+        doc.tensor == (30, 192).
+        '''
+        xp = get_array_module(self.tensor)
+        if self.tensor.size == 0:
+            self.tensor.resize(tensor.shape)
+            copy_array(self.tensor, tensor)
+        else:
+            self.tensor = xp.hstack((self.tensor, tensor))
+
    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """Retokenize the document, such that the span at
        `doc.text[start_idx : end_idx]` is merged into a single token. If
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -283,7 +283,12 @@ cdef class Span:
        def __get__(self):
            if 'has_vector' in self.doc.user_span_hooks:
                return self.doc.user_span_hooks['has_vector'](self)
-            return any(token.has_vector for token in self)
+            elif self.vocab.vectors.data.size > 0:
+                return any(token.has_vector for token in self)
+            elif self.doc.tensor.size > 0:
+                return True
+            else:
+                return False

    property vector:
        """A real-valued meaning representation. Defaults to an average of the
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -292,6 +292,8 @@ cdef class Token:
        def __get__(self):
            if 'has_vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['has_vector'](self)
+            if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
+                return True
            return self.vocab.has_vector(self.c.lex.orth)

    property vector:
@ -303,7 +305,10 @@ cdef class Token:
        def __get__(self):
            if 'vector' in self.doc.user_token_hooks:
                return self.doc.user_token_hooks['vector'](self)
-            return self.vocab.get_vector(self.c.lex.orth)
+            if self.vocab.vectors.size == 0 and self.doc.tensor.size != 0:
+                return self.doc.tensor[self.i]
+            else:
+                return self.vocab.get_vector(self.c.lex.orth)

    property vector_norm:
        """The L2 norm of the token's vector representation.
--- a/website/_includes/_scripts.jade
+++ b/website/_includes/_scripts.jade
@ -11,9 +11,8 @@ if environment == "deploy"

 script(src="/assets/js/vendor/prism.min.js")

-if SECTION == "models"
+if compare_models
    script(src="/assets/js/vendor/chart.min.js")
-    script(src="/assets/js/models.js?v#{V_JS}" type="module")

 script
    if quickstart
@ -24,15 +23,15 @@ script
        | (ga.q=ga.q||[]).push(arguments)}; ga.l=+new Date;
        | ga('create', '#{ANALYTICS}', 'auto'); ga('send', 'pageview');

-
-if IS_PAGE
-    script
+    if IS_PAGE
        | ((window.gitter = {}).chat = {}).options = {
        |     useStyles: false,
        |     activationElement: '.js-gitter-button',
        |     targetElement: '.js-gitter',
        |     room: '!{SOCIAL.gitter}'
        | };
+
+if IS_PAGE
    script(src="https://sidecar.gitter.im/dist/sidecar.v1.js" async defer)


@ -48,39 +47,36 @@ if IS_PAGE
 - ModelLoader = "new ModelLoader('" + MODELS_REPO + "'," + JSON.stringify(CURRENT_MODELS) + "," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + ");"
 - ModelComparer = "new ModelComparer('" + MODELS_REPO + "'," + JSON.stringify(MODEL_LICENSES) + "," + JSON.stringify(MODEL_BENCHMARKS) + "," + JSON.stringify(LANGUAGES) + "," + JSON.stringify(MODEL_META) + "," + JSON.stringify(default_models || false) + ");"

-//- Browsers with JS module support.
-    Will be ignored otherwise.
-
-script(type="module")
-    | import ProgressBar from '/assets/js/progress.js';
-    !=ProgressBar
-    if changelog
-        | import Changelog from '/assets/js/changelog.js';
-        !=Changelog
-    if IS_PAGE
-        | import NavHighlighter from '/assets/js/nav-highlighter.js';
-        !=NavHighlighter
-        | import GitHubEmbed from '/assets/js/github-embed.js';
-        !=GitHubEmbed
-    if HAS_MODELS
-        | import { ModelLoader } from '/assets/js/models.js';
-        !=ModelLoader
-    if compare_models
-        | import { ModelComparer } from '/assets/js/models.js';
-        !=ModelComparer
-
-//- Browsers with no JS module support.
-    Won't be fetched or interpreted otherwise.
-
-script(nomodule src="/assets/js/rollup.js")
-script(nomodule)
-    !=ProgressBar
-    if changelog
-        !=Changelog
-    if IS_PAGE
-        !=NavHighlighter
-        !=GitHubEmbed
-    if HAS_MODELS
-        !=ModeLoader
-    if compare_models
-        !=ModelComparer
+if environment == "deploy"
+    //- DEPLOY: use compiled rollup.js and instantiate classes directly
+    script(src="/assets/js/rollup.js")
+    script
+        !=ProgressBar
+        if changelog
+            !=Changelog
+        if IS_PAGE
+            !=NavHighlighter
+            !=GitHubEmbed
+        if HAS_MODELS
+            !=ModelLoader
+        if compare_models
+            !=ModelComparer
+else
+    //- DEVELOPMENT: Use ES6 modules
+    script(type="module")
+        | import ProgressBar from '/assets/js/progress.js';
+        !=ProgressBar
+        if changelog
+            | import Changelog from '/assets/js/changelog.js';
+            !=Changelog
+        if IS_PAGE
+            | import NavHighlighter from '/assets/js/nav-highlighter.js';
+            !=NavHighlighter
+            | import GitHubEmbed from '/assets/js/github-embed.js';
+            !=GitHubEmbed
+        if HAS_MODELS
+            | import { ModelLoader } from '/assets/js/models.js';
+            !=ModelLoader
+        if compare_models
+            | import { ModelComparer } from '/assets/js/models.js';
+            !=ModelComparer
--- a/website/assets/js/models.js
+++ b/website/assets/js/models.js
@ -198,6 +198,7 @@ export class ModelComparer {
        this.fonts = CHART_FONTS;
        this.defaultModels = defaultModels;
        this.tpl.get('result').style.display = 'block';
+        this.tpl.get('error').style.display = 'none';
        this.fetchCompat()
            .then(compat => this.init(compat))
            .catch(this.showError.bind(this))
--- a/website/models/_data.json
+++ b/website/models/_data.json
@ -40,13 +40,10 @@
    },

    "MODELS": {
-        "en": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_vectors_web_lg"],
-        "de": ["de_core_news_sm", "de_core_news_md"],
-        "es": ["es_core_news_sm", "es_core_news_md", "es_vectors_web_lg"],
-        "pt": ["pt_core_news_sm"],
-        "fr": ["fr_core_news_sm", "fr_core_news_md", "fr_vectors_web_lg"],
+        "en": ["en_core_web_sm", "en_core_web_lg", "en_vectors_web_lg"],
+        "de": ["de_core_news_sm"],
+        "es": ["es_core_news_sm", "es_core_news_md"],
        "it": ["it_core_news_sm"],
-        "nl": ["nl_core_news_sm"],
        "xx": ["xx_ent_wiki_sm"]
    },

--- a/website/usage/_adding-languages/_language-data.jade
+++ b/website/usage/_adding-languages/_language-data.jade
@ -218,7 +218,7 @@ p
    |  If an exception consists of more than one token, the #[code ORTH] values
    |  combined always need to #[strong match the original string]. The way the
    |  original string is split up can be pretty arbitrary sometimes – for
-    |  example "gonna" is split into "gon" (lemma "go") nad "na" (lemma "to").
+    |  example "gonna" is split into "gon" (lemma "go") and "na" (lemma "to").
    |  Because of how the tokenizer works, it's currently not possible to split
    |  single-letter strings into multiple tokens.

--- a/website/usage/_linguistic-features/_tokenization.jade
+++ b/website/usage/_linguistic-features/_tokenization.jade
@ -198,11 +198,11 @@ p
    |  #[code .finditer()] methods:

 +code.
-    import re
+    import regex as re
    from spacy.tokenizer import Tokenizer

-    prefix_re = re.compile(r'''[\[\(&quot;&apos;]''')
-    suffix_re = re.compile(r'''[\]\)&quot;&apos;]''')
+    prefix_re = re.compile(r'''^[\[\(&quot;&apos;]''')
+    suffix_re = re.compile(r'''[\]\)&quot;&apos;]$''')
    infix_re = re.compile(r'''[-~]''')
    simple_url_re = re.compile(r'''^https?://''')

@ -220,6 +220,17 @@ p
    |  specialize are #[code find_prefix], #[code find_suffix] and
    |  #[code find_infix].

+infobox("Important note", "⚠️")
+    |  When customising the prefix, suffix and infix handling, remember that
+    |  you're passing in #[strong functions] for spaCy to execute, e.g.
+    |  #[code prefix_re.search] – not just the regular expressions. This means
+    |  that your functions also need to define how the rules should be applied.
+    |  For example, if you're adding your own prefix rules, you need
+    |  to make sure they're only applied to characters at the
+    |  #[strong beginning of a token], e.g. by adding #[code ^]. Similarly,
+    |  suffix rules should only be applied at the #[strong end of a token],
+    |  so your expression should end with a #[code $].
+
 +h(3, "custom-tokenizer") Hooking an arbitrary tokenizer into the pipeline

 p