From 212c8f071180c9ce134a74b85603e48c14199595 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 00:25:54 +0200
Subject: [PATCH 01/19] Implement new Language methods and pipeline API

---
 spacy/language.py | 260 ++++++++++++++++++++++++++--------------------
 spacy/util.py     |   6 +-
 2 files changed, 150 insertions(+), 116 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index c49c64b1d..91644aec0 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -70,59 +70,7 @@ class BaseDefaults(object):
                          prefix_search=prefix_search, suffix_search=suffix_search,
                          infix_finditer=infix_finditer, token_match=token_match)
 
-    @classmethod
-    def create_tagger(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralTagger(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralTagger(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_parser(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralDependencyParser(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_entity(cls, nlp=None, **cfg):
-        if nlp is None:
-            return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
-        else:
-            return NeuralEntityRecognizer(nlp.vocab, **cfg)
-
-    @classmethod
-    def create_pipeline(cls, nlp=None, disable=tuple()):
-        meta = nlp.meta if nlp is not None else {}
-        # Resolve strings, like "cnn", "lstm", etc
-        pipeline = []
-        for entry in meta.get('pipeline', []):
-            if entry in disable or getattr(entry, 'name', entry) in disable:
-                continue
-            factory = cls.Defaults.factories[entry]
-            pipeline.append(factory(nlp, **meta.get(entry, {})))
-        return pipeline
-
-    factories = {
-        'make_doc': create_tokenizer,
-        'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
-        'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
-        'parser': lambda nlp, **cfg: [
-            NeuralDependencyParser(nlp.vocab, **cfg),
-            nonproj.deprojectivize],
-        'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-        'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
-        'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
-        # Temporary compatibility -- delete after pivot
-        'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
-        'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
-        'dependencies': lambda nlp, **cfg: [
-            NeuralDependencyParser(nlp.vocab, **cfg),
-            nonproj.deprojectivize,
-        ],
-        'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
-    }
-
+    pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
     token_match = TOKEN_MATCH
     prefixes = tuple(TOKENIZER_PREFIXES)
     suffixes = tuple(TOKENIZER_SUFFIXES)
@@ -152,8 +100,17 @@ class Language(object):
     Defaults = BaseDefaults
     lang = None
 
-    def __init__(self, vocab=True, make_doc=True, pipeline=None,
-                 meta={}, disable=tuple(), **kwargs):
+    factories = {
+        'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
+        'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
+        'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
+        'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),  # nonproj.deprojectivize,
+        'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
+        'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
+        'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
+    }
+
+    def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
         """Initialise a Language object.
 
         vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@@ -179,28 +136,7 @@ class Language(object):
             factory = self.Defaults.create_tokenizer
             make_doc = factory(self, **meta.get('tokenizer', {}))
         self.tokenizer = make_doc
-        if pipeline is True:
-            self.pipeline = self.Defaults.create_pipeline(self, disable)
-        elif pipeline:
-            # Careful not to do getattr(p, 'name', None) here
-            # If we had disable=[None], we'd disable everything!
-            self.pipeline = [p for p in pipeline
-                             if p not in disable
-                             and getattr(p, 'name', p) not in disable]
-            # Resolve strings, like "cnn", "lstm", etc
-            for i, entry in enumerate(self.pipeline):
-                if entry in self.Defaults.factories:
-                    factory = self.Defaults.factories[entry]
-                    self.pipeline[i] = factory(self, **meta.get(entry, {}))
-        else:
-            self.pipeline = []
-        flat_list = []
-        for pipe in self.pipeline:
-            if isinstance(pipe, list):
-                flat_list.extend(pipe)
-            else:
-                flat_list.append(pipe)
-        self.pipeline = flat_list
+        self.pipeline = []
         self._optimizer = None
 
     @property
@@ -214,11 +150,7 @@ class Language(object):
         self._meta.setdefault('email', '')
         self._meta.setdefault('url', '')
         self._meta.setdefault('license', '')
-        pipeline = []
-        for component in self.pipeline:
-            if hasattr(component, 'name'):
-                pipeline.append(component.name)
-        self._meta['pipeline'] = pipeline
+        self._meta['pipeline'] = self.pipe_names
         return self._meta
 
     @meta.setter
@@ -228,31 +160,133 @@ class Language(object):
     # Conveniences to access pipeline components
     @property
     def tensorizer(self):
-        return self.get_component('tensorizer')
+        return self.get_pipe('tensorizer')
 
     @property
     def tagger(self):
-        return self.get_component('tagger')
+        return self.get_pipe('tagger')
 
     @property
     def parser(self):
-        return self.get_component('parser')
+        return self.get_pipe('parser')
 
     @property
     def entity(self):
-        return self.get_component('ner')
+        return self.get_pipe('ner')
 
     @property
     def matcher(self):
-        return self.get_component('matcher')
+        return self.get_pipe('matcher')
 
-    def get_component(self, name):
-        if self.pipeline in (True, None):
-            return None
-        for proc in self.pipeline:
-            if hasattr(proc, 'name') and proc.name.endswith(name):
-                return proc
-        return None
+    @property
+    def pipe_names(self):
+        """Get names of available pipeline components.
+
+        RETURNS (list): List of component name strings, in order.
+        """
+        return [pipe_name for pipe_name, _ in self.pipeline]
+
+    def get_pipe(self, name):
+        """Get a pipeline component for a given component name.
+
+        name (unicode): Name of pipeline component to get.
+        RETURNS (callable): The pipeline component.
+        """
+        for pipe_name, component in self.pipeline:
+            if pipe_name == name:
+                return component
+        msg = "No component '{}' found in pipeline. Available names: {}"
+        raise KeyError(msg.format(name, self.pipe_names))
+
+    def create_pipe(self, name, config=dict()):
+        """Create a pipeline component from a factory.
+
+        name (unicode): Factory name to look up in `Language.factories`.
+        RETURNS (callable): Pipeline component.
+        """
+        if name not in self.factories:
+            raise KeyError("Can't find factory for '{}'.".format(name))
+        factory = self.factories[name]
+        return factory(self, **config)
+
+    def add_pipe(self, component, name=None, before=None, after=None,
+                 first=None, last=None):
+        """Add a component to the processing pipeline. Valid components are
+        callables that take a `Doc` object, modify it and return it. Only one of
+        before, after, first or last can be set. Default behaviour is "last".
+
+        component (callable): The pipeline component.
+        name (unicode): Name of pipeline component. Overwrites existing
+            component.name attribute if available. If no name is set and
+            the component exposes no name attribute, component.__name__ is
+            used. An error is raised if the name already exists in the pipeline.
+        before (unicode): Component name to insert component directly before.
+        after (unicode): Component name to insert component directly after.
+        first (bool): Insert component first / not first in the pipeline.
+        last (bool): Insert component last / not last in the pipeline.
+
+        EXAMPLE:
+            >>> nlp.add_pipe(component, before='ner')
+            >>> nlp.add_pipe(component, name='custom_name', last=True)
+        """
+        if name is None:
+            name = getattr(component, 'name', component.__name__)
+        if name in self.pipe_names:
+            raise ValueError("'{}' already exists in pipeline.".format(name))
+        if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
+            msg = ("Invalid constraints. You can only set one of the "
+                   "following: before, after, first, last.")
+            raise ValueError(msg)
+        pipe = (name, component)
+        if last or not any([first, before, after]):
+            self.pipeline.append(pipe)
+        elif first:
+            self.pipeline.insert(0, pipe)
+        elif before and before in self.pipe_names:
+            self.pipeline.insert(self.pipe_names.index(before), pipe)
+        elif after and after in self.pipe_names:
+            self.pipeline.insert(self.pipe_names.index(after), pipe)
+        else:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            unfound = before or after
+            raise ValueError(msg.format(unfound, self.pipe_names))
+
+    def replace_pipe(self, name, component):
+        """Replace a component in the pipeline.
+
+        name (unicode): Name of the component to replace.
+        component (callable): Pipeline component.
+        """
+        if name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(name, self.pipe_names))
+        self.pipeline[self.pipe_names.index(name)] = (name, component)
+
+    def rename_pipe(self, old_name, new_name):
+        """Rename a pipeline component.
+
+        old_name (unicode): Name of the component to rename.
+        new_name (unicode): New name of the component.
+        """
+        if old_name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(old_name, self.pipe_names))
+        if new_name in self.pipe_names:
+            msg = "'{}' already exists in pipeline. Existing names: {}"
+            raise ValueError(msg.format(new_name, self.pipe_names))
+        i = self.pipe_names.index(old_name)
+        self.pipeline[i] = (new_name, self.pipeline[i][1])
+
+    def remove_pipe(self, name):
+        """Remove a component from the pipeline.
+
+        name (unicode): Name of the component to remove.
+        RETURNS (tuple): A (name, component) tuple of the removed component.
+        """
+        if name not in self.pipe_names:
+            msg = "Can't find '{}' in pipeline. Available names: {}"
+            raise ValueError(msg.format(name, self.pipe_names))
+        return self.pipeline.pop(self.pipe_names.index(name))
 
     def __call__(self, text, disable=[]):
         """'Apply the pipeline to some text. The text can span multiple sentences,
@@ -269,8 +303,7 @@ class Language(object):
             ('An', 'NN')
         """
         doc = self.make_doc(text)
-        for proc in self.pipeline:
-            name = getattr(proc, 'name', None)
+        for name, proc in self.pipeline:
             if name in disable:
                 continue
             doc = proc(doc)
@@ -308,7 +341,7 @@ class Language(object):
             grads[key] = (W, dW)
         pipes = list(self.pipeline)
         random.shuffle(pipes)
-        for proc in pipes:
+        for name, proc in pipes:
             if not hasattr(proc, 'update'):
                 continue
             proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
@@ -322,7 +355,7 @@ class Language(object):
         docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
         YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
         """
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
             if hasattr(proc, 'preprocess_gold'):
                 docs_golds = proc.preprocess_gold(docs_golds)
         for doc, gold in docs_golds:
@@ -371,7 +404,7 @@ class Language(object):
         else:
             device = None
         link_vectors_to_models(self.vocab)
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
             if hasattr(proc, 'begin_training'):
                 context = proc.begin_training(get_gold_tuples(),
                                               pipeline=self.pipeline)
@@ -393,7 +426,7 @@ class Language(object):
         docs, golds = zip(*docs_golds)
         docs = list(docs)
         golds = list(golds)
-        for pipe in self.pipeline:
+        for name, pipe in self.pipeline:
             if not hasattr(pipe, 'pipe'):
                 for doc in docs:
                     pipe(doc)
@@ -419,7 +452,7 @@ class Language(object):
             >>> with nlp.use_params(optimizer.averages):
             >>>     nlp.to_disk('/tmp/checkpoint')
         """
-        contexts = [pipe.use_params(params) for pipe
+        contexts = [pipe.use_params(params) for name, pipe
                     in self.pipeline if hasattr(pipe, 'use_params')]
         # TODO: Having trouble with contextlib
         # Workaround: these aren't actually context managers atm.
@@ -466,8 +499,7 @@ class Language(object):
                 yield (doc, context)
             return
         docs = (self.make_doc(text) for text in texts)
-        for proc in self.pipeline:
-            name = getattr(proc, 'name', None)
+        for name, proc in self.pipeline:
             if name in disable:
                 continue
             if hasattr(proc, 'pipe'):
@@ -495,14 +527,14 @@ class Language(object):
             ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
         ))
-        for proc in self.pipeline:
+        for name, proc in self.pipeline:
             if not hasattr(proc, 'name'):
                 continue
-            if proc.name in disable:
+            if name in disable:
                 continue
             if not hasattr(proc, 'to_disk'):
                 continue
-            serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
+            serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
         serializers['vocab'] = lambda p: self.vocab.to_disk(p)
         util.to_disk(path, serializers, {p: False for p in disable})
 
@@ -526,14 +558,12 @@ class Language(object):
             ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
             ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
         ))
-        for proc in self.pipeline:
-            if not hasattr(proc, 'name'):
-                continue
-            if proc.name in disable:
+        for name, proc in self.pipeline:
+            if name in disable:
                 continue
             if not hasattr(proc, 'to_disk'):
                 continue
-            deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
+            deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
         exclude = {p: False for p in disable}
         if not (path / 'vocab').exists():
             exclude['vocab'] = True
@@ -552,8 +582,8 @@ class Language(object):
             ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
             ('meta', lambda: ujson.dumps(self.meta))
         ))
-        for i, proc in enumerate(self.pipeline):
-            if getattr(proc, 'name', None) in disable:
+        for i, (name, proc) in enumerate(self.pipeline):
+            if name in disable:
                 continue
             if not hasattr(proc, 'to_bytes'):
                 continue
@@ -572,8 +602,8 @@ class Language(object):
             ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
             ('meta', lambda b: self.meta.update(ujson.loads(b)))
         ))
-        for i, proc in enumerate(self.pipeline):
-            if getattr(proc, 'name', None) in disable:
+        for i, (name, proc) in enumerate(self.pipeline):
+            if name in disable:
                 continue
             if not hasattr(proc, 'from_bytes'):
                 continue
diff --git a/spacy/util.py b/spacy/util.py
index e1a721a12..9e9c4fa42 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -135,7 +135,11 @@ def load_model_from_path(model_path, meta=False, **overrides):
     if not meta:
         meta = get_model_meta(model_path)
     cls = get_lang_class(meta['lang'])
-    nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
+    nlp = cls(meta=meta, **overrides)
+    for name in meta.get('pipeline', []):
+        config = meta.get('pipeline_args', {}).get(name, {})
+        component = nlp.create_pipe(name, config=config)
+        nlp.add_pipe(component, name=name)
     return nlp.from_disk(model_path)
 
 

From 2586b61b15fa04d91ec4a2919729ab70e9a6b26b Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 00:26:05 +0200
Subject: [PATCH 02/19] Fix formatting, tidy up and remove unused imports

---
 spacy/language.py | 23 ++++++++---------------
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 91644aec0..7a409133a 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,12 +1,9 @@
 # coding: utf8
 from __future__ import absolute_import, unicode_literals
 from contextlib import contextmanager
-import dill
 
-import numpy
 from thinc.neural import Model
-from thinc.neural.ops import NumpyOps, CupyOps
-from thinc.neural.optimizers import Adam, SGD
+from thinc.neural.optimizers import Adam
 import random
 import ujson
 from collections import OrderedDict
@@ -17,24 +14,20 @@ from .vocab import Vocab
 from .tagger import Tagger
 from .lemmatizer import Lemmatizer
 from .syntax.parser import get_templates
-from .syntax import nonproj
 
-from .pipeline import NeuralDependencyParser, EntityRecognizer
-from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
-from .pipeline import NeuralLabeller
-from .pipeline import SimilarityHook
-from .pipeline import TextCategorizer
-from . import about
+from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
+from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
 
 from .compat import json_dumps, izip
+from .scorer import Scorer
+from ._ml import link_vectors_to_models
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
 from .lang.tokenizer_exceptions import TOKEN_MATCH
 from .lang.tag_map import TAG_MAP
 from .lang.lex_attrs import LEX_ATTRS
 from . import util
-from .scorer import Scorer
-from ._ml import link_vectors_to_models
+from . import about
 
 
 class BaseDefaults(object):
@@ -289,7 +282,7 @@ class Language(object):
         return self.pipeline.pop(self.pipe_names.index(name))
 
     def __call__(self, text, disable=[]):
-        """'Apply the pipeline to some text. The text can span multiple sentences,
+        """Apply the pipeline to some text. The text can span multiple sentences,
         and can contain arbtrary whitespace. Alignment into the original string
         is preserved.
 
@@ -387,7 +380,7 @@ class Language(object):
 
         get_gold_tuples (function): Function returning gold data
         **cfg: Config parameters.
-        returns: An optimizer
+        RETURNS: An optimizer
         """
         # Populate vocab
         if get_gold_tuples is not None:

From b39409173e4143b6053892475c1adf6010176060 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 00:29:08 +0200
Subject: [PATCH 03/19] Add disable option and True/False/None values for
 pipeline

---
 spacy/util.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/spacy/util.py b/spacy/util.py
index 9e9c4fa42..50ebc036b 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -136,10 +136,17 @@ def load_model_from_path(model_path, meta=False, **overrides):
         meta = get_model_meta(model_path)
     cls = get_lang_class(meta['lang'])
     nlp = cls(meta=meta, **overrides)
-    for name in meta.get('pipeline', []):
-        config = meta.get('pipeline_args', {}).get(name, {})
-        component = nlp.create_pipe(name, config=config)
-        nlp.add_pipe(component, name=name)
+    pipeline = meta.get('pipeline', [])
+    disable = overrides.get('disable', [])
+    if pipeline is True:
+        pipeline = nlp.Defaults.pipe_names
+    elif pipeline in (False, None):
+        pipeline = []
+    for name in pipeline:
+        if name not in disable:
+            config = meta.get('pipeline_args', {}).get(name, {})
+            component = nlp.create_pipe(name, config=config)
+            nlp.add_pipe(component, name=name)
     return nlp.from_disk(model_path)
 
 

From 61a503a61195c465328fcf0f283ce64f923b5c55 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 00:38:51 +0200
Subject: [PATCH 04/19] Fix parser test

---
 spacy/tests/conftest.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index b33a7c008..28b5f4ab9 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -58,8 +58,9 @@ def en_vocab():
 
 
 @pytest.fixture
-def en_parser():
-    return util.get_lang_class('en').Defaults.create_parser()
+def en_parser(en_vocab):
+    nlp = util.get_lang_class('en')(en_vocab)
+    return nlp.create_pipe('parser')
 
 
 @pytest.fixture

From e43530269c77a39d7b9460d5730db5707c439285 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 01:04:50 +0200
Subject: [PATCH 05/19] Update docstrings

---
 spacy/language.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index 7a409133a..a3152aea3 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -195,6 +195,7 @@ class Language(object):
         """Create a pipeline component from a factory.
 
         name (unicode): Factory name to look up in `Language.factories`.
+        config (dict): Configuration parameters to initialise component.
         RETURNS (callable): Pipeline component.
         """
         if name not in self.factories:
@@ -274,7 +275,7 @@ class Language(object):
         """Remove a component from the pipeline.
 
         name (unicode): Name of the component to remove.
-        RETURNS (tuple): A (name, component) tuple of the removed component.
+        RETURNS (tuple): A `(name, component)` tuple of the removed component.
         """
         if name not in self.pipe_names:
             msg = "Can't find '{}' in pipeline. Available names: {}"

From 3a65a0c970ec235d7e5b306924a90e8552c6568c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 7 Oct 2017 01:48:23 +0200
Subject: [PATCH 06/19] Start adding tests for new pipeline management

---
 spacy/tests/pipeline/__init__.py      |  0
 spacy/tests/pipeline/test_add_pipe.py | 43 +++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)
 create mode 100644 spacy/tests/pipeline/__init__.py
 create mode 100644 spacy/tests/pipeline/test_add_pipe.py

diff --git a/spacy/tests/pipeline/__init__.py b/spacy/tests/pipeline/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/pipeline/test_add_pipe.py b/spacy/tests/pipeline/test_add_pipe.py
new file mode 100644
index 000000000..13fb4acaf
--- /dev/null
+++ b/spacy/tests/pipeline/test_add_pipe.py
@@ -0,0 +1,43 @@
+from __future__ import unicode_literals
+import pytest
+
+from ... import language
+from ...language import Language
+
+@pytest.fixture
+def nlp():
+    return Language()
+
+@pytest.fixture
+def name():
+    return 'parser'
+
+def new_pipe(doc):
+    return doc
+
+
+def test_add_pipe_no_name(nlp):
+    nlp.add_pipe(new_pipe)
+    assert 'new_pipe' in nlp.pipe_names
+
+def test_add_pipe_duplicate_name(nlp):
+    nlp.add_pipe(new_pipe, name='duplicate_name')
+    with pytest.raises(ValueError):
+        nlp.add_pipe(new_pipe, name='duplicate_name')
+
+
+def test_add_pipe_first(nlp, name):
+    nlp.add_pipe(new_pipe, name=name, first=True)
+    assert nlp.pipeline[0][0] == name
+
+
+def test_add_pipe_last(nlp, name):
+    nlp.add_pipe(lambda doc: doc, name='lambda_pipe')
+    nlp.add_pipe(new_pipe, name=name, last=True)
+    assert nlp.pipeline[0][0] != name
+    assert nlp.pipeline[-1][0] == name
+
+
+def test_cant_add_pipe_first_and_last(nlp):
+    with pytest.raises(ValueError):
+        nlp.add_pipe(new_pipe, first=True, last=True)

From 0384f0821817014972b5bf8f062d94cd6ea22c2b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 7 Oct 2017 02:00:47 +0200
Subject: [PATCH 07/19] Trigger nonproj.deprojectivize as a postprocess

---
 spacy/language.py          |  2 +-
 spacy/pipeline.pyx         | 14 ++++++++++++++
 spacy/syntax/nn_parser.pyx |  8 ++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/spacy/language.py b/spacy/language.py
index a3152aea3..d40aee3ca 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -97,7 +97,7 @@ class Language(object):
         'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
         'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
         'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
-        'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),  # nonproj.deprojectivize,
+        'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
         'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
         'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 8d935335c..4d9adc609 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity
 from .tokens.doc cimport Doc
 from .syntax.parser cimport Parser as LinearParser
 from .syntax.nn_parser cimport Parser as NeuralParser
+from .syntax import nonproj
 from .syntax.parser import get_templates as get_feature_templates
 from .syntax.beam_parser cimport BeamParser
 from .syntax.ner cimport BiluoPushDown
@@ -773,11 +774,19 @@ cdef class DependencyParser(LinearParser):
         if isinstance(label, basestring):
             label = self.vocab.strings[label]
 
+    @property
+    def postprocesses(self):
+        return [nonproj.deprojectivize]
+
 
 cdef class NeuralDependencyParser(NeuralParser):
     name = 'parser'
     TransitionSystem = ArcEager
 
+    @property
+    def postprocesses(self):
+        return [nonproj.deprojectivize]
+
     def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
         for target in []:
             labeller = NeuralLabeller(self.vocab, target=target)
@@ -818,6 +827,11 @@ cdef class BeamDependencyParser(BeamParser):
         if isinstance(label, basestring):
             label = self.vocab.strings[label]
 
+    @property
+    def postprocesses(self):
+        return [nonproj.deprojectivize]
+
+
 
 __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
            'BeamEntityRecognizer', 'TokenVectorEnoder']
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 459c94463..f2c72a639 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -739,6 +739,14 @@ cdef class Parser:
             for i in range(doc.length):
                 doc.c[i] = state.c._sent[i]
             self.moves.finalize_doc(doc)
+            for hook in self.postprocesses:
+                for doc in docs:
+                    hook(doc)
+
+    @property
+    def postprocesses(self):
+        # Available for subclasses, e.g. to deprojectivize
+        return []
 
     def add_label(self, label):
         for action in self.moves.action_types:

From b38a8f4a943306a4a978e9b40fea9f5f2d7193e7 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 02:06:21 +0200
Subject: [PATCH 08/19] Fix and update pipe methods tests

---
 spacy/tests/pipeline/test_add_pipe.py     | 43 ------------
 spacy/tests/pipeline/test_pipe_methods.py | 84 +++++++++++++++++++++++
 2 files changed, 84 insertions(+), 43 deletions(-)
 delete mode 100644 spacy/tests/pipeline/test_add_pipe.py
 create mode 100644 spacy/tests/pipeline/test_pipe_methods.py

diff --git a/spacy/tests/pipeline/test_add_pipe.py b/spacy/tests/pipeline/test_add_pipe.py
deleted file mode 100644
index 13fb4acaf..000000000
--- a/spacy/tests/pipeline/test_add_pipe.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from __future__ import unicode_literals
-import pytest
-
-from ... import language
-from ...language import Language
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-@pytest.fixture
-def name():
-    return 'parser'
-
-def new_pipe(doc):
-    return doc
-
-
-def test_add_pipe_no_name(nlp):
-    nlp.add_pipe(new_pipe)
-    assert 'new_pipe' in nlp.pipe_names
-
-def test_add_pipe_duplicate_name(nlp):
-    nlp.add_pipe(new_pipe, name='duplicate_name')
-    with pytest.raises(ValueError):
-        nlp.add_pipe(new_pipe, name='duplicate_name')
-
-
-def test_add_pipe_first(nlp, name):
-    nlp.add_pipe(new_pipe, name=name, first=True)
-    assert nlp.pipeline[0][0] == name
-
-
-def test_add_pipe_last(nlp, name):
-    nlp.add_pipe(lambda doc: doc, name='lambda_pipe')
-    nlp.add_pipe(new_pipe, name=name, last=True)
-    assert nlp.pipeline[0][0] != name
-    assert nlp.pipeline[-1][0] == name
-
-
-def test_cant_add_pipe_first_and_last(nlp):
-    with pytest.raises(ValueError):
-        nlp.add_pipe(new_pipe, first=True, last=True)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
new file mode 100644
index 000000000..5ec78aefb
--- /dev/null
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -0,0 +1,84 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+
+from ...language import Language
+
+
+@pytest.fixture
+def nlp():
+    return Language()
+
+
+def new_pipe(doc):
+    return doc
+
+
+def test_add_pipe_no_name(nlp):
+    nlp.add_pipe(new_pipe)
+    assert 'new_pipe' in nlp.pipe_names
+
+
+def test_add_pipe_duplicate_name(nlp):
+    nlp.add_pipe(new_pipe, name='duplicate_name')
+    with pytest.raises(ValueError):
+        nlp.add_pipe(new_pipe, name='duplicate_name')
+
+
+@pytest.mark.parametrize('name', ['parser'])
+def test_add_pipe_first(nlp, name):
+    nlp.add_pipe(new_pipe, name=name, first=True)
+    assert nlp.pipeline[0][0] == name
+
+
+@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
+def test_add_pipe_last(nlp, name1, name2):
+    nlp.add_pipe(lambda doc: doc, name=name2)
+    nlp.add_pipe(new_pipe, name=name1, last=True)
+    assert nlp.pipeline[0][0] != name1
+    assert nlp.pipeline[-1][0] == name1
+
+
+def test_cant_add_pipe_first_and_last(nlp):
+    with pytest.raises(ValueError):
+        nlp.add_pipe(new_pipe, first=True, last=True)
+
+
+@pytest.mark.parametrize('name', ['my_component'])
+def test_get_pipe(nlp, name):
+    with pytest.raises(KeyError):
+        nlp.get_pipe(name)
+    nlp.add_pipe(new_pipe, name=name)
+    assert nlp.get_pipe(name) == new_pipe
+
+
+@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
+def test_replace_pipe(nlp, name, replacement):
+    with pytest.raises(ValueError):
+        nlp.replace_pipe(name, new_pipe)
+    nlp.add_pipe(new_pipe, name=name)
+    nlp.replace_pipe(name, replacement)
+    assert nlp.get_pipe(name) != new_pipe
+    assert nlp.get_pipe(name) == replacement
+
+
+@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
+def test_rename_pipe(nlp, old_name, new_name):
+    with pytest.raises(ValueError):
+        nlp.rename_pipe(old_name, new_name)
+    nlp.add_pipe(new_pipe, name=old_name)
+    nlp.rename_pipe(old_name, new_name)
+    assert nlp.pipeline[0][0] == new_name
+
+
+@pytest.mark.parametrize('name', ['my_component'])
+def test_remove_pipe(nlp, name):
+    with pytest.raises(ValueError):
+        nlp.remove_pipe(name)
+    nlp.add_pipe(new_pipe, name=name)
+    assert len(nlp.pipeline) == 1
+    removed_name, removed_component = nlp.remove_pipe(name)
+    assert not len(nlp.pipeline)
+    assert removed_name == name
+    assert removed_component == new_pipe

From 0adadcb3f04e2ecb98b5ca5de1afba2ba7208d23 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 02:15:15 +0200
Subject: [PATCH 09/19] Fix beam parse model test

---
 spacy/tests/parser/test_beam_parse.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/spacy/tests/parser/test_beam_parse.py b/spacy/tests/parser/test_beam_parse.py
index da5f43d5e..dd77c6805 100644
--- a/spacy/tests/parser/test_beam_parse.py
+++ b/spacy/tests/parser/test_beam_parse.py
@@ -1,10 +1,11 @@
-import spacy
+# coding: utf8
+from __future__ import unicode_literals
+
 import pytest
 
-@pytest.mark.models
-def test_beam_parse():
-    nlp = spacy.load('en_core_web_sm')
-    doc = nlp(u'Australia is a country', disable=['ner'])
-    ents = nlp.entity(doc, beam_width=2)
-    print(ents)
 
+@pytest.mark.models('en')
+def test_beam_parse(EN):
+    doc = EN(u'Australia is a country', disable=['ner'])
+    ents = EN.entity(doc, beam_width=2)
+    print(ents)

From e370332fb1fe8cb179f0fbbbfd79b7251df8781c Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 03:00:20 +0200
Subject: [PATCH 10/19] Update Language API docs

---
 website/api/language.jade | 229 +++++++++++++++++++++++++++++++++++---
 1 file changed, 216 insertions(+), 13 deletions(-)

diff --git a/website/api/language.jade b/website/api/language.jade
index 617c81599..89807fabe 100644
--- a/website/api/language.jade
+++ b/website/api/language.jade
@@ -4,7 +4,14 @@ include ../_includes/_mixins
 
 p
     |  Usually you'll load this once per process as #[code nlp] and pass the
-    |  instance around your application.
+    |  instance around your application. The #[code Language] class is created
+    |  when you call #[+api("spacy#load") #[code spacy.load()]] and contains
+    |  the shared vocabulary and #[+a("/usage/adding-languages") language data],
+    |  optional model data loaded from a #[+a("/models") model package] or
+    |  a path, and a #[+a("/usage/processing-pipelines") processing pipeline]
+    |  containing components like the tagger or parser that are called on a
+    |  document in order. You can also add your own processing pipeline
+    |  components that take a #[code Doc] object, modify it and return it.
 
 +h(2, "init") Language.__init__
     +tag method
@@ -12,9 +19,9 @@ p
 p Initialise a #[code Language] object.
 
 +aside-code("Example").
+    from spacy.vocab import Vocab
     from spacy.language import Language
-    nlp = Language(pipeline=['token_vectors', 'tags',
-                             'dependencies'])
+    nlp = Language(Vocab())
 
     from spacy.lang.en import English
     nlp = English()
@@ -34,14 +41,6 @@ p Initialise a #[code Language] object.
             |  A function that takes text and returns a #[code Doc] object.
             |  Usually a #[code Tokenizer].
 
-    +row
-        +cell #[code pipeline]
-        +cell list
-        +cell
-            |  A list of annotation processes or IDs of annotation, processes,
-            |  e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
-            |  up in #[code Language.Defaults.factories].
-
     +row
         +cell #[code meta]
         +cell dict
@@ -54,6 +53,23 @@ p Initialise a #[code Language] object.
         +cell #[code Language]
         +cell The newly constructed object.
 
++infobox("Deprecation note", "⚠️")
+    .o-block
+        |  To make the processing pipelines and their components more
+        |  transparent, the #[code pipeline] and #[code disable] arguments on
+        |  initialisation are now deprecated. Instead, pipeline components can
+        |  now be added, removed and rearranged using the new #[code Language]
+        |  methods, for example #[+api("language#add_pipe") #[code add_pipe]] or
+        |  #[+api("language#create_pipe") #[code create_pipe]]. This is also how
+        |  #[+api("spacy#load") #[code spacy.load()]] creates the
+        |  #[code Language] instance it returns.
+
+    +code-new.
+        nlp = English()
+        parser = nlp.create_pipe('parser')
+        nlp.add_pipe(parser)
+    +code-old nlp = English(pipeline=['parser'])
+
 +h(2, "call") Language.__call__
     +tag method
 
@@ -235,7 +251,6 @@ p
     |  Can be called before training to pre-process gold data. By default, it
     |  handles nonprojectivity and adds missing tags to the tag map.
 
-
 +table(["Name", "Type", "Description"])
     +row
         +cell #[code docs_golds]
@@ -247,6 +262,177 @@ p
         +cell tuple
         +cell Tuples of #[code Doc] and #[code GoldParse] objects.
 
++h(2, "create_pipe") Language.create_pipe
+    +tag method
+    +tag-new(2)
+
+p Create a pipeline component from a factory.
+
++aside-code("Example").
+    parser = nlp.create_pipe('parser')
+    nlp.add_pipe(parser)
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Factory name to look up in
+            |  #[+api("language#class-attributes") #[code Language.factories]].
+
+    +row
+        +cell #[code config]
+        +cell dict
+        +cell Configuration parameters to initialise component.
+
+    +row("foot")
+        +cell returns
+        +cell callable
+        +cell The pipeline component.
+
++h(2, "add_pipe") Language.add_pipe
+    +tag method
+    +tag-new(2)
+
+p
+    |  Add a component to the processing pipeline. Valid components are
+    |  callables that take a #[code Doc] object, modify it and return it. Only
+    |  one of #[code before], #[code after], #[code first] or #[code last] can
+    |  be set. Default behaviour is #[code last=True].
+
++aside-code("Example").
+    def component(doc):
+        # modify Doc and return it
+        return doc
+
+    nlp.add_pipe(component, before='ner')
+    nlp.add_pipe(component, name='custom_name', last=True)
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code component]
+        +cell callable
+        +cell The pipeline component.
+
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell
+            |  Name of pipeline component. Overwrites existing
+            |  #[code component.name] attribute if available. If no #[code name]
+            |  is set and the component exposes no name attribute,
+            |  #[code component.__name__] is used. An error is raised if the
+            |  name already exists in the pipeline.
+
+    +row
+        +cell #[code before]
+        +cell unicode
+        +cell Component name to insert component directly before.
+
+    +row
+        +cell #[code after]
+        +cell unicode
+        +cell Component name to insert component directly after:
+
+    +row
+        +cell #[code first]
+        +cell bool
+        +cell Insert component first / not first in the pipeline.
+
+    +row
+        +cell #[code last]
+        +cell bool
+        +cell Insert component last / not last in the pipeline.
+
++h(2, "get_pipe") Language.get_pipe
+    +tag method
+    +tag-new(2)
+
+p Get a pipeline component for a given component name.
+
++aside-code("Example").
+    parser = nlp.get_pipe('parser')
+    custom_component = nlp.get_pipe('custom_component')
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the pipeline component to get.
+
+    +row("foot")
+        +cell returns
+        +cell callable
+        +cell The pipeline component.
+
++h(2, "replace_pipe") Language.replace_pipe
+    +tag method
+    +tag-new(2)
+
+p Replace a component in the pipeline.
+
++aside-code("Example").
+    nlp.replace_pipe('parser', my_custom_parser)
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the component to replace.
+
+    +row
+        +cell #[code component]
+        +cell callable
+        +cell The pipeline component to inser.
+
+
++h(2, "rename_pipe") Language.rename_pipe
+    +tag method
+    +tag-new(2)
+
+p
+    |  Rename a component in the pipeline. Useful to create custom names for
+    |  pre-defined and pre-loaded components. To change the default name of
+    |  a component added to the pipeline, you can also use the #[code name]
+    |  argument on #[+api("language#add_pipe") #[code add_pipe]].
+
++aside-code("Example").
+    nlp.rename_pipe('parser', 'spacy_parser')
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code old_name]
+        +cell unicode
+        +cell Name of the component to rename.
+
+    +row
+        +cell #[code new_name]
+        +cell unicode
+        +cell New name of the component.
+
++h(2, "remove_pipe") Language.remove_pipe
+    +tag method
+    +tag-new(2)
+
+p
+    |  Remove a component from the pipeline. Returns the removed component name
+    |  and component function.
+
++aside-code("Example").
+    name, component = nlp.remove_pipe('parser')
+    assert name == 'parser'
+
++table(["Name", "Type", "Description"])
+    +row
+        +cell #[code name]
+        +cell unicode
+        +cell Name of the component to remove.
+
+    +row("foot")
+        +cell returns
+        +cell tuple
+        +cell A #[code (name, component)] tuple of the removed component.
+
 +h(2, "to_disk") Language.to_disk
     +tag method
     +tag-new(2)
@@ -399,7 +585,15 @@ p Load state from a binary string.
     +row
         +cell #[code pipeline]
         +cell list
-        +cell Sequence of annotation functions.
+        +cell
+            |  List of #[code (name, component)] tuples describing the current
+            |  processing pipeline, in order.
+
+    +row
+        +cell #[code pipe_names]
+            +tag-new(2)
+        +cell list
+        +cell List of pipeline component names, in order.
 
     +row
         +cell #[code meta]
@@ -424,3 +618,12 @@ p Load state from a binary string.
         +cell
             |  Two-letter language ID, i.e.
             |  #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
+
+    +row
+        +cell #[code factories]
+            +tag-new(2)
+        +cell dict
+        +cell
+            |  Factories that create pre-defined pipeline components, e.g. the
+            |  tagger, parser or entity recognizer, keyed by their component
+            |  name.

From ed8e0085b0b6aae9501bb87be365366b88816be4 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 03:06:55 +0200
Subject: [PATCH 11/19] Update docs for spacy.load()

---
 website/api/_top-level/_spacy.jade | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade
index c14f62f7e..2b523f846 100644
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@@ -43,6 +43,20 @@ p
         +cell #[code Language]
         +cell A #[code Language] object with the loaded model.
 
+p
+    |  Essentially, #[code spacy.load()] is a convenience wrapper that reads
+    |  the language ID and pipeline components from a model's #[code meta.json],
+    |  initialises the #[code Language] class, loads in the model data and
+    |  returns it.
+
++code("Abstract example").
+    cls = util.get_lang_class(lang)         #  get Language class for ID, e.g. 'en'
+    nlp = cls()                             #  initialise the Language class
+    for name in pipeline:
+        component = nlp.create_pipe(name)   #  create each pipeline component
+        nlp.add_pipe(component)             #  add component to pipeline
+    nlp.from_disk(model_data_path)          #  load in model data
+
 +infobox("Deprecation note", "⚠️")
     .o-block
         |  As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy

From 58dfde7c0227958972dc37a71d878d605b87ffa1 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 04:54:57 +0200
Subject: [PATCH 12/19] Remove redundante deprecation note

---
 website/api/language.jade | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/website/api/language.jade b/website/api/language.jade
index 89807fabe..500d6c411 100644
--- a/website/api/language.jade
+++ b/website/api/language.jade
@@ -53,23 +53,6 @@ p Initialise a #[code Language] object.
         +cell #[code Language]
         +cell The newly constructed object.
 
-+infobox("Deprecation note", "⚠️")
-    .o-block
-        |  To make the processing pipelines and their components more
-        |  transparent, the #[code pipeline] and #[code disable] arguments on
-        |  initialisation are now deprecated. Instead, pipeline components can
-        |  now be added, removed and rearranged using the new #[code Language]
-        |  methods, for example #[+api("language#add_pipe") #[code add_pipe]] or
-        |  #[+api("language#create_pipe") #[code create_pipe]]. This is also how
-        |  #[+api("spacy#load") #[code spacy.load()]] creates the
-        |  #[code Language] instance it returns.
-
-    +code-new.
-        nlp = English()
-        parser = nlp.create_pipe('parser')
-        nlp.add_pipe(parser)
-    +code-old nlp = English(pipeline=['parser'])
-
 +h(2, "call") Language.__call__
     +tag method
 

From feaf353051f1163454a05c78b074c0a37b1329af Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 14:05:59 +0200
Subject: [PATCH 13/19] Update processing pipelines usage docs

---
 .../_processing-pipelines/_pipelines.jade     | 195 +++++-------------
 1 file changed, 52 insertions(+), 143 deletions(-)

diff --git a/website/usage/_processing-pipelines/_pipelines.jade b/website/usage/_processing-pipelines/_pipelines.jade
index d09ed4ead..3c1c28af1 100644
--- a/website/usage/_processing-pipelines/_pipelines.jade
+++ b/website/usage/_processing-pipelines/_pipelines.jade
@@ -11,7 +11,7 @@ p
 
 p
     |  When you load a model, spaCy first consults the model's
-    |  #[+a("/usage/saving-loading#models-generating") meta.json]. The
+    |  #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The
     |  meta typically includes the model details, the ID of a language class,
     |  and an optional list of pipeline components. spaCy then does the
     |  following:
@@ -21,24 +21,26 @@ p
         "name": "example_model",
         "lang": "en"
         "description": "Example model for spaCy",
-        "pipeline": ["tensorizer", "tagger"]
+        "pipeline": ["tagger", "parser"]
     }
 
 +list("numbers")
-    +item
-        |  Look up #[strong pipeline IDs] in the available
-        |  #[strong pipeline factories].
-    +item
-        |  Initialise the #[strong pipeline components] by calling their
-        |  factories with the #[code Vocab] as an argument. This gives each
-        |  factory and component access to the pipeline's shared data, like
-        |  strings, morphology and annotation scheme.
     +item
         |  Load the #[strong language class and data] for the given ID via
-        |  #[+api("util.get_lang_class") #[code get_lang_class]].
+        |  #[+api("util.get_lang_class") #[code get_lang_class]] and initialise
+        |  it. The #[code Language] class contains the shared vocabulary,
+        |  tokenization rules and the language-specific annotation scheme.
     +item
-        |  Pass the path to the #[strong model data] to the #[code Language]
-        |  class and return it.
+        |  Iterate over the #[strong pipeline names] and create each component
+        |  using #[+api("language#create_pipe") #[code create_pipe]], which
+        |  looks them up in #[code Language.factories].
+    +item
+        |  Add each pipeline component to the pipeline in order, using
+        |  #[+api("language#add_pipe") #[code add_pipe]].
+    +item
+        |  Make the #[strong model data] available to the #[code Language] class
+        |  by calling #[+api("language#from_disk") #[code from_disk]] with the
+        |  path to the model data ditectory.
 
 p
     |  So when you call this...
@@ -47,12 +49,12 @@ p
     nlp = spacy.load('en')
 
 p
-    | ... the model tells spaCy to use the pipeline
+    | ... the model tells spaCy to use the language #[code "en"] and the pipeline
     |  #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
-    |  then look up each string in its internal factories registry and
-    |  initialise the individual components. It'll then load
-    |  #[code spacy.lang.en.English], pass it the path to the model's data
-    |  directory, and return it for you to use as the #[code nlp] object.
+    |  then initialise #[code spacy.lang.en.English], and create each pipeline
+    |  component and add it to the processing pipeline. It'll then load in the
+    |  model's data from its data ditectory and return the modified
+    |  #[code Language] class for you to use as the #[code nlp] object.
 
 p
     |  Fundamentally, a #[+a("/models") spaCy model] consists of three
@@ -73,9 +75,12 @@ p
     pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
     data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
 
-    cls = spacy.util.get_lang_class(lang)  # 1. get Language instance, e.g. English()
-    nlp = cls(pipeline=pipeline)           # 2. initialise it with the pipeline
-    nlp.from_disk(model_data_path)         # 3. load in the binary data
+    cls = spacy.util.get_lang_class(lang)   # 1. get Language instance, e.g. English()
+    nlp = cls()                             # 2. initialise it
+    for name in pipeline:
+        component = nlp.create_pipe(name)   # 3. create the pipeline components
+        nlp.add_pipe(component)             # 4. add the component to the pipeline
+    nlp.from_disk(model_data_path)          # 5. load in the binary data
 
 p
     |  When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
@@ -87,124 +92,23 @@ p
     |  document, which is then processed by the component next in the pipeline.
 
 +code("The pipeline under the hood").
-    doc = nlp.make_doc(u'This is a sentence')
-    for proc in nlp.pipeline:
-        doc = proc(doc)
-
-+h(3, "creating") Creating pipeline components and factories
+    doc = nlp.make_doc(u'This is a sentence')   # create a Doc from raw text
+    for name, proc in nlp.pipeline:             # iterate over components in order
+        doc = proc(doc)                         # apply each component
 
 p
-    |  spaCy lets you customise the pipeline with your own components. Components
-    |  are functions that receive a #[code Doc] object, modify and return it.
-    |  If your component is stateful, you'll want to create a new one for each
-    |  pipeline. You can do that by defining and registering a factory which
-    |  receives the shared #[code Vocab] object and returns a component.
-
-+h(4, "creating-component") Creating a  component
-
-p
-    |  A component receives a #[code Doc] object and
-    |  #[strong performs the actual processing] – for example, using the current
-    |  weights to make a prediction and set some annotation on the document. By
-    |  adding a component to the pipeline, you'll get access to the #[code Doc]
-    |  at any point #[strong during] processing – instead of only being able to
-    |  modify it afterwards.
-
-+aside-code("Example").
-    def my_component(doc):
-        # do something to the doc here
-        return doc
-
-+table(["Argument", "Type", "Description"])
-    +row
-        +cell #[code doc]
-        +cell #[code Doc]
-        +cell The #[code Doc] object processed by the previous component.
-
-    +row("foot")
-        +cell returns
-        +cell #[code Doc]
-        +cell The #[code Doc] object processed by this pipeline component.
-
-p
-    |  When creating a new #[code Language] class, you can pass it a list of
-    |  pipeline component functions to execute in that order. You can also
-    |  add it to an existing pipeline by modifying #[code nlp.pipeline] – just
-    |  be careful not to overwrite a pipeline or its components by accident!
+    |  The current processing pipeline is available as #[code nlp.pipeline],
+    |  which returns a list of #[code (name, component)] tuples, or
+    |  #[code nlp.pipe_names], which only returns a list of human-readable
+    |  component names.
 
 +code.
-    # Create a new Language object with a pipeline
-    from spacy.language import Language
-    nlp = Language(pipeline=[my_component])
+    nlp.pipeline
+    # [('tagger', &lt;spacy.pipeline.Tagger&gt;), ('parser', &lt;spacy.pipeline.DependencyParser&gt;), ('ner', &lt;spacy.pipeline.EntityRecognizer&gt;)]
+    nlp.pipe_names
+    # ['tagger', 'parser', 'ner']
 
-    # Modify an existing pipeline
-    nlp = spacy.load('en')
-    nlp.pipeline.append(my_component)
-
-+h(4, "creating-factory") Creating a factory
-
-p
-    |  A factory is a #[strong function that returns a pipeline component].
-    |  It's called with the #[code Vocab] object, to give it access to the
-    |  shared data between components – for example, the strings, morphology,
-    |  vectors or annotation scheme. Factories are useful for creating
-    |  #[strong stateful components], especially ones which
-    |  #[strong depend on shared data].
-
-+aside-code("Example").
-    def my_factory(vocab):
-        # load some state
-        def my_component(doc):
-            # process the doc
-            return doc
-        return my_component
-
-+table(["Argument", "Type", "Description"])
-    +row
-        +cell #[code vocab]
-        +cell #[code Vocab]
-        +cell
-            |  Shared data between components, including strings, morphology,
-            |  vectors etc.
-
-    +row("foot")
-        +cell returns
-        +cell callable
-        +cell The pipeline component.
-
-p
-    |  By creating a factory, you're essentially telling spaCy how to get the
-    |  pipeline component #[strong once the vocab is available]. Factories need to
-    |  be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
-    |  by assigning them a unique ID. This ID can be added to the pipeline as a
-    |  string. When creating a pipeline, you're free to mix strings and
-    |  callable components:
-
-+code.
-    spacy.set_factory('my_factory', my_factory)
-    nlp = Language(pipeline=['my_factory', my_other_component])
-
-p
-    |  If spaCy comes across a string in the pipeline, it will try to resolve it
-    |  by looking it up in the available factories. The factory will then be
-    |  initialised with the #[code Vocab]. Providing factory names instead of
-    |  callables also makes it easy to specify them in the model's
-    |  #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
-    |  training your own model and want to use one of spaCy's default components,
-    |  you won't have to worry about finding and implementing it either – to use
-    |  the default tagger, simply add #[code "tagger"] to the pipeline, and
-    |  #[strong spaCy will know what to do].
-
-+infobox("Important note")
-    |  Because factories are #[strong resolved on initialisation] of the
-    |  #[code Language] class, it's #[strong not possible] to add them to the
-    |  pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
-    |  works with individual component functions. To use factories, you need to
-    |  create a new #[code Language] object, or generate a
-    |  #[+a("/usage/training#models-generating") model package] with
-    |  a custom pipeline.
-
-+h(3, "disabling") Disabling pipeline components
++h(3, "disabling") Disabling and modifying pipeline components
 
 p
     |  If you don't need a particular component of the pipeline – for
@@ -217,16 +121,19 @@ p
 +code.
     nlp = spacy.load('en', disable['parser', 'tagger'])
     nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
-    doc = nlp(u"I don't want parsed", disable=['parser'])
 
 p
-    |  Note that you can't write directly to #[code nlp.pipeline], as this list
-    |  holds the #[em actual components], not the IDs. However, if you know the
-    |  order of the components, you can still slice the list:
+    |  You can also use the #[+api("language#remove_pipe") #[code remove_pipe]]
+    |  method to remove pipeline components from an existing pipeline, the
+    |  #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them,
+    |  or the #[+api("language#replace_pipe") #[code replace_pipe]] method
+    |  to replace them with a custom component entirely (more details on this
+    |  in the section on #[+a("#custom-components") custom components].
 
 +code.
-    nlp = spacy.load('en')
-    nlp.pipeline = nlp.pipeline[:2] # only use the first two components
+    nlp.remove_pipe('parser')
+    nlp.rename_pipe('ner', 'entityrecognizer')
+    nlp.replace_pipe('tagger', my_custom_tagger)
 
 +infobox("Important note: disabling pipeline components")
     .o-block
@@ -234,12 +141,14 @@ p
         |  processing pipeline components, the #[code parser], #[code tagger]
         |  and #[code entity] keyword arguments have been replaced with
         |  #[code disable], which takes a list of pipeline component names.
-        |  This lets you disable both default and custom components when loading
+        |  This lets you disable pre-defined components when loading
         |  a model, or initialising a Language class via
         |  #[+api("language-from_disk") #[code from_disk]].
+
     +code-new.
-        nlp = spacy.load('en', disable=['tagger', 'ner'])
-        doc = nlp(u"I don't want parsed", disable=['parser'])
+        nlp = spacy.load('en', disable=['ner'])
+        nlp.remove_pipe('parser')
+        doc = nlp(u"I don't want parsed")
     +code-old.
         nlp = spacy.load('en', tagger=False, entity=False)
         doc = nlp(u"I don't want parsed", parse=False)

From 743d1df1fe6474b6342ff7cfe73a988d89e679c6 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 15:27:28 +0200
Subject: [PATCH 14/19] Update pipelines docs and add user hooks to custom
 components

---
 website/usage/_data.json                      |   7 +-
 .../_custom-components.jade                   | 151 ++++++++++++++++++
 .../_processing-pipelines/_user-hooks.jade    |  61 -------
 website/usage/processing-pipelines.jade       |  10 +-
 4 files changed, 157 insertions(+), 72 deletions(-)
 create mode 100644 website/usage/_processing-pipelines/_custom-components.jade
 delete mode 100644 website/usage/_processing-pipelines/_user-hooks.jade

diff --git a/website/usage/_data.json b/website/usage/_data.json
index b34304ed6..f77f7929c 100644
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@@ -103,11 +103,10 @@
         "title": "Language Processing Pipelines",
         "next": "vectors-similarity",
         "menu": {
-            "How pipelines work": "pipelines",
-            "Examples": "examples",
+            "How Pipelines Work": "pipelines",
+            "Custom Components": "custom-components",
             "Multi-threading": "multithreading",
-            "User Hooks": "user-hooks",
-            "Serialization": "serialization"
+            "Serialization": "serialization",
         }
     },
 
diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade
new file mode 100644
index 000000000..13f0cb85c
--- /dev/null
+++ b/website/usage/_processing-pipelines/_custom-components.jade
@@ -0,0 +1,151 @@
+//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
+
+p
+    |  A component receives a #[code Doc] object and
+    |  #[strong performs the actual processing] – for example, using the current
+    |  weights to make a prediction and set some annotation on the document. By
+    |  adding a component to the pipeline, you'll get access to the #[code Doc]
+    |  at any point #[strong during] processing – instead of only being able to
+    |  modify it afterwards.
+
++aside-code("Example").
+    def my_component(doc):
+        # do something to the doc here
+        return doc
+
++table(["Argument", "Type", "Description"])
+    +row
+        +cell #[code doc]
+        +cell #[code Doc]
+        +cell The #[code Doc] object processed by the previous component.
+
+    +row("foot")
+        +cell returns
+        +cell #[code Doc]
+        +cell The #[code Doc] object processed by this pipeline component.
+
+p
+    |  Custom components can be added to the pipeline using the
+    |  #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
+    |  can either specify a component to add it before or after, tell spaCy
+    |  to add it first or last in the pipeline, or define a custom name.
+    |  If no name is set and no #[code name] attribute is present on your
+    |  component, the function name, e.g. #[code component.__name__] is used.
+
++code("Adding pipeline components").
+    def my_component(doc):
+        print("After tokenization, this doc has %s tokens." % len(doc))
+        if len(doc) &lt; 10:
+            print("This is a pretty short document.")
+        return doc
+
+    nlp = spacy.load('en')
+    nlp.pipeline.add_pipe(my_component, name='print_info', first=True)
+    print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
+    doc = nlp(u"This is a sentence.")
+
+p
+    |  Of course, you can also wrap your component as a class to allow
+    |  initialising it with custom settings and hold state within the component.
+    |  This is useful for #[strong stateful components], especially ones which
+    |  #[strong depend on shared data].
+
++code.
+    class MyComponent(object):
+        name = 'print_info'
+
+        def __init__(vocab, short_limit=10):
+            self.vocab = nlp.vocab
+            self.short_limit = short_limit
+
+        def __call__(doc):
+            if len(doc) &lt; self.short_limit:
+                print("This is a pretty short document.")
+            return doc
+
+    my_component = MyComponent(nlp.vocab, short_limit=25)
+    nlp.add_pipe(my_component, first=True)
+
++h(3, "custom-components-attributes")
+    |  Setting attributes on the #[code Doc], #[code Span] and #[code Token]
+
++aside("Why ._?")
+    |  Writing to a #[code ._] attribute instead of to the #[code Doc] directly
+    |  keeps a clearer separation and makes it easier to ensure backwards
+    |  compatibility. For example, if you've implemented your own #[code .coref]
+    |  property and spaCy claims it one day, it'll break your code. Similarly,
+    |  just by looking at the code, you'll immediately know what's built-in and
+    |  what's custom – for example, #[code doc.sentiment] is spaCy, while
+    |  #[code doc._.sent_score] isn't.
+
++under-construction
+
++h(3, "custom-components-user-hooks") Other user hooks
+
+p
+    |  While it's generally recommended to use the #[code Doc._], #[code Span._]
+    |  and #[code Token._] proxies to add your own custom attributes, spaCy
+    |  offers a few exceptions to allow #[strong customising the built-in methods]
+    |  like #[+api("doc#similarity") #[code Doc.similarity]] or
+    |  #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can
+    |  rely on statistical models you train yourself. For instance, you can
+    |  provide your own on-the-fly sentence segmentation algorithm or document
+    |  similarity method.
+
+p
+    |  Hooks let you customize some of the behaviours of the #[code Doc],
+    |  #[code Span] or #[code Token] objects by adding a component to the
+    |  pipeline. For instance, to customize the
+    |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
+    |  component that sets a custom function to
+    |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
+    |  method will check the #[code user_hooks] dict, and delegate to your
+    |  function if you've set one. Similar results can be achieved by setting
+    |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+
++aside("Implementation note")
+    |  The hooks live on the #[code Doc] object because the #[code Span] and
+    |  #[code Token] objects are created lazily, and don't own any data. They
+    |  just proxy to their parent #[code Doc]. This turns out to be convenient
+    |  here — we only have to worry about installing hooks in one place.
+
++table(["Name", "Customises"])
+    +row
+        +cell #[code user_hooks]
+        +cell
+            +api("doc#vector") #[code Doc.vector]
+            +api("doc#has_vector") #[code Doc.has_vector]
+            +api("doc#vector_norm") #[code Doc.vector_norm]
+            +api("doc#sents") #[code Doc.sents]
+
+    +row
+        +cell #[code user_token_hooks]
+        +cell
+            +api("token#similarity") #[code Token.similarity]
+            +api("token#vector") #[code Token.vector]
+            +api("token#has_vector") #[code Token.has_vector]
+            +api("token#vector_norm") #[code Token.vector_norm]
+            +api("token#conjuncts") #[code Token.conjuncts]
+
+    +row
+        +cell #[code user_span_hooks]
+        +cell
+            +api("span#similarity") #[code Span.similarity]
+            +api("span#vector") #[code Span.vector]
+            +api("span#has_vector") #[code Span.has_vector]
+            +api("span#vector_norm") #[code Span.vector_norm]
+            +api("span#root") #[code Span.root]
+
++code("Add custom similarity hooks").
+    class SimilarityModel(object):
+        def __init__(self, model):
+            self._model = model
+
+        def __call__(self, doc):
+            doc.user_hooks['similarity'] = self.similarity
+            doc.user_span_hooks['similarity'] = self.similarity
+            doc.user_token_hooks['similarity'] = self.similarity
+
+        def similarity(self, obj1, obj2):
+            y = self._model([obj1.vector, obj2.vector])
+            return float(y[0])
diff --git a/website/usage/_processing-pipelines/_user-hooks.jade b/website/usage/_processing-pipelines/_user-hooks.jade
deleted file mode 100644
index e7dce53fe..000000000
--- a/website/usage/_processing-pipelines/_user-hooks.jade
+++ /dev/null
@@ -1,61 +0,0 @@
-//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
-
-p
-    |  Hooks let you customize some of the behaviours of the #[code Doc],
-    |  #[code Span] or #[code Token] objects by adding a component to the
-    |  pipeline. For instance, to customize the
-    |  #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
-    |  component that sets a custom function to
-    |  #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
-    |  method will check the #[code user_hooks] dict, and delegate to your
-    |  function if you've set one. Similar results can be achieved by setting
-    |  functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
-
-+code("Polymorphic similarity example").
-    span.similarity(doc)
-    token.similarity(span)
-    doc1.similarity(doc2)
-
-p
-    |  By default, this just averages the vectors for each document, and
-    |  computes their cosine. Obviously, spaCy should make it easy for you to
-    |  install your own similarity model. This introduces a tricky design
-    |  challenge. The current solution is to add three more dicts to the
-    |  #[code Doc] object:
-
-+aside("Implementation note")
-    |  The hooks live on the #[code Doc] object because the #[code Span] and
-    |  #[code Token] objects are created lazily, and don't own any data. They
-    |  just proxy to their parent #[code Doc]. This turns out to be convenient
-    |  here — we only have to worry about installing hooks in one place.
-
-+table(["Name", "Description"])
-    +row
-        +cell #[code user_hooks]
-        +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
-
-    +row
-        +cell #[code user_token_hooks]
-        +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
-
-    +row
-        +cell #[code user_span_hooks]
-        +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
-
-p
-    |  To sum up, here's an example of hooking in custom #[code .similarity()]
-    |  methods:
-
-+code("Add custom similarity hooks").
-    class SimilarityModel(object):
-        def __init__(self, model):
-            self._model = model
-
-        def __call__(self, doc):
-            doc.user_hooks['similarity'] = self.similarity
-            doc.user_span_hooks['similarity'] = self.similarity
-            doc.user_token_hooks['similarity'] = self.similarity
-
-        def similarity(self, obj1, obj2):
-            y = self._model([obj1.vector, obj2.vector])
-            return float(y[0])
diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade
index 0bb96780e..0d0579883 100644
--- a/website/usage/processing-pipelines.jade
+++ b/website/usage/processing-pipelines.jade
@@ -8,18 +8,14 @@ include _spacy-101/_pipelines
     +h(2, "pipelines") How pipelines work
     include _processing-pipelines/_pipelines
 
-+section("examples")
-    +h(2, "examples") Examples
-    include _processing-pipelines/_examples
++section("custom-components")
+    +h(2, "custom-components") Creating custom pipeline components
+    include _processing-pipelines/_custom-components
 
 +section("multithreading")
     +h(2, "multithreading") Multi-threading
     include _processing-pipelines/_multithreading
 
-+section("user-hooks")
-    +h(2, "user-hooks") User hooks
-    include _processing-pipelines/_user-hooks
-
 +section("serialization")
     +h(2, "serialization") Serialization
     include _processing-pipelines/_serialization

From ca6769fd4855e55365b70c3b6cbd32387aec6548 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Sat, 7 Oct 2017 15:28:01 +0200
Subject: [PATCH 15/19] Update spacy functions and remove removed set_factory

---
 website/api/_top-level/_spacy.jade | 38 ++----------------------------
 1 file changed, 2 insertions(+), 36 deletions(-)

diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade
index 2b523f846..81ec744ad 100644
--- a/website/api/_top-level/_spacy.jade
+++ b/website/api/_top-level/_spacy.jade
@@ -50,8 +50,8 @@ p
     |  returns it.
 
 +code("Abstract example").
-    cls = util.get_lang_class(lang)         #  get Language class for ID, e.g. 'en'
-    nlp = cls()                             #  initialise the Language class
+    cls = util.get_lang_class(lang)         #  get language for ID, e.g. 'en'
+    nlp = cls()                             #  initialise the language
     for name in pipeline:
         component = nlp.create_pipe(name)   #  create each pipeline component
         nlp.add_pipe(component)             #  add component to pipeline
@@ -155,37 +155,3 @@ p
         +cell returns
         +cell unicode
         +cell The explanation, or #[code None] if not found in the glossary.
-
-+h(3, "spacy.set_factory") spacy.set_factory
-    +tag function
-    +tag-new(2)
-
-p
-    |  Set a factory that returns a custom
-    |  #[+a("/usage/processing-pipelines") processing pipeline]
-    |  component. Factories are useful for creating stateful components, especially ones which depend on shared data.
-
-+aside-code("Example").
-    def my_factory(vocab):
-        def my_component(doc):
-            return doc
-        return my_component
-
-    spacy.set_factory('my_factory', my_factory)
-    nlp = Language(pipeline=['my_factory'])
-
-+table(["Name", "Type", "Description"])
-    +row
-        +cell #[code factory_id]
-        +cell unicode
-        +cell
-            |  Unique name of factory. If added to a new pipeline, spaCy will
-            |  look up the factory for this ID and use it to create the
-            |  component.
-
-    +row
-        +cell #[code factory]
-        +cell callable
-        +cell
-            |  Callable that takes a #[code Vocab] object and returns a pipeline
-            |  component.

From 2ac8b5c6223483af59d279277769a1c7b055ee7e Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 9 Oct 2017 14:36:20 +0200
Subject: [PATCH 16/19] Add wrapper for before/after code examples

---
 website/_includes/_mixins.jade           | 4 ++++
 website/assets/css/_base/_utilities.sass | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade
index 4876c6b6b..68db1be57 100644
--- a/website/_includes/_mixins.jade
+++ b/website/_includes/_mixins.jade
@@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap)
 
 //- Code blocks to display old/new versions
 
+mixin code-compare()
+    span.u-inline-block.u-padding-top.u-width-full
+        block
+
 mixin code-old()
     +code(false, false, false, false, "reject").o-block-small
         block
diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass
index e2ba552b7..91a6251e6 100644
--- a/website/assets/css/_base/_utilities.sass
+++ b/website/assets/css/_base/_utilities.sass
@@ -143,6 +143,9 @@
 
 //- Layout
 
+.u-width-full
+    width: 100%
+
 .u-float-left
     float: left
     margin-right: 1rem
@@ -166,6 +169,9 @@
 .u-padding-medium
     padding: 1.8rem
 
+.u-padding-top
+    padding-top: 2rem
+
 .u-inline-block
     display: inline-block
 

From 4d248ea920958943979850dc9e605cd172b7ee3a Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 9 Oct 2017 14:36:30 +0200
Subject: [PATCH 17/19] Fix spacing on bulleted lists

---
 website/assets/css/_components/_lists.sass | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/assets/css/_components/_lists.sass b/website/assets/css/_components/_lists.sass
index 2a933c95e..553af6578 100644
--- a/website/assets/css/_components/_lists.sass
+++ b/website/assets/css/_components/_lists.sass
@@ -25,7 +25,7 @@
         display: inline-block
         font-size: 0.6em
         font-weight: bold
-        padding-right: 1.25rem
+        padding-right: 1em
         margin-left: -3.75rem
         text-align: right
         width: 2.5rem

From 6550d0547c03002e1d46cf2cf1aa396835bc7cde Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 9 Oct 2017 14:36:36 +0200
Subject: [PATCH 18/19] Fix typo

---
 website/usage/_processing-pipelines/_serialization.jade | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/usage/_processing-pipelines/_serialization.jade b/website/usage/_processing-pipelines/_serialization.jade
index e29cbc558..111a5fbad 100644
--- a/website/usage/_processing-pipelines/_serialization.jade
+++ b/website/usage/_processing-pipelines/_serialization.jade
@@ -21,7 +21,7 @@ p
 
 +code.
     import spacy
-    from spacy.tokens import Span
+    from spacy.tokens.span import Span
 
     text = u'Netflix is hiring a new VP of global policy'
 

From 6c253db3fe879f229adce49f6b2541b8d5b97913 Mon Sep 17 00:00:00 2001
From: ines <ines@ines.io>
Date: Mon, 9 Oct 2017 14:36:56 +0200
Subject: [PATCH 19/19] Add section for developing spaCy extensions

---
 website/usage/_data.json                             | 1 +
 website/usage/_processing-pipelines/_extensions.jade | 3 +++
 website/usage/processing-pipelines.jade              | 4 ++++
 3 files changed, 8 insertions(+)
 create mode 100644 website/usage/_processing-pipelines/_extensions.jade

diff --git a/website/usage/_data.json b/website/usage/_data.json
index f77f7929c..25165c3ee 100644
--- a/website/usage/_data.json
+++ b/website/usage/_data.json
@@ -107,6 +107,7 @@
             "Custom Components": "custom-components",
             "Multi-threading": "multithreading",
             "Serialization": "serialization",
+            "Developing Extensions": "extensions"
         }
     },
 
diff --git a/website/usage/_processing-pipelines/_extensions.jade b/website/usage/_processing-pipelines/_extensions.jade
new file mode 100644
index 000000000..d512e0321
--- /dev/null
+++ b/website/usage/_processing-pipelines/_extensions.jade
@@ -0,0 +1,3 @@
+//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
+
++under-construction
diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade
index 0d0579883..346e0554d 100644
--- a/website/usage/processing-pipelines.jade
+++ b/website/usage/processing-pipelines.jade
@@ -19,3 +19,7 @@ include _spacy-101/_pipelines
 +section("serialization")
     +h(2, "serialization") Serialization
     include _processing-pipelines/_serialization
+
++section("extensions")
+    +h(2, "extensions") Developing spaCy extensions
+    include _processing-pipelines/_extensions