From 212c8f071180c9ce134a74b85603e48c14199595 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 00:25:54 +0200 Subject: [PATCH 01/19] Implement new Language methods and pipeline API --- spacy/language.py | 260 ++++++++++++++++++++++++++-------------------- spacy/util.py | 6 +- 2 files changed, 150 insertions(+), 116 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index c49c64b1d..91644aec0 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -70,59 +70,7 @@ class BaseDefaults(object): prefix_search=prefix_search, suffix_search=suffix_search, infix_finditer=infix_finditer, token_match=token_match) - @classmethod - def create_tagger(cls, nlp=None, **cfg): - if nlp is None: - return NeuralTagger(cls.create_vocab(nlp), **cfg) - else: - return NeuralTagger(nlp.vocab, **cfg) - - @classmethod - def create_parser(cls, nlp=None, **cfg): - if nlp is None: - return NeuralDependencyParser(cls.create_vocab(nlp), **cfg) - else: - return NeuralDependencyParser(nlp.vocab, **cfg) - - @classmethod - def create_entity(cls, nlp=None, **cfg): - if nlp is None: - return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg) - else: - return NeuralEntityRecognizer(nlp.vocab, **cfg) - - @classmethod - def create_pipeline(cls, nlp=None, disable=tuple()): - meta = nlp.meta if nlp is not None else {} - # Resolve strings, like "cnn", "lstm", etc - pipeline = [] - for entry in meta.get('pipeline', []): - if entry in disable or getattr(entry, 'name', entry) in disable: - continue - factory = cls.Defaults.factories[entry] - pipeline.append(factory(nlp, **meta.get(entry, {}))) - return pipeline - - factories = { - 'make_doc': create_tokenizer, - 'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], - 'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], - 'parser': lambda nlp, **cfg: [ - NeuralDependencyParser(nlp.vocab, **cfg), - nonproj.deprojectivize], - 'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], - 'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)], - 'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)], - # Temporary compatibility -- delete after pivot - 'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)], - 'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)], - 'dependencies': lambda nlp, **cfg: [ - NeuralDependencyParser(nlp.vocab, **cfg), - nonproj.deprojectivize, - ], - 'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)], - } - + pipe_names = ['tensorizer', 'tagger', 'parser', 'ner'] token_match = TOKEN_MATCH prefixes = tuple(TOKENIZER_PREFIXES) suffixes = tuple(TOKENIZER_SUFFIXES) @@ -152,8 +100,17 @@ class Language(object): Defaults = BaseDefaults lang = None - def __init__(self, vocab=True, make_doc=True, pipeline=None, - meta={}, disable=tuple(), **kwargs): + factories = { + 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), + 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), + 'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), + 'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), # nonproj.deprojectivize, + 'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), + 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), + 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) + } + + def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs): """Initialise a Language object. vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via @@ -179,28 +136,7 @@ class Language(object): factory = self.Defaults.create_tokenizer make_doc = factory(self, **meta.get('tokenizer', {})) self.tokenizer = make_doc - if pipeline is True: - self.pipeline = self.Defaults.create_pipeline(self, disable) - elif pipeline: - # Careful not to do getattr(p, 'name', None) here - # If we had disable=[None], we'd disable everything! - self.pipeline = [p for p in pipeline - if p not in disable - and getattr(p, 'name', p) not in disable] - # Resolve strings, like "cnn", "lstm", etc - for i, entry in enumerate(self.pipeline): - if entry in self.Defaults.factories: - factory = self.Defaults.factories[entry] - self.pipeline[i] = factory(self, **meta.get(entry, {})) - else: - self.pipeline = [] - flat_list = [] - for pipe in self.pipeline: - if isinstance(pipe, list): - flat_list.extend(pipe) - else: - flat_list.append(pipe) - self.pipeline = flat_list + self.pipeline = [] self._optimizer = None @property @@ -214,11 +150,7 @@ class Language(object): self._meta.setdefault('email', '') self._meta.setdefault('url', '') self._meta.setdefault('license', '') - pipeline = [] - for component in self.pipeline: - if hasattr(component, 'name'): - pipeline.append(component.name) - self._meta['pipeline'] = pipeline + self._meta['pipeline'] = self.pipe_names return self._meta @meta.setter @@ -228,31 +160,133 @@ class Language(object): # Conveniences to access pipeline components @property def tensorizer(self): - return self.get_component('tensorizer') + return self.get_pipe('tensorizer') @property def tagger(self): - return self.get_component('tagger') + return self.get_pipe('tagger') @property def parser(self): - return self.get_component('parser') + return self.get_pipe('parser') @property def entity(self): - return self.get_component('ner') + return self.get_pipe('ner') @property def matcher(self): - return self.get_component('matcher') + return self.get_pipe('matcher') - def get_component(self, name): - if self.pipeline in (True, None): - return None - for proc in self.pipeline: - if hasattr(proc, 'name') and proc.name.endswith(name): - return proc - return None + @property + def pipe_names(self): + """Get names of available pipeline components. + + RETURNS (list): List of component name strings, in order. + """ + return [pipe_name for pipe_name, _ in self.pipeline] + + def get_pipe(self, name): + """Get a pipeline component for a given component name. + + name (unicode): Name of pipeline component to get. + RETURNS (callable): The pipeline component. + """ + for pipe_name, component in self.pipeline: + if pipe_name == name: + return component + msg = "No component '{}' found in pipeline. Available names: {}" + raise KeyError(msg.format(name, self.pipe_names)) + + def create_pipe(self, name, config=dict()): + """Create a pipeline component from a factory. + + name (unicode): Factory name to look up in `Language.factories`. + RETURNS (callable): Pipeline component. + """ + if name not in self.factories: + raise KeyError("Can't find factory for '{}'.".format(name)) + factory = self.factories[name] + return factory(self, **config) + + def add_pipe(self, component, name=None, before=None, after=None, + first=None, last=None): + """Add a component to the processing pipeline. Valid components are + callables that take a `Doc` object, modify it and return it. Only one of + before, after, first or last can be set. Default behaviour is "last". + + component (callable): The pipeline component. + name (unicode): Name of pipeline component. Overwrites existing + component.name attribute if available. If no name is set and + the component exposes no name attribute, component.__name__ is + used. An error is raised if the name already exists in the pipeline. + before (unicode): Component name to insert component directly before. + after (unicode): Component name to insert component directly after. + first (bool): Insert component first / not first in the pipeline. + last (bool): Insert component last / not last in the pipeline. + + EXAMPLE: + >>> nlp.add_pipe(component, before='ner') + >>> nlp.add_pipe(component, name='custom_name', last=True) + """ + if name is None: + name = getattr(component, 'name', component.__name__) + if name in self.pipe_names: + raise ValueError("'{}' already exists in pipeline.".format(name)) + if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: + msg = ("Invalid constraints. You can only set one of the " + "following: before, after, first, last.") + raise ValueError(msg) + pipe = (name, component) + if last or not any([first, before, after]): + self.pipeline.append(pipe) + elif first: + self.pipeline.insert(0, pipe) + elif before and before in self.pipe_names: + self.pipeline.insert(self.pipe_names.index(before), pipe) + elif after and after in self.pipe_names: + self.pipeline.insert(self.pipe_names.index(after), pipe) + else: + msg = "Can't find '{}' in pipeline. Available names: {}" + unfound = before or after + raise ValueError(msg.format(unfound, self.pipe_names)) + + def replace_pipe(self, name, component): + """Replace a component in the pipeline. + + name (unicode): Name of the component to replace. + component (callable): Pipeline component. + """ + if name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(name, self.pipe_names)) + self.pipeline[self.pipe_names.index(name)] = (name, component) + + def rename_pipe(self, old_name, new_name): + """Rename a pipeline component. + + old_name (unicode): Name of the component to rename. + new_name (unicode): New name of the component. + """ + if old_name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(old_name, self.pipe_names)) + if new_name in self.pipe_names: + msg = "'{}' already exists in pipeline. Existing names: {}" + raise ValueError(msg.format(new_name, self.pipe_names)) + i = self.pipe_names.index(old_name) + self.pipeline[i] = (new_name, self.pipeline[i][1]) + + def remove_pipe(self, name): + """Remove a component from the pipeline. + + name (unicode): Name of the component to remove. + RETURNS (tuple): A (name, component) tuple of the removed component. + """ + if name not in self.pipe_names: + msg = "Can't find '{}' in pipeline. Available names: {}" + raise ValueError(msg.format(name, self.pipe_names)) + return self.pipeline.pop(self.pipe_names.index(name)) def __call__(self, text, disable=[]): """'Apply the pipeline to some text. The text can span multiple sentences, @@ -269,8 +303,7 @@ class Language(object): ('An', 'NN') """ doc = self.make_doc(text) - for proc in self.pipeline: - name = getattr(proc, 'name', None) + for name, proc in self.pipeline: if name in disable: continue doc = proc(doc) @@ -308,7 +341,7 @@ class Language(object): grads[key] = (W, dW) pipes = list(self.pipeline) random.shuffle(pipes) - for proc in pipes: + for name, proc in pipes: if not hasattr(proc, 'update'): continue proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses) @@ -322,7 +355,7 @@ class Language(object): docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects. YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects. """ - for proc in self.pipeline: + for name, proc in self.pipeline: if hasattr(proc, 'preprocess_gold'): docs_golds = proc.preprocess_gold(docs_golds) for doc, gold in docs_golds: @@ -371,7 +404,7 @@ class Language(object): else: device = None link_vectors_to_models(self.vocab) - for proc in self.pipeline: + for name, proc in self.pipeline: if hasattr(proc, 'begin_training'): context = proc.begin_training(get_gold_tuples(), pipeline=self.pipeline) @@ -393,7 +426,7 @@ class Language(object): docs, golds = zip(*docs_golds) docs = list(docs) golds = list(golds) - for pipe in self.pipeline: + for name, pipe in self.pipeline: if not hasattr(pipe, 'pipe'): for doc in docs: pipe(doc) @@ -419,7 +452,7 @@ class Language(object): >>> with nlp.use_params(optimizer.averages): >>> nlp.to_disk('/tmp/checkpoint') """ - contexts = [pipe.use_params(params) for pipe + contexts = [pipe.use_params(params) for name, pipe in self.pipeline if hasattr(pipe, 'use_params')] # TODO: Having trouble with contextlib # Workaround: these aren't actually context managers atm. @@ -466,8 +499,7 @@ class Language(object): yield (doc, context) return docs = (self.make_doc(text) for text in texts) - for proc in self.pipeline: - name = getattr(proc, 'name', None) + for name, proc in self.pipeline: if name in disable: continue if hasattr(proc, 'pipe'): @@ -495,14 +527,14 @@ class Language(object): ('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) - for proc in self.pipeline: + for name, proc in self.pipeline: if not hasattr(proc, 'name'): continue - if proc.name in disable: + if name in disable: continue if not hasattr(proc, 'to_disk'): continue - serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) + serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False) serializers['vocab'] = lambda p: self.vocab.to_disk(p) util.to_disk(path, serializers, {p: False for p in disable}) @@ -526,14 +558,12 @@ class Language(object): ('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)), ('meta.json', lambda p: p.open('w').write(json_dumps(self.meta))) )) - for proc in self.pipeline: - if not hasattr(proc, 'name'): - continue - if proc.name in disable: + for name, proc in self.pipeline: + if name in disable: continue if not hasattr(proc, 'to_disk'): continue - deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) + deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False) exclude = {p: False for p in disable} if not (path / 'vocab').exists(): exclude['vocab'] = True @@ -552,8 +582,8 @@ class Language(object): ('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)), ('meta', lambda: ujson.dumps(self.meta)) )) - for i, proc in enumerate(self.pipeline): - if getattr(proc, 'name', None) in disable: + for i, (name, proc) in enumerate(self.pipeline): + if name in disable: continue if not hasattr(proc, 'to_bytes'): continue @@ -572,8 +602,8 @@ class Language(object): ('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)), ('meta', lambda b: self.meta.update(ujson.loads(b))) )) - for i, proc in enumerate(self.pipeline): - if getattr(proc, 'name', None) in disable: + for i, (name, proc) in enumerate(self.pipeline): + if name in disable: continue if not hasattr(proc, 'from_bytes'): continue diff --git a/spacy/util.py b/spacy/util.py index e1a721a12..9e9c4fa42 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -135,7 +135,11 @@ def load_model_from_path(model_path, meta=False, **overrides): if not meta: meta = get_model_meta(model_path) cls = get_lang_class(meta['lang']) - nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides) + nlp = cls(meta=meta, **overrides) + for name in meta.get('pipeline', []): + config = meta.get('pipeline_args', {}).get(name, {}) + component = nlp.create_pipe(name, config=config) + nlp.add_pipe(component, name=name) return nlp.from_disk(model_path) From 2586b61b15fa04d91ec4a2919729ab70e9a6b26b Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 00:26:05 +0200 Subject: [PATCH 02/19] Fix formatting, tidy up and remove unused imports --- spacy/language.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 91644aec0..7a409133a 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1,12 +1,9 @@ # coding: utf8 from __future__ import absolute_import, unicode_literals from contextlib import contextmanager -import dill -import numpy from thinc.neural import Model -from thinc.neural.ops import NumpyOps, CupyOps -from thinc.neural.optimizers import Adam, SGD +from thinc.neural.optimizers import Adam import random import ujson from collections import OrderedDict @@ -17,24 +14,20 @@ from .vocab import Vocab from .tagger import Tagger from .lemmatizer import Lemmatizer from .syntax.parser import get_templates -from .syntax import nonproj -from .pipeline import NeuralDependencyParser, EntityRecognizer -from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer -from .pipeline import NeuralLabeller -from .pipeline import SimilarityHook -from .pipeline import TextCategorizer -from . import about +from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger +from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer from .compat import json_dumps, izip +from .scorer import Scorer +from ._ml import link_vectors_to_models from .attrs import IS_STOP from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tag_map import TAG_MAP from .lang.lex_attrs import LEX_ATTRS from . import util -from .scorer import Scorer -from ._ml import link_vectors_to_models +from . import about class BaseDefaults(object): @@ -289,7 +282,7 @@ class Language(object): return self.pipeline.pop(self.pipe_names.index(name)) def __call__(self, text, disable=[]): - """'Apply the pipeline to some text. The text can span multiple sentences, + """Apply the pipeline to some text. The text can span multiple sentences, and can contain arbtrary whitespace. Alignment into the original string is preserved. @@ -387,7 +380,7 @@ class Language(object): get_gold_tuples (function): Function returning gold data **cfg: Config parameters. - returns: An optimizer + RETURNS: An optimizer """ # Populate vocab if get_gold_tuples is not None: From b39409173e4143b6053892475c1adf6010176060 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 00:29:08 +0200 Subject: [PATCH 03/19] Add disable option and True/False/None values for pipeline --- spacy/util.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/spacy/util.py b/spacy/util.py index 9e9c4fa42..50ebc036b 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -136,10 +136,17 @@ def load_model_from_path(model_path, meta=False, **overrides): meta = get_model_meta(model_path) cls = get_lang_class(meta['lang']) nlp = cls(meta=meta, **overrides) - for name in meta.get('pipeline', []): - config = meta.get('pipeline_args', {}).get(name, {}) - component = nlp.create_pipe(name, config=config) - nlp.add_pipe(component, name=name) + pipeline = meta.get('pipeline', []) + disable = overrides.get('disable', []) + if pipeline is True: + pipeline = nlp.Defaults.pipe_names + elif pipeline in (False, None): + pipeline = [] + for name in pipeline: + if name not in disable: + config = meta.get('pipeline_args', {}).get(name, {}) + component = nlp.create_pipe(name, config=config) + nlp.add_pipe(component, name=name) return nlp.from_disk(model_path) From 61a503a61195c465328fcf0f283ce64f923b5c55 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 00:38:51 +0200 Subject: [PATCH 04/19] Fix parser test --- spacy/tests/conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index b33a7c008..28b5f4ab9 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -58,8 +58,9 @@ def en_vocab(): @pytest.fixture -def en_parser(): - return util.get_lang_class('en').Defaults.create_parser() +def en_parser(en_vocab): + nlp = util.get_lang_class('en')(en_vocab) + return nlp.create_pipe('parser') @pytest.fixture From e43530269c77a39d7b9460d5730db5707c439285 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 01:04:50 +0200 Subject: [PATCH 05/19] Update docstrings --- spacy/language.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index 7a409133a..a3152aea3 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -195,6 +195,7 @@ class Language(object): """Create a pipeline component from a factory. name (unicode): Factory name to look up in `Language.factories`. + config (dict): Configuration parameters to initialise component. RETURNS (callable): Pipeline component. """ if name not in self.factories: @@ -274,7 +275,7 @@ class Language(object): """Remove a component from the pipeline. name (unicode): Name of the component to remove. - RETURNS (tuple): A (name, component) tuple of the removed component. + RETURNS (tuple): A `(name, component)` tuple of the removed component. """ if name not in self.pipe_names: msg = "Can't find '{}' in pipeline. Available names: {}" From 3a65a0c970ec235d7e5b306924a90e8552c6568c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 7 Oct 2017 01:48:23 +0200 Subject: [PATCH 06/19] Start adding tests for new pipeline management --- spacy/tests/pipeline/__init__.py | 0 spacy/tests/pipeline/test_add_pipe.py | 43 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 spacy/tests/pipeline/__init__.py create mode 100644 spacy/tests/pipeline/test_add_pipe.py diff --git a/spacy/tests/pipeline/__init__.py b/spacy/tests/pipeline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/spacy/tests/pipeline/test_add_pipe.py b/spacy/tests/pipeline/test_add_pipe.py new file mode 100644 index 000000000..13fb4acaf --- /dev/null +++ b/spacy/tests/pipeline/test_add_pipe.py @@ -0,0 +1,43 @@ +from __future__ import unicode_literals +import pytest + +from ... import language +from ...language import Language + +@pytest.fixture +def nlp(): + return Language() + +@pytest.fixture +def name(): + return 'parser' + +def new_pipe(doc): + return doc + + +def test_add_pipe_no_name(nlp): + nlp.add_pipe(new_pipe) + assert 'new_pipe' in nlp.pipe_names + +def test_add_pipe_duplicate_name(nlp): + nlp.add_pipe(new_pipe, name='duplicate_name') + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, name='duplicate_name') + + +def test_add_pipe_first(nlp, name): + nlp.add_pipe(new_pipe, name=name, first=True) + assert nlp.pipeline[0][0] == name + + +def test_add_pipe_last(nlp, name): + nlp.add_pipe(lambda doc: doc, name='lambda_pipe') + nlp.add_pipe(new_pipe, name=name, last=True) + assert nlp.pipeline[0][0] != name + assert nlp.pipeline[-1][0] == name + + +def test_cant_add_pipe_first_and_last(nlp): + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, first=True, last=True) From 0384f0821817014972b5bf8f062d94cd6ea22c2b Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 7 Oct 2017 02:00:47 +0200 Subject: [PATCH 07/19] Trigger nonproj.deprojectivize as a postprocess --- spacy/language.py | 2 +- spacy/pipeline.pyx | 14 ++++++++++++++ spacy/syntax/nn_parser.pyx | 8 ++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) diff --git a/spacy/language.py b/spacy/language.py index a3152aea3..d40aee3ca 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -97,7 +97,7 @@ class Language(object): 'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp), 'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg), 'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg), - 'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), # nonproj.deprojectivize, + 'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg), 'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg), 'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), 'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg) diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx index 8d935335c..4d9adc609 100644 --- a/spacy/pipeline.pyx +++ b/spacy/pipeline.pyx @@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity from .tokens.doc cimport Doc from .syntax.parser cimport Parser as LinearParser from .syntax.nn_parser cimport Parser as NeuralParser +from .syntax import nonproj from .syntax.parser import get_templates as get_feature_templates from .syntax.beam_parser cimport BeamParser from .syntax.ner cimport BiluoPushDown @@ -773,11 +774,19 @@ cdef class DependencyParser(LinearParser): if isinstance(label, basestring): label = self.vocab.strings[label] + @property + def postprocesses(self): + return [nonproj.deprojectivize] + cdef class NeuralDependencyParser(NeuralParser): name = 'parser' TransitionSystem = ArcEager + @property + def postprocesses(self): + return [nonproj.deprojectivize] + def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): for target in []: labeller = NeuralLabeller(self.vocab, target=target) @@ -818,6 +827,11 @@ cdef class BeamDependencyParser(BeamParser): if isinstance(label, basestring): label = self.vocab.strings[label] + @property + def postprocesses(self): + return [nonproj.deprojectivize] + + __all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser', 'BeamEntityRecognizer', 'TokenVectorEnoder'] diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 459c94463..f2c72a639 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -739,6 +739,14 @@ cdef class Parser: for i in range(doc.length): doc.c[i] = state.c._sent[i] self.moves.finalize_doc(doc) + for hook in self.postprocesses: + for doc in docs: + hook(doc) + + @property + def postprocesses(self): + # Available for subclasses, e.g. to deprojectivize + return [] def add_label(self, label): for action in self.moves.action_types: From b38a8f4a943306a4a978e9b40fea9f5f2d7193e7 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 02:06:21 +0200 Subject: [PATCH 08/19] Fix and update pipe methods tests --- spacy/tests/pipeline/test_add_pipe.py | 43 ------------ spacy/tests/pipeline/test_pipe_methods.py | 84 +++++++++++++++++++++++ 2 files changed, 84 insertions(+), 43 deletions(-) delete mode 100644 spacy/tests/pipeline/test_add_pipe.py create mode 100644 spacy/tests/pipeline/test_pipe_methods.py diff --git a/spacy/tests/pipeline/test_add_pipe.py b/spacy/tests/pipeline/test_add_pipe.py deleted file mode 100644 index 13fb4acaf..000000000 --- a/spacy/tests/pipeline/test_add_pipe.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals -import pytest - -from ... import language -from ...language import Language - -@pytest.fixture -def nlp(): - return Language() - -@pytest.fixture -def name(): - return 'parser' - -def new_pipe(doc): - return doc - - -def test_add_pipe_no_name(nlp): - nlp.add_pipe(new_pipe) - assert 'new_pipe' in nlp.pipe_names - -def test_add_pipe_duplicate_name(nlp): - nlp.add_pipe(new_pipe, name='duplicate_name') - with pytest.raises(ValueError): - nlp.add_pipe(new_pipe, name='duplicate_name') - - -def test_add_pipe_first(nlp, name): - nlp.add_pipe(new_pipe, name=name, first=True) - assert nlp.pipeline[0][0] == name - - -def test_add_pipe_last(nlp, name): - nlp.add_pipe(lambda doc: doc, name='lambda_pipe') - nlp.add_pipe(new_pipe, name=name, last=True) - assert nlp.pipeline[0][0] != name - assert nlp.pipeline[-1][0] == name - - -def test_cant_add_pipe_first_and_last(nlp): - with pytest.raises(ValueError): - nlp.add_pipe(new_pipe, first=True, last=True) diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py new file mode 100644 index 000000000..5ec78aefb --- /dev/null +++ b/spacy/tests/pipeline/test_pipe_methods.py @@ -0,0 +1,84 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest + +from ...language import Language + + +@pytest.fixture +def nlp(): + return Language() + + +def new_pipe(doc): + return doc + + +def test_add_pipe_no_name(nlp): + nlp.add_pipe(new_pipe) + assert 'new_pipe' in nlp.pipe_names + + +def test_add_pipe_duplicate_name(nlp): + nlp.add_pipe(new_pipe, name='duplicate_name') + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, name='duplicate_name') + + +@pytest.mark.parametrize('name', ['parser']) +def test_add_pipe_first(nlp, name): + nlp.add_pipe(new_pipe, name=name, first=True) + assert nlp.pipeline[0][0] == name + + +@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')]) +def test_add_pipe_last(nlp, name1, name2): + nlp.add_pipe(lambda doc: doc, name=name2) + nlp.add_pipe(new_pipe, name=name1, last=True) + assert nlp.pipeline[0][0] != name1 + assert nlp.pipeline[-1][0] == name1 + + +def test_cant_add_pipe_first_and_last(nlp): + with pytest.raises(ValueError): + nlp.add_pipe(new_pipe, first=True, last=True) + + +@pytest.mark.parametrize('name', ['my_component']) +def test_get_pipe(nlp, name): + with pytest.raises(KeyError): + nlp.get_pipe(name) + nlp.add_pipe(new_pipe, name=name) + assert nlp.get_pipe(name) == new_pipe + + +@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)]) +def test_replace_pipe(nlp, name, replacement): + with pytest.raises(ValueError): + nlp.replace_pipe(name, new_pipe) + nlp.add_pipe(new_pipe, name=name) + nlp.replace_pipe(name, replacement) + assert nlp.get_pipe(name) != new_pipe + assert nlp.get_pipe(name) == replacement + + +@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')]) +def test_rename_pipe(nlp, old_name, new_name): + with pytest.raises(ValueError): + nlp.rename_pipe(old_name, new_name) + nlp.add_pipe(new_pipe, name=old_name) + nlp.rename_pipe(old_name, new_name) + assert nlp.pipeline[0][0] == new_name + + +@pytest.mark.parametrize('name', ['my_component']) +def test_remove_pipe(nlp, name): + with pytest.raises(ValueError): + nlp.remove_pipe(name) + nlp.add_pipe(new_pipe, name=name) + assert len(nlp.pipeline) == 1 + removed_name, removed_component = nlp.remove_pipe(name) + assert not len(nlp.pipeline) + assert removed_name == name + assert removed_component == new_pipe From 0adadcb3f04e2ecb98b5ca5de1afba2ba7208d23 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 02:15:15 +0200 Subject: [PATCH 09/19] Fix beam parse model test --- spacy/tests/parser/test_beam_parse.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/spacy/tests/parser/test_beam_parse.py b/spacy/tests/parser/test_beam_parse.py index da5f43d5e..dd77c6805 100644 --- a/spacy/tests/parser/test_beam_parse.py +++ b/spacy/tests/parser/test_beam_parse.py @@ -1,10 +1,11 @@ -import spacy +# coding: utf8 +from __future__ import unicode_literals + import pytest -@pytest.mark.models -def test_beam_parse(): - nlp = spacy.load('en_core_web_sm') - doc = nlp(u'Australia is a country', disable=['ner']) - ents = nlp.entity(doc, beam_width=2) - print(ents) +@pytest.mark.models('en') +def test_beam_parse(EN): + doc = EN(u'Australia is a country', disable=['ner']) + ents = EN.entity(doc, beam_width=2) + print(ents) From e370332fb1fe8cb179f0fbbbfd79b7251df8781c Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 03:00:20 +0200 Subject: [PATCH 10/19] Update Language API docs --- website/api/language.jade | 229 +++++++++++++++++++++++++++++++++++--- 1 file changed, 216 insertions(+), 13 deletions(-) diff --git a/website/api/language.jade b/website/api/language.jade index 617c81599..89807fabe 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -4,7 +4,14 @@ include ../_includes/_mixins p | Usually you'll load this once per process as #[code nlp] and pass the - | instance around your application. + | instance around your application. The #[code Language] class is created + | when you call #[+api("spacy#load") #[code spacy.load()]] and contains + | the shared vocabulary and #[+a("/usage/adding-languages") language data], + | optional model data loaded from a #[+a("/models") model package] or + | a path, and a #[+a("/usage/processing-pipelines") processing pipeline] + | containing components like the tagger or parser that are called on a + | document in order. You can also add your own processing pipeline + | components that take a #[code Doc] object, modify it and return it. +h(2, "init") Language.__init__ +tag method @@ -12,9 +19,9 @@ p p Initialise a #[code Language] object. +aside-code("Example"). + from spacy.vocab import Vocab from spacy.language import Language - nlp = Language(pipeline=['token_vectors', 'tags', - 'dependencies']) + nlp = Language(Vocab()) from spacy.lang.en import English nlp = English() @@ -34,14 +41,6 @@ p Initialise a #[code Language] object. | A function that takes text and returns a #[code Doc] object. | Usually a #[code Tokenizer]. - +row - +cell #[code pipeline] - +cell list - +cell - | A list of annotation processes or IDs of annotation, processes, - | e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked - | up in #[code Language.Defaults.factories]. - +row +cell #[code meta] +cell dict @@ -54,6 +53,23 @@ p Initialise a #[code Language] object. +cell #[code Language] +cell The newly constructed object. ++infobox("Deprecation note", "⚠️") + .o-block + | To make the processing pipelines and their components more + | transparent, the #[code pipeline] and #[code disable] arguments on + | initialisation are now deprecated. Instead, pipeline components can + | now be added, removed and rearranged using the new #[code Language] + | methods, for example #[+api("language#add_pipe") #[code add_pipe]] or + | #[+api("language#create_pipe") #[code create_pipe]]. This is also how + | #[+api("spacy#load") #[code spacy.load()]] creates the + | #[code Language] instance it returns. + + +code-new. + nlp = English() + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser) + +code-old nlp = English(pipeline=['parser']) + +h(2, "call") Language.__call__ +tag method @@ -235,7 +251,6 @@ p | Can be called before training to pre-process gold data. By default, it | handles nonprojectivity and adds missing tags to the tag map. - +table(["Name", "Type", "Description"]) +row +cell #[code docs_golds] @@ -247,6 +262,177 @@ p +cell tuple +cell Tuples of #[code Doc] and #[code GoldParse] objects. ++h(2, "create_pipe") Language.create_pipe + +tag method + +tag-new(2) + +p Create a pipeline component from a factory. + ++aside-code("Example"). + parser = nlp.create_pipe('parser') + nlp.add_pipe(parser) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell + | Factory name to look up in + | #[+api("language#class-attributes") #[code Language.factories]]. + + +row + +cell #[code config] + +cell dict + +cell Configuration parameters to initialise component. + + +row("foot") + +cell returns + +cell callable + +cell The pipeline component. + ++h(2, "add_pipe") Language.add_pipe + +tag method + +tag-new(2) + +p + | Add a component to the processing pipeline. Valid components are + | callables that take a #[code Doc] object, modify it and return it. Only + | one of #[code before], #[code after], #[code first] or #[code last] can + | be set. Default behaviour is #[code last=True]. + ++aside-code("Example"). + def component(doc): + # modify Doc and return it + return doc + + nlp.add_pipe(component, before='ner') + nlp.add_pipe(component, name='custom_name', last=True) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code component] + +cell callable + +cell The pipeline component. + + +row + +cell #[code name] + +cell unicode + +cell + | Name of pipeline component. Overwrites existing + | #[code component.name] attribute if available. If no #[code name] + | is set and the component exposes no name attribute, + | #[code component.__name__] is used. An error is raised if the + | name already exists in the pipeline. + + +row + +cell #[code before] + +cell unicode + +cell Component name to insert component directly before. + + +row + +cell #[code after] + +cell unicode + +cell Component name to insert component directly after: + + +row + +cell #[code first] + +cell bool + +cell Insert component first / not first in the pipeline. + + +row + +cell #[code last] + +cell bool + +cell Insert component last / not last in the pipeline. + ++h(2, "get_pipe") Language.get_pipe + +tag method + +tag-new(2) + +p Get a pipeline component for a given component name. + ++aside-code("Example"). + parser = nlp.get_pipe('parser') + custom_component = nlp.get_pipe('custom_component') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the pipeline component to get. + + +row("foot") + +cell returns + +cell callable + +cell The pipeline component. + ++h(2, "replace_pipe") Language.replace_pipe + +tag method + +tag-new(2) + +p Replace a component in the pipeline. + ++aside-code("Example"). + nlp.replace_pipe('parser', my_custom_parser) + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the component to replace. + + +row + +cell #[code component] + +cell callable + +cell The pipeline component to inser. + + ++h(2, "rename_pipe") Language.rename_pipe + +tag method + +tag-new(2) + +p + | Rename a component in the pipeline. Useful to create custom names for + | pre-defined and pre-loaded components. To change the default name of + | a component added to the pipeline, you can also use the #[code name] + | argument on #[+api("language#add_pipe") #[code add_pipe]]. + ++aside-code("Example"). + nlp.rename_pipe('parser', 'spacy_parser') + ++table(["Name", "Type", "Description"]) + +row + +cell #[code old_name] + +cell unicode + +cell Name of the component to rename. + + +row + +cell #[code new_name] + +cell unicode + +cell New name of the component. + ++h(2, "remove_pipe") Language.remove_pipe + +tag method + +tag-new(2) + +p + | Remove a component from the pipeline. Returns the removed component name + | and component function. + ++aside-code("Example"). + name, component = nlp.remove_pipe('parser') + assert name == 'parser' + ++table(["Name", "Type", "Description"]) + +row + +cell #[code name] + +cell unicode + +cell Name of the component to remove. + + +row("foot") + +cell returns + +cell tuple + +cell A #[code (name, component)] tuple of the removed component. + +h(2, "to_disk") Language.to_disk +tag method +tag-new(2) @@ -399,7 +585,15 @@ p Load state from a binary string. +row +cell #[code pipeline] +cell list - +cell Sequence of annotation functions. + +cell + | List of #[code (name, component)] tuples describing the current + | processing pipeline, in order. + + +row + +cell #[code pipe_names] + +tag-new(2) + +cell list + +cell List of pipeline component names, in order. +row +cell #[code meta] @@ -424,3 +618,12 @@ p Load state from a binary string. +cell | Two-letter language ID, i.e. | #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code]. + + +row + +cell #[code factories] + +tag-new(2) + +cell dict + +cell + | Factories that create pre-defined pipeline components, e.g. the + | tagger, parser or entity recognizer, keyed by their component + | name. From ed8e0085b0b6aae9501bb87be365366b88816be4 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 03:06:55 +0200 Subject: [PATCH 11/19] Update docs for spacy.load() --- website/api/_top-level/_spacy.jade | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade index c14f62f7e..2b523f846 100644 --- a/website/api/_top-level/_spacy.jade +++ b/website/api/_top-level/_spacy.jade @@ -43,6 +43,20 @@ p +cell #[code Language] +cell A #[code Language] object with the loaded model. +p + | Essentially, #[code spacy.load()] is a convenience wrapper that reads + | the language ID and pipeline components from a model's #[code meta.json], + | initialises the #[code Language] class, loads in the model data and + | returns it. + ++code("Abstract example"). + cls = util.get_lang_class(lang) # get Language class for ID, e.g. 'en' + nlp = cls() # initialise the Language class + for name in pipeline: + component = nlp.create_pipe(name) # create each pipeline component + nlp.add_pipe(component) # add component to pipeline + nlp.from_disk(model_data_path) # load in model data + +infobox("Deprecation note", "⚠️") .o-block | As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy From 58dfde7c0227958972dc37a71d878d605b87ffa1 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 04:54:57 +0200 Subject: [PATCH 12/19] Remove redundante deprecation note --- website/api/language.jade | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/website/api/language.jade b/website/api/language.jade index 89807fabe..500d6c411 100644 --- a/website/api/language.jade +++ b/website/api/language.jade @@ -53,23 +53,6 @@ p Initialise a #[code Language] object. +cell #[code Language] +cell The newly constructed object. -+infobox("Deprecation note", "⚠️") - .o-block - | To make the processing pipelines and their components more - | transparent, the #[code pipeline] and #[code disable] arguments on - | initialisation are now deprecated. Instead, pipeline components can - | now be added, removed and rearranged using the new #[code Language] - | methods, for example #[+api("language#add_pipe") #[code add_pipe]] or - | #[+api("language#create_pipe") #[code create_pipe]]. This is also how - | #[+api("spacy#load") #[code spacy.load()]] creates the - | #[code Language] instance it returns. - - +code-new. - nlp = English() - parser = nlp.create_pipe('parser') - nlp.add_pipe(parser) - +code-old nlp = English(pipeline=['parser']) - +h(2, "call") Language.__call__ +tag method From feaf353051f1163454a05c78b074c0a37b1329af Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 14:05:59 +0200 Subject: [PATCH 13/19] Update processing pipelines usage docs --- .../_processing-pipelines/_pipelines.jade | 195 +++++------------- 1 file changed, 52 insertions(+), 143 deletions(-) diff --git a/website/usage/_processing-pipelines/_pipelines.jade b/website/usage/_processing-pipelines/_pipelines.jade index d09ed4ead..3c1c28af1 100644 --- a/website/usage/_processing-pipelines/_pipelines.jade +++ b/website/usage/_processing-pipelines/_pipelines.jade @@ -11,7 +11,7 @@ p p | When you load a model, spaCy first consults the model's - | #[+a("/usage/saving-loading#models-generating") meta.json]. The + | #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The | meta typically includes the model details, the ID of a language class, | and an optional list of pipeline components. spaCy then does the | following: @@ -21,24 +21,26 @@ p "name": "example_model", "lang": "en" "description": "Example model for spaCy", - "pipeline": ["tensorizer", "tagger"] + "pipeline": ["tagger", "parser"] } +list("numbers") - +item - | Look up #[strong pipeline IDs] in the available - | #[strong pipeline factories]. - +item - | Initialise the #[strong pipeline components] by calling their - | factories with the #[code Vocab] as an argument. This gives each - | factory and component access to the pipeline's shared data, like - | strings, morphology and annotation scheme. +item | Load the #[strong language class and data] for the given ID via - | #[+api("util.get_lang_class") #[code get_lang_class]]. + | #[+api("util.get_lang_class") #[code get_lang_class]] and initialise + | it. The #[code Language] class contains the shared vocabulary, + | tokenization rules and the language-specific annotation scheme. +item - | Pass the path to the #[strong model data] to the #[code Language] - | class and return it. + | Iterate over the #[strong pipeline names] and create each component + | using #[+api("language#create_pipe") #[code create_pipe]], which + | looks them up in #[code Language.factories]. + +item + | Add each pipeline component to the pipeline in order, using + | #[+api("language#add_pipe") #[code add_pipe]]. + +item + | Make the #[strong model data] available to the #[code Language] class + | by calling #[+api("language#from_disk") #[code from_disk]] with the + | path to the model data ditectory. p | So when you call this... @@ -47,12 +49,12 @@ p nlp = spacy.load('en') p - | ... the model tells spaCy to use the pipeline + | ... the model tells spaCy to use the language #[code "en"] and the pipeline | #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will - | then look up each string in its internal factories registry and - | initialise the individual components. It'll then load - | #[code spacy.lang.en.English], pass it the path to the model's data - | directory, and return it for you to use as the #[code nlp] object. + | then initialise #[code spacy.lang.en.English], and create each pipeline + | component and add it to the processing pipeline. It'll then load in the + | model's data from its data ditectory and return the modified + | #[code Language] class for you to use as the #[code nlp] object. p | Fundamentally, a #[+a("/models") spaCy model] consists of three @@ -73,9 +75,12 @@ p pipeline = ['tensorizer', 'tagger', 'parser', 'ner'] data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0' - cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English() - nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline - nlp.from_disk(model_data_path) # 3. load in the binary data + cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English() + nlp = cls() # 2. initialise it + for name in pipeline: + component = nlp.create_pipe(name) # 3. create the pipeline components + nlp.add_pipe(component) # 4. add the component to the pipeline + nlp.from_disk(model_data_path) # 5. load in the binary data p | When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and @@ -87,124 +92,23 @@ p | document, which is then processed by the component next in the pipeline. +code("The pipeline under the hood"). - doc = nlp.make_doc(u'This is a sentence') - for proc in nlp.pipeline: - doc = proc(doc) - -+h(3, "creating") Creating pipeline components and factories + doc = nlp.make_doc(u'This is a sentence') # create a Doc from raw text + for name, proc in nlp.pipeline: # iterate over components in order + doc = proc(doc) # apply each component p - | spaCy lets you customise the pipeline with your own components. Components - | are functions that receive a #[code Doc] object, modify and return it. - | If your component is stateful, you'll want to create a new one for each - | pipeline. You can do that by defining and registering a factory which - | receives the shared #[code Vocab] object and returns a component. - -+h(4, "creating-component") Creating a component - -p - | A component receives a #[code Doc] object and - | #[strong performs the actual processing] – for example, using the current - | weights to make a prediction and set some annotation on the document. By - | adding a component to the pipeline, you'll get access to the #[code Doc] - | at any point #[strong during] processing – instead of only being able to - | modify it afterwards. - -+aside-code("Example"). - def my_component(doc): - # do something to the doc here - return doc - -+table(["Argument", "Type", "Description"]) - +row - +cell #[code doc] - +cell #[code Doc] - +cell The #[code Doc] object processed by the previous component. - - +row("foot") - +cell returns - +cell #[code Doc] - +cell The #[code Doc] object processed by this pipeline component. - -p - | When creating a new #[code Language] class, you can pass it a list of - | pipeline component functions to execute in that order. You can also - | add it to an existing pipeline by modifying #[code nlp.pipeline] – just - | be careful not to overwrite a pipeline or its components by accident! + | The current processing pipeline is available as #[code nlp.pipeline], + | which returns a list of #[code (name, component)] tuples, or + | #[code nlp.pipe_names], which only returns a list of human-readable + | component names. +code. - # Create a new Language object with a pipeline - from spacy.language import Language - nlp = Language(pipeline=[my_component]) + nlp.pipeline + # [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)] + nlp.pipe_names + # ['tagger', 'parser', 'ner'] - # Modify an existing pipeline - nlp = spacy.load('en') - nlp.pipeline.append(my_component) - -+h(4, "creating-factory") Creating a factory - -p - | A factory is a #[strong function that returns a pipeline component]. - | It's called with the #[code Vocab] object, to give it access to the - | shared data between components – for example, the strings, morphology, - | vectors or annotation scheme. Factories are useful for creating - | #[strong stateful components], especially ones which - | #[strong depend on shared data]. - -+aside-code("Example"). - def my_factory(vocab): - # load some state - def my_component(doc): - # process the doc - return doc - return my_component - -+table(["Argument", "Type", "Description"]) - +row - +cell #[code vocab] - +cell #[code Vocab] - +cell - | Shared data between components, including strings, morphology, - | vectors etc. - - +row("foot") - +cell returns - +cell callable - +cell The pipeline component. - -p - | By creating a factory, you're essentially telling spaCy how to get the - | pipeline component #[strong once the vocab is available]. Factories need to - | be registered via #[+api("spacy#set_factory") #[code set_factory()]] and - | by assigning them a unique ID. This ID can be added to the pipeline as a - | string. When creating a pipeline, you're free to mix strings and - | callable components: - -+code. - spacy.set_factory('my_factory', my_factory) - nlp = Language(pipeline=['my_factory', my_other_component]) - -p - | If spaCy comes across a string in the pipeline, it will try to resolve it - | by looking it up in the available factories. The factory will then be - | initialised with the #[code Vocab]. Providing factory names instead of - | callables also makes it easy to specify them in the model's - | #[+a("/usage/saving-loading#models-generating") meta.json]. If you're - | training your own model and want to use one of spaCy's default components, - | you won't have to worry about finding and implementing it either – to use - | the default tagger, simply add #[code "tagger"] to the pipeline, and - | #[strong spaCy will know what to do]. - -+infobox("Important note") - | Because factories are #[strong resolved on initialisation] of the - | #[code Language] class, it's #[strong not possible] to add them to the - | pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only - | works with individual component functions. To use factories, you need to - | create a new #[code Language] object, or generate a - | #[+a("/usage/training#models-generating") model package] with - | a custom pipeline. - -+h(3, "disabling") Disabling pipeline components ++h(3, "disabling") Disabling and modifying pipeline components p | If you don't need a particular component of the pipeline – for @@ -217,16 +121,19 @@ p +code. nlp = spacy.load('en', disable['parser', 'tagger']) nlp = English().from_disk('/model', disable=['tensorizer', 'ner']) - doc = nlp(u"I don't want parsed", disable=['parser']) p - | Note that you can't write directly to #[code nlp.pipeline], as this list - | holds the #[em actual components], not the IDs. However, if you know the - | order of the components, you can still slice the list: + | You can also use the #[+api("language#remove_pipe") #[code remove_pipe]] + | method to remove pipeline components from an existing pipeline, the + | #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them, + | or the #[+api("language#replace_pipe") #[code replace_pipe]] method + | to replace them with a custom component entirely (more details on this + | in the section on #[+a("#custom-components") custom components]. +code. - nlp = spacy.load('en') - nlp.pipeline = nlp.pipeline[:2] # only use the first two components + nlp.remove_pipe('parser') + nlp.rename_pipe('ner', 'entityrecognizer') + nlp.replace_pipe('tagger', my_custom_tagger) +infobox("Important note: disabling pipeline components") .o-block @@ -234,12 +141,14 @@ p | processing pipeline components, the #[code parser], #[code tagger] | and #[code entity] keyword arguments have been replaced with | #[code disable], which takes a list of pipeline component names. - | This lets you disable both default and custom components when loading + | This lets you disable pre-defined components when loading | a model, or initialising a Language class via | #[+api("language-from_disk") #[code from_disk]]. + +code-new. - nlp = spacy.load('en', disable=['tagger', 'ner']) - doc = nlp(u"I don't want parsed", disable=['parser']) + nlp = spacy.load('en', disable=['ner']) + nlp.remove_pipe('parser') + doc = nlp(u"I don't want parsed") +code-old. nlp = spacy.load('en', tagger=False, entity=False) doc = nlp(u"I don't want parsed", parse=False) From 743d1df1fe6474b6342ff7cfe73a988d89e679c6 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 15:27:28 +0200 Subject: [PATCH 14/19] Update pipelines docs and add user hooks to custom components --- website/usage/_data.json | 7 +- .../_custom-components.jade | 151 ++++++++++++++++++ .../_processing-pipelines/_user-hooks.jade | 61 ------- website/usage/processing-pipelines.jade | 10 +- 4 files changed, 157 insertions(+), 72 deletions(-) create mode 100644 website/usage/_processing-pipelines/_custom-components.jade delete mode 100644 website/usage/_processing-pipelines/_user-hooks.jade diff --git a/website/usage/_data.json b/website/usage/_data.json index b34304ed6..f77f7929c 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -103,11 +103,10 @@ "title": "Language Processing Pipelines", "next": "vectors-similarity", "menu": { - "How pipelines work": "pipelines", - "Examples": "examples", + "How Pipelines Work": "pipelines", + "Custom Components": "custom-components", "Multi-threading": "multithreading", - "User Hooks": "user-hooks", - "Serialization": "serialization" + "Serialization": "serialization", } }, diff --git a/website/usage/_processing-pipelines/_custom-components.jade b/website/usage/_processing-pipelines/_custom-components.jade new file mode 100644 index 000000000..13f0cb85c --- /dev/null +++ b/website/usage/_processing-pipelines/_custom-components.jade @@ -0,0 +1,151 @@ +//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS + +p + | A component receives a #[code Doc] object and + | #[strong performs the actual processing] – for example, using the current + | weights to make a prediction and set some annotation on the document. By + | adding a component to the pipeline, you'll get access to the #[code Doc] + | at any point #[strong during] processing – instead of only being able to + | modify it afterwards. + ++aside-code("Example"). + def my_component(doc): + # do something to the doc here + return doc + ++table(["Argument", "Type", "Description"]) + +row + +cell #[code doc] + +cell #[code Doc] + +cell The #[code Doc] object processed by the previous component. + + +row("foot") + +cell returns + +cell #[code Doc] + +cell The #[code Doc] object processed by this pipeline component. + +p + | Custom components can be added to the pipeline using the + | #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you + | can either specify a component to add it before or after, tell spaCy + | to add it first or last in the pipeline, or define a custom name. + | If no name is set and no #[code name] attribute is present on your + | component, the function name, e.g. #[code component.__name__] is used. + ++code("Adding pipeline components"). + def my_component(doc): + print("After tokenization, this doc has %s tokens." % len(doc)) + if len(doc) < 10: + print("This is a pretty short document.") + return doc + + nlp = spacy.load('en') + nlp.pipeline.add_pipe(my_component, name='print_info', first=True) + print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner'] + doc = nlp(u"This is a sentence.") + +p + | Of course, you can also wrap your component as a class to allow + | initialising it with custom settings and hold state within the component. + | This is useful for #[strong stateful components], especially ones which + | #[strong depend on shared data]. + ++code. + class MyComponent(object): + name = 'print_info' + + def __init__(vocab, short_limit=10): + self.vocab = nlp.vocab + self.short_limit = short_limit + + def __call__(doc): + if len(doc) < self.short_limit: + print("This is a pretty short document.") + return doc + + my_component = MyComponent(nlp.vocab, short_limit=25) + nlp.add_pipe(my_component, first=True) + ++h(3, "custom-components-attributes") + | Setting attributes on the #[code Doc], #[code Span] and #[code Token] + ++aside("Why ._?") + | Writing to a #[code ._] attribute instead of to the #[code Doc] directly + | keeps a clearer separation and makes it easier to ensure backwards + | compatibility. For example, if you've implemented your own #[code .coref] + | property and spaCy claims it one day, it'll break your code. Similarly, + | just by looking at the code, you'll immediately know what's built-in and + | what's custom – for example, #[code doc.sentiment] is spaCy, while + | #[code doc._.sent_score] isn't. + ++under-construction + ++h(3, "custom-components-user-hooks") Other user hooks + +p + | While it's generally recommended to use the #[code Doc._], #[code Span._] + | and #[code Token._] proxies to add your own custom attributes, spaCy + | offers a few exceptions to allow #[strong customising the built-in methods] + | like #[+api("doc#similarity") #[code Doc.similarity]] or + | #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can + | rely on statistical models you train yourself. For instance, you can + | provide your own on-the-fly sentence segmentation algorithm or document + | similarity method. + +p + | Hooks let you customize some of the behaviours of the #[code Doc], + | #[code Span] or #[code Token] objects by adding a component to the + | pipeline. For instance, to customize the + | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a + | component that sets a custom function to + | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] + | method will check the #[code user_hooks] dict, and delegate to your + | function if you've set one. Similar results can be achieved by setting + | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. + ++aside("Implementation note") + | The hooks live on the #[code Doc] object because the #[code Span] and + | #[code Token] objects are created lazily, and don't own any data. They + | just proxy to their parent #[code Doc]. This turns out to be convenient + | here — we only have to worry about installing hooks in one place. + ++table(["Name", "Customises"]) + +row + +cell #[code user_hooks] + +cell + +api("doc#vector") #[code Doc.vector] + +api("doc#has_vector") #[code Doc.has_vector] + +api("doc#vector_norm") #[code Doc.vector_norm] + +api("doc#sents") #[code Doc.sents] + + +row + +cell #[code user_token_hooks] + +cell + +api("token#similarity") #[code Token.similarity] + +api("token#vector") #[code Token.vector] + +api("token#has_vector") #[code Token.has_vector] + +api("token#vector_norm") #[code Token.vector_norm] + +api("token#conjuncts") #[code Token.conjuncts] + + +row + +cell #[code user_span_hooks] + +cell + +api("span#similarity") #[code Span.similarity] + +api("span#vector") #[code Span.vector] + +api("span#has_vector") #[code Span.has_vector] + +api("span#vector_norm") #[code Span.vector_norm] + +api("span#root") #[code Span.root] + ++code("Add custom similarity hooks"). + class SimilarityModel(object): + def __init__(self, model): + self._model = model + + def __call__(self, doc): + doc.user_hooks['similarity'] = self.similarity + doc.user_span_hooks['similarity'] = self.similarity + doc.user_token_hooks['similarity'] = self.similarity + + def similarity(self, obj1, obj2): + y = self._model([obj1.vector, obj2.vector]) + return float(y[0]) diff --git a/website/usage/_processing-pipelines/_user-hooks.jade b/website/usage/_processing-pipelines/_user-hooks.jade deleted file mode 100644 index e7dce53fe..000000000 --- a/website/usage/_processing-pipelines/_user-hooks.jade +++ /dev/null @@ -1,61 +0,0 @@ -//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS - -p - | Hooks let you customize some of the behaviours of the #[code Doc], - | #[code Span] or #[code Token] objects by adding a component to the - | pipeline. For instance, to customize the - | #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a - | component that sets a custom function to - | #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity] - | method will check the #[code user_hooks] dict, and delegate to your - | function if you've set one. Similar results can be achieved by setting - | functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks]. - -+code("Polymorphic similarity example"). - span.similarity(doc) - token.similarity(span) - doc1.similarity(doc2) - -p - | By default, this just averages the vectors for each document, and - | computes their cosine. Obviously, spaCy should make it easy for you to - | install your own similarity model. This introduces a tricky design - | challenge. The current solution is to add three more dicts to the - | #[code Doc] object: - -+aside("Implementation note") - | The hooks live on the #[code Doc] object because the #[code Span] and - | #[code Token] objects are created lazily, and don't own any data. They - | just proxy to their parent #[code Doc]. This turns out to be convenient - | here — we only have to worry about installing hooks in one place. - -+table(["Name", "Description"]) - +row - +cell #[code user_hooks] - +cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents] - - +row - +cell #[code user_token_hooks] - +cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts] - - +row - +cell #[code user_span_hooks] - +cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root] - -p - | To sum up, here's an example of hooking in custom #[code .similarity()] - | methods: - -+code("Add custom similarity hooks"). - class SimilarityModel(object): - def __init__(self, model): - self._model = model - - def __call__(self, doc): - doc.user_hooks['similarity'] = self.similarity - doc.user_span_hooks['similarity'] = self.similarity - doc.user_token_hooks['similarity'] = self.similarity - - def similarity(self, obj1, obj2): - y = self._model([obj1.vector, obj2.vector]) - return float(y[0]) diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade index 0bb96780e..0d0579883 100644 --- a/website/usage/processing-pipelines.jade +++ b/website/usage/processing-pipelines.jade @@ -8,18 +8,14 @@ include _spacy-101/_pipelines +h(2, "pipelines") How pipelines work include _processing-pipelines/_pipelines -+section("examples") - +h(2, "examples") Examples - include _processing-pipelines/_examples ++section("custom-components") + +h(2, "custom-components") Creating custom pipeline components + include _processing-pipelines/_custom-components +section("multithreading") +h(2, "multithreading") Multi-threading include _processing-pipelines/_multithreading -+section("user-hooks") - +h(2, "user-hooks") User hooks - include _processing-pipelines/_user-hooks - +section("serialization") +h(2, "serialization") Serialization include _processing-pipelines/_serialization From ca6769fd4855e55365b70c3b6cbd32387aec6548 Mon Sep 17 00:00:00 2001 From: ines Date: Sat, 7 Oct 2017 15:28:01 +0200 Subject: [PATCH 15/19] Update spacy functions and remove removed set_factory --- website/api/_top-level/_spacy.jade | 38 ++---------------------------- 1 file changed, 2 insertions(+), 36 deletions(-) diff --git a/website/api/_top-level/_spacy.jade b/website/api/_top-level/_spacy.jade index 2b523f846..81ec744ad 100644 --- a/website/api/_top-level/_spacy.jade +++ b/website/api/_top-level/_spacy.jade @@ -50,8 +50,8 @@ p | returns it. +code("Abstract example"). - cls = util.get_lang_class(lang) # get Language class for ID, e.g. 'en' - nlp = cls() # initialise the Language class + cls = util.get_lang_class(lang) # get language for ID, e.g. 'en' + nlp = cls() # initialise the language for name in pipeline: component = nlp.create_pipe(name) # create each pipeline component nlp.add_pipe(component) # add component to pipeline @@ -155,37 +155,3 @@ p +cell returns +cell unicode +cell The explanation, or #[code None] if not found in the glossary. - -+h(3, "spacy.set_factory") spacy.set_factory - +tag function - +tag-new(2) - -p - | Set a factory that returns a custom - | #[+a("/usage/processing-pipelines") processing pipeline] - | component. Factories are useful for creating stateful components, especially ones which depend on shared data. - -+aside-code("Example"). - def my_factory(vocab): - def my_component(doc): - return doc - return my_component - - spacy.set_factory('my_factory', my_factory) - nlp = Language(pipeline=['my_factory']) - -+table(["Name", "Type", "Description"]) - +row - +cell #[code factory_id] - +cell unicode - +cell - | Unique name of factory. If added to a new pipeline, spaCy will - | look up the factory for this ID and use it to create the - | component. - - +row - +cell #[code factory] - +cell callable - +cell - | Callable that takes a #[code Vocab] object and returns a pipeline - | component. From 2ac8b5c6223483af59d279277769a1c7b055ee7e Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 9 Oct 2017 14:36:20 +0200 Subject: [PATCH 16/19] Add wrapper for before/after code examples --- website/_includes/_mixins.jade | 4 ++++ website/assets/css/_base/_utilities.sass | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/website/_includes/_mixins.jade b/website/_includes/_mixins.jade index 4876c6b6b..68db1be57 100644 --- a/website/_includes/_mixins.jade +++ b/website/_includes/_mixins.jade @@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap) //- Code blocks to display old/new versions +mixin code-compare() + span.u-inline-block.u-padding-top.u-width-full + block + mixin code-old() +code(false, false, false, false, "reject").o-block-small block diff --git a/website/assets/css/_base/_utilities.sass b/website/assets/css/_base/_utilities.sass index e2ba552b7..91a6251e6 100644 --- a/website/assets/css/_base/_utilities.sass +++ b/website/assets/css/_base/_utilities.sass @@ -143,6 +143,9 @@ //- Layout +.u-width-full + width: 100% + .u-float-left float: left margin-right: 1rem @@ -166,6 +169,9 @@ .u-padding-medium padding: 1.8rem +.u-padding-top + padding-top: 2rem + .u-inline-block display: inline-block From 4d248ea920958943979850dc9e605cd172b7ee3a Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 9 Oct 2017 14:36:30 +0200 Subject: [PATCH 17/19] Fix spacing on bulleted lists --- website/assets/css/_components/_lists.sass | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/assets/css/_components/_lists.sass b/website/assets/css/_components/_lists.sass index 2a933c95e..553af6578 100644 --- a/website/assets/css/_components/_lists.sass +++ b/website/assets/css/_components/_lists.sass @@ -25,7 +25,7 @@ display: inline-block font-size: 0.6em font-weight: bold - padding-right: 1.25rem + padding-right: 1em margin-left: -3.75rem text-align: right width: 2.5rem From 6550d0547c03002e1d46cf2cf1aa396835bc7cde Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 9 Oct 2017 14:36:36 +0200 Subject: [PATCH 18/19] Fix typo --- website/usage/_processing-pipelines/_serialization.jade | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/usage/_processing-pipelines/_serialization.jade b/website/usage/_processing-pipelines/_serialization.jade index e29cbc558..111a5fbad 100644 --- a/website/usage/_processing-pipelines/_serialization.jade +++ b/website/usage/_processing-pipelines/_serialization.jade @@ -21,7 +21,7 @@ p +code. import spacy - from spacy.tokens import Span + from spacy.tokens.span import Span text = u'Netflix is hiring a new VP of global policy' From 6c253db3fe879f229adce49f6b2541b8d5b97913 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 9 Oct 2017 14:36:56 +0200 Subject: [PATCH 19/19] Add section for developing spaCy extensions --- website/usage/_data.json | 1 + website/usage/_processing-pipelines/_extensions.jade | 3 +++ website/usage/processing-pipelines.jade | 4 ++++ 3 files changed, 8 insertions(+) create mode 100644 website/usage/_processing-pipelines/_extensions.jade diff --git a/website/usage/_data.json b/website/usage/_data.json index f77f7929c..25165c3ee 100644 --- a/website/usage/_data.json +++ b/website/usage/_data.json @@ -107,6 +107,7 @@ "Custom Components": "custom-components", "Multi-threading": "multithreading", "Serialization": "serialization", + "Developing Extensions": "extensions" } }, diff --git a/website/usage/_processing-pipelines/_extensions.jade b/website/usage/_processing-pipelines/_extensions.jade new file mode 100644 index 000000000..d512e0321 --- /dev/null +++ b/website/usage/_processing-pipelines/_extensions.jade @@ -0,0 +1,3 @@ +//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS + ++under-construction diff --git a/website/usage/processing-pipelines.jade b/website/usage/processing-pipelines.jade index 0d0579883..346e0554d 100644 --- a/website/usage/processing-pipelines.jade +++ b/website/usage/processing-pipelines.jade @@ -19,3 +19,7 @@ include _spacy-101/_pipelines +section("serialization") +h(2, "serialization") Serialization include _processing-pipelines/_serialization + ++section("extensions") + +h(2, "extensions") Developing spaCy extensions + include _processing-pipelines/_extensions