Merge branch 'feature/pipeline-management' into feature/dot-underscore

This commit is contained in:
ines 2017-10-09 14:37:51 +02:00
commit de374dc72a
20 changed files with 725 additions and 404 deletions

View File

@ -1,12 +1,9 @@
# coding: utf8
from __future__ import absolute_import, unicode_literals
from contextlib import contextmanager
import dill
import numpy
from thinc.neural import Model
from thinc.neural.ops import NumpyOps, CupyOps
from thinc.neural.optimizers import Adam, SGD
from thinc.neural.optimizers import Adam
import random
import ujson
from collections import OrderedDict
@ -17,24 +14,20 @@ from .vocab import Vocab
from .tagger import Tagger
from .lemmatizer import Lemmatizer
from .syntax.parser import get_templates
from .syntax import nonproj
from .pipeline import NeuralDependencyParser, EntityRecognizer
from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
from .pipeline import NeuralLabeller
from .pipeline import SimilarityHook
from .pipeline import TextCategorizer
from . import about
from .pipeline import NeuralDependencyParser, TokenVectorEncoder, NeuralTagger
from .pipeline import NeuralEntityRecognizer, SimilarityHook, TextCategorizer
from .compat import json_dumps, izip
from .scorer import Scorer
from ._ml import link_vectors_to_models
from .attrs import IS_STOP
from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS
from . import util
from .scorer import Scorer
from ._ml import link_vectors_to_models
from . import about
class BaseDefaults(object):
@ -70,59 +63,7 @@ class BaseDefaults(object):
prefix_search=prefix_search, suffix_search=suffix_search,
infix_finditer=infix_finditer, token_match=token_match)
@classmethod
def create_tagger(cls, nlp=None, **cfg):
if nlp is None:
return NeuralTagger(cls.create_vocab(nlp), **cfg)
else:
return NeuralTagger(nlp.vocab, **cfg)
@classmethod
def create_parser(cls, nlp=None, **cfg):
if nlp is None:
return NeuralDependencyParser(cls.create_vocab(nlp), **cfg)
else:
return NeuralDependencyParser(nlp.vocab, **cfg)
@classmethod
def create_entity(cls, nlp=None, **cfg):
if nlp is None:
return NeuralEntityRecognizer(cls.create_vocab(nlp), **cfg)
else:
return NeuralEntityRecognizer(nlp.vocab, **cfg)
@classmethod
def create_pipeline(cls, nlp=None, disable=tuple()):
meta = nlp.meta if nlp is not None else {}
# Resolve strings, like "cnn", "lstm", etc
pipeline = []
for entry in meta.get('pipeline', []):
if entry in disable or getattr(entry, 'name', entry) in disable:
continue
factory = cls.Defaults.factories[entry]
pipeline.append(factory(nlp, **meta.get(entry, {})))
return pipeline
factories = {
'make_doc': create_tokenizer,
'tensorizer': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tagger': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'parser': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize],
'ner': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
'similarity': lambda nlp, **cfg: [SimilarityHook(nlp.vocab, **cfg)],
'textcat': lambda nlp, **cfg: [TextCategorizer(nlp.vocab, **cfg)],
# Temporary compatibility -- delete after pivot
'token_vectors': lambda nlp, **cfg: [TokenVectorEncoder(nlp.vocab, **cfg)],
'tags': lambda nlp, **cfg: [NeuralTagger(nlp.vocab, **cfg)],
'dependencies': lambda nlp, **cfg: [
NeuralDependencyParser(nlp.vocab, **cfg),
nonproj.deprojectivize,
],
'entities': lambda nlp, **cfg: [NeuralEntityRecognizer(nlp.vocab, **cfg)],
}
pipe_names = ['tensorizer', 'tagger', 'parser', 'ner']
token_match = TOKEN_MATCH
prefixes = tuple(TOKENIZER_PREFIXES)
suffixes = tuple(TOKENIZER_SUFFIXES)
@ -152,8 +93,17 @@ class Language(object):
Defaults = BaseDefaults
lang = None
def __init__(self, vocab=True, make_doc=True, pipeline=None,
meta={}, disable=tuple(), **kwargs):
factories = {
'tokenizer': lambda nlp: nlp.Defaults.create_tokenizer(nlp),
'tensorizer': lambda nlp, **cfg: TokenVectorEncoder(nlp.vocab, **cfg),
'tagger': lambda nlp, **cfg: NeuralTagger(nlp.vocab, **cfg),
'parser': lambda nlp, **cfg: NeuralDependencyParser(nlp.vocab, **cfg),
'ner': lambda nlp, **cfg: NeuralEntityRecognizer(nlp.vocab, **cfg),
'similarity': lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
'textcat': lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg)
}
def __init__(self, vocab=True, make_doc=True, meta={}, **kwargs):
"""Initialise a Language object.
vocab (Vocab): A `Vocab` object. If `True`, a vocab is created via
@ -179,28 +129,7 @@ class Language(object):
factory = self.Defaults.create_tokenizer
make_doc = factory(self, **meta.get('tokenizer', {}))
self.tokenizer = make_doc
if pipeline is True:
self.pipeline = self.Defaults.create_pipeline(self, disable)
elif pipeline:
# Careful not to do getattr(p, 'name', None) here
# If we had disable=[None], we'd disable everything!
self.pipeline = [p for p in pipeline
if p not in disable
and getattr(p, 'name', p) not in disable]
# Resolve strings, like "cnn", "lstm", etc
for i, entry in enumerate(self.pipeline):
if entry in self.Defaults.factories:
factory = self.Defaults.factories[entry]
self.pipeline[i] = factory(self, **meta.get(entry, {}))
else:
self.pipeline = []
flat_list = []
for pipe in self.pipeline:
if isinstance(pipe, list):
flat_list.extend(pipe)
else:
flat_list.append(pipe)
self.pipeline = flat_list
self.pipeline = []
self._optimizer = None
@property
@ -214,11 +143,7 @@ class Language(object):
self._meta.setdefault('email', '')
self._meta.setdefault('url', '')
self._meta.setdefault('license', '')
pipeline = []
for component in self.pipeline:
if hasattr(component, 'name'):
pipeline.append(component.name)
self._meta['pipeline'] = pipeline
self._meta['pipeline'] = self.pipe_names
return self._meta
@meta.setter
@ -228,34 +153,137 @@ class Language(object):
# Conveniences to access pipeline components
@property
def tensorizer(self):
return self.get_component('tensorizer')
return self.get_pipe('tensorizer')
@property
def tagger(self):
return self.get_component('tagger')
return self.get_pipe('tagger')
@property
def parser(self):
return self.get_component('parser')
return self.get_pipe('parser')
@property
def entity(self):
return self.get_component('ner')
return self.get_pipe('ner')
@property
def matcher(self):
return self.get_component('matcher')
return self.get_pipe('matcher')
def get_component(self, name):
if self.pipeline in (True, None):
return None
for proc in self.pipeline:
if hasattr(proc, 'name') and proc.name.endswith(name):
return proc
return None
@property
def pipe_names(self):
"""Get names of available pipeline components.
RETURNS (list): List of component name strings, in order.
"""
return [pipe_name for pipe_name, _ in self.pipeline]
def get_pipe(self, name):
"""Get a pipeline component for a given component name.
name (unicode): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
"""
for pipe_name, component in self.pipeline:
if pipe_name == name:
return component
msg = "No component '{}' found in pipeline. Available names: {}"
raise KeyError(msg.format(name, self.pipe_names))
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
name (unicode): Factory name to look up in `Language.factories`.
config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component.
"""
if name not in self.factories:
raise KeyError("Can't find factory for '{}'.".format(name))
factory = self.factories[name]
return factory(self, **config)
def add_pipe(self, component, name=None, before=None, after=None,
first=None, last=None):
"""Add a component to the processing pipeline. Valid components are
callables that take a `Doc` object, modify it and return it. Only one of
before, after, first or last can be set. Default behaviour is "last".
component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if the name already exists in the pipeline.
before (unicode): Component name to insert component directly before.
after (unicode): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline.
EXAMPLE:
>>> nlp.add_pipe(component, before='ner')
>>> nlp.add_pipe(component, name='custom_name', last=True)
"""
if name is None:
name = getattr(component, 'name', component.__name__)
if name in self.pipe_names:
raise ValueError("'{}' already exists in pipeline.".format(name))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
msg = ("Invalid constraints. You can only set one of the "
"following: before, after, first, last.")
raise ValueError(msg)
pipe = (name, component)
if last or not any([first, before, after]):
self.pipeline.append(pipe)
elif first:
self.pipeline.insert(0, pipe)
elif before and before in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(before), pipe)
elif after and after in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(after), pipe)
else:
msg = "Can't find '{}' in pipeline. Available names: {}"
unfound = before or after
raise ValueError(msg.format(unfound, self.pipe_names))
def replace_pipe(self, name, component):
"""Replace a component in the pipeline.
name (unicode): Name of the component to replace.
component (callable): Pipeline component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
self.pipeline[self.pipe_names.index(name)] = (name, component)
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
old_name (unicode): Name of the component to rename.
new_name (unicode): New name of the component.
"""
if old_name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(old_name, self.pipe_names))
if new_name in self.pipe_names:
msg = "'{}' already exists in pipeline. Existing names: {}"
raise ValueError(msg.format(new_name, self.pipe_names))
i = self.pipe_names.index(old_name)
self.pipeline[i] = (new_name, self.pipeline[i][1])
def remove_pipe(self, name):
"""Remove a component from the pipeline.
name (unicode): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
return self.pipeline.pop(self.pipe_names.index(name))
def __call__(self, text, disable=[]):
"""'Apply the pipeline to some text. The text can span multiple sentences,
"""Apply the pipeline to some text. The text can span multiple sentences,
and can contain arbtrary whitespace. Alignment into the original string
is preserved.
@ -269,8 +297,7 @@ class Language(object):
('An', 'NN')
"""
doc = self.make_doc(text)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
for name, proc in self.pipeline:
if name in disable:
continue
doc = proc(doc)
@ -308,7 +335,7 @@ class Language(object):
grads[key] = (W, dW)
pipes = list(self.pipeline)
random.shuffle(pipes)
for proc in pipes:
for name, proc in pipes:
if not hasattr(proc, 'update'):
continue
proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
@ -322,7 +349,7 @@ class Language(object):
docs_golds (iterable): Tuples of `Doc` and `GoldParse` objects.
YIELDS (tuple): Tuples of preprocessed `Doc` and `GoldParse` objects.
"""
for proc in self.pipeline:
for name, proc in self.pipeline:
if hasattr(proc, 'preprocess_gold'):
docs_golds = proc.preprocess_gold(docs_golds)
for doc, gold in docs_golds:
@ -354,7 +381,7 @@ class Language(object):
get_gold_tuples (function): Function returning gold data
**cfg: Config parameters.
returns: An optimizer
RETURNS: An optimizer
"""
# Populate vocab
if get_gold_tuples is not None:
@ -371,7 +398,7 @@ class Language(object):
else:
device = None
link_vectors_to_models(self.vocab)
for proc in self.pipeline:
for name, proc in self.pipeline:
if hasattr(proc, 'begin_training'):
context = proc.begin_training(get_gold_tuples(),
pipeline=self.pipeline)
@ -393,7 +420,7 @@ class Language(object):
docs, golds = zip(*docs_golds)
docs = list(docs)
golds = list(golds)
for pipe in self.pipeline:
for name, pipe in self.pipeline:
if not hasattr(pipe, 'pipe'):
for doc in docs:
pipe(doc)
@ -419,7 +446,7 @@ class Language(object):
>>> with nlp.use_params(optimizer.averages):
>>> nlp.to_disk('/tmp/checkpoint')
"""
contexts = [pipe.use_params(params) for pipe
contexts = [pipe.use_params(params) for name, pipe
in self.pipeline if hasattr(pipe, 'use_params')]
# TODO: Having trouble with contextlib
# Workaround: these aren't actually context managers atm.
@ -466,8 +493,7 @@ class Language(object):
yield (doc, context)
return
docs = (self.make_doc(text) for text in texts)
for proc in self.pipeline:
name = getattr(proc, 'name', None)
for name, proc in self.pipeline:
if name in disable:
continue
if hasattr(proc, 'pipe'):
@ -495,14 +521,14 @@ class Language(object):
('tokenizer', lambda p: self.tokenizer.to_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
for name, proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
if name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
serializers[proc.name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers[name] = lambda p, proc=proc: proc.to_disk(p, vocab=False)
serializers['vocab'] = lambda p: self.vocab.to_disk(p)
util.to_disk(path, serializers, {p: False for p in disable})
@ -526,14 +552,12 @@ class Language(object):
('tokenizer', lambda p: self.tokenizer.from_disk(p, vocab=False)),
('meta.json', lambda p: p.open('w').write(json_dumps(self.meta)))
))
for proc in self.pipeline:
if not hasattr(proc, 'name'):
continue
if proc.name in disable:
for name, proc in self.pipeline:
if name in disable:
continue
if not hasattr(proc, 'to_disk'):
continue
deserializers[proc.name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
deserializers[name] = lambda p, proc=proc: proc.from_disk(p, vocab=False)
exclude = {p: False for p in disable}
if not (path / 'vocab').exists():
exclude['vocab'] = True
@ -552,8 +576,8 @@ class Language(object):
('tokenizer', lambda: self.tokenizer.to_bytes(vocab=False)),
('meta', lambda: ujson.dumps(self.meta))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
if not hasattr(proc, 'to_bytes'):
continue
@ -572,8 +596,8 @@ class Language(object):
('tokenizer', lambda b: self.tokenizer.from_bytes(b, vocab=False)),
('meta', lambda b: self.meta.update(ujson.loads(b)))
))
for i, proc in enumerate(self.pipeline):
if getattr(proc, 'name', None) in disable:
for i, (name, proc) in enumerate(self.pipeline):
if name in disable:
continue
if not hasattr(proc, 'from_bytes'):
continue

View File

@ -28,6 +28,7 @@ from thinc.neural._classes.difference import Siamese, CauchySimilarity
from .tokens.doc cimport Doc
from .syntax.parser cimport Parser as LinearParser
from .syntax.nn_parser cimport Parser as NeuralParser
from .syntax import nonproj
from .syntax.parser import get_templates as get_feature_templates
from .syntax.beam_parser cimport BeamParser
from .syntax.ner cimport BiluoPushDown
@ -773,11 +774,19 @@ cdef class DependencyParser(LinearParser):
if isinstance(label, basestring):
label = self.vocab.strings[label]
@property
def postprocesses(self):
return [nonproj.deprojectivize]
cdef class NeuralDependencyParser(NeuralParser):
name = 'parser'
TransitionSystem = ArcEager
@property
def postprocesses(self):
return [nonproj.deprojectivize]
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
for target in []:
labeller = NeuralLabeller(self.vocab, target=target)
@ -818,6 +827,11 @@ cdef class BeamDependencyParser(BeamParser):
if isinstance(label, basestring):
label = self.vocab.strings[label]
@property
def postprocesses(self):
return [nonproj.deprojectivize]
__all__ = ['Tagger', 'DependencyParser', 'EntityRecognizer', 'BeamDependencyParser',
'BeamEntityRecognizer', 'TokenVectorEnoder']

View File

@ -779,6 +779,14 @@ cdef class Parser:
for i in range(doc.length):
doc.c[i] = state.c._sent[i]
self.moves.finalize_doc(doc)
for hook in self.postprocesses:
for doc in docs:
hook(doc)
@property
def postprocesses(self):
# Available for subclasses, e.g. to deprojectivize
return []
def add_label(self, label):
for action in self.moves.action_types:

View File

@ -58,8 +58,9 @@ def en_vocab():
@pytest.fixture
def en_parser():
return util.get_lang_class('en').Defaults.create_parser()
def en_parser(en_vocab):
nlp = util.get_lang_class('en')(en_vocab)
return nlp.create_pipe('parser')
@pytest.fixture

View File

@ -1,10 +1,11 @@
import spacy
# coding: utf8
from __future__ import unicode_literals
import pytest
@pytest.mark.models
def test_beam_parse():
nlp = spacy.load('en_core_web_sm')
doc = nlp(u'Australia is a country', disable=['ner'])
ents = nlp.entity(doc, beam_width=2)
print(ents)
@pytest.mark.models('en')
def test_beam_parse(EN):
doc = EN(u'Australia is a country', disable=['ner'])
ents = EN.entity(doc, beam_width=2)
print(ents)

View File

View File

@ -0,0 +1,84 @@
# coding: utf8
from __future__ import unicode_literals
import pytest
from ...language import Language
@pytest.fixture
def nlp():
return Language()
def new_pipe(doc):
return doc
def test_add_pipe_no_name(nlp):
nlp.add_pipe(new_pipe)
assert 'new_pipe' in nlp.pipe_names
def test_add_pipe_duplicate_name(nlp):
nlp.add_pipe(new_pipe, name='duplicate_name')
with pytest.raises(ValueError):
nlp.add_pipe(new_pipe, name='duplicate_name')
@pytest.mark.parametrize('name', ['parser'])
def test_add_pipe_first(nlp, name):
nlp.add_pipe(new_pipe, name=name, first=True)
assert nlp.pipeline[0][0] == name
@pytest.mark.parametrize('name1,name2', [('parser', 'lambda_pipe')])
def test_add_pipe_last(nlp, name1, name2):
nlp.add_pipe(lambda doc: doc, name=name2)
nlp.add_pipe(new_pipe, name=name1, last=True)
assert nlp.pipeline[0][0] != name1
assert nlp.pipeline[-1][0] == name1
def test_cant_add_pipe_first_and_last(nlp):
with pytest.raises(ValueError):
nlp.add_pipe(new_pipe, first=True, last=True)
@pytest.mark.parametrize('name', ['my_component'])
def test_get_pipe(nlp, name):
with pytest.raises(KeyError):
nlp.get_pipe(name)
nlp.add_pipe(new_pipe, name=name)
assert nlp.get_pipe(name) == new_pipe
@pytest.mark.parametrize('name,replacement', [('my_component', lambda doc: doc)])
def test_replace_pipe(nlp, name, replacement):
with pytest.raises(ValueError):
nlp.replace_pipe(name, new_pipe)
nlp.add_pipe(new_pipe, name=name)
nlp.replace_pipe(name, replacement)
assert nlp.get_pipe(name) != new_pipe
assert nlp.get_pipe(name) == replacement
@pytest.mark.parametrize('old_name,new_name', [('old_pipe', 'new_pipe')])
def test_rename_pipe(nlp, old_name, new_name):
with pytest.raises(ValueError):
nlp.rename_pipe(old_name, new_name)
nlp.add_pipe(new_pipe, name=old_name)
nlp.rename_pipe(old_name, new_name)
assert nlp.pipeline[0][0] == new_name
@pytest.mark.parametrize('name', ['my_component'])
def test_remove_pipe(nlp, name):
with pytest.raises(ValueError):
nlp.remove_pipe(name)
nlp.add_pipe(new_pipe, name=name)
assert len(nlp.pipeline) == 1
removed_name, removed_component = nlp.remove_pipe(name)
assert not len(nlp.pipeline)
assert removed_name == name
assert removed_component == new_pipe

View File

@ -135,7 +135,18 @@ def load_model_from_path(model_path, meta=False, **overrides):
if not meta:
meta = get_model_meta(model_path)
cls = get_lang_class(meta['lang'])
nlp = cls(pipeline=meta.get('pipeline', True), meta=meta, **overrides)
nlp = cls(meta=meta, **overrides)
pipeline = meta.get('pipeline', [])
disable = overrides.get('disable', [])
if pipeline is True:
pipeline = nlp.Defaults.pipe_names
elif pipeline in (False, None):
pipeline = []
for name in pipeline:
if name not in disable:
config = meta.get('pipeline_args', {}).get(name, {})
component = nlp.create_pipe(name, config=config)
nlp.add_pipe(component, name=name)
return nlp.from_disk(model_path)

View File

@ -149,6 +149,10 @@ mixin code(label, language, prompt, height, icon, wrap)
//- Code blocks to display old/new versions
mixin code-compare()
span.u-inline-block.u-padding-top.u-width-full
block
mixin code-old()
+code(false, false, false, false, "reject").o-block-small
block

View File

@ -43,6 +43,20 @@ p
+cell #[code Language]
+cell A #[code Language] object with the loaded model.
p
| Essentially, #[code spacy.load()] is a convenience wrapper that reads
| the language ID and pipeline components from a model's #[code meta.json],
| initialises the #[code Language] class, loads in the model data and
| returns it.
+code("Abstract example").
cls = util.get_lang_class(lang) # get language for ID, e.g. 'en'
nlp = cls() # initialise the language
for name in pipeline:
component = nlp.create_pipe(name) # create each pipeline component
nlp.add_pipe(component) # add component to pipeline
nlp.from_disk(model_data_path) # load in model data
+infobox("Deprecation note", "⚠️")
.o-block
| As of spaCy 2.0, the #[code path] keyword argument is deprecated. spaCy
@ -141,37 +155,3 @@ p
+cell returns
+cell unicode
+cell The explanation, or #[code None] if not found in the glossary.
+h(3, "spacy.set_factory") spacy.set_factory
+tag function
+tag-new(2)
p
| Set a factory that returns a custom
| #[+a("/usage/processing-pipelines") processing pipeline]
| component. Factories are useful for creating stateful components, especially ones which depend on shared data.
+aside-code("Example").
def my_factory(vocab):
def my_component(doc):
return doc
return my_component
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory'])
+table(["Name", "Type", "Description"])
+row
+cell #[code factory_id]
+cell unicode
+cell
| Unique name of factory. If added to a new pipeline, spaCy will
| look up the factory for this ID and use it to create the
| component.
+row
+cell #[code factory]
+cell callable
+cell
| Callable that takes a #[code Vocab] object and returns a pipeline
| component.

View File

@ -4,7 +4,14 @@ include ../_includes/_mixins
p
| Usually you'll load this once per process as #[code nlp] and pass the
| instance around your application.
| instance around your application. The #[code Language] class is created
| when you call #[+api("spacy#load") #[code spacy.load()]] and contains
| the shared vocabulary and #[+a("/usage/adding-languages") language data],
| optional model data loaded from a #[+a("/models") model package] or
| a path, and a #[+a("/usage/processing-pipelines") processing pipeline]
| containing components like the tagger or parser that are called on a
| document in order. You can also add your own processing pipeline
| components that take a #[code Doc] object, modify it and return it.
+h(2, "init") Language.__init__
+tag method
@ -12,9 +19,9 @@ p
p Initialise a #[code Language] object.
+aside-code("Example").
from spacy.vocab import Vocab
from spacy.language import Language
nlp = Language(pipeline=['token_vectors', 'tags',
'dependencies'])
nlp = Language(Vocab())
from spacy.lang.en import English
nlp = English()
@ -34,14 +41,6 @@ p Initialise a #[code Language] object.
| A function that takes text and returns a #[code Doc] object.
| Usually a #[code Tokenizer].
+row
+cell #[code pipeline]
+cell list
+cell
| A list of annotation processes or IDs of annotation, processes,
| e.g. a #[code Tagger] object, or #[code 'tagger']. IDs are looked
| up in #[code Language.Defaults.factories].
+row
+cell #[code meta]
+cell dict
@ -235,7 +234,6 @@ p
| Can be called before training to pre-process gold data. By default, it
| handles nonprojectivity and adds missing tags to the tag map.
+table(["Name", "Type", "Description"])
+row
+cell #[code docs_golds]
@ -247,6 +245,177 @@ p
+cell tuple
+cell Tuples of #[code Doc] and #[code GoldParse] objects.
+h(2, "create_pipe") Language.create_pipe
+tag method
+tag-new(2)
p Create a pipeline component from a factory.
+aside-code("Example").
parser = nlp.create_pipe('parser')
nlp.add_pipe(parser)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell
| Factory name to look up in
| #[+api("language#class-attributes") #[code Language.factories]].
+row
+cell #[code config]
+cell dict
+cell Configuration parameters to initialise component.
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
+h(2, "add_pipe") Language.add_pipe
+tag method
+tag-new(2)
p
| Add a component to the processing pipeline. Valid components are
| callables that take a #[code Doc] object, modify it and return it. Only
| one of #[code before], #[code after], #[code first] or #[code last] can
| be set. Default behaviour is #[code last=True].
+aside-code("Example").
def component(doc):
# modify Doc and return it
return doc
nlp.add_pipe(component, before='ner')
nlp.add_pipe(component, name='custom_name', last=True)
+table(["Name", "Type", "Description"])
+row
+cell #[code component]
+cell callable
+cell The pipeline component.
+row
+cell #[code name]
+cell unicode
+cell
| Name of pipeline component. Overwrites existing
| #[code component.name] attribute if available. If no #[code name]
| is set and the component exposes no name attribute,
| #[code component.__name__] is used. An error is raised if the
| name already exists in the pipeline.
+row
+cell #[code before]
+cell unicode
+cell Component name to insert component directly before.
+row
+cell #[code after]
+cell unicode
+cell Component name to insert component directly after:
+row
+cell #[code first]
+cell bool
+cell Insert component first / not first in the pipeline.
+row
+cell #[code last]
+cell bool
+cell Insert component last / not last in the pipeline.
+h(2, "get_pipe") Language.get_pipe
+tag method
+tag-new(2)
p Get a pipeline component for a given component name.
+aside-code("Example").
parser = nlp.get_pipe('parser')
custom_component = nlp.get_pipe('custom_component')
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the pipeline component to get.
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
+h(2, "replace_pipe") Language.replace_pipe
+tag method
+tag-new(2)
p Replace a component in the pipeline.
+aside-code("Example").
nlp.replace_pipe('parser', my_custom_parser)
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the component to replace.
+row
+cell #[code component]
+cell callable
+cell The pipeline component to inser.
+h(2, "rename_pipe") Language.rename_pipe
+tag method
+tag-new(2)
p
| Rename a component in the pipeline. Useful to create custom names for
| pre-defined and pre-loaded components. To change the default name of
| a component added to the pipeline, you can also use the #[code name]
| argument on #[+api("language#add_pipe") #[code add_pipe]].
+aside-code("Example").
nlp.rename_pipe('parser', 'spacy_parser')
+table(["Name", "Type", "Description"])
+row
+cell #[code old_name]
+cell unicode
+cell Name of the component to rename.
+row
+cell #[code new_name]
+cell unicode
+cell New name of the component.
+h(2, "remove_pipe") Language.remove_pipe
+tag method
+tag-new(2)
p
| Remove a component from the pipeline. Returns the removed component name
| and component function.
+aside-code("Example").
name, component = nlp.remove_pipe('parser')
assert name == 'parser'
+table(["Name", "Type", "Description"])
+row
+cell #[code name]
+cell unicode
+cell Name of the component to remove.
+row("foot")
+cell returns
+cell tuple
+cell A #[code (name, component)] tuple of the removed component.
+h(2, "to_disk") Language.to_disk
+tag method
+tag-new(2)
@ -399,7 +568,15 @@ p Load state from a binary string.
+row
+cell #[code pipeline]
+cell list
+cell Sequence of annotation functions.
+cell
| List of #[code (name, component)] tuples describing the current
| processing pipeline, in order.
+row
+cell #[code pipe_names]
+tag-new(2)
+cell list
+cell List of pipeline component names, in order.
+row
+cell #[code meta]
@ -424,3 +601,12 @@ p Load state from a binary string.
+cell
| Two-letter language ID, i.e.
| #[+a("https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes") ISO code].
+row
+cell #[code factories]
+tag-new(2)
+cell dict
+cell
| Factories that create pre-defined pipeline components, e.g. the
| tagger, parser or entity recognizer, keyed by their component
| name.

View File

@ -143,6 +143,9 @@
//- Layout
.u-width-full
width: 100%
.u-float-left
float: left
margin-right: 1rem
@ -166,6 +169,9 @@
.u-padding-medium
padding: 1.8rem
.u-padding-top
padding-top: 2rem
.u-inline-block
display: inline-block

View File

@ -25,7 +25,7 @@
display: inline-block
font-size: 0.6em
font-weight: bold
padding-right: 1.25rem
padding-right: 1em
margin-left: -3.75rem
text-align: right
width: 2.5rem

View File

@ -103,11 +103,11 @@
"title": "Language Processing Pipelines",
"next": "vectors-similarity",
"menu": {
"How pipelines work": "pipelines",
"Examples": "examples",
"How Pipelines Work": "pipelines",
"Custom Components": "custom-components",
"Multi-threading": "multithreading",
"User Hooks": "user-hooks",
"Serialization": "serialization"
"Serialization": "serialization",
"Developing Extensions": "extensions"
}
},

View File

@ -0,0 +1,151 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > CUSTOM COMPONENTS
p
| A component receives a #[code Doc] object and
| #[strong performs the actual processing] for example, using the current
| weights to make a prediction and set some annotation on the document. By
| adding a component to the pipeline, you'll get access to the #[code Doc]
| at any point #[strong during] processing instead of only being able to
| modify it afterwards.
+aside-code("Example").
def my_component(doc):
# do something to the doc here
return doc
+table(["Argument", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+row("foot")
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
p
| Custom components can be added to the pipeline using the
| #[+api("language#add_pipe") #[code add_pipe]] method. Optionally, you
| can either specify a component to add it before or after, tell spaCy
| to add it first or last in the pipeline, or define a custom name.
| If no name is set and no #[code name] attribute is present on your
| component, the function name, e.g. #[code component.__name__] is used.
+code("Adding pipeline components").
def my_component(doc):
print("After tokenization, this doc has %s tokens." % len(doc))
if len(doc) < 10:
print("This is a pretty short document.")
return doc
nlp = spacy.load('en')
nlp.pipeline.add_pipe(my_component, name='print_info', first=True)
print(nlp.pipe_names) # ['print_info', 'tagger', 'parser', 'ner']
doc = nlp(u"This is a sentence.")
p
| Of course, you can also wrap your component as a class to allow
| initialising it with custom settings and hold state within the component.
| This is useful for #[strong stateful components], especially ones which
| #[strong depend on shared data].
+code.
class MyComponent(object):
name = 'print_info'
def __init__(vocab, short_limit=10):
self.vocab = nlp.vocab
self.short_limit = short_limit
def __call__(doc):
if len(doc) < self.short_limit:
print("This is a pretty short document.")
return doc
my_component = MyComponent(nlp.vocab, short_limit=25)
nlp.add_pipe(my_component, first=True)
+h(3, "custom-components-attributes")
| Setting attributes on the #[code Doc], #[code Span] and #[code Token]
+aside("Why ._?")
| Writing to a #[code ._] attribute instead of to the #[code Doc] directly
| keeps a clearer separation and makes it easier to ensure backwards
| compatibility. For example, if you've implemented your own #[code .coref]
| property and spaCy claims it one day, it'll break your code. Similarly,
| just by looking at the code, you'll immediately know what's built-in and
| what's custom for example, #[code doc.sentiment] is spaCy, while
| #[code doc._.sent_score] isn't.
+under-construction
+h(3, "custom-components-user-hooks") Other user hooks
p
| While it's generally recommended to use the #[code Doc._], #[code Span._]
| and #[code Token._] proxies to add your own custom attributes, spaCy
| offers a few exceptions to allow #[strong customising the built-in methods]
| like #[+api("doc#similarity") #[code Doc.similarity]] or
| #[+api("doc#vector") #[code Doc.vector]]. with your own hooks, which can
| rely on statistical models you train yourself. For instance, you can
| provide your own on-the-fly sentence segmentation algorithm or document
| similarity method.
p
| Hooks let you customize some of the behaviours of the #[code Doc],
| #[code Span] or #[code Token] objects by adding a component to the
| pipeline. For instance, to customize the
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
| component that sets a custom function to
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
| method will check the #[code user_hooks] dict, and delegate to your
| function if you've set one. Similar results can be achieved by setting
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+aside("Implementation note")
| The hooks live on the #[code Doc] object because the #[code Span] and
| #[code Token] objects are created lazily, and don't own any data. They
| just proxy to their parent #[code Doc]. This turns out to be convenient
| here — we only have to worry about installing hooks in one place.
+table(["Name", "Customises"])
+row
+cell #[code user_hooks]
+cell
+api("doc#vector") #[code Doc.vector]
+api("doc#has_vector") #[code Doc.has_vector]
+api("doc#vector_norm") #[code Doc.vector_norm]
+api("doc#sents") #[code Doc.sents]
+row
+cell #[code user_token_hooks]
+cell
+api("token#similarity") #[code Token.similarity]
+api("token#vector") #[code Token.vector]
+api("token#has_vector") #[code Token.has_vector]
+api("token#vector_norm") #[code Token.vector_norm]
+api("token#conjuncts") #[code Token.conjuncts]
+row
+cell #[code user_span_hooks]
+cell
+api("span#similarity") #[code Span.similarity]
+api("span#vector") #[code Span.vector]
+api("span#has_vector") #[code Span.has_vector]
+api("span#vector_norm") #[code Span.vector_norm]
+api("span#root") #[code Span.root]
+code("Add custom similarity hooks").
class SimilarityModel(object):
def __init__(self, model):
self._model = model
def __call__(self, doc):
doc.user_hooks['similarity'] = self.similarity
doc.user_span_hooks['similarity'] = self.similarity
doc.user_token_hooks['similarity'] = self.similarity
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])

View File

@ -0,0 +1,3 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > DEVELOPING EXTENSIONS
+under-construction

View File

@ -11,7 +11,7 @@ p
p
| When you load a model, spaCy first consults the model's
| #[+a("/usage/saving-loading#models-generating") meta.json]. The
| #[+a("/usage/saving-loading#models-generating") #[code meta.json]]. The
| meta typically includes the model details, the ID of a language class,
| and an optional list of pipeline components. spaCy then does the
| following:
@ -21,24 +21,26 @@ p
"name": "example_model",
"lang": "en"
"description": "Example model for spaCy",
"pipeline": ["tensorizer", "tagger"]
"pipeline": ["tagger", "parser"]
}
+list("numbers")
+item
| Look up #[strong pipeline IDs] in the available
| #[strong pipeline factories].
+item
| Initialise the #[strong pipeline components] by calling their
| factories with the #[code Vocab] as an argument. This gives each
| factory and component access to the pipeline's shared data, like
| strings, morphology and annotation scheme.
+item
| Load the #[strong language class and data] for the given ID via
| #[+api("util.get_lang_class") #[code get_lang_class]].
| #[+api("util.get_lang_class") #[code get_lang_class]] and initialise
| it. The #[code Language] class contains the shared vocabulary,
| tokenization rules and the language-specific annotation scheme.
+item
| Pass the path to the #[strong model data] to the #[code Language]
| class and return it.
| Iterate over the #[strong pipeline names] and create each component
| using #[+api("language#create_pipe") #[code create_pipe]], which
| looks them up in #[code Language.factories].
+item
| Add each pipeline component to the pipeline in order, using
| #[+api("language#add_pipe") #[code add_pipe]].
+item
| Make the #[strong model data] available to the #[code Language] class
| by calling #[+api("language#from_disk") #[code from_disk]] with the
| path to the model data ditectory.
p
| So when you call this...
@ -47,12 +49,12 @@ p
nlp = spacy.load('en')
p
| ... the model tells spaCy to use the pipeline
| ... the model tells spaCy to use the language #[code "en"] and the pipeline
| #[code.u-break ["tensorizer", "tagger", "parser", "ner"]]. spaCy will
| then look up each string in its internal factories registry and
| initialise the individual components. It'll then load
| #[code spacy.lang.en.English], pass it the path to the model's data
| directory, and return it for you to use as the #[code nlp] object.
| then initialise #[code spacy.lang.en.English], and create each pipeline
| component and add it to the processing pipeline. It'll then load in the
| model's data from its data ditectory and return the modified
| #[code Language] class for you to use as the #[code nlp] object.
p
| Fundamentally, a #[+a("/models") spaCy model] consists of three
@ -73,9 +75,12 @@ p
pipeline = ['tensorizer', 'tagger', 'parser', 'ner']
data_path = 'path/to/en_core_web_sm/en_core_web_sm-2.0.0'
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
nlp = cls(pipeline=pipeline) # 2. initialise it with the pipeline
nlp.from_disk(model_data_path) # 3. load in the binary data
cls = spacy.util.get_lang_class(lang) # 1. get Language instance, e.g. English()
nlp = cls() # 2. initialise it
for name in pipeline:
component = nlp.create_pipe(name) # 3. create the pipeline components
nlp.add_pipe(component) # 4. add the component to the pipeline
nlp.from_disk(model_data_path) # 5. load in the binary data
p
| When you call #[code nlp] on a text, spaCy will #[strong tokenize] it and
@ -87,124 +92,23 @@ p
| document, which is then processed by the component next in the pipeline.
+code("The pipeline under the hood").
doc = nlp.make_doc(u'This is a sentence')
for proc in nlp.pipeline:
doc = proc(doc)
+h(3, "creating") Creating pipeline components and factories
doc = nlp.make_doc(u'This is a sentence') # create a Doc from raw text
for name, proc in nlp.pipeline: # iterate over components in order
doc = proc(doc) # apply each component
p
| spaCy lets you customise the pipeline with your own components. Components
| are functions that receive a #[code Doc] object, modify and return it.
| If your component is stateful, you'll want to create a new one for each
| pipeline. You can do that by defining and registering a factory which
| receives the shared #[code Vocab] object and returns a component.
+h(4, "creating-component") Creating a component
p
| A component receives a #[code Doc] object and
| #[strong performs the actual processing] for example, using the current
| weights to make a prediction and set some annotation on the document. By
| adding a component to the pipeline, you'll get access to the #[code Doc]
| at any point #[strong during] processing instead of only being able to
| modify it afterwards.
+aside-code("Example").
def my_component(doc):
# do something to the doc here
return doc
+table(["Argument", "Type", "Description"])
+row
+cell #[code doc]
+cell #[code Doc]
+cell The #[code Doc] object processed by the previous component.
+row("foot")
+cell returns
+cell #[code Doc]
+cell The #[code Doc] object processed by this pipeline component.
p
| When creating a new #[code Language] class, you can pass it a list of
| pipeline component functions to execute in that order. You can also
| add it to an existing pipeline by modifying #[code nlp.pipeline] just
| be careful not to overwrite a pipeline or its components by accident!
| The current processing pipeline is available as #[code nlp.pipeline],
| which returns a list of #[code (name, component)] tuples, or
| #[code nlp.pipe_names], which only returns a list of human-readable
| component names.
+code.
# Create a new Language object with a pipeline
from spacy.language import Language
nlp = Language(pipeline=[my_component])
nlp.pipeline
# [('tagger', <spacy.pipeline.Tagger>), ('parser', <spacy.pipeline.DependencyParser>), ('ner', <spacy.pipeline.EntityRecognizer>)]
nlp.pipe_names
# ['tagger', 'parser', 'ner']
# Modify an existing pipeline
nlp = spacy.load('en')
nlp.pipeline.append(my_component)
+h(4, "creating-factory") Creating a factory
p
| A factory is a #[strong function that returns a pipeline component].
| It's called with the #[code Vocab] object, to give it access to the
| shared data between components for example, the strings, morphology,
| vectors or annotation scheme. Factories are useful for creating
| #[strong stateful components], especially ones which
| #[strong depend on shared data].
+aside-code("Example").
def my_factory(vocab):
# load some state
def my_component(doc):
# process the doc
return doc
return my_component
+table(["Argument", "Type", "Description"])
+row
+cell #[code vocab]
+cell #[code Vocab]
+cell
| Shared data between components, including strings, morphology,
| vectors etc.
+row("foot")
+cell returns
+cell callable
+cell The pipeline component.
p
| By creating a factory, you're essentially telling spaCy how to get the
| pipeline component #[strong once the vocab is available]. Factories need to
| be registered via #[+api("spacy#set_factory") #[code set_factory()]] and
| by assigning them a unique ID. This ID can be added to the pipeline as a
| string. When creating a pipeline, you're free to mix strings and
| callable components:
+code.
spacy.set_factory('my_factory', my_factory)
nlp = Language(pipeline=['my_factory', my_other_component])
p
| If spaCy comes across a string in the pipeline, it will try to resolve it
| by looking it up in the available factories. The factory will then be
| initialised with the #[code Vocab]. Providing factory names instead of
| callables also makes it easy to specify them in the model's
| #[+a("/usage/saving-loading#models-generating") meta.json]. If you're
| training your own model and want to use one of spaCy's default components,
| you won't have to worry about finding and implementing it either to use
| the default tagger, simply add #[code "tagger"] to the pipeline, and
| #[strong spaCy will know what to do].
+infobox("Important note")
| Because factories are #[strong resolved on initialisation] of the
| #[code Language] class, it's #[strong not possible] to add them to the
| pipeline afterwards, e.g. by modifying #[code nlp.pipeline]. This only
| works with individual component functions. To use factories, you need to
| create a new #[code Language] object, or generate a
| #[+a("/usage/training#models-generating") model package] with
| a custom pipeline.
+h(3, "disabling") Disabling pipeline components
+h(3, "disabling") Disabling and modifying pipeline components
p
| If you don't need a particular component of the pipeline for
@ -217,16 +121,19 @@ p
+code.
nlp = spacy.load('en', disable['parser', 'tagger'])
nlp = English().from_disk('/model', disable=['tensorizer', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
p
| Note that you can't write directly to #[code nlp.pipeline], as this list
| holds the #[em actual components], not the IDs. However, if you know the
| order of the components, you can still slice the list:
| You can also use the #[+api("language#remove_pipe") #[code remove_pipe]]
| method to remove pipeline components from an existing pipeline, the
| #[+api("language#rename_pipe") #[code rename_pipe]] method to rename them,
| or the #[+api("language#replace_pipe") #[code replace_pipe]] method
| to replace them with a custom component entirely (more details on this
| in the section on #[+a("#custom-components") custom components].
+code.
nlp = spacy.load('en')
nlp.pipeline = nlp.pipeline[:2] # only use the first two components
nlp.remove_pipe('parser')
nlp.rename_pipe('ner', 'entityrecognizer')
nlp.replace_pipe('tagger', my_custom_tagger)
+infobox("Important note: disabling pipeline components")
.o-block
@ -234,12 +141,14 @@ p
| processing pipeline components, the #[code parser], #[code tagger]
| and #[code entity] keyword arguments have been replaced with
| #[code disable], which takes a list of pipeline component names.
| This lets you disable both default and custom components when loading
| This lets you disable pre-defined components when loading
| a model, or initialising a Language class via
| #[+api("language-from_disk") #[code from_disk]].
+code-new.
nlp = spacy.load('en', disable=['tagger', 'ner'])
doc = nlp(u"I don't want parsed", disable=['parser'])
nlp = spacy.load('en', disable=['ner'])
nlp.remove_pipe('parser')
doc = nlp(u"I don't want parsed")
+code-old.
nlp = spacy.load('en', tagger=False, entity=False)
doc = nlp(u"I don't want parsed", parse=False)

View File

@ -21,7 +21,7 @@ p
+code.
import spacy
from spacy.tokens import Span
from spacy.tokens.span import Span
text = u'Netflix is hiring a new VP of global policy'

View File

@ -1,61 +0,0 @@
//- 💫 DOCS > USAGE > PROCESSING PIPELINES > ATTRIBUTE HOOKS
p
| Hooks let you customize some of the behaviours of the #[code Doc],
| #[code Span] or #[code Token] objects by adding a component to the
| pipeline. For instance, to customize the
| #[+api("doc#similarity") #[code Doc.similarity]] method, you can add a
| component that sets a custom function to
| #[code doc.user_hooks['similarity']]. The built-in #[code Doc.similarity]
| method will check the #[code user_hooks] dict, and delegate to your
| function if you've set one. Similar results can be achieved by setting
| functions to #[code Doc.user_span_hooks] and #[code Doc.user_token_hooks].
+code("Polymorphic similarity example").
span.similarity(doc)
token.similarity(span)
doc1.similarity(doc2)
p
| By default, this just averages the vectors for each document, and
| computes their cosine. Obviously, spaCy should make it easy for you to
| install your own similarity model. This introduces a tricky design
| challenge. The current solution is to add three more dicts to the
| #[code Doc] object:
+aside("Implementation note")
| The hooks live on the #[code Doc] object because the #[code Span] and
| #[code Token] objects are created lazily, and don't own any data. They
| just proxy to their parent #[code Doc]. This turns out to be convenient
| here — we only have to worry about installing hooks in one place.
+table(["Name", "Description"])
+row
+cell #[code user_hooks]
+cell Customise behaviour of #[code doc.vector], #[code doc.has_vector], #[code doc.vector_norm] or #[code doc.sents]
+row
+cell #[code user_token_hooks]
+cell Customise behaviour of #[code token.similarity], #[code token.vector], #[code token.has_vector], #[code token.vector_norm] or #[code token.conjuncts]
+row
+cell #[code user_span_hooks]
+cell Customise behaviour of #[code span.similarity], #[code span.vector], #[code span.has_vector], #[code span.vector_norm] or #[code span.root]
p
| To sum up, here's an example of hooking in custom #[code .similarity()]
| methods:
+code("Add custom similarity hooks").
class SimilarityModel(object):
def __init__(self, model):
self._model = model
def __call__(self, doc):
doc.user_hooks['similarity'] = self.similarity
doc.user_span_hooks['similarity'] = self.similarity
doc.user_token_hooks['similarity'] = self.similarity
def similarity(self, obj1, obj2):
y = self._model([obj1.vector, obj2.vector])
return float(y[0])

View File

@ -8,18 +8,18 @@ include _spacy-101/_pipelines
+h(2, "pipelines") How pipelines work
include _processing-pipelines/_pipelines
+section("examples")
+h(2, "examples") Examples
include _processing-pipelines/_examples
+section("custom-components")
+h(2, "custom-components") Creating custom pipeline components
include _processing-pipelines/_custom-components
+section("multithreading")
+h(2, "multithreading") Multi-threading
include _processing-pipelines/_multithreading
+section("user-hooks")
+h(2, "user-hooks") User hooks
include _processing-pipelines/_user-hooks
+section("serialization")
+h(2, "serialization") Serialization
include _processing-pipelines/_serialization
+section("extensions")
+h(2, "extensions") Developing spaCy extensions
include _processing-pipelines/_extensions