From a9c610404798b9a3d45a84918212cab9d053c40d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 27 Oct 2019 13:35:49 +0100 Subject: [PATCH] Component decorator and component analysis (#4517) * Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/__init__.py | 2 + spacy/analysis.py | 176 ++++++++++++++++++++++++++ spacy/compat.py | 3 + spacy/errors.py | 16 +++ spacy/language.py | 104 ++++++++++----- spacy/pipeline/entityruler.py | 8 +- spacy/pipeline/functions.py | 12 ++ spacy/pipeline/hooks.py | 7 +- spacy/pipeline/morphologizer.pyx | 5 +- spacy/pipeline/pipes.pyx | 44 +++++-- spacy/syntax/nn_parser.pyx | 4 + spacy/tests/pipeline/test_analysis.py | 146 +++++++++++++++++++++ spacy/util.py | 14 +- 15 files changed, 492 insertions(+), 53 deletions(-) create mode 100644 spacy/analysis.py create mode 100644 spacy/tests/pipeline/test_analysis.py diff --git a/requirements.txt b/requirements.txt index 68e29f6ab..6d76c7233 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0 thinc>=7.2.0,<7.3.0 blis>=0.4.0,<0.5.0 murmurhash>=0.28.0,<1.1.0 -wasabi>=0.2.0,<1.1.0 +wasabi>=0.3.0,<1.1.0 srsly>=0.1.0,<1.1.0 # Third party dependencies numpy>=1.15.0 diff --git a/setup.cfg b/setup.cfg index 796f4176a..2d4a06c2b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -49,7 +49,7 @@ install_requires = blis>=0.4.0,<0.5.0 plac>=0.9.6,<1.2.0 requests>=2.13.0,<3.0.0 - wasabi>=0.2.0,<1.1.0 + wasabi>=0.3.0,<1.1.0 srsly>=0.1.0,<1.1.0 pathlib==1.0.1; python_version < "3.4" importlib_metadata>=0.20; python_version < "3.8" diff --git a/spacy/__init__.py b/spacy/__init__.py index 8930b1d4e..57701179f 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed") # These are imported as part of the API from thinc.neural.util import prefer_gpu, require_gpu +from . import pipeline from .cli.info import info as cli_info from .glossary import explain from .about import __version__ from .errors import Errors, Warnings, deprecation_warning from . import util from .util import register_architecture, get_architecture +from .language import component if sys.maxunicode == 65535: diff --git a/spacy/analysis.py b/spacy/analysis.py new file mode 100644 index 000000000..49f67fd83 --- /dev/null +++ b/spacy/analysis.py @@ -0,0 +1,176 @@ +# coding: utf8 +from __future__ import unicode_literals + +from collections import OrderedDict +from wasabi import Printer + +from .tokens import Doc, Token +from .errors import Errors, Warnings, user_warning + + +def analyze_pipes(pipeline, name, pipe, index, warn=True): + """Analyze a pipeline component with respect to its position in the current + pipeline and the other components. Will check whether requirements are + fulfilled (e.g. if previous components assign the attributes). + + pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. + name (unicode): The name of the pipeline component to analyze. + pipe (callable): The pipeline component function to analyze. + index (int): The index of the component in the pipeline. + warn (bool): Show user warning if problem is found. + RETURNS (list): The problems found for the given pipeline component. + """ + assert pipeline[index][0] == name + prev_pipes = pipeline[:index] + pipe_requires = getattr(pipe, "requires", []) + requires = OrderedDict([(annot, False) for annot in pipe_requires]) + if requires: + for prev_name, prev_pipe in prev_pipes: + prev_assigns = getattr(prev_pipe, "assigns", []) + for annot in prev_assigns: + requires[annot] = True + problems = [] + for annot, fulfilled in requires.items(): + if not fulfilled: + problems.append(annot) + if warn: + user_warning(Warnings.W025.format(name=name, attr=annot)) + return problems + + +def analyze_all_pipes(pipeline, warn=True): + """Analyze all pipes in the pipeline in order. + + pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. + warn (bool): Show user warning if problem is found. + RETURNS (dict): The problems found, keyed by component name. + """ + problems = {} + for i, (name, pipe) in enumerate(pipeline): + problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn) + return problems + + +def dot_to_dict(values): + """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"] + become {"token": {"pos": True, "_": {"xyz": True }}}. + + values (iterable): The values to convert. + RETURNS (dict): The converted values. + """ + result = {} + for value in values: + path = result + parts = value.lower().split(".") + for i, item in enumerate(parts): + is_last = i == len(parts) - 1 + path = path.setdefault(item, True if is_last else {}) + return result + + +def validate_attrs(values): + """Validate component attributes provided to "assigns", "requires" etc. + Raises error for invalid attributes and formatting. Doesn't check if + custom extension attributes are registered, since this is something the + user might want to do themselves later in the component. + + values (iterable): The string attributes to check, e.g. `["token.pos"]`. + RETURNS (iterable): The checked attributes. + """ + data = dot_to_dict(values) + objs = {"doc": Doc, "token": Token} + for obj_key, attrs in data.items(): + if obj_key not in objs: # first element is not doc/token + if obj_key == "span": + span_attrs = [attr for attr in values if attr.startswith("span.")] + raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs))) + invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key)) + raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs)) + if not isinstance(attrs, dict): # attr is something like "doc" + raise ValueError(Errors.E182.format(attr=obj_key)) + for attr, value in attrs.items(): + if attr == "_": + if value is True: # attr is something like "doc._" + raise ValueError(Errors.E182.format(attr="{}._".format(obj_key))) + for ext_attr, ext_value in value.items(): + # We don't check whether the attribute actually exists + if ext_value is not True: # attr is something like doc._.x.y + good = "{}._.{}".format(obj_key, ext_attr) + bad = "{}.{}".format(good, ".".join(ext_value)) + raise ValueError(Errors.E183.format(attr=bad, solution=good)) + continue # we can't validate those further + if attr.endswith("_"): # attr is something like "token.pos_" + raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1])) + if value is not True: # attr is something like doc.x.y + good = "{}.{}".format(obj_key, attr) + bad = "{}.{}".format(good, ".".join(value)) + raise ValueError(Errors.E183.format(attr=bad, solution=good)) + obj = objs[obj_key] + if not hasattr(obj, attr): + raise ValueError(Errors.E185.format(obj=obj_key, attr=attr)) + return values + + +def _get_feature_for_attr(pipeline, attr, feature): + assert feature in ["assigns", "requires"] + result = [] + for pipe_name, pipe in pipeline: + pipe_assigns = getattr(pipe, feature, []) + if attr in pipe_assigns: + result.append((pipe_name, pipe)) + return result + + +def get_assigns_for_attr(pipeline, attr): + """Get all pipeline components that assign an attr, e.g. "doc.tensor". + + pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. + attr (unicode): The attribute to check. + RETURNS (list): (name, pipeline) tuples of components that assign the attr. + """ + return _get_feature_for_attr(pipeline, attr, "assigns") + + +def get_requires_for_attr(pipeline, attr): + """Get all pipeline components that require an attr, e.g. "doc.tensor". + + pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. + attr (unicode): The attribute to check. + RETURNS (list): (name, pipeline) tuples of components that require the attr. + """ + return _get_feature_for_attr(pipeline, attr, "requires") + + +def print_summary(nlp, pretty=True, no_print=False): + """Print a formatted summary for the current nlp object's pipeline. Shows + a table with the pipeline components and why they assign and require, as + well as any problems if available. + + nlp (Language): The nlp object. + pretty (bool): Pretty-print the results (color etc). + no_print (bool): Don't print anything, just return the data. + RETURNS (dict): A dict with "overview" and "problems". + """ + msg = Printer(pretty=pretty, no_print=no_print) + overview = [] + problems = {} + for i, (name, pipe) in enumerate(nlp.pipeline): + requires = getattr(pipe, "requires", []) + assigns = getattr(pipe, "assigns", []) + retok = getattr(pipe, "retokenizes", False) + overview.append((i, name, requires, assigns, retok)) + problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False) + msg.divider("Pipeline Overview") + header = ("#", "Component", "Requires", "Assigns", "Retokenizes") + msg.table(overview, header=header, divider=True, multiline=True) + n_problems = sum(len(p) for p in problems.values()) + if any(p for p in problems.values()): + msg.divider("Problems ({})".format(n_problems)) + for name, problem in problems.items(): + if problem: + problem = ", ".join(problem) + msg.warn("'{}' requirements not met: {}".format(name, problem)) + else: + msg.good("No problems found.") + if no_print: + return {"overview": overview, "problems": problems} diff --git a/spacy/compat.py b/spacy/compat.py index 3a19e9423..5bff28815 100644 --- a/spacy/compat.py +++ b/spacy/compat.py @@ -12,6 +12,7 @@ import os import sys import itertools import ast +import types from thinc.neural.util import copy_array @@ -67,6 +68,7 @@ if is_python2: basestring_ = basestring # noqa: F821 input_ = raw_input # noqa: F821 path2str = lambda path: str(path).decode("utf8") + class_types = (type, types.ClassType) elif is_python3: bytes_ = bytes @@ -74,6 +76,7 @@ elif is_python3: basestring_ = str input_ = input path2str = lambda path: str(path) + class_types = (type, types.ClassType) if is_python_pre_3_5 else type def b_to_str(b_str): diff --git a/spacy/errors.py b/spacy/errors.py index ddf14585b..f342b2271 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -99,6 +99,8 @@ class Warnings(object): "'n_process' will be set to 1.") W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in " "the Knowledge Base.") + W025 = ("'{name}' requires '{attr}' to be assigned, but none of the " + "previous components in the pipeline declare that they assign it.") @add_codes @@ -511,6 +513,20 @@ class Errors(object): E179 = ("Invalid pattern. Expected a list of Doc objects but got a single " "Doc. If you only want to add one pattern, make sure to wrap it " "in a list. For example: matcher.add('{key}', [doc])") + E180 = ("Span attributes can't be declared as required or assigned by " + "components, since spans are only views of the Doc. Use Doc and " + "Token attributes only and remove the following: {attrs}") + E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. " + "Only Doc and Token attributes are supported.") + E182 = ("Received invalid attribute declaration: {attr}\nDid you forget " + "to define the attribute? For example: {attr}.???") + E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level " + "attributes are supported, for example: {solution}") + E184 = ("Only attributes without underscores are supported in component " + "attribute declarations (because underscore and non-underscore " + "attributes are connected anyways): {attr} -> {solution}") + E185 = ("Received invalid attribute in component attribute declaration: " + "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.") @add_codes diff --git a/spacy/language.py b/spacy/language.py index 5f0e632ae..a7d1f3a70 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -18,13 +18,8 @@ from .tokenizer import Tokenizer from .vocab import Vocab from .lemmatizer import Lemmatizer from .lookups import Lookups -from .pipeline import DependencyParser, Tagger -from .pipeline import Tensorizer, EntityRecognizer, EntityLinker -from .pipeline import SimilarityHook, TextCategorizer, Sentencizer -from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens -from .pipeline import EntityRuler -from .pipeline import Morphologizer -from .compat import izip, basestring_, is_python2 +from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs +from .compat import izip, basestring_, is_python2, class_types from .gold import GoldParse from .scorer import Scorer from ._ml import link_vectors_to_models, create_default_optimizer @@ -40,6 +35,9 @@ from . import util from . import about +ENABLE_PIPELINE_ANALYSIS = False + + class BaseDefaults(object): @classmethod def create_lemmatizer(cls, nlp=None, lookups=None): @@ -135,19 +133,6 @@ class Language(object): factories = { "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp), - "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg), - "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg), - "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg), - "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg), - "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg), - "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg), - "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg), - "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg), - "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg), - "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks, - "merge_entities": lambda nlp, **cfg: merge_entities, - "merge_subtokens": lambda nlp, **cfg: merge_subtokens, - "entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg), } def __init__( @@ -218,6 +203,7 @@ class Language(object): "name": self.vocab.vectors.name, } self._meta["pipeline"] = self.pipe_names + self._meta["factories"] = self.pipe_factories self._meta["labels"] = self.pipe_labels return self._meta @@ -259,6 +245,17 @@ class Language(object): """ return [pipe_name for pipe_name, _ in self.pipeline] + @property + def pipe_factories(self): + """Get the component factories for the available pipeline components. + + RETURNS (dict): Factory names, keyed by component names. + """ + factories = {} + for pipe_name, pipe in self.pipeline: + factories[pipe_name] = getattr(pipe, "factory", pipe_name) + return factories + @property def pipe_labels(self): """Get the labels set by the pipeline components, if available (if @@ -327,33 +324,30 @@ class Language(object): msg += Errors.E004.format(component=component) raise ValueError(msg) if name is None: - if hasattr(component, "name"): - name = component.name - elif hasattr(component, "__name__"): - name = component.__name__ - elif hasattr(component, "__class__") and hasattr( - component.__class__, "__name__" - ): - name = component.__class__.__name__ - else: - name = repr(component) + name = util.get_component_name(component) if name in self.pipe_names: raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names)) if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: raise ValueError(Errors.E006) + pipe_index = 0 pipe = (name, component) if last or not any([first, before, after]): + pipe_index = len(self.pipeline) self.pipeline.append(pipe) elif first: self.pipeline.insert(0, pipe) elif before and before in self.pipe_names: + pipe_index = self.pipe_names.index(before) self.pipeline.insert(self.pipe_names.index(before), pipe) elif after and after in self.pipe_names: + pipe_index = self.pipe_names.index(after) + 1 self.pipeline.insert(self.pipe_names.index(after) + 1, pipe) else: raise ValueError( Errors.E001.format(name=before or after, opts=self.pipe_names) ) + if ENABLE_PIPELINE_ANALYSIS: + analyze_pipes(self.pipeline, name, component, pipe_index) def has_pipe(self, name): """Check if a component name is present in the pipeline. Equivalent to @@ -382,6 +376,8 @@ class Language(object): msg += Errors.E135.format(name=name) raise ValueError(msg) self.pipeline[self.pipe_names.index(name)] = (name, component) + if ENABLE_PIPELINE_ANALYSIS: + analyze_all_pipes(self.pipeline) def rename_pipe(self, old_name, new_name): """Rename a pipeline component. @@ -408,6 +404,8 @@ class Language(object): """ if name not in self.pipe_names: raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) + if ENABLE_PIPELINE_ANALYSIS: + analyze_all_pipes(self.pipeline) return self.pipeline.pop(self.pipe_names.index(name)) def __call__(self, text, disable=[], component_cfg=None): @@ -1001,6 +999,52 @@ class Language(object): return self +class component(object): + """Decorator for pipeline components. Can decorate both function components + and class components and will automatically register components in the + Language.factories. If the component is a class and needs access to the + nlp object or config parameters, it can expose a from_nlp classmethod + that takes the nlp object and **cfg arguments and returns the initialized + component. + """ + + # NB: This decorator needs to live here, because it needs to write to + # Language.factories. All other solutions would cause circular import. + + def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False): + """Decorate a pipeline component. + + name (unicode): Default component and factory name. + assigns (list): Attributes assigned by component, e.g. `["token.pos"]`. + requires (list): Attributes required by component, e.g. `["token.dep"]`. + retokenizes (bool): Whether the component changes the tokenization. + """ + self.name = name + self.assigns = validate_attrs(assigns) + self.requires = validate_attrs(requires) + self.retokenizes = retokenizes + + def __call__(self, *args, **kwargs): + obj = args[0] + args = args[1:] + factory_name = self.name or util.get_component_name(obj) + obj.name = factory_name + obj.factory = factory_name + obj.assigns = self.assigns + obj.requires = self.requires + obj.retokenizes = self.retokenizes + + def factory(nlp, **cfg): + if hasattr(obj, "from_nlp"): + return obj.from_nlp(nlp, **cfg) + elif isinstance(obj, class_types): + return obj() + return obj + + Language.factories[obj.factory] = factory + return obj + + def _fix_pretrained_vectors_name(nlp): # TODO: Replace this once we handle vectors consistently as static # data diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index 6bd6c4ea9..d926b987b 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals from collections import defaultdict, OrderedDict import srsly +from ..language import component from ..errors import Errors from ..compat import basestring_ from ..util import ensure_path, to_disk, from_disk @@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher DEFAULT_ENT_ID_SEP = "||" +@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"]) class EntityRuler(object): """The EntityRuler lets you add spans to the `Doc.ents` using token-based rules or exact phrase matches. It can be combined with the statistical @@ -24,8 +26,6 @@ class EntityRuler(object): USAGE: https://spacy.io/usage/rule-based-matching#entityruler """ - name = "entity_ruler" - def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg): """Initialize the entitiy ruler. If patterns are supplied here, they need to be a list of dictionaries with a `"label"` and `"pattern"` @@ -69,6 +69,10 @@ class EntityRuler(object): if patterns is not None: self.add_patterns(patterns) + @classmethod + def from_nlp(cls, nlp, **cfg): + return cls(nlp, **cfg) + def __len__(self): """The number of all patterns added to the entity ruler.""" n_token_patterns = sum(len(p) for p in self.token_patterns.values()) diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py index 0f7d94df2..9562dcbdb 100644 --- a/spacy/pipeline/functions.py +++ b/spacy/pipeline/functions.py @@ -1,9 +1,15 @@ # coding: utf8 from __future__ import unicode_literals +from ..language import component from ..matcher import Matcher +@component( + "merge_noun_chunks", + requires=["token.dep", "token.tag", "token.pos"], + retokenizes=True, +) def merge_noun_chunks(doc): """Merge noun chunks into a single token. @@ -21,6 +27,11 @@ def merge_noun_chunks(doc): return doc +@component( + "merge_entities", + requires=["doc.ents", "token.ent_iob", "token.ent_type"], + retokenizes=True, +) def merge_entities(doc): """Merge entities into a single token. @@ -36,6 +47,7 @@ def merge_entities(doc): return doc +@component("merge_subtokens", requires=["token.dep"], retokenizes=True) def merge_subtokens(doc, label="subtok"): """Merge subtokens into a single token. diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py index 38672cde0..b61a34c0e 100644 --- a/spacy/pipeline/hooks.py +++ b/spacy/pipeline/hooks.py @@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool from thinc.neural._classes.difference import Siamese, CauchySimilarity from .pipes import Pipe +from ..language import component from .._ml import link_vectors_to_models +@component("sentencizer_hook", assigns=["doc.user_hooks"]) class SentenceSegmenter(object): """A simple spaCy hook, to allow custom sentence boundary detection logic (that doesn't require the dependency parse). To change the sentence @@ -17,8 +19,6 @@ class SentenceSegmenter(object): and yield `Span` objects for each sentence. """ - name = "sentencizer" - def __init__(self, vocab, strategy=None): self.vocab = vocab if strategy is None or strategy == "on_punct": @@ -44,6 +44,7 @@ class SentenceSegmenter(object): yield doc[start : len(doc)] +@component("similarity", assigns=["doc.user_hooks"]) class SimilarityHook(Pipe): """ Experimental: A pipeline component to install a hook for supervised @@ -58,8 +59,6 @@ class SimilarityHook(Pipe): Where W is a vector of dimension weights, initialized to 1. """ - name = "similarity" - def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx index b14e2bec7..72e31f120 100644 --- a/spacy/pipeline/morphologizer.pyx +++ b/spacy/pipeline/morphologizer.pyx @@ -8,6 +8,7 @@ from thinc.api import chain from thinc.neural.util import to_categorical, copy_array, get_array_module from .. import util from .pipes import Pipe +from ..language import component from .._ml import Tok2Vec, build_morphologizer_model from .._ml import link_vectors_to_models, zero_init, flatten from .._ml import create_default_optimizer @@ -18,9 +19,9 @@ from ..vocab cimport Vocab from ..morphology cimport Morphology +@component("morphologizer", assigns=["token.morph", "token.pos"]) class Morphologizer(Pipe): - name = 'morphologizer' - + @classmethod def Model(cls, **cfg): if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'): diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx index 1f6a517c6..e33c6259b 100644 --- a/spacy/pipeline/pipes.pyx +++ b/spacy/pipeline/pipes.pyx @@ -13,7 +13,6 @@ from thinc.misc import LayerNorm from thinc.neural.util import to_categorical from thinc.neural.util import get_array_module -from .functions import merge_subtokens from ..tokens.doc cimport Doc from ..syntax.nn_parser cimport Parser from ..syntax.ner cimport BiluoPushDown @@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager from ..morphology cimport Morphology from ..vocab cimport Vocab +from .functions import merge_subtokens +from ..language import Language, component from ..syntax import nonproj from ..attrs import POS, ID from ..parts_of_speech import X @@ -54,6 +55,10 @@ class Pipe(object): """Initialize a model for the pipe.""" raise NotImplementedError + @classmethod + def from_nlp(cls, nlp, **cfg): + return cls(nlp.vocab, **cfg) + def __init__(self, vocab, model=True, **cfg): """Create a new pipe instance.""" raise NotImplementedError @@ -223,11 +228,10 @@ class Pipe(object): return self +@component("tensorizer", assigns=["doc.tensor"]) class Tensorizer(Pipe): """Pre-train position-sensitive vectors for tokens.""" - name = "tensorizer" - @classmethod def Model(cls, output_size=300, **cfg): """Create a new statistical model for the class. @@ -362,14 +366,13 @@ class Tensorizer(Pipe): return sgd +@component("tagger", assigns=["token.tag", "token.pos"]) class Tagger(Pipe): """Pipeline component for part-of-speech tagging. DOCS: https://spacy.io/api/tagger """ - name = "tagger" - def __init__(self, vocab, model=True, **cfg): self.vocab = vocab self.model = model @@ -657,13 +660,12 @@ class Tagger(Pipe): return self +@component("nn_labeller") class MultitaskObjective(Tagger): """Experimental: Assist training of a parser or tagger, by training a side-objective. """ - name = "nn_labeller" - def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg): self.vocab = vocab self.model = model @@ -898,12 +900,12 @@ class ClozeMultitask(Pipe): losses[self.name] += loss +@component("textcat", assigns=["doc.cats"]) class TextCategorizer(Pipe): """Pipeline component for text classification. DOCS: https://spacy.io/api/textcategorizer """ - name = 'textcat' @classmethod def Model(cls, nr_class=1, **cfg): @@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser): DOCS: https://spacy.io/api/dependencyparser """ - + # cdef classes can't have decorators, so we're defining this here name = "parser" + factory = "parser" + assigns = ["token.dep", "token.is_sent_start", "doc.sents"] + requires = [] TransitionSystem = ArcEager @property @@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser): DOCS: https://spacy.io/api/entityrecognizer """ - name = "ner" + factory = "ner" + assigns = ["doc.ents", "token.ent_iob", "token.ent_type"] + requires = [] TransitionSystem = BiluoPushDown nr_feature = 6 @@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser): return tuple(sorted(labels)) +@component( + "entity_linker", + requires=["doc.ents", "token.ent_iob", "token.ent_type"], + assigns=["token.ent_kb_id"] +) class EntityLinker(Pipe): """Pipeline component for named entity linking. DOCS: https://spacy.io/api/entitylinker """ - name = 'entity_linker' NIL = "NIL" # string used to refer to a non-existing link @classmethod @@ -1405,13 +1416,13 @@ class EntityLinker(Pipe): raise NotImplementedError +@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"]) class Sentencizer(object): """Segment the Doc into sentences using a rule-based strategy. DOCS: https://spacy.io/api/sentencizer """ - name = "sentencizer" default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', @@ -1437,6 +1448,10 @@ class Sentencizer(object): else: self.punct_chars = set(self.default_punct_chars) + @classmethod + def from_nlp(cls, nlp, **cfg): + return cls(**cfg) + def __call__(self, doc): """Apply the sentencizer to a Doc and set Token.is_sent_start. @@ -1503,4 +1518,9 @@ class Sentencizer(object): return self +# Cython classes can't be decorated, so we need to add the factories here +Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg) +Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg) + + __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"] diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index 2c1a5dba2..92168631c 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -128,6 +128,10 @@ cdef class Parser: self._multitasks = [] self._rehearsal_model = None + @classmethod + def from_nlp(cls, nlp, **cfg): + return cls(nlp.vocab, **cfg) + def __reduce__(self): return (Parser, (self.vocab, self.moves, self.model), None, None) diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py new file mode 100644 index 000000000..6e0354b18 --- /dev/null +++ b/spacy/tests/pipeline/test_analysis.py @@ -0,0 +1,146 @@ +# coding: utf8 +from __future__ import unicode_literals + +import spacy.language +from spacy.language import Language, component +from spacy.analysis import print_summary, validate_attrs +from spacy.analysis import get_assigns_for_attr, get_requires_for_attr +from spacy.compat import is_python2 +from mock import Mock, ANY +import pytest + + +def test_component_decorator_function(): + @component(name="test") + def test_component(doc): + """docstring""" + return doc + + assert test_component.name == "test" + if not is_python2: + assert test_component.__doc__ == "docstring" + assert test_component("foo") == "foo" + + +def test_component_decorator_class(): + @component(name="test") + class TestComponent(object): + """docstring1""" + + foo = "bar" + + def __call__(self, doc): + """docstring2""" + return doc + + def custom(self, x): + """docstring3""" + return x + + assert TestComponent.name == "test" + assert TestComponent.foo == "bar" + assert hasattr(TestComponent, "custom") + test_component = TestComponent() + assert test_component.foo == "bar" + assert test_component("foo") == "foo" + assert hasattr(test_component, "custom") + assert test_component.custom("bar") == "bar" + if not is_python2: + assert TestComponent.__doc__ == "docstring1" + assert TestComponent.__call__.__doc__ == "docstring2" + assert TestComponent.custom.__doc__ == "docstring3" + assert test_component.__doc__ == "docstring1" + assert test_component.__call__.__doc__ == "docstring2" + assert test_component.custom.__doc__ == "docstring3" + + +def test_component_decorator_assigns(): + spacy.language.ENABLE_PIPELINE_ANALYSIS = True + + @component("c1", assigns=["token.tag", "doc.tensor"]) + def test_component1(doc): + return doc + + @component( + "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"] + ) + def test_component2(doc): + return doc + + @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"]) + def test_component3(doc): + return doc + + assert "c1" in Language.factories + assert "c2" in Language.factories + assert "c3" in Language.factories + + nlp = Language() + nlp.add_pipe(test_component1) + with pytest.warns(UserWarning): + nlp.add_pipe(test_component2) + nlp.add_pipe(test_component3) + assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor") + assert [name for name, _ in assigns_tensor] == ["c1", "c2"] + test_component4 = nlp.create_pipe("c1") + assert test_component4.name == "c1" + assert test_component4.factory == "c1" + nlp.add_pipe(test_component4, name="c4") + assert nlp.pipe_names == ["c1", "c2", "c3", "c4"] + assert "c4" not in Language.factories + assert nlp.pipe_factories["c1"] == "c1" + assert nlp.pipe_factories["c4"] == "c1" + assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor") + assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"] + requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos") + assert [name for name, _ in requires_pos] == ["c2"] + assert print_summary(nlp, no_print=True) + assert nlp("hello world") + + +def test_component_factories_from_nlp(): + """Test that class components can implement a from_nlp classmethod that + gives them access to the nlp object and config via the factory.""" + + class TestComponent5(object): + def __call__(self, doc): + return doc + + mock = Mock() + mock.return_value = TestComponent5() + TestComponent5.from_nlp = classmethod(mock) + TestComponent5 = component("c5")(TestComponent5) + + assert "c5" in Language.factories + nlp = Language() + pipe = nlp.create_pipe("c5", config={"foo": "bar"}) + nlp.add_pipe(pipe) + assert nlp("hello world") + # The first argument here is the class itself, so we're accepting any here + mock.assert_called_once_with(ANY, nlp, foo="bar") + + +def test_analysis_validate_attrs_valid(): + attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"] + assert validate_attrs(attrs) + for attr in attrs: + assert validate_attrs([attr]) + with pytest.raises(ValueError): + validate_attrs(["doc.sents", "doc.xyz"]) + + +@pytest.mark.parametrize( + "attr", + [ + "doc", + "doc_ents", + "doc.xyz", + "token.xyz", + "token.tag_", + "token.tag.xyz", + "token._.xyz.abc", + ], +) +def test_analysis_validate_attrs_invalid(attr): + with pytest.raises(ValueError): + validate_attrs([attr]) diff --git a/spacy/util.py b/spacy/util.py index b59ad6fef..ffc25fb9d 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides): cls = get_lang_class(lang) nlp = cls(meta=meta, **overrides) pipeline = meta.get("pipeline", []) + factories = meta.get("factories", {}) disable = overrides.get("disable", []) if pipeline is True: pipeline = nlp.Defaults.pipe_names @@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides): for name in pipeline: if name not in disable: config = meta.get("pipeline_args", {}).get(name, {}) - component = nlp.create_pipe(name, config=config) + factory = factories.get(name, name) + component = nlp.create_pipe(factory, config=config) nlp.add_pipe(component, name=name) return nlp.from_disk(model_path) @@ -368,6 +370,16 @@ def is_in_jupyter(): return False +def get_component_name(component): + if hasattr(component, "name"): + return component.name + if hasattr(component, "__name__"): + return component.__name__ + if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"): + return component.__class__.__name__ + return repr(component) + + def get_cuda_stream(require=False): if CudaStream is None: return None