From a9c610404798b9a3d45a84918212cab9d053c40d Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 27 Oct 2019 13:35:49 +0100
Subject: [PATCH] Component decorator and component analysis (#4517)

* Add work in progress

* Update analysis helpers and component decorator

* Fix porting of docstrings for Python 2

* Fix docstring stuff on Python 2

* Support meta factories when loading model

* Put auto pipeline analysis behind flag for now

* Analyse pipes on remove_pipe and replace_pipe

* Move analysis to root for now

Try to find a better place for it, but it needs to go for now to avoid circular imports

* Simplify decorator

Don't return a wrapped class and instead just write to the object

* Update existing components and factories

* Add condition in factory for classes vs. functions

* Add missing from_nlp classmethods

* Add "retokenizes" to printed overview

* Update assigns/requires declarations of builtins

* Only return data if no_print is enabled

* Use multiline table for overview

* Don't support Span

* Rewrite errors/warnings and move them to spacy.errors
---
 requirements.txt                      |   2 +-
 setup.cfg                             |   2 +-
 spacy/__init__.py                     |   2 +
 spacy/analysis.py                     | 176 ++++++++++++++++++++++++++
 spacy/compat.py                       |   3 +
 spacy/errors.py                       |  16 +++
 spacy/language.py                     | 104 ++++++++++-----
 spacy/pipeline/entityruler.py         |   8 +-
 spacy/pipeline/functions.py           |  12 ++
 spacy/pipeline/hooks.py               |   7 +-
 spacy/pipeline/morphologizer.pyx      |   5 +-
 spacy/pipeline/pipes.pyx              |  44 +++++--
 spacy/syntax/nn_parser.pyx            |   4 +
 spacy/tests/pipeline/test_analysis.py | 146 +++++++++++++++++++++
 spacy/util.py                         |  14 +-
 15 files changed, 492 insertions(+), 53 deletions(-)
 create mode 100644 spacy/analysis.py
 create mode 100644 spacy/tests/pipeline/test_analysis.py

diff --git a/requirements.txt b/requirements.txt
index 68e29f6ab..6d76c7233 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
 thinc>=7.2.0,<7.3.0
 blis>=0.4.0,<0.5.0
 murmurhash>=0.28.0,<1.1.0
-wasabi>=0.2.0,<1.1.0
+wasabi>=0.3.0,<1.1.0
 srsly>=0.1.0,<1.1.0
 # Third party dependencies
 numpy>=1.15.0
diff --git a/setup.cfg b/setup.cfg
index 796f4176a..2d4a06c2b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -49,7 +49,7 @@ install_requires =
     blis>=0.4.0,<0.5.0
     plac>=0.9.6,<1.2.0
     requests>=2.13.0,<3.0.0
-    wasabi>=0.2.0,<1.1.0
+    wasabi>=0.3.0,<1.1.0
     srsly>=0.1.0,<1.1.0
     pathlib==1.0.1; python_version < "3.4"
     importlib_metadata>=0.20; python_version < "3.8"
diff --git a/spacy/__init__.py b/spacy/__init__.py
index 8930b1d4e..57701179f 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
 # These are imported as part of the API
 from thinc.neural.util import prefer_gpu, require_gpu
 
+from . import pipeline
 from .cli.info import info as cli_info
 from .glossary import explain
 from .about import __version__
 from .errors import Errors, Warnings, deprecation_warning
 from . import util
 from .util import register_architecture, get_architecture
+from .language import component
 
 
 if sys.maxunicode == 65535:
diff --git a/spacy/analysis.py b/spacy/analysis.py
new file mode 100644
index 000000000..49f67fd83
--- /dev/null
+++ b/spacy/analysis.py
@@ -0,0 +1,176 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from collections import OrderedDict
+from wasabi import Printer
+
+from .tokens import Doc, Token
+from .errors import Errors, Warnings, user_warning
+
+
+def analyze_pipes(pipeline, name, pipe, index, warn=True):
+    """Analyze a pipeline component with respect to its position in the current
+    pipeline and the other components. Will check whether requirements are
+    fulfilled (e.g. if previous components assign the attributes).
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    name (unicode): The name of the pipeline component to analyze.
+    pipe (callable): The pipeline component function to analyze.
+    index (int): The index of the component in the pipeline.
+    warn (bool): Show user warning if problem is found.
+    RETURNS (list): The problems found for the given pipeline component.
+    """
+    assert pipeline[index][0] == name
+    prev_pipes = pipeline[:index]
+    pipe_requires = getattr(pipe, "requires", [])
+    requires = OrderedDict([(annot, False) for annot in pipe_requires])
+    if requires:
+        for prev_name, prev_pipe in prev_pipes:
+            prev_assigns = getattr(prev_pipe, "assigns", [])
+            for annot in prev_assigns:
+                requires[annot] = True
+    problems = []
+    for annot, fulfilled in requires.items():
+        if not fulfilled:
+            problems.append(annot)
+            if warn:
+                user_warning(Warnings.W025.format(name=name, attr=annot))
+    return problems
+
+
+def analyze_all_pipes(pipeline, warn=True):
+    """Analyze all pipes in the pipeline in order.
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    warn (bool): Show user warning if problem is found.
+    RETURNS (dict): The problems found, keyed by component name.
+    """
+    problems = {}
+    for i, (name, pipe) in enumerate(pipeline):
+        problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
+    return problems
+
+
+def dot_to_dict(values):
+    """Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
+    become {"token": {"pos": True, "_": {"xyz": True }}}.
+
+    values (iterable): The values to convert.
+    RETURNS (dict): The converted values.
+    """
+    result = {}
+    for value in values:
+        path = result
+        parts = value.lower().split(".")
+        for i, item in enumerate(parts):
+            is_last = i == len(parts) - 1
+            path = path.setdefault(item, True if is_last else {})
+    return result
+
+
+def validate_attrs(values):
+    """Validate component attributes provided to "assigns", "requires" etc.
+    Raises error for invalid attributes and formatting. Doesn't check if
+    custom extension attributes are registered, since this is something the
+    user might want to do themselves later in the component.
+
+    values (iterable): The string attributes to check, e.g. `["token.pos"]`.
+    RETURNS (iterable): The checked attributes.
+    """
+    data = dot_to_dict(values)
+    objs = {"doc": Doc, "token": Token}
+    for obj_key, attrs in data.items():
+        if obj_key not in objs:  # first element is not doc/token
+            if obj_key == "span":
+                span_attrs = [attr for attr in values if attr.startswith("span.")]
+                raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
+            invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
+            raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
+        if not isinstance(attrs, dict):  # attr is something like "doc"
+            raise ValueError(Errors.E182.format(attr=obj_key))
+        for attr, value in attrs.items():
+            if attr == "_":
+                if value is True:  # attr is something like "doc._"
+                    raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
+                for ext_attr, ext_value in value.items():
+                    # We don't check whether the attribute actually exists
+                    if ext_value is not True:  # attr is something like doc._.x.y
+                        good = "{}._.{}".format(obj_key, ext_attr)
+                        bad = "{}.{}".format(good, ".".join(ext_value))
+                        raise ValueError(Errors.E183.format(attr=bad, solution=good))
+                continue  # we can't validate those further
+            if attr.endswith("_"):  # attr is something like "token.pos_"
+                raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
+            if value is not True:  # attr is something like doc.x.y
+                good = "{}.{}".format(obj_key, attr)
+                bad = "{}.{}".format(good, ".".join(value))
+                raise ValueError(Errors.E183.format(attr=bad, solution=good))
+            obj = objs[obj_key]
+            if not hasattr(obj, attr):
+                raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
+    return values
+
+
+def _get_feature_for_attr(pipeline, attr, feature):
+    assert feature in ["assigns", "requires"]
+    result = []
+    for pipe_name, pipe in pipeline:
+        pipe_assigns = getattr(pipe, feature, [])
+        if attr in pipe_assigns:
+            result.append((pipe_name, pipe))
+    return result
+
+
+def get_assigns_for_attr(pipeline, attr):
+    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    attr (unicode): The attribute to check.
+    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
+    """
+    return _get_feature_for_attr(pipeline, attr, "assigns")
+
+
+def get_requires_for_attr(pipeline, attr):
+    """Get all pipeline components that require an attr, e.g. "doc.tensor".
+
+    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
+    attr (unicode): The attribute to check.
+    RETURNS (list): (name, pipeline) tuples of components that require the attr.
+    """
+    return _get_feature_for_attr(pipeline, attr, "requires")
+
+
+def print_summary(nlp, pretty=True, no_print=False):
+    """Print a formatted summary for the current nlp object's pipeline. Shows
+    a table with the pipeline components and why they assign and require, as
+    well as any problems if available.
+
+    nlp (Language): The nlp object.
+    pretty (bool): Pretty-print the results (color etc).
+    no_print (bool): Don't print anything, just return the data.
+    RETURNS (dict): A dict with "overview" and "problems".
+    """
+    msg = Printer(pretty=pretty, no_print=no_print)
+    overview = []
+    problems = {}
+    for i, (name, pipe) in enumerate(nlp.pipeline):
+        requires = getattr(pipe, "requires", [])
+        assigns = getattr(pipe, "assigns", [])
+        retok = getattr(pipe, "retokenizes", False)
+        overview.append((i, name, requires, assigns, retok))
+        problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
+    msg.divider("Pipeline Overview")
+    header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
+    msg.table(overview, header=header, divider=True, multiline=True)
+    n_problems = sum(len(p) for p in problems.values())
+    if any(p for p in problems.values()):
+        msg.divider("Problems ({})".format(n_problems))
+        for name, problem in problems.items():
+            if problem:
+                problem = ", ".join(problem)
+                msg.warn("'{}' requirements not met: {}".format(name, problem))
+    else:
+        msg.good("No problems found.")
+    if no_print:
+        return {"overview": overview, "problems": problems}
diff --git a/spacy/compat.py b/spacy/compat.py
index 3a19e9423..5bff28815 100644
--- a/spacy/compat.py
+++ b/spacy/compat.py
@@ -12,6 +12,7 @@ import os
 import sys
 import itertools
 import ast
+import types
 
 from thinc.neural.util import copy_array
 
@@ -67,6 +68,7 @@ if is_python2:
     basestring_ = basestring  # noqa: F821
     input_ = raw_input  # noqa: F821
     path2str = lambda path: str(path).decode("utf8")
+    class_types = (type, types.ClassType)
 
 elif is_python3:
     bytes_ = bytes
@@ -74,6 +76,7 @@ elif is_python3:
     basestring_ = str
     input_ = input
     path2str = lambda path: str(path)
+    class_types = (type, types.ClassType) if is_python_pre_3_5 else type
 
 
 def b_to_str(b_str):
diff --git a/spacy/errors.py b/spacy/errors.py
index ddf14585b..f342b2271 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -99,6 +99,8 @@ class Warnings(object):
             "'n_process' will be set to 1.")
     W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
             "the Knowledge Base.")
+    W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
+            "previous components in the pipeline declare that they assign it.")
 
 
 @add_codes
@@ -511,6 +513,20 @@ class Errors(object):
     E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
             "Doc. If you only want to add one pattern, make sure to wrap it "
             "in a list. For example: matcher.add('{key}', [doc])")
+    E180 = ("Span attributes can't be declared as required or assigned by "
+            "components, since spans are only views of the Doc. Use Doc and "
+            "Token attributes only and remove the following: {attrs}")
+    E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
+            "Only Doc and Token attributes are supported.")
+    E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
+            "to define the attribute? For example: {attr}.???")
+    E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
+            "attributes are supported, for example: {solution}")
+    E184 = ("Only attributes without underscores are supported in component "
+            "attribute declarations (because underscore and non-underscore "
+            "attributes are connected anyways): {attr} -> {solution}")
+    E185 = ("Received invalid attribute in component attribute declaration: "
+            "{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
 
 
 @add_codes
diff --git a/spacy/language.py b/spacy/language.py
index 5f0e632ae..a7d1f3a70 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .lookups import Lookups
-from .pipeline import DependencyParser, Tagger
-from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
-from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
-from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
-from .pipeline import EntityRuler
-from .pipeline import Morphologizer
-from .compat import izip, basestring_, is_python2
+from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .compat import izip, basestring_, is_python2, class_types
 from .gold import GoldParse
 from .scorer import Scorer
 from ._ml import link_vectors_to_models, create_default_optimizer
@@ -40,6 +35,9 @@ from . import util
 from . import about
 
 
+ENABLE_PIPELINE_ANALYSIS = False
+
+
 class BaseDefaults(object):
     @classmethod
     def create_lemmatizer(cls, nlp=None, lookups=None):
@@ -135,19 +133,6 @@ class Language(object):
 
     factories = {
         "tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
-        "tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
-        "tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
-        "morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
-        "parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
-        "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
-        "entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
-        "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
-        "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
-        "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
-        "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
-        "merge_entities": lambda nlp, **cfg: merge_entities,
-        "merge_subtokens": lambda nlp, **cfg: merge_subtokens,
-        "entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
     }
 
     def __init__(
@@ -218,6 +203,7 @@ class Language(object):
             "name": self.vocab.vectors.name,
         }
         self._meta["pipeline"] = self.pipe_names
+        self._meta["factories"] = self.pipe_factories
         self._meta["labels"] = self.pipe_labels
         return self._meta
 
@@ -259,6 +245,17 @@ class Language(object):
         """
         return [pipe_name for pipe_name, _ in self.pipeline]
 
+    @property
+    def pipe_factories(self):
+        """Get the component factories for the available pipeline components.
+
+        RETURNS (dict): Factory names, keyed by component names.
+        """
+        factories = {}
+        for pipe_name, pipe in self.pipeline:
+            factories[pipe_name] = getattr(pipe, "factory", pipe_name)
+        return factories
+
     @property
     def pipe_labels(self):
         """Get the labels set by the pipeline components, if available (if
@@ -327,33 +324,30 @@ class Language(object):
                 msg += Errors.E004.format(component=component)
             raise ValueError(msg)
         if name is None:
-            if hasattr(component, "name"):
-                name = component.name
-            elif hasattr(component, "__name__"):
-                name = component.__name__
-            elif hasattr(component, "__class__") and hasattr(
-                component.__class__, "__name__"
-            ):
-                name = component.__class__.__name__
-            else:
-                name = repr(component)
+            name = util.get_component_name(component)
         if name in self.pipe_names:
             raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
             raise ValueError(Errors.E006)
+        pipe_index = 0
         pipe = (name, component)
         if last or not any([first, before, after]):
+            pipe_index = len(self.pipeline)
             self.pipeline.append(pipe)
         elif first:
             self.pipeline.insert(0, pipe)
         elif before and before in self.pipe_names:
+            pipe_index = self.pipe_names.index(before)
             self.pipeline.insert(self.pipe_names.index(before), pipe)
         elif after and after in self.pipe_names:
+            pipe_index = self.pipe_names.index(after) + 1
             self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
         else:
             raise ValueError(
                 Errors.E001.format(name=before or after, opts=self.pipe_names)
             )
+        if ENABLE_PIPELINE_ANALYSIS:
+            analyze_pipes(self.pipeline, name, component, pipe_index)
 
     def has_pipe(self, name):
         """Check if a component name is present in the pipeline. Equivalent to
@@ -382,6 +376,8 @@ class Language(object):
                 msg += Errors.E135.format(name=name)
             raise ValueError(msg)
         self.pipeline[self.pipe_names.index(name)] = (name, component)
+        if ENABLE_PIPELINE_ANALYSIS:
+            analyze_all_pipes(self.pipeline)
 
     def rename_pipe(self, old_name, new_name):
         """Rename a pipeline component.
@@ -408,6 +404,8 @@ class Language(object):
         """
         if name not in self.pipe_names:
             raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
+        if ENABLE_PIPELINE_ANALYSIS:
+            analyze_all_pipes(self.pipeline)
         return self.pipeline.pop(self.pipe_names.index(name))
 
     def __call__(self, text, disable=[], component_cfg=None):
@@ -1001,6 +999,52 @@ class Language(object):
         return self
 
 
+class component(object):
+    """Decorator for pipeline components. Can decorate both function components
+    and class components and will automatically register components in the
+    Language.factories. If the component is a class and needs access to the
+    nlp object or config parameters, it can expose a from_nlp classmethod
+    that takes the nlp object and **cfg arguments and returns the initialized
+    component.
+    """
+
+    # NB: This decorator needs to live here, because it needs to write to
+    # Language.factories. All other solutions would cause circular import.
+
+    def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
+        """Decorate a pipeline component.
+
+        name (unicode): Default component and factory name.
+        assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
+        requires (list): Attributes required by component, e.g. `["token.dep"]`.
+        retokenizes (bool): Whether the component changes the tokenization.
+        """
+        self.name = name
+        self.assigns = validate_attrs(assigns)
+        self.requires = validate_attrs(requires)
+        self.retokenizes = retokenizes
+
+    def __call__(self, *args, **kwargs):
+        obj = args[0]
+        args = args[1:]
+        factory_name = self.name or util.get_component_name(obj)
+        obj.name = factory_name
+        obj.factory = factory_name
+        obj.assigns = self.assigns
+        obj.requires = self.requires
+        obj.retokenizes = self.retokenizes
+
+        def factory(nlp, **cfg):
+            if hasattr(obj, "from_nlp"):
+                return obj.from_nlp(nlp, **cfg)
+            elif isinstance(obj, class_types):
+                return obj()
+            return obj
+
+        Language.factories[obj.factory] = factory
+        return obj
+
+
 def _fix_pretrained_vectors_name(nlp):
     # TODO: Replace this once we handle vectors consistently as static
     # data
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 6bd6c4ea9..d926b987b 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
 from collections import defaultdict, OrderedDict
 import srsly
 
+from ..language import component
 from ..errors import Errors
 from ..compat import basestring_
 from ..util import ensure_path, to_disk, from_disk
@@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
 DEFAULT_ENT_ID_SEP = "||"
 
 
+@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
 class EntityRuler(object):
     """The EntityRuler lets you add spans to the `Doc.ents` using token-based
     rules or exact phrase matches. It can be combined with the statistical
@@ -24,8 +26,6 @@ class EntityRuler(object):
     USAGE: https://spacy.io/usage/rule-based-matching#entityruler
     """
 
-    name = "entity_ruler"
-
     def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
         """Initialize the entitiy ruler. If patterns are supplied here, they
         need to be a list of dictionaries with a `"label"` and `"pattern"`
@@ -69,6 +69,10 @@ class EntityRuler(object):
         if patterns is not None:
             self.add_patterns(patterns)
 
+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp, **cfg)
+
     def __len__(self):
         """The number of all patterns added to the entity ruler."""
         n_token_patterns = sum(len(p) for p in self.token_patterns.values())
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 0f7d94df2..9562dcbdb 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -1,9 +1,15 @@
 # coding: utf8
 from __future__ import unicode_literals
 
+from ..language import component
 from ..matcher import Matcher
 
 
+@component(
+    "merge_noun_chunks",
+    requires=["token.dep", "token.tag", "token.pos"],
+    retokenizes=True,
+)
 def merge_noun_chunks(doc):
     """Merge noun chunks into a single token.
 
@@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
     return doc
 
 
+@component(
+    "merge_entities",
+    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+    retokenizes=True,
+)
 def merge_entities(doc):
     """Merge entities into a single token.
 
@@ -36,6 +47,7 @@ def merge_entities(doc):
     return doc
 
 
+@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
 def merge_subtokens(doc, label="subtok"):
     """Merge subtokens into a single token.
 
diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
index 38672cde0..b61a34c0e 100644
--- a/spacy/pipeline/hooks.py
+++ b/spacy/pipeline/hooks.py
@@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
 from thinc.neural._classes.difference import Siamese, CauchySimilarity
 
 from .pipes import Pipe
+from ..language import component
 from .._ml import link_vectors_to_models
 
 
+@component("sentencizer_hook", assigns=["doc.user_hooks"])
 class SentenceSegmenter(object):
     """A simple spaCy hook, to allow custom sentence boundary detection logic
     (that doesn't require the dependency parse). To change the sentence
@@ -17,8 +19,6 @@ class SentenceSegmenter(object):
     and yield `Span` objects for each sentence.
     """
 
-    name = "sentencizer"
-
     def __init__(self, vocab, strategy=None):
         self.vocab = vocab
         if strategy is None or strategy == "on_punct":
@@ -44,6 +44,7 @@ class SentenceSegmenter(object):
             yield doc[start : len(doc)]
 
 
+@component("similarity", assigns=["doc.user_hooks"])
 class SimilarityHook(Pipe):
     """
     Experimental: A pipeline component to install a hook for supervised
@@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
     Where W is a vector of dimension weights, initialized to 1.
     """
 
-    name = "similarity"
-
     def __init__(self, vocab, model=True, **cfg):
         self.vocab = vocab
         self.model = model
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index b14e2bec7..72e31f120 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -8,6 +8,7 @@ from thinc.api import chain
 from thinc.neural.util import to_categorical, copy_array, get_array_module
 from .. import util
 from .pipes import Pipe
+from ..language import component
 from .._ml import Tok2Vec, build_morphologizer_model
 from .._ml import link_vectors_to_models, zero_init, flatten
 from .._ml import create_default_optimizer
@@ -18,9 +19,9 @@ from ..vocab cimport Vocab
 from ..morphology cimport Morphology
 
 
+@component("morphologizer", assigns=["token.morph", "token.pos"])
 class Morphologizer(Pipe):
-    name = 'morphologizer'
-    
+
     @classmethod
     def Model(cls, **cfg):
         if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 1f6a517c6..e33c6259b 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
 from thinc.neural.util import to_categorical
 from thinc.neural.util import get_array_module
 
-from .functions import merge_subtokens
 from ..tokens.doc cimport Doc
 from ..syntax.nn_parser cimport Parser
 from ..syntax.ner cimport BiluoPushDown
@@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
 from ..morphology cimport Morphology
 from ..vocab cimport Vocab
 
+from .functions import merge_subtokens
+from ..language import Language, component
 from ..syntax import nonproj
 from ..attrs import POS, ID
 from ..parts_of_speech import X
@@ -54,6 +55,10 @@ class Pipe(object):
         """Initialize a model for the pipe."""
         raise NotImplementedError
 
+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp.vocab, **cfg)
+
     def __init__(self, vocab, model=True, **cfg):
         """Create a new pipe instance."""
         raise NotImplementedError
@@ -223,11 +228,10 @@ class Pipe(object):
         return self
 
 
+@component("tensorizer", assigns=["doc.tensor"])
 class Tensorizer(Pipe):
     """Pre-train position-sensitive vectors for tokens."""
 
-    name = "tensorizer"
-
     @classmethod
     def Model(cls, output_size=300, **cfg):
         """Create a new statistical model for the class.
@@ -362,14 +366,13 @@ class Tensorizer(Pipe):
         return sgd
 
 
+@component("tagger", assigns=["token.tag", "token.pos"])
 class Tagger(Pipe):
     """Pipeline component for part-of-speech tagging.
 
     DOCS: https://spacy.io/api/tagger
     """
 
-    name = "tagger"
-
     def __init__(self, vocab, model=True, **cfg):
         self.vocab = vocab
         self.model = model
@@ -657,13 +660,12 @@ class Tagger(Pipe):
         return self
 
 
+@component("nn_labeller")
 class MultitaskObjective(Tagger):
     """Experimental: Assist training of a parser or tagger, by training a
     side-objective.
     """
 
-    name = "nn_labeller"
-
     def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
         self.vocab = vocab
         self.model = model
@@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
             losses[self.name] += loss
 
 
+@component("textcat", assigns=["doc.cats"])
 class TextCategorizer(Pipe):
     """Pipeline component for text classification.
 
     DOCS: https://spacy.io/api/textcategorizer
     """
-    name = 'textcat'
 
     @classmethod
     def Model(cls, nr_class=1, **cfg):
@@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):
 
     DOCS: https://spacy.io/api/dependencyparser
     """
-
+    # cdef classes can't have decorators, so we're defining this here
     name = "parser"
+    factory = "parser"
+    assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
+    requires = []
     TransitionSystem = ArcEager
 
     @property
@@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):
 
     DOCS: https://spacy.io/api/entityrecognizer
     """
-
     name = "ner"
+    factory = "ner"
+    assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
+    requires = []
     TransitionSystem = BiluoPushDown
     nr_feature = 6
 
@@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
         return tuple(sorted(labels))
 
 
+@component(
+    "entity_linker",
+    requires=["doc.ents", "token.ent_iob", "token.ent_type"],
+    assigns=["token.ent_kb_id"]
+)
 class EntityLinker(Pipe):
     """Pipeline component for named entity linking.
 
     DOCS: https://spacy.io/api/entitylinker
     """
-    name = 'entity_linker'
     NIL = "NIL"  # string used to refer to a non-existing link
 
     @classmethod
@@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
         raise NotImplementedError
 
 
+@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
 class Sentencizer(object):
     """Segment the Doc into sentences using a rule-based strategy.
 
     DOCS: https://spacy.io/api/sentencizer
     """
 
-    name = "sentencizer"
     default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
             '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
             '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
@@ -1437,6 +1448,10 @@ class Sentencizer(object):
         else:
             self.punct_chars = set(self.default_punct_chars)
 
+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(**cfg)
+
     def __call__(self, doc):
         """Apply the sentencizer to a Doc and set Token.is_sent_start.
 
@@ -1503,4 +1518,9 @@ class Sentencizer(object):
         return self
 
 
+# Cython classes can't be decorated, so we need to add the factories here
+Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
+Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
+
+
 __all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 2c1a5dba2..92168631c 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -128,6 +128,10 @@ cdef class Parser:
         self._multitasks = []
         self._rehearsal_model = None
 
+    @classmethod
+    def from_nlp(cls, nlp, **cfg):
+        return cls(nlp.vocab, **cfg)
+
     def __reduce__(self):
         return (Parser, (self.vocab, self.moves, self.model), None, None)
 
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
new file mode 100644
index 000000000..6e0354b18
--- /dev/null
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -0,0 +1,146 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import spacy.language
+from spacy.language import Language, component
+from spacy.analysis import print_summary, validate_attrs
+from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.compat import is_python2
+from mock import Mock, ANY
+import pytest
+
+
+def test_component_decorator_function():
+    @component(name="test")
+    def test_component(doc):
+        """docstring"""
+        return doc
+
+    assert test_component.name == "test"
+    if not is_python2:
+        assert test_component.__doc__ == "docstring"
+    assert test_component("foo") == "foo"
+
+
+def test_component_decorator_class():
+    @component(name="test")
+    class TestComponent(object):
+        """docstring1"""
+
+        foo = "bar"
+
+        def __call__(self, doc):
+            """docstring2"""
+            return doc
+
+        def custom(self, x):
+            """docstring3"""
+            return x
+
+    assert TestComponent.name == "test"
+    assert TestComponent.foo == "bar"
+    assert hasattr(TestComponent, "custom")
+    test_component = TestComponent()
+    assert test_component.foo == "bar"
+    assert test_component("foo") == "foo"
+    assert hasattr(test_component, "custom")
+    assert test_component.custom("bar") == "bar"
+    if not is_python2:
+        assert TestComponent.__doc__ == "docstring1"
+        assert TestComponent.__call__.__doc__ == "docstring2"
+        assert TestComponent.custom.__doc__ == "docstring3"
+        assert test_component.__doc__ == "docstring1"
+        assert test_component.__call__.__doc__ == "docstring2"
+        assert test_component.custom.__doc__ == "docstring3"
+
+
+def test_component_decorator_assigns():
+    spacy.language.ENABLE_PIPELINE_ANALYSIS = True
+
+    @component("c1", assigns=["token.tag", "doc.tensor"])
+    def test_component1(doc):
+        return doc
+
+    @component(
+        "c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
+    )
+    def test_component2(doc):
+        return doc
+
+    @component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
+    def test_component3(doc):
+        return doc
+
+    assert "c1" in Language.factories
+    assert "c2" in Language.factories
+    assert "c3" in Language.factories
+
+    nlp = Language()
+    nlp.add_pipe(test_component1)
+    with pytest.warns(UserWarning):
+        nlp.add_pipe(test_component2)
+    nlp.add_pipe(test_component3)
+    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
+    assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
+    test_component4 = nlp.create_pipe("c1")
+    assert test_component4.name == "c1"
+    assert test_component4.factory == "c1"
+    nlp.add_pipe(test_component4, name="c4")
+    assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
+    assert "c4" not in Language.factories
+    assert nlp.pipe_factories["c1"] == "c1"
+    assert nlp.pipe_factories["c4"] == "c1"
+    assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
+    assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
+    requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
+    assert [name for name, _ in requires_pos] == ["c2"]
+    assert print_summary(nlp, no_print=True)
+    assert nlp("hello world")
+
+
+def test_component_factories_from_nlp():
+    """Test that class components can implement a from_nlp classmethod that
+    gives them access to the nlp object and config via the factory."""
+
+    class TestComponent5(object):
+        def __call__(self, doc):
+            return doc
+
+    mock = Mock()
+    mock.return_value = TestComponent5()
+    TestComponent5.from_nlp = classmethod(mock)
+    TestComponent5 = component("c5")(TestComponent5)
+
+    assert "c5" in Language.factories
+    nlp = Language()
+    pipe = nlp.create_pipe("c5", config={"foo": "bar"})
+    nlp.add_pipe(pipe)
+    assert nlp("hello world")
+    # The first argument here is the class itself, so we're accepting any here
+    mock.assert_called_once_with(ANY, nlp, foo="bar")
+
+
+def test_analysis_validate_attrs_valid():
+    attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
+    assert validate_attrs(attrs)
+    for attr in attrs:
+        assert validate_attrs([attr])
+    with pytest.raises(ValueError):
+        validate_attrs(["doc.sents", "doc.xyz"])
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "doc",
+        "doc_ents",
+        "doc.xyz",
+        "token.xyz",
+        "token.tag_",
+        "token.tag.xyz",
+        "token._.xyz.abc",
+    ],
+)
+def test_analysis_validate_attrs_invalid(attr):
+    with pytest.raises(ValueError):
+        validate_attrs([attr])
diff --git a/spacy/util.py b/spacy/util.py
index b59ad6fef..ffc25fb9d 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
     cls = get_lang_class(lang)
     nlp = cls(meta=meta, **overrides)
     pipeline = meta.get("pipeline", [])
+    factories = meta.get("factories", {})
     disable = overrides.get("disable", [])
     if pipeline is True:
         pipeline = nlp.Defaults.pipe_names
@@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
     for name in pipeline:
         if name not in disable:
             config = meta.get("pipeline_args", {}).get(name, {})
-            component = nlp.create_pipe(name, config=config)
+            factory = factories.get(name, name)
+            component = nlp.create_pipe(factory, config=config)
             nlp.add_pipe(component, name=name)
     return nlp.from_disk(model_path)
 
@@ -368,6 +370,16 @@ def is_in_jupyter():
     return False
 
 
+def get_component_name(component):
+    if hasattr(component, "name"):
+        return component.name
+    if hasattr(component, "__name__"):
+        return component.__name__
+    if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
+        return component.__class__.__name__
+    return repr(component)
+
+
 def get_cuda_stream(require=False):
     if CudaStream is None:
         return None