mirror of https://github.com/explosion/spaCy.git
Component decorator and component analysis (#4517)
* Add work in progress * Update analysis helpers and component decorator * Fix porting of docstrings for Python 2 * Fix docstring stuff on Python 2 * Support meta factories when loading model * Put auto pipeline analysis behind flag for now * Analyse pipes on remove_pipe and replace_pipe * Move analysis to root for now Try to find a better place for it, but it needs to go for now to avoid circular imports * Simplify decorator Don't return a wrapped class and instead just write to the object * Update existing components and factories * Add condition in factory for classes vs. functions * Add missing from_nlp classmethods * Add "retokenizes" to printed overview * Update assigns/requires declarations of builtins * Only return data if no_print is enabled * Use multiline table for overview * Don't support Span * Rewrite errors/warnings and move them to spacy.errors
This commit is contained in:
parent
1180304449
commit
a9c6104047
|
@ -4,7 +4,7 @@ preshed>=3.0.2,<3.1.0
|
|||
thinc>=7.2.0,<7.3.0
|
||||
blis>=0.4.0,<0.5.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
wasabi>=0.2.0,<1.1.0
|
||||
wasabi>=0.3.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
# Third party dependencies
|
||||
numpy>=1.15.0
|
||||
|
|
|
@ -49,7 +49,7 @@ install_requires =
|
|||
blis>=0.4.0,<0.5.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
wasabi>=0.2.0,<1.1.0
|
||||
wasabi>=0.3.0,<1.1.0
|
||||
srsly>=0.1.0,<1.1.0
|
||||
pathlib==1.0.1; python_version < "3.4"
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
|
|
|
@ -9,12 +9,14 @@ warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
|
|||
# These are imported as part of the API
|
||||
from thinc.neural.util import prefer_gpu, require_gpu
|
||||
|
||||
from . import pipeline
|
||||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .about import __version__
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
from .util import register_architecture, get_architecture
|
||||
from .language import component
|
||||
|
||||
|
||||
if sys.maxunicode == 65535:
|
||||
|
|
|
@ -0,0 +1,176 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import OrderedDict
|
||||
from wasabi import Printer
|
||||
|
||||
from .tokens import Doc, Token
|
||||
from .errors import Errors, Warnings, user_warning
|
||||
|
||||
|
||||
def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
||||
"""Analyze a pipeline component with respect to its position in the current
|
||||
pipeline and the other components. Will check whether requirements are
|
||||
fulfilled (e.g. if previous components assign the attributes).
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
name (unicode): The name of the pipeline component to analyze.
|
||||
pipe (callable): The pipeline component function to analyze.
|
||||
index (int): The index of the component in the pipeline.
|
||||
warn (bool): Show user warning if problem is found.
|
||||
RETURNS (list): The problems found for the given pipeline component.
|
||||
"""
|
||||
assert pipeline[index][0] == name
|
||||
prev_pipes = pipeline[:index]
|
||||
pipe_requires = getattr(pipe, "requires", [])
|
||||
requires = OrderedDict([(annot, False) for annot in pipe_requires])
|
||||
if requires:
|
||||
for prev_name, prev_pipe in prev_pipes:
|
||||
prev_assigns = getattr(prev_pipe, "assigns", [])
|
||||
for annot in prev_assigns:
|
||||
requires[annot] = True
|
||||
problems = []
|
||||
for annot, fulfilled in requires.items():
|
||||
if not fulfilled:
|
||||
problems.append(annot)
|
||||
if warn:
|
||||
user_warning(Warnings.W025.format(name=name, attr=annot))
|
||||
return problems
|
||||
|
||||
|
||||
def analyze_all_pipes(pipeline, warn=True):
|
||||
"""Analyze all pipes in the pipeline in order.
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
warn (bool): Show user warning if problem is found.
|
||||
RETURNS (dict): The problems found, keyed by component name.
|
||||
"""
|
||||
problems = {}
|
||||
for i, (name, pipe) in enumerate(pipeline):
|
||||
problems[name] = analyze_pipes(pipeline, name, pipe, i, warn=warn)
|
||||
return problems
|
||||
|
||||
|
||||
def dot_to_dict(values):
|
||||
"""Convert dot notation to a dict. For example: ["token.pos", "token._.xyz"]
|
||||
become {"token": {"pos": True, "_": {"xyz": True }}}.
|
||||
|
||||
values (iterable): The values to convert.
|
||||
RETURNS (dict): The converted values.
|
||||
"""
|
||||
result = {}
|
||||
for value in values:
|
||||
path = result
|
||||
parts = value.lower().split(".")
|
||||
for i, item in enumerate(parts):
|
||||
is_last = i == len(parts) - 1
|
||||
path = path.setdefault(item, True if is_last else {})
|
||||
return result
|
||||
|
||||
|
||||
def validate_attrs(values):
|
||||
"""Validate component attributes provided to "assigns", "requires" etc.
|
||||
Raises error for invalid attributes and formatting. Doesn't check if
|
||||
custom extension attributes are registered, since this is something the
|
||||
user might want to do themselves later in the component.
|
||||
|
||||
values (iterable): The string attributes to check, e.g. `["token.pos"]`.
|
||||
RETURNS (iterable): The checked attributes.
|
||||
"""
|
||||
data = dot_to_dict(values)
|
||||
objs = {"doc": Doc, "token": Token}
|
||||
for obj_key, attrs in data.items():
|
||||
if obj_key not in objs: # first element is not doc/token
|
||||
if obj_key == "span":
|
||||
span_attrs = [attr for attr in values if attr.startswith("span.")]
|
||||
raise ValueError(Errors.E180.format(attrs=", ".join(span_attrs)))
|
||||
invalid_attrs = ", ".join(a for a in values if a.startswith(obj_key))
|
||||
raise ValueError(Errors.E181.format(obj=obj_key, attrs=invalid_attrs))
|
||||
if not isinstance(attrs, dict): # attr is something like "doc"
|
||||
raise ValueError(Errors.E182.format(attr=obj_key))
|
||||
for attr, value in attrs.items():
|
||||
if attr == "_":
|
||||
if value is True: # attr is something like "doc._"
|
||||
raise ValueError(Errors.E182.format(attr="{}._".format(obj_key)))
|
||||
for ext_attr, ext_value in value.items():
|
||||
# We don't check whether the attribute actually exists
|
||||
if ext_value is not True: # attr is something like doc._.x.y
|
||||
good = "{}._.{}".format(obj_key, ext_attr)
|
||||
bad = "{}.{}".format(good, ".".join(ext_value))
|
||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||
continue # we can't validate those further
|
||||
if attr.endswith("_"): # attr is something like "token.pos_"
|
||||
raise ValueError(Errors.E184.format(attr=attr, solution=attr[:-1]))
|
||||
if value is not True: # attr is something like doc.x.y
|
||||
good = "{}.{}".format(obj_key, attr)
|
||||
bad = "{}.{}".format(good, ".".join(value))
|
||||
raise ValueError(Errors.E183.format(attr=bad, solution=good))
|
||||
obj = objs[obj_key]
|
||||
if not hasattr(obj, attr):
|
||||
raise ValueError(Errors.E185.format(obj=obj_key, attr=attr))
|
||||
return values
|
||||
|
||||
|
||||
def _get_feature_for_attr(pipeline, attr, feature):
|
||||
assert feature in ["assigns", "requires"]
|
||||
result = []
|
||||
for pipe_name, pipe in pipeline:
|
||||
pipe_assigns = getattr(pipe, feature, [])
|
||||
if attr in pipe_assigns:
|
||||
result.append((pipe_name, pipe))
|
||||
return result
|
||||
|
||||
|
||||
def get_assigns_for_attr(pipeline, attr):
|
||||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
attr (unicode): The attribute to check.
|
||||
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
||||
"""
|
||||
return _get_feature_for_attr(pipeline, attr, "assigns")
|
||||
|
||||
|
||||
def get_requires_for_attr(pipeline, attr):
|
||||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
attr (unicode): The attribute to check.
|
||||
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
||||
"""
|
||||
return _get_feature_for_attr(pipeline, attr, "requires")
|
||||
|
||||
|
||||
def print_summary(nlp, pretty=True, no_print=False):
|
||||
"""Print a formatted summary for the current nlp object's pipeline. Shows
|
||||
a table with the pipeline components and why they assign and require, as
|
||||
well as any problems if available.
|
||||
|
||||
nlp (Language): The nlp object.
|
||||
pretty (bool): Pretty-print the results (color etc).
|
||||
no_print (bool): Don't print anything, just return the data.
|
||||
RETURNS (dict): A dict with "overview" and "problems".
|
||||
"""
|
||||
msg = Printer(pretty=pretty, no_print=no_print)
|
||||
overview = []
|
||||
problems = {}
|
||||
for i, (name, pipe) in enumerate(nlp.pipeline):
|
||||
requires = getattr(pipe, "requires", [])
|
||||
assigns = getattr(pipe, "assigns", [])
|
||||
retok = getattr(pipe, "retokenizes", False)
|
||||
overview.append((i, name, requires, assigns, retok))
|
||||
problems[name] = analyze_pipes(nlp.pipeline, name, pipe, i, warn=False)
|
||||
msg.divider("Pipeline Overview")
|
||||
header = ("#", "Component", "Requires", "Assigns", "Retokenizes")
|
||||
msg.table(overview, header=header, divider=True, multiline=True)
|
||||
n_problems = sum(len(p) for p in problems.values())
|
||||
if any(p for p in problems.values()):
|
||||
msg.divider("Problems ({})".format(n_problems))
|
||||
for name, problem in problems.items():
|
||||
if problem:
|
||||
problem = ", ".join(problem)
|
||||
msg.warn("'{}' requirements not met: {}".format(name, problem))
|
||||
else:
|
||||
msg.good("No problems found.")
|
||||
if no_print:
|
||||
return {"overview": overview, "problems": problems}
|
|
@ -12,6 +12,7 @@ import os
|
|||
import sys
|
||||
import itertools
|
||||
import ast
|
||||
import types
|
||||
|
||||
from thinc.neural.util import copy_array
|
||||
|
||||
|
@ -67,6 +68,7 @@ if is_python2:
|
|||
basestring_ = basestring # noqa: F821
|
||||
input_ = raw_input # noqa: F821
|
||||
path2str = lambda path: str(path).decode("utf8")
|
||||
class_types = (type, types.ClassType)
|
||||
|
||||
elif is_python3:
|
||||
bytes_ = bytes
|
||||
|
@ -74,6 +76,7 @@ elif is_python3:
|
|||
basestring_ = str
|
||||
input_ = input
|
||||
path2str = lambda path: str(path)
|
||||
class_types = (type, types.ClassType) if is_python_pre_3_5 else type
|
||||
|
||||
|
||||
def b_to_str(b_str):
|
||||
|
|
|
@ -99,6 +99,8 @@ class Warnings(object):
|
|||
"'n_process' will be set to 1.")
|
||||
W024 = ("Entity '{entity}' - Alias '{alias}' combination already exists in "
|
||||
"the Knowledge Base.")
|
||||
W025 = ("'{name}' requires '{attr}' to be assigned, but none of the "
|
||||
"previous components in the pipeline declare that they assign it.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -511,6 +513,20 @@ class Errors(object):
|
|||
E179 = ("Invalid pattern. Expected a list of Doc objects but got a single "
|
||||
"Doc. If you only want to add one pattern, make sure to wrap it "
|
||||
"in a list. For example: matcher.add('{key}', [doc])")
|
||||
E180 = ("Span attributes can't be declared as required or assigned by "
|
||||
"components, since spans are only views of the Doc. Use Doc and "
|
||||
"Token attributes only and remove the following: {attrs}")
|
||||
E181 = ("Received invalid attributes for unkown object {obj}: {attrs}. "
|
||||
"Only Doc and Token attributes are supported.")
|
||||
E182 = ("Received invalid attribute declaration: {attr}\nDid you forget "
|
||||
"to define the attribute? For example: {attr}.???")
|
||||
E183 = ("Received invalid attribute declaration: {attr}\nOnly top-level "
|
||||
"attributes are supported, for example: {solution}")
|
||||
E184 = ("Only attributes without underscores are supported in component "
|
||||
"attribute declarations (because underscore and non-underscore "
|
||||
"attributes are connected anyways): {attr} -> {solution}")
|
||||
E185 = ("Received invalid attribute in component attribute declaration: "
|
||||
"{obj}.{attr}\nAttribute '{attr}' does not exist on {obj}.")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
|
|
@ -18,13 +18,8 @@ from .tokenizer import Tokenizer
|
|||
from .vocab import Vocab
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .lookups import Lookups
|
||||
from .pipeline import DependencyParser, Tagger
|
||||
from .pipeline import Tensorizer, EntityRecognizer, EntityLinker
|
||||
from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
|
||||
from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
|
||||
from .pipeline import EntityRuler
|
||||
from .pipeline import Morphologizer
|
||||
from .compat import izip, basestring_, is_python2
|
||||
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||
from .compat import izip, basestring_, is_python2, class_types
|
||||
from .gold import GoldParse
|
||||
from .scorer import Scorer
|
||||
from ._ml import link_vectors_to_models, create_default_optimizer
|
||||
|
@ -40,6 +35,9 @@ from . import util
|
|||
from . import about
|
||||
|
||||
|
||||
ENABLE_PIPELINE_ANALYSIS = False
|
||||
|
||||
|
||||
class BaseDefaults(object):
|
||||
@classmethod
|
||||
def create_lemmatizer(cls, nlp=None, lookups=None):
|
||||
|
@ -135,19 +133,6 @@ class Language(object):
|
|||
|
||||
factories = {
|
||||
"tokenizer": lambda nlp: nlp.Defaults.create_tokenizer(nlp),
|
||||
"tensorizer": lambda nlp, **cfg: Tensorizer(nlp.vocab, **cfg),
|
||||
"tagger": lambda nlp, **cfg: Tagger(nlp.vocab, **cfg),
|
||||
"morphologizer": lambda nlp, **cfg: Morphologizer(nlp.vocab, **cfg),
|
||||
"parser": lambda nlp, **cfg: DependencyParser(nlp.vocab, **cfg),
|
||||
"ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
|
||||
"entity_linker": lambda nlp, **cfg: EntityLinker(nlp.vocab, **cfg),
|
||||
"similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
|
||||
"textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
|
||||
"sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
|
||||
"merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
|
||||
"merge_entities": lambda nlp, **cfg: merge_entities,
|
||||
"merge_subtokens": lambda nlp, **cfg: merge_subtokens,
|
||||
"entity_ruler": lambda nlp, **cfg: EntityRuler(nlp, **cfg),
|
||||
}
|
||||
|
||||
def __init__(
|
||||
|
@ -218,6 +203,7 @@ class Language(object):
|
|||
"name": self.vocab.vectors.name,
|
||||
}
|
||||
self._meta["pipeline"] = self.pipe_names
|
||||
self._meta["factories"] = self.pipe_factories
|
||||
self._meta["labels"] = self.pipe_labels
|
||||
return self._meta
|
||||
|
||||
|
@ -259,6 +245,17 @@ class Language(object):
|
|||
"""
|
||||
return [pipe_name for pipe_name, _ in self.pipeline]
|
||||
|
||||
@property
|
||||
def pipe_factories(self):
|
||||
"""Get the component factories for the available pipeline components.
|
||||
|
||||
RETURNS (dict): Factory names, keyed by component names.
|
||||
"""
|
||||
factories = {}
|
||||
for pipe_name, pipe in self.pipeline:
|
||||
factories[pipe_name] = getattr(pipe, "factory", pipe_name)
|
||||
return factories
|
||||
|
||||
@property
|
||||
def pipe_labels(self):
|
||||
"""Get the labels set by the pipeline components, if available (if
|
||||
|
@ -327,33 +324,30 @@ class Language(object):
|
|||
msg += Errors.E004.format(component=component)
|
||||
raise ValueError(msg)
|
||||
if name is None:
|
||||
if hasattr(component, "name"):
|
||||
name = component.name
|
||||
elif hasattr(component, "__name__"):
|
||||
name = component.__name__
|
||||
elif hasattr(component, "__class__") and hasattr(
|
||||
component.__class__, "__name__"
|
||||
):
|
||||
name = component.__class__.__name__
|
||||
else:
|
||||
name = repr(component)
|
||||
name = util.get_component_name(component)
|
||||
if name in self.pipe_names:
|
||||
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
|
||||
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
||||
raise ValueError(Errors.E006)
|
||||
pipe_index = 0
|
||||
pipe = (name, component)
|
||||
if last or not any([first, before, after]):
|
||||
pipe_index = len(self.pipeline)
|
||||
self.pipeline.append(pipe)
|
||||
elif first:
|
||||
self.pipeline.insert(0, pipe)
|
||||
elif before and before in self.pipe_names:
|
||||
pipe_index = self.pipe_names.index(before)
|
||||
self.pipeline.insert(self.pipe_names.index(before), pipe)
|
||||
elif after and after in self.pipe_names:
|
||||
pipe_index = self.pipe_names.index(after) + 1
|
||||
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
|
||||
else:
|
||||
raise ValueError(
|
||||
Errors.E001.format(name=before or after, opts=self.pipe_names)
|
||||
)
|
||||
if ENABLE_PIPELINE_ANALYSIS:
|
||||
analyze_pipes(self.pipeline, name, component, pipe_index)
|
||||
|
||||
def has_pipe(self, name):
|
||||
"""Check if a component name is present in the pipeline. Equivalent to
|
||||
|
@ -382,6 +376,8 @@ class Language(object):
|
|||
msg += Errors.E135.format(name=name)
|
||||
raise ValueError(msg)
|
||||
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
||||
if ENABLE_PIPELINE_ANALYSIS:
|
||||
analyze_all_pipes(self.pipeline)
|
||||
|
||||
def rename_pipe(self, old_name, new_name):
|
||||
"""Rename a pipeline component.
|
||||
|
@ -408,6 +404,8 @@ class Language(object):
|
|||
"""
|
||||
if name not in self.pipe_names:
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
if ENABLE_PIPELINE_ANALYSIS:
|
||||
analyze_all_pipes(self.pipeline)
|
||||
return self.pipeline.pop(self.pipe_names.index(name))
|
||||
|
||||
def __call__(self, text, disable=[], component_cfg=None):
|
||||
|
@ -1001,6 +999,52 @@ class Language(object):
|
|||
return self
|
||||
|
||||
|
||||
class component(object):
|
||||
"""Decorator for pipeline components. Can decorate both function components
|
||||
and class components and will automatically register components in the
|
||||
Language.factories. If the component is a class and needs access to the
|
||||
nlp object or config parameters, it can expose a from_nlp classmethod
|
||||
that takes the nlp object and **cfg arguments and returns the initialized
|
||||
component.
|
||||
"""
|
||||
|
||||
# NB: This decorator needs to live here, because it needs to write to
|
||||
# Language.factories. All other solutions would cause circular import.
|
||||
|
||||
def __init__(self, name=None, assigns=tuple(), requires=tuple(), retokenizes=False):
|
||||
"""Decorate a pipeline component.
|
||||
|
||||
name (unicode): Default component and factory name.
|
||||
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
|
||||
requires (list): Attributes required by component, e.g. `["token.dep"]`.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
"""
|
||||
self.name = name
|
||||
self.assigns = validate_attrs(assigns)
|
||||
self.requires = validate_attrs(requires)
|
||||
self.retokenizes = retokenizes
|
||||
|
||||
def __call__(self, *args, **kwargs):
|
||||
obj = args[0]
|
||||
args = args[1:]
|
||||
factory_name = self.name or util.get_component_name(obj)
|
||||
obj.name = factory_name
|
||||
obj.factory = factory_name
|
||||
obj.assigns = self.assigns
|
||||
obj.requires = self.requires
|
||||
obj.retokenizes = self.retokenizes
|
||||
|
||||
def factory(nlp, **cfg):
|
||||
if hasattr(obj, "from_nlp"):
|
||||
return obj.from_nlp(nlp, **cfg)
|
||||
elif isinstance(obj, class_types):
|
||||
return obj()
|
||||
return obj
|
||||
|
||||
Language.factories[obj.factory] = factory
|
||||
return obj
|
||||
|
||||
|
||||
def _fix_pretrained_vectors_name(nlp):
|
||||
# TODO: Replace this once we handle vectors consistently as static
|
||||
# data
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
from collections import defaultdict, OrderedDict
|
||||
import srsly
|
||||
|
||||
from ..language import component
|
||||
from ..errors import Errors
|
||||
from ..compat import basestring_
|
||||
from ..util import ensure_path, to_disk, from_disk
|
||||
|
@ -13,6 +14,7 @@ from ..matcher import Matcher, PhraseMatcher
|
|||
DEFAULT_ENT_ID_SEP = "||"
|
||||
|
||||
|
||||
@component("entity_ruler", assigns=["doc.ents", "token.ent_type", "token.ent_iob"])
|
||||
class EntityRuler(object):
|
||||
"""The EntityRuler lets you add spans to the `Doc.ents` using token-based
|
||||
rules or exact phrase matches. It can be combined with the statistical
|
||||
|
@ -24,8 +26,6 @@ class EntityRuler(object):
|
|||
USAGE: https://spacy.io/usage/rule-based-matching#entityruler
|
||||
"""
|
||||
|
||||
name = "entity_ruler"
|
||||
|
||||
def __init__(self, nlp, phrase_matcher_attr=None, validate=False, **cfg):
|
||||
"""Initialize the entitiy ruler. If patterns are supplied here, they
|
||||
need to be a list of dictionaries with a `"label"` and `"pattern"`
|
||||
|
@ -69,6 +69,10 @@ class EntityRuler(object):
|
|||
if patterns is not None:
|
||||
self.add_patterns(patterns)
|
||||
|
||||
@classmethod
|
||||
def from_nlp(cls, nlp, **cfg):
|
||||
return cls(nlp, **cfg)
|
||||
|
||||
def __len__(self):
|
||||
"""The number of all patterns added to the entity ruler."""
|
||||
n_token_patterns = sum(len(p) for p in self.token_patterns.values())
|
||||
|
|
|
@ -1,9 +1,15 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from ..language import component
|
||||
from ..matcher import Matcher
|
||||
|
||||
|
||||
@component(
|
||||
"merge_noun_chunks",
|
||||
requires=["token.dep", "token.tag", "token.pos"],
|
||||
retokenizes=True,
|
||||
)
|
||||
def merge_noun_chunks(doc):
|
||||
"""Merge noun chunks into a single token.
|
||||
|
||||
|
@ -21,6 +27,11 @@ def merge_noun_chunks(doc):
|
|||
return doc
|
||||
|
||||
|
||||
@component(
|
||||
"merge_entities",
|
||||
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
retokenizes=True,
|
||||
)
|
||||
def merge_entities(doc):
|
||||
"""Merge entities into a single token.
|
||||
|
||||
|
@ -36,6 +47,7 @@ def merge_entities(doc):
|
|||
return doc
|
||||
|
||||
|
||||
@component("merge_subtokens", requires=["token.dep"], retokenizes=True)
|
||||
def merge_subtokens(doc, label="subtok"):
|
||||
"""Merge subtokens into a single token.
|
||||
|
||||
|
|
|
@ -5,9 +5,11 @@ from thinc.t2v import Pooling, max_pool, mean_pool
|
|||
from thinc.neural._classes.difference import Siamese, CauchySimilarity
|
||||
|
||||
from .pipes import Pipe
|
||||
from ..language import component
|
||||
from .._ml import link_vectors_to_models
|
||||
|
||||
|
||||
@component("sentencizer_hook", assigns=["doc.user_hooks"])
|
||||
class SentenceSegmenter(object):
|
||||
"""A simple spaCy hook, to allow custom sentence boundary detection logic
|
||||
(that doesn't require the dependency parse). To change the sentence
|
||||
|
@ -17,8 +19,6 @@ class SentenceSegmenter(object):
|
|||
and yield `Span` objects for each sentence.
|
||||
"""
|
||||
|
||||
name = "sentencizer"
|
||||
|
||||
def __init__(self, vocab, strategy=None):
|
||||
self.vocab = vocab
|
||||
if strategy is None or strategy == "on_punct":
|
||||
|
@ -44,6 +44,7 @@ class SentenceSegmenter(object):
|
|||
yield doc[start : len(doc)]
|
||||
|
||||
|
||||
@component("similarity", assigns=["doc.user_hooks"])
|
||||
class SimilarityHook(Pipe):
|
||||
"""
|
||||
Experimental: A pipeline component to install a hook for supervised
|
||||
|
@ -58,8 +59,6 @@ class SimilarityHook(Pipe):
|
|||
Where W is a vector of dimension weights, initialized to 1.
|
||||
"""
|
||||
|
||||
name = "similarity"
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
|
|
@ -8,6 +8,7 @@ from thinc.api import chain
|
|||
from thinc.neural.util import to_categorical, copy_array, get_array_module
|
||||
from .. import util
|
||||
from .pipes import Pipe
|
||||
from ..language import component
|
||||
from .._ml import Tok2Vec, build_morphologizer_model
|
||||
from .._ml import link_vectors_to_models, zero_init, flatten
|
||||
from .._ml import create_default_optimizer
|
||||
|
@ -18,9 +19,9 @@ from ..vocab cimport Vocab
|
|||
from ..morphology cimport Morphology
|
||||
|
||||
|
||||
@component("morphologizer", assigns=["token.morph", "token.pos"])
|
||||
class Morphologizer(Pipe):
|
||||
name = 'morphologizer'
|
||||
|
||||
|
||||
@classmethod
|
||||
def Model(cls, **cfg):
|
||||
if cfg.get('pretrained_dims') and not cfg.get('pretrained_vectors'):
|
||||
|
|
|
@ -13,7 +13,6 @@ from thinc.misc import LayerNorm
|
|||
from thinc.neural.util import to_categorical
|
||||
from thinc.neural.util import get_array_module
|
||||
|
||||
from .functions import merge_subtokens
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..syntax.nn_parser cimport Parser
|
||||
from ..syntax.ner cimport BiluoPushDown
|
||||
|
@ -21,6 +20,8 @@ from ..syntax.arc_eager cimport ArcEager
|
|||
from ..morphology cimport Morphology
|
||||
from ..vocab cimport Vocab
|
||||
|
||||
from .functions import merge_subtokens
|
||||
from ..language import Language, component
|
||||
from ..syntax import nonproj
|
||||
from ..attrs import POS, ID
|
||||
from ..parts_of_speech import X
|
||||
|
@ -54,6 +55,10 @@ class Pipe(object):
|
|||
"""Initialize a model for the pipe."""
|
||||
raise NotImplementedError
|
||||
|
||||
@classmethod
|
||||
def from_nlp(cls, nlp, **cfg):
|
||||
return cls(nlp.vocab, **cfg)
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
"""Create a new pipe instance."""
|
||||
raise NotImplementedError
|
||||
|
@ -223,11 +228,10 @@ class Pipe(object):
|
|||
return self
|
||||
|
||||
|
||||
@component("tensorizer", assigns=["doc.tensor"])
|
||||
class Tensorizer(Pipe):
|
||||
"""Pre-train position-sensitive vectors for tokens."""
|
||||
|
||||
name = "tensorizer"
|
||||
|
||||
@classmethod
|
||||
def Model(cls, output_size=300, **cfg):
|
||||
"""Create a new statistical model for the class.
|
||||
|
@ -362,14 +366,13 @@ class Tensorizer(Pipe):
|
|||
return sgd
|
||||
|
||||
|
||||
@component("tagger", assigns=["token.tag", "token.pos"])
|
||||
class Tagger(Pipe):
|
||||
"""Pipeline component for part-of-speech tagging.
|
||||
|
||||
DOCS: https://spacy.io/api/tagger
|
||||
"""
|
||||
|
||||
name = "tagger"
|
||||
|
||||
def __init__(self, vocab, model=True, **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -657,13 +660,12 @@ class Tagger(Pipe):
|
|||
return self
|
||||
|
||||
|
||||
@component("nn_labeller")
|
||||
class MultitaskObjective(Tagger):
|
||||
"""Experimental: Assist training of a parser or tagger, by training a
|
||||
side-objective.
|
||||
"""
|
||||
|
||||
name = "nn_labeller"
|
||||
|
||||
def __init__(self, vocab, model=True, target='dep_tag_offset', **cfg):
|
||||
self.vocab = vocab
|
||||
self.model = model
|
||||
|
@ -898,12 +900,12 @@ class ClozeMultitask(Pipe):
|
|||
losses[self.name] += loss
|
||||
|
||||
|
||||
@component("textcat", assigns=["doc.cats"])
|
||||
class TextCategorizer(Pipe):
|
||||
"""Pipeline component for text classification.
|
||||
|
||||
DOCS: https://spacy.io/api/textcategorizer
|
||||
"""
|
||||
name = 'textcat'
|
||||
|
||||
@classmethod
|
||||
def Model(cls, nr_class=1, **cfg):
|
||||
|
@ -1051,8 +1053,11 @@ cdef class DependencyParser(Parser):
|
|||
|
||||
DOCS: https://spacy.io/api/dependencyparser
|
||||
"""
|
||||
|
||||
# cdef classes can't have decorators, so we're defining this here
|
||||
name = "parser"
|
||||
factory = "parser"
|
||||
assigns = ["token.dep", "token.is_sent_start", "doc.sents"]
|
||||
requires = []
|
||||
TransitionSystem = ArcEager
|
||||
|
||||
@property
|
||||
|
@ -1097,8 +1102,10 @@ cdef class EntityRecognizer(Parser):
|
|||
|
||||
DOCS: https://spacy.io/api/entityrecognizer
|
||||
"""
|
||||
|
||||
name = "ner"
|
||||
factory = "ner"
|
||||
assigns = ["doc.ents", "token.ent_iob", "token.ent_type"]
|
||||
requires = []
|
||||
TransitionSystem = BiluoPushDown
|
||||
nr_feature = 6
|
||||
|
||||
|
@ -1129,12 +1136,16 @@ cdef class EntityRecognizer(Parser):
|
|||
return tuple(sorted(labels))
|
||||
|
||||
|
||||
@component(
|
||||
"entity_linker",
|
||||
requires=["doc.ents", "token.ent_iob", "token.ent_type"],
|
||||
assigns=["token.ent_kb_id"]
|
||||
)
|
||||
class EntityLinker(Pipe):
|
||||
"""Pipeline component for named entity linking.
|
||||
|
||||
DOCS: https://spacy.io/api/entitylinker
|
||||
"""
|
||||
name = 'entity_linker'
|
||||
NIL = "NIL" # string used to refer to a non-existing link
|
||||
|
||||
@classmethod
|
||||
|
@ -1405,13 +1416,13 @@ class EntityLinker(Pipe):
|
|||
raise NotImplementedError
|
||||
|
||||
|
||||
@component("sentencizer", assigns=["token.is_sent_start", "doc.sents"])
|
||||
class Sentencizer(object):
|
||||
"""Segment the Doc into sentences using a rule-based strategy.
|
||||
|
||||
DOCS: https://spacy.io/api/sentencizer
|
||||
"""
|
||||
|
||||
name = "sentencizer"
|
||||
default_punct_chars = ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹',
|
||||
'।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄',
|
||||
'᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿',
|
||||
|
@ -1437,6 +1448,10 @@ class Sentencizer(object):
|
|||
else:
|
||||
self.punct_chars = set(self.default_punct_chars)
|
||||
|
||||
@classmethod
|
||||
def from_nlp(cls, nlp, **cfg):
|
||||
return cls(**cfg)
|
||||
|
||||
def __call__(self, doc):
|
||||
"""Apply the sentencizer to a Doc and set Token.is_sent_start.
|
||||
|
||||
|
@ -1503,4 +1518,9 @@ class Sentencizer(object):
|
|||
return self
|
||||
|
||||
|
||||
# Cython classes can't be decorated, so we need to add the factories here
|
||||
Language.factories["parser"] = lambda nlp, **cfg: DependencyParser.from_nlp(nlp, **cfg)
|
||||
Language.factories["ner"] = lambda nlp, **cfg: EntityRecognizer.from_nlp(nlp, **cfg)
|
||||
|
||||
|
||||
__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "EntityLinker", "Sentencizer"]
|
||||
|
|
|
@ -128,6 +128,10 @@ cdef class Parser:
|
|||
self._multitasks = []
|
||||
self._rehearsal_model = None
|
||||
|
||||
@classmethod
|
||||
def from_nlp(cls, nlp, **cfg):
|
||||
return cls(nlp.vocab, **cfg)
|
||||
|
||||
def __reduce__(self):
|
||||
return (Parser, (self.vocab, self.moves, self.model), None, None)
|
||||
|
||||
|
|
|
@ -0,0 +1,146 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy.language
|
||||
from spacy.language import Language, component
|
||||
from spacy.analysis import print_summary, validate_attrs
|
||||
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
|
||||
from spacy.compat import is_python2
|
||||
from mock import Mock, ANY
|
||||
import pytest
|
||||
|
||||
|
||||
def test_component_decorator_function():
|
||||
@component(name="test")
|
||||
def test_component(doc):
|
||||
"""docstring"""
|
||||
return doc
|
||||
|
||||
assert test_component.name == "test"
|
||||
if not is_python2:
|
||||
assert test_component.__doc__ == "docstring"
|
||||
assert test_component("foo") == "foo"
|
||||
|
||||
|
||||
def test_component_decorator_class():
|
||||
@component(name="test")
|
||||
class TestComponent(object):
|
||||
"""docstring1"""
|
||||
|
||||
foo = "bar"
|
||||
|
||||
def __call__(self, doc):
|
||||
"""docstring2"""
|
||||
return doc
|
||||
|
||||
def custom(self, x):
|
||||
"""docstring3"""
|
||||
return x
|
||||
|
||||
assert TestComponent.name == "test"
|
||||
assert TestComponent.foo == "bar"
|
||||
assert hasattr(TestComponent, "custom")
|
||||
test_component = TestComponent()
|
||||
assert test_component.foo == "bar"
|
||||
assert test_component("foo") == "foo"
|
||||
assert hasattr(test_component, "custom")
|
||||
assert test_component.custom("bar") == "bar"
|
||||
if not is_python2:
|
||||
assert TestComponent.__doc__ == "docstring1"
|
||||
assert TestComponent.__call__.__doc__ == "docstring2"
|
||||
assert TestComponent.custom.__doc__ == "docstring3"
|
||||
assert test_component.__doc__ == "docstring1"
|
||||
assert test_component.__call__.__doc__ == "docstring2"
|
||||
assert test_component.custom.__doc__ == "docstring3"
|
||||
|
||||
|
||||
def test_component_decorator_assigns():
|
||||
spacy.language.ENABLE_PIPELINE_ANALYSIS = True
|
||||
|
||||
@component("c1", assigns=["token.tag", "doc.tensor"])
|
||||
def test_component1(doc):
|
||||
return doc
|
||||
|
||||
@component(
|
||||
"c2", requires=["token.tag", "token.pos"], assigns=["token.lemma", "doc.tensor"]
|
||||
)
|
||||
def test_component2(doc):
|
||||
return doc
|
||||
|
||||
@component("c3", requires=["token.lemma"], assigns=["token._.custom_lemma"])
|
||||
def test_component3(doc):
|
||||
return doc
|
||||
|
||||
assert "c1" in Language.factories
|
||||
assert "c2" in Language.factories
|
||||
assert "c3" in Language.factories
|
||||
|
||||
nlp = Language()
|
||||
nlp.add_pipe(test_component1)
|
||||
with pytest.warns(UserWarning):
|
||||
nlp.add_pipe(test_component2)
|
||||
nlp.add_pipe(test_component3)
|
||||
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
|
||||
assert [name for name, _ in assigns_tensor] == ["c1", "c2"]
|
||||
test_component4 = nlp.create_pipe("c1")
|
||||
assert test_component4.name == "c1"
|
||||
assert test_component4.factory == "c1"
|
||||
nlp.add_pipe(test_component4, name="c4")
|
||||
assert nlp.pipe_names == ["c1", "c2", "c3", "c4"]
|
||||
assert "c4" not in Language.factories
|
||||
assert nlp.pipe_factories["c1"] == "c1"
|
||||
assert nlp.pipe_factories["c4"] == "c1"
|
||||
assigns_tensor = get_assigns_for_attr(nlp.pipeline, "doc.tensor")
|
||||
assert [name for name, _ in assigns_tensor] == ["c1", "c2", "c4"]
|
||||
requires_pos = get_requires_for_attr(nlp.pipeline, "token.pos")
|
||||
assert [name for name, _ in requires_pos] == ["c2"]
|
||||
assert print_summary(nlp, no_print=True)
|
||||
assert nlp("hello world")
|
||||
|
||||
|
||||
def test_component_factories_from_nlp():
|
||||
"""Test that class components can implement a from_nlp classmethod that
|
||||
gives them access to the nlp object and config via the factory."""
|
||||
|
||||
class TestComponent5(object):
|
||||
def __call__(self, doc):
|
||||
return doc
|
||||
|
||||
mock = Mock()
|
||||
mock.return_value = TestComponent5()
|
||||
TestComponent5.from_nlp = classmethod(mock)
|
||||
TestComponent5 = component("c5")(TestComponent5)
|
||||
|
||||
assert "c5" in Language.factories
|
||||
nlp = Language()
|
||||
pipe = nlp.create_pipe("c5", config={"foo": "bar"})
|
||||
nlp.add_pipe(pipe)
|
||||
assert nlp("hello world")
|
||||
# The first argument here is the class itself, so we're accepting any here
|
||||
mock.assert_called_once_with(ANY, nlp, foo="bar")
|
||||
|
||||
|
||||
def test_analysis_validate_attrs_valid():
|
||||
attrs = ["doc.sents", "doc.ents", "token.tag", "token._.xyz"]
|
||||
assert validate_attrs(attrs)
|
||||
for attr in attrs:
|
||||
assert validate_attrs([attr])
|
||||
with pytest.raises(ValueError):
|
||||
validate_attrs(["doc.sents", "doc.xyz"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"attr",
|
||||
[
|
||||
"doc",
|
||||
"doc_ents",
|
||||
"doc.xyz",
|
||||
"token.xyz",
|
||||
"token.tag_",
|
||||
"token.tag.xyz",
|
||||
"token._.xyz.abc",
|
||||
],
|
||||
)
|
||||
def test_analysis_validate_attrs_invalid(attr):
|
||||
with pytest.raises(ValueError):
|
||||
validate_attrs([attr])
|
|
@ -247,6 +247,7 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
|||
cls = get_lang_class(lang)
|
||||
nlp = cls(meta=meta, **overrides)
|
||||
pipeline = meta.get("pipeline", [])
|
||||
factories = meta.get("factories", {})
|
||||
disable = overrides.get("disable", [])
|
||||
if pipeline is True:
|
||||
pipeline = nlp.Defaults.pipe_names
|
||||
|
@ -255,7 +256,8 @@ def load_model_from_path(model_path, meta=False, **overrides):
|
|||
for name in pipeline:
|
||||
if name not in disable:
|
||||
config = meta.get("pipeline_args", {}).get(name, {})
|
||||
component = nlp.create_pipe(name, config=config)
|
||||
factory = factories.get(name, name)
|
||||
component = nlp.create_pipe(factory, config=config)
|
||||
nlp.add_pipe(component, name=name)
|
||||
return nlp.from_disk(model_path)
|
||||
|
||||
|
@ -368,6 +370,16 @@ def is_in_jupyter():
|
|||
return False
|
||||
|
||||
|
||||
def get_component_name(component):
|
||||
if hasattr(component, "name"):
|
||||
return component.name
|
||||
if hasattr(component, "__name__"):
|
||||
return component.__name__
|
||||
if hasattr(component, "__class__") and hasattr(component.__class__, "__name__"):
|
||||
return component.__class__.__name__
|
||||
return repr(component)
|
||||
|
||||
|
||||
def get_cuda_stream(require=False):
|
||||
if CudaStream is None:
|
||||
return None
|
||||
|
|
Loading…
Reference in New Issue