Guess set_annotations=True in nlp.update

During `nlp.update`, components can be passed a boolean set_annotations
to indicate whether they should assign annotations to the `Doc`. This
needs to be called if downstream components expect to use the
annotations during training, e.g. if we wanted to use tagger features in
the parser.

Components can specify their assignments and requirements, so we can
figure out which components have these inter-dependencies. After
figuring this out, we can guess whether to pass set_annotations=True.

We could also call set_annotations=True always, or even just have this
as the only behaviour. The downside of this is that it would require the
`Doc` objects to be created afresh to avoid problematic modifications.
One approach would be to make a fresh copy of the `Doc` objects within
`nlp.update()`, so that we can write to the objects without any
problems. If we do that, we can drop this logic and also drop the
`set_annotations` mechanism. I would be fine with that approach,
although it runs the risk of introducing some performance overhead, and
we'll have to take care to copy all extension attributes etc.
This commit is contained in:
Matthew Honnibal 2020-05-22 15:55:45 +02:00
parent 25b51f4fc8
commit 78d79d94ce
2 changed files with 39 additions and 3 deletions

View File

@ -545,13 +545,14 @@ class Language(object):
if component_cfg is None:
component_cfg = {}
component_deps = _count_pipeline_inter_dependencies(self.pipeline)
# Determine whether component should set annotations. In theory I guess
# we should do this by inspecting the meta? Or we could just always
# say "yes"
for name, proc in self.pipeline:
for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {})
component_cfg[name].setdefault("drop", drop)
component_cfg[name].setdefault("set_annotations", False)
component_cfg[name]["set_annotations"] = bool(component_deps[i])
for name, proc in self.pipeline:
if not hasattr(proc, "update"):
continue
@ -1159,6 +1160,25 @@ class DisabledPipes(list):
self[:] = []
def _count_pipeline_inter_dependencies(pipeline):
"""Count how many subsequent components require an annotation set by each
component in the pipeline.
"""
pipe_assigns = []
pipe_requires = []
for name, pipe in pipeline:
pipe_assigns.append(set(getattr(pipe, "assigns", [])))
pipe_requires.append(set(getattr(pipe, "requires", [])))
counts = []
for i, assigns in enumerate(pipe_assigns):
count = 0
for requires in pipe_requires[i+1:]:
if assigns.intersection(requires):
count += 1
counts.append(count)
return counts
def _pipe(examples, proc, kwargs):
# We added some args for pipe that __call__ doesn't expect.
kwargs = dict(kwargs)

View File

@ -1,5 +1,5 @@
import pytest
from spacy.language import Language
from spacy.language import Language, _count_pipeline_inter_dependencies
@pytest.fixture
@ -198,3 +198,19 @@ def test_pipe_labels(nlp):
assert len(nlp.pipe_labels) == len(input_labels)
for name, labels in nlp.pipe_labels.items():
assert sorted(input_labels[name]) == sorted(labels)
def test_pipe_inter_dependencies():
class Fancifier:
name = "fancifier"
assigns = ("doc._.fancy",)
requires = tuple()
class FancyNeeder:
name = "needer"
assigns = tuple()
requires = ("doc._.fancy",)
pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
counts = _count_pipeline_inter_dependencies(pipeline)
assert counts == [1, 0]