Guess set_annotations=True in nlp.update

During `nlp.update`, components can be passed a boolean set_annotations to indicate whether they should assign annotations to the `Doc`. This needs to be called if downstream components expect to use the annotations during training, e.g. if we wanted to use tagger features in the parser. Components can specify their assignments and requirements, so we can figure out which components have these inter-dependencies. After figuring this out, we can guess whether to pass set_annotations=True. We could also call set_annotations=True always, or even just have this as the only behaviour. The downside of this is that it would require the `Doc` objects to be created afresh to avoid problematic modifications. One approach would be to make a fresh copy of the `Doc` objects within `nlp.update()`, so that we can write to the objects without any problems. If we do that, we can drop this logic and also drop the `set_annotations` mechanism. I would be fine with that approach, although it runs the risk of introducing some performance overhead, and we'll have to take care to copy all extension attributes etc.
2020-05-22 15:55:45 +02:00 · 2020-05-22 15:55:45 +02:00 · 78d79d94ce
parent 25b51f4fc8
commit 78d79d94ce
2 changed files with 39 additions and 3 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -545,13 +545,14 @@ class Language(object):

        if component_cfg is None:
            component_cfg = {}
+        component_deps = _count_pipeline_inter_dependencies(self.pipeline)
        # Determine whether component should set annotations. In theory I guess
        # we should do this by inspecting the meta? Or we could just always
        # say "yes"
-        for name, proc in self.pipeline:
+        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
            component_cfg[name].setdefault("drop", drop)
-            component_cfg[name].setdefault("set_annotations", False)
+            component_cfg[name]["set_annotations"] = bool(component_deps[i])
        for name, proc in self.pipeline:
            if not hasattr(proc, "update"):
                continue
@ -1159,6 +1160,25 @@ class DisabledPipes(list):
        self[:] = []


+def _count_pipeline_inter_dependencies(pipeline):
+    """Count how many subsequent components require an annotation set by each
+    component in the pipeline.
+    """
+    pipe_assigns = []
+    pipe_requires = []
+    for name, pipe in pipeline:
+        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
+        pipe_requires.append(set(getattr(pipe, "requires", [])))
+    counts = []
+    for i, assigns in enumerate(pipe_assigns):
+        count = 0
+        for requires in pipe_requires[i+1:]:
+            if assigns.intersection(requires):
+                count += 1
+        counts.append(count)
+    return counts
+
+
 def _pipe(examples, proc, kwargs):
    # We added some args for pipe that __call__ doesn't expect.
    kwargs = dict(kwargs)
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -1,5 +1,5 @@
 import pytest
-from spacy.language import Language
+from spacy.language import Language, _count_pipeline_inter_dependencies


@pytest.fixture
@ -198,3 +198,19 @@ def test_pipe_labels(nlp):
    assert len(nlp.pipe_labels) == len(input_labels)
    for name, labels in nlp.pipe_labels.items():
        assert sorted(input_labels[name]) == sorted(labels)
+
+
+def test_pipe_inter_dependencies():
+    class Fancifier:
+        name = "fancifier"
+        assigns = ("doc._.fancy",)
+        requires = tuple()
+    
+    class FancyNeeder:
+        name = "needer"
+        assigns = tuple()
+        requires = ("doc._.fancy",)
+
+    pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
+    counts = _count_pipeline_inter_dependencies(pipeline)
+    assert counts == [1, 0]