Ensure the NER remains consistent after resizing (#4330)

* test and fix for second bug of issue 4042 * fix for first bug in 4042 * crashing test for Issue 4313 * forgot one instance of resize * remove prints * undo uncomment * delete test for 4313 (uses third party lib) * add fix for Issue 4313 * unit test for 4313
2019-09-27 20:57:13 +02:00 · 2019-09-27 20:57:13 +02:00 · 22b9e12159
parent 3906785b49
commit 22b9e12159
5 changed files with 165 additions and 22 deletions
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -180,21 +180,28 @@ class EntityRuler(object):

        DOCS: https://spacy.io/api/entityruler#add_patterns
        """
-        for entry in patterns:
-            label = entry["label"]
-            if "id" in entry:
-                label = self._create_label(label, entry["id"])
-            pattern = entry["pattern"]
-            if isinstance(pattern, basestring_):
-                self.phrase_patterns[label].append(self.nlp(pattern))
-            elif isinstance(pattern, list):
-                self.token_patterns[label].append(pattern)
-            else:
-                raise ValueError(Errors.E097.format(pattern=pattern))
-        for label, patterns in self.token_patterns.items():
-            self.matcher.add(label, None, *patterns)
-        for label, patterns in self.phrase_patterns.items():
-            self.phrase_matcher.add(label, None, *patterns)
+        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
+        try:
+            current_index = self.nlp.pipe_names.index(self.name)
+            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
+        except ValueError:
+            subsequent_pipes = []
+        with self.nlp.disable_pipes(*subsequent_pipes):
+            for entry in patterns:
+                label = entry["label"]
+                if "id" in entry:
+                    label = self._create_label(label, entry["id"])
+                pattern = entry["pattern"]
+                if isinstance(pattern, basestring_):
+                    self.phrase_patterns[label].append(self.nlp(pattern))
+                elif isinstance(pattern, list):
+                    self.token_patterns[label].append(pattern)
+                else:
+                    raise ValueError(Errors.E097.format(pattern=pattern))
+            for label, patterns in self.token_patterns.items():
+                self.matcher.add(label, None, *patterns)
+            for label, patterns in self.phrase_patterns.items():
+                self.phrase_matcher.add(label, None, *patterns)

    def _split_label(self, label):
        """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -163,10 +163,16 @@ cdef class Parser:
            added = self.moves.add_action(action, label)
            if added:
                resized = True
-        if resized and "nr_class" in self.cfg:
+        if resized:
+            self._resize()
+
+    def _resize(self):
+        if "nr_class" in self.cfg:
            self.cfg["nr_class"] = self.moves.n_moves
-        if self.model not in (True, False, None) and resized:
+        if self.model not in (True, False, None):
            self.model.resize_output(self.moves.n_moves)
+        if self._rehearsal_model not in (True, False, None):
+            self._rehearsal_model.resize_output(self.moves.n_moves)

    def add_multitask_objective(self, target):
        # Defined in subclasses, to avoid circular import
@ -237,7 +243,9 @@ cdef class Parser:
        if isinstance(docs, Doc):
            docs = [docs]
        if not any(len(doc) for doc in docs):
-            return self.moves.init_batch(docs)
+            result = self.moves.init_batch(docs)
+            self._resize()
+            return result
        if beam_width < 2:
            return self.greedy_parse(docs, drop=drop)
        else:
@ -251,7 +259,7 @@ cdef class Parser:
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
        model = self.model(docs)
        weights = get_c_weights(model)
        for state in batch:
@ -271,7 +279,7 @@ cdef class Parser:
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
        model = self.model(docs)
        token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
                                 dtype='i', order='C')
@ -445,8 +453,7 @@ cdef class Parser:
        # This is pretty dirty, but the NER can resize itself in init_batch,
        # if labels are missing. We therefore have to check whether we need to
        # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
-        self._rehearsal_model.resize_output(self.moves.n_moves)
+        self._resize()
        # Prepare the stepwise model, and get the callback for finishing the batch
        tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
        model, finish_update = self.model.begin_update(docs, drop=0.0)
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@ -63,6 +63,13 @@ cdef class TransitionSystem:
        cdef Doc doc
        beams = []
        cdef int offset = 0
+
+        # Doc objects might contain labels that we need to register actions for. We need to check for that
+        # *before* we create any Beam objects, because the Beam object needs the correct number of
+        # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
+        # and it doesn't matter that we create and discard the state objects.
+        self.init_batch(docs)
+
        for doc in docs:
            beam = Beam(self.n_moves, beam_width, min_density=beam_density)
            beam.initialize(self.init_beam_state, doc.length, doc.c)
--- a/spacy/tests/regression/test_issue4042.py
+++ b/spacy/tests/regression/test_issue4042.py
@ -0,0 +1,83 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import spacy
+
+from spacy.pipeline import EntityRecognizer, EntityRuler
+
+from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
+from spacy.tokens import Span
+from spacy.util import ensure_path
+
+
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+
+    # add ner pipe
+    ner = nlp.create_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    # Add entity ruler
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+
+        nlp2 = spacy.load(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    vocab = nlp1.vocab
+
+    # add ner pipe
+    ner1 = nlp1.create_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.add_pipe(ner1)
+    nlp1.begin_training()
+
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+
+    # reapply the NER - at this point it should resize itself
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+
+        nlp2 = English(vocab)
+        ner2 = EntityRecognizer(vocab)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
--- a/spacy/tests/regression/test_issue4313.py
+++ b/spacy/tests/regression/test_issue4313.py
@ -0,0 +1,39 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from collections import defaultdict
+
+from spacy.pipeline import EntityRecognizer
+
+from spacy.lang.en import English
+from spacy.tokens import Span
+
+
+def test_issue4313():
+    """ This should not crash or exit with some strange error code """
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    ner = EntityRecognizer(nlp.vocab)
+    ner.add_label("SOME_LABEL")
+    ner.begin_training([])
+    nlp.add_pipe(ner)
+
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    beams = nlp.entity.beam_parse(
+        docs, beam_width=beam_width, beam_density=beam_density
+    )
+
+    for doc, beam in zip(docs, beams):
+        entity_scores = defaultdict(float)
+        for score, ents in nlp.entity.moves.get_beam_parses(beam):
+            for start, end, label in ents:
+                entity_scores[(start, end, label)] += score