From 22b9e1215932f9111c49d4ec5e1ccd25bab1fc3c Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Fri, 27 Sep 2019 20:57:13 +0200
Subject: [PATCH] Ensure the NER remains consistent after resizing (#4330)

* test and fix for second bug of issue 4042

* fix for first bug in 4042

* crashing test for Issue 4313

* forgot one instance of resize

* remove prints

* undo uncomment

* delete test for 4313 (uses third party lib)

* add fix for Issue 4313

* unit test for 4313
---
 spacy/pipeline/entityruler.py            | 37 ++++++-----
 spacy/syntax/nn_parser.pyx               | 21 ++++--
 spacy/syntax/transition_system.pyx       |  7 ++
 spacy/tests/regression/test_issue4042.py | 83 ++++++++++++++++++++++++
 spacy/tests/regression/test_issue4313.py | 39 +++++++++++
 5 files changed, 165 insertions(+), 22 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue4042.py
 create mode 100644 spacy/tests/regression/test_issue4313.py

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index a1d3f922e..956d67291 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -180,21 +180,28 @@ class EntityRuler(object):
 
         DOCS: https://spacy.io/api/entityruler#add_patterns
         """
-        for entry in patterns:
-            label = entry["label"]
-            if "id" in entry:
-                label = self._create_label(label, entry["id"])
-            pattern = entry["pattern"]
-            if isinstance(pattern, basestring_):
-                self.phrase_patterns[label].append(self.nlp(pattern))
-            elif isinstance(pattern, list):
-                self.token_patterns[label].append(pattern)
-            else:
-                raise ValueError(Errors.E097.format(pattern=pattern))
-        for label, patterns in self.token_patterns.items():
-            self.matcher.add(label, None, *patterns)
-        for label, patterns in self.phrase_patterns.items():
-            self.phrase_matcher.add(label, None, *patterns)
+        # disable the nlp components after this one in case they hadn't been initialized / deserialised yet
+        try:
+            current_index = self.nlp.pipe_names.index(self.name)
+            subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
+        except ValueError:
+            subsequent_pipes = []
+        with self.nlp.disable_pipes(*subsequent_pipes):
+            for entry in patterns:
+                label = entry["label"]
+                if "id" in entry:
+                    label = self._create_label(label, entry["id"])
+                pattern = entry["pattern"]
+                if isinstance(pattern, basestring_):
+                    self.phrase_patterns[label].append(self.nlp(pattern))
+                elif isinstance(pattern, list):
+                    self.token_patterns[label].append(pattern)
+                else:
+                    raise ValueError(Errors.E097.format(pattern=pattern))
+            for label, patterns in self.token_patterns.items():
+                self.matcher.add(label, None, *patterns)
+            for label, patterns in self.phrase_patterns.items():
+                self.phrase_matcher.add(label, None, *patterns)
 
     def _split_label(self, label):
         """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index bddc29527..aeb4a5306 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -163,10 +163,16 @@ cdef class Parser:
             added = self.moves.add_action(action, label)
             if added:
                 resized = True
-        if resized and "nr_class" in self.cfg:
+        if resized:
+            self._resize()
+
+    def _resize(self):
+        if "nr_class" in self.cfg:
             self.cfg["nr_class"] = self.moves.n_moves
-        if self.model not in (True, False, None) and resized:
+        if self.model not in (True, False, None):
             self.model.resize_output(self.moves.n_moves)
+        if self._rehearsal_model not in (True, False, None):
+            self._rehearsal_model.resize_output(self.moves.n_moves)
 
     def add_multitask_objective(self, target):
         # Defined in subclasses, to avoid circular import
@@ -237,7 +243,9 @@ cdef class Parser:
         if isinstance(docs, Doc):
             docs = [docs]
         if not any(len(doc) for doc in docs):
-            return self.moves.init_batch(docs)
+            result = self.moves.init_batch(docs)
+            self._resize()
+            return result
         if beam_width < 2:
             return self.greedy_parse(docs, drop=drop)
         else:
@@ -251,7 +259,7 @@ cdef class Parser:
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
         model = self.model(docs)
         weights = get_c_weights(model)
         for state in batch:
@@ -271,7 +279,7 @@ cdef class Parser:
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
+        self._resize()
         model = self.model(docs)
         token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
                                  dtype='i', order='C')
@@ -445,8 +453,7 @@ cdef class Parser:
         # This is pretty dirty, but the NER can resize itself in init_batch,
         # if labels are missing. We therefore have to check whether we need to
         # expand our model output.
-        self.model.resize_output(self.moves.n_moves)
-        self._rehearsal_model.resize_output(self.moves.n_moves)
+        self._resize()
         # Prepare the stepwise model, and get the callback for finishing the batch
         tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
         model, finish_update = self.model.begin_update(docs, drop=0.0)
diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx
index fede704b5..58b3a6993 100644
--- a/spacy/syntax/transition_system.pyx
+++ b/spacy/syntax/transition_system.pyx
@@ -63,6 +63,13 @@ cdef class TransitionSystem:
         cdef Doc doc
         beams = []
         cdef int offset = 0
+
+        # Doc objects might contain labels that we need to register actions for. We need to check for that
+        # *before* we create any Beam objects, because the Beam object needs the correct number of
+        # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
+        # and it doesn't matter that we create and discard the state objects.
+        self.init_batch(docs)
+
         for doc in docs:
             beam = Beam(self.n_moves, beam_width, min_density=beam_density)
             beam.initialize(self.init_beam_state, doc.length, doc.c)
diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py
new file mode 100644
index 000000000..73872df0b
--- /dev/null
+++ b/spacy/tests/regression/test_issue4042.py
@@ -0,0 +1,83 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import spacy
+
+from spacy.pipeline import EntityRecognizer, EntityRuler
+
+from spacy.lang.en import English
+from spacy.tests.util import make_tempdir
+from spacy.tokens import Span
+from spacy.util import ensure_path
+
+
+def test_issue4042():
+    """Test that serialization of an EntityRuler before NER works fine."""
+    nlp = English()
+
+    # add ner pipe
+    ner = nlp.create_pipe("ner")
+    ner.add_label("SOME_LABEL")
+    nlp.add_pipe(ner)
+    nlp.begin_training()
+
+    # Add entity ruler
+    ruler = EntityRuler(nlp)
+    patterns = [
+        {"label": "MY_ORG", "pattern": "Apple"},
+        {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
+    ]
+    ruler.add_patterns(patterns)
+    nlp.add_pipe(ruler, before="ner")  # works fine with "after"
+    doc1 = nlp("What do you think about Apple ?")
+    assert doc1.ents[0].label_ == "MY_ORG"
+
+    with make_tempdir() as d:
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        nlp.to_disk(output_dir)
+
+        nlp2 = spacy.load(output_dir)
+        doc2 = nlp2("What do you think about Apple ?")
+        assert doc2.ents[0].label_ == "MY_ORG"
+
+
+def test_issue4042_bug2():
+    """
+    Test that serialization of an NER works fine when new labels were added.
+    This is the second bug of two bugs underlying the issue 4042.
+    """
+    nlp1 = English()
+    vocab = nlp1.vocab
+
+    # add ner pipe
+    ner1 = nlp1.create_pipe("ner")
+    ner1.add_label("SOME_LABEL")
+    nlp1.add_pipe(ner1)
+    nlp1.begin_training()
+
+    # add a new label to the doc
+    doc1 = nlp1("What do you think about Apple ?")
+    assert len(ner1.labels) == 1
+    assert "SOME_LABEL" in ner1.labels
+    apple_ent = Span(doc1, 5, 6, label="MY_ORG")
+    doc1.ents = list(doc1.ents) + [apple_ent]
+
+    # reapply the NER - at this point it should resize itself
+    ner1(doc1)
+    assert len(ner1.labels) == 2
+    assert "SOME_LABEL" in ner1.labels
+    assert "MY_ORG" in ner1.labels
+
+    with make_tempdir() as d:
+        # assert IO goes fine
+        output_dir = ensure_path(d)
+        if not output_dir.exists():
+            output_dir.mkdir()
+        ner1.to_disk(output_dir)
+
+        nlp2 = English(vocab)
+        ner2 = EntityRecognizer(vocab)
+        ner2.from_disk(output_dir)
+        assert len(ner2.labels) == 2
diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py
new file mode 100644
index 000000000..c68f745a7
--- /dev/null
+++ b/spacy/tests/regression/test_issue4313.py
@@ -0,0 +1,39 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from collections import defaultdict
+
+from spacy.pipeline import EntityRecognizer
+
+from spacy.lang.en import English
+from spacy.tokens import Span
+
+
+def test_issue4313():
+    """ This should not crash or exit with some strange error code """
+    beam_width = 16
+    beam_density = 0.0001
+    nlp = English()
+    ner = EntityRecognizer(nlp.vocab)
+    ner.add_label("SOME_LABEL")
+    ner.begin_training([])
+    nlp.add_pipe(ner)
+
+    # add a new label to the doc
+    doc = nlp("What do you think about Apple ?")
+    assert len(ner.labels) == 1
+    assert "SOME_LABEL" in ner.labels
+    apple_ent = Span(doc, 5, 6, label="MY_ORG")
+    doc.ents = list(doc.ents) + [apple_ent]
+
+    # ensure the beam_parse still works with the new label
+    docs = [doc]
+    beams = nlp.entity.beam_parse(
+        docs, beam_width=beam_width, beam_density=beam_density
+    )
+
+    for doc, beam in zip(docs, beams):
+        entity_scores = defaultdict(float)
+        for score, ents in nlp.entity.moves.get_beam_parses(beam):
+            for start, end, label in ents:
+                entity_scores[(start, end, label)] += score