From 22b9e1215932f9111c49d4ec5e1ccd25bab1fc3c Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 27 Sep 2019 20:57:13 +0200 Subject: [PATCH] Ensure the NER remains consistent after resizing (#4330) * test and fix for second bug of issue 4042 * fix for first bug in 4042 * crashing test for Issue 4313 * forgot one instance of resize * remove prints * undo uncomment * delete test for 4313 (uses third party lib) * add fix for Issue 4313 * unit test for 4313 --- spacy/pipeline/entityruler.py | 37 ++++++----- spacy/syntax/nn_parser.pyx | 21 ++++-- spacy/syntax/transition_system.pyx | 7 ++ spacy/tests/regression/test_issue4042.py | 83 ++++++++++++++++++++++++ spacy/tests/regression/test_issue4313.py | 39 +++++++++++ 5 files changed, 165 insertions(+), 22 deletions(-) create mode 100644 spacy/tests/regression/test_issue4042.py create mode 100644 spacy/tests/regression/test_issue4313.py diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py index a1d3f922e..956d67291 100644 --- a/spacy/pipeline/entityruler.py +++ b/spacy/pipeline/entityruler.py @@ -180,21 +180,28 @@ class EntityRuler(object): DOCS: https://spacy.io/api/entityruler#add_patterns """ - for entry in patterns: - label = entry["label"] - if "id" in entry: - label = self._create_label(label, entry["id"]) - pattern = entry["pattern"] - if isinstance(pattern, basestring_): - self.phrase_patterns[label].append(self.nlp(pattern)) - elif isinstance(pattern, list): - self.token_patterns[label].append(pattern) - else: - raise ValueError(Errors.E097.format(pattern=pattern)) - for label, patterns in self.token_patterns.items(): - self.matcher.add(label, None, *patterns) - for label, patterns in self.phrase_patterns.items(): - self.phrase_matcher.add(label, None, *patterns) + # disable the nlp components after this one in case they hadn't been initialized / deserialised yet + try: + current_index = self.nlp.pipe_names.index(self.name) + subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]] + except ValueError: + subsequent_pipes = [] + with self.nlp.disable_pipes(*subsequent_pipes): + for entry in patterns: + label = entry["label"] + if "id" in entry: + label = self._create_label(label, entry["id"]) + pattern = entry["pattern"] + if isinstance(pattern, basestring_): + self.phrase_patterns[label].append(self.nlp(pattern)) + elif isinstance(pattern, list): + self.token_patterns[label].append(pattern) + else: + raise ValueError(Errors.E097.format(pattern=pattern)) + for label, patterns in self.token_patterns.items(): + self.matcher.add(label, None, *patterns) + for label, patterns in self.phrase_patterns.items(): + self.phrase_matcher.add(label, None, *patterns) def _split_label(self, label): """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx index bddc29527..aeb4a5306 100644 --- a/spacy/syntax/nn_parser.pyx +++ b/spacy/syntax/nn_parser.pyx @@ -163,10 +163,16 @@ cdef class Parser: added = self.moves.add_action(action, label) if added: resized = True - if resized and "nr_class" in self.cfg: + if resized: + self._resize() + + def _resize(self): + if "nr_class" in self.cfg: self.cfg["nr_class"] = self.moves.n_moves - if self.model not in (True, False, None) and resized: + if self.model not in (True, False, None): self.model.resize_output(self.moves.n_moves) + if self._rehearsal_model not in (True, False, None): + self._rehearsal_model.resize_output(self.moves.n_moves) def add_multitask_objective(self, target): # Defined in subclasses, to avoid circular import @@ -237,7 +243,9 @@ cdef class Parser: if isinstance(docs, Doc): docs = [docs] if not any(len(doc) for doc in docs): - return self.moves.init_batch(docs) + result = self.moves.init_batch(docs) + self._resize() + return result if beam_width < 2: return self.greedy_parse(docs, drop=drop) else: @@ -251,7 +259,7 @@ cdef class Parser: # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. - self.model.resize_output(self.moves.n_moves) + self._resize() model = self.model(docs) weights = get_c_weights(model) for state in batch: @@ -271,7 +279,7 @@ cdef class Parser: # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. - self.model.resize_output(self.moves.n_moves) + self._resize() model = self.model(docs) token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature), dtype='i', order='C') @@ -445,8 +453,7 @@ cdef class Parser: # This is pretty dirty, but the NER can resize itself in init_batch, # if labels are missing. We therefore have to check whether we need to # expand our model output. - self.model.resize_output(self.moves.n_moves) - self._rehearsal_model.resize_output(self.moves.n_moves) + self._resize() # Prepare the stepwise model, and get the callback for finishing the batch tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0) model, finish_update = self.model.begin_update(docs, drop=0.0) diff --git a/spacy/syntax/transition_system.pyx b/spacy/syntax/transition_system.pyx index fede704b5..58b3a6993 100644 --- a/spacy/syntax/transition_system.pyx +++ b/spacy/syntax/transition_system.pyx @@ -63,6 +63,13 @@ cdef class TransitionSystem: cdef Doc doc beams = [] cdef int offset = 0 + + # Doc objects might contain labels that we need to register actions for. We need to check for that + # *before* we create any Beam objects, because the Beam object needs the correct number of + # actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions, + # and it doesn't matter that we create and discard the state objects. + self.init_batch(docs) + for doc in docs: beam = Beam(self.n_moves, beam_width, min_density=beam_density) beam.initialize(self.init_beam_state, doc.length, doc.c) diff --git a/spacy/tests/regression/test_issue4042.py b/spacy/tests/regression/test_issue4042.py new file mode 100644 index 000000000..73872df0b --- /dev/null +++ b/spacy/tests/regression/test_issue4042.py @@ -0,0 +1,83 @@ +# coding: utf8 +from __future__ import unicode_literals + +import spacy + +from spacy.pipeline import EntityRecognizer, EntityRuler + +from spacy.lang.en import English +from spacy.tests.util import make_tempdir +from spacy.tokens import Span +from spacy.util import ensure_path + + +def test_issue4042(): + """Test that serialization of an EntityRuler before NER works fine.""" + nlp = English() + + # add ner pipe + ner = nlp.create_pipe("ner") + ner.add_label("SOME_LABEL") + nlp.add_pipe(ner) + nlp.begin_training() + + # Add entity ruler + ruler = EntityRuler(nlp) + patterns = [ + {"label": "MY_ORG", "pattern": "Apple"}, + {"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]}, + ] + ruler.add_patterns(patterns) + nlp.add_pipe(ruler, before="ner") # works fine with "after" + doc1 = nlp("What do you think about Apple ?") + assert doc1.ents[0].label_ == "MY_ORG" + + with make_tempdir() as d: + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + + nlp2 = spacy.load(output_dir) + doc2 = nlp2("What do you think about Apple ?") + assert doc2.ents[0].label_ == "MY_ORG" + + +def test_issue4042_bug2(): + """ + Test that serialization of an NER works fine when new labels were added. + This is the second bug of two bugs underlying the issue 4042. + """ + nlp1 = English() + vocab = nlp1.vocab + + # add ner pipe + ner1 = nlp1.create_pipe("ner") + ner1.add_label("SOME_LABEL") + nlp1.add_pipe(ner1) + nlp1.begin_training() + + # add a new label to the doc + doc1 = nlp1("What do you think about Apple ?") + assert len(ner1.labels) == 1 + assert "SOME_LABEL" in ner1.labels + apple_ent = Span(doc1, 5, 6, label="MY_ORG") + doc1.ents = list(doc1.ents) + [apple_ent] + + # reapply the NER - at this point it should resize itself + ner1(doc1) + assert len(ner1.labels) == 2 + assert "SOME_LABEL" in ner1.labels + assert "MY_ORG" in ner1.labels + + with make_tempdir() as d: + # assert IO goes fine + output_dir = ensure_path(d) + if not output_dir.exists(): + output_dir.mkdir() + ner1.to_disk(output_dir) + + nlp2 = English(vocab) + ner2 = EntityRecognizer(vocab) + ner2.from_disk(output_dir) + assert len(ner2.labels) == 2 diff --git a/spacy/tests/regression/test_issue4313.py b/spacy/tests/regression/test_issue4313.py new file mode 100644 index 000000000..c68f745a7 --- /dev/null +++ b/spacy/tests/regression/test_issue4313.py @@ -0,0 +1,39 @@ +# coding: utf8 +from __future__ import unicode_literals + +from collections import defaultdict + +from spacy.pipeline import EntityRecognizer + +from spacy.lang.en import English +from spacy.tokens import Span + + +def test_issue4313(): + """ This should not crash or exit with some strange error code """ + beam_width = 16 + beam_density = 0.0001 + nlp = English() + ner = EntityRecognizer(nlp.vocab) + ner.add_label("SOME_LABEL") + ner.begin_training([]) + nlp.add_pipe(ner) + + # add a new label to the doc + doc = nlp("What do you think about Apple ?") + assert len(ner.labels) == 1 + assert "SOME_LABEL" in ner.labels + apple_ent = Span(doc, 5, 6, label="MY_ORG") + doc.ents = list(doc.ents) + [apple_ent] + + # ensure the beam_parse still works with the new label + docs = [doc] + beams = nlp.entity.beam_parse( + docs, beam_width=beam_width, beam_density=beam_density + ) + + for doc, beam in zip(docs, beams): + entity_scores = defaultdict(float) + for score, ents in nlp.entity.moves.get_beam_parses(beam): + for start, end, label in ents: + entity_scores[(start, end, label)] += score