Ensure the NER remains consistent after resizing (#4330)

* test and fix for second bug of issue 4042

* fix for first bug in 4042

* crashing test for Issue 4313

* forgot one instance of resize

* remove prints

* undo uncomment

* delete test for 4313 (uses third party lib)

* add fix for Issue 4313

* unit test for 4313
This commit is contained in:
Sofie Van Landeghem 2019-09-27 20:57:13 +02:00 committed by Matthew Honnibal
parent 3906785b49
commit 22b9e12159
5 changed files with 165 additions and 22 deletions

View File

@ -180,21 +180,28 @@ class EntityRuler(object):
DOCS: https://spacy.io/api/entityruler#add_patterns
"""
for entry in patterns:
label = entry["label"]
if "id" in entry:
label = self._create_label(label, entry["id"])
pattern = entry["pattern"]
if isinstance(pattern, basestring_):
self.phrase_patterns[label].append(self.nlp(pattern))
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
else:
raise ValueError(Errors.E097.format(pattern=pattern))
for label, patterns in self.token_patterns.items():
self.matcher.add(label, None, *patterns)
for label, patterns in self.phrase_patterns.items():
self.phrase_matcher.add(label, None, *patterns)
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
try:
current_index = self.nlp.pipe_names.index(self.name)
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
except ValueError:
subsequent_pipes = []
with self.nlp.disable_pipes(*subsequent_pipes):
for entry in patterns:
label = entry["label"]
if "id" in entry:
label = self._create_label(label, entry["id"])
pattern = entry["pattern"]
if isinstance(pattern, basestring_):
self.phrase_patterns[label].append(self.nlp(pattern))
elif isinstance(pattern, list):
self.token_patterns[label].append(pattern)
else:
raise ValueError(Errors.E097.format(pattern=pattern))
for label, patterns in self.token_patterns.items():
self.matcher.add(label, None, *patterns)
for label, patterns in self.phrase_patterns.items():
self.phrase_matcher.add(label, None, *patterns)
def _split_label(self, label):
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep

View File

@ -163,10 +163,16 @@ cdef class Parser:
added = self.moves.add_action(action, label)
if added:
resized = True
if resized and "nr_class" in self.cfg:
if resized:
self._resize()
def _resize(self):
if "nr_class" in self.cfg:
self.cfg["nr_class"] = self.moves.n_moves
if self.model not in (True, False, None) and resized:
if self.model not in (True, False, None):
self.model.resize_output(self.moves.n_moves)
if self._rehearsal_model not in (True, False, None):
self._rehearsal_model.resize_output(self.moves.n_moves)
def add_multitask_objective(self, target):
# Defined in subclasses, to avoid circular import
@ -237,7 +243,9 @@ cdef class Parser:
if isinstance(docs, Doc):
docs = [docs]
if not any(len(doc) for doc in docs):
return self.moves.init_batch(docs)
result = self.moves.init_batch(docs)
self._resize()
return result
if beam_width < 2:
return self.greedy_parse(docs, drop=drop)
else:
@ -251,7 +259,7 @@ cdef class Parser:
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self.model.resize_output(self.moves.n_moves)
self._resize()
model = self.model(docs)
weights = get_c_weights(model)
for state in batch:
@ -271,7 +279,7 @@ cdef class Parser:
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self.model.resize_output(self.moves.n_moves)
self._resize()
model = self.model(docs)
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
dtype='i', order='C')
@ -445,8 +453,7 @@ cdef class Parser:
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
self.model.resize_output(self.moves.n_moves)
self._rehearsal_model.resize_output(self.moves.n_moves)
self._resize()
# Prepare the stepwise model, and get the callback for finishing the batch
tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
model, finish_update = self.model.begin_update(docs, drop=0.0)

View File

@ -63,6 +63,13 @@ cdef class TransitionSystem:
cdef Doc doc
beams = []
cdef int offset = 0
# Doc objects might contain labels that we need to register actions for. We need to check for that
# *before* we create any Beam objects, because the Beam object needs the correct number of
# actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
# and it doesn't matter that we create and discard the state objects.
self.init_batch(docs)
for doc in docs:
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
beam.initialize(self.init_beam_state, doc.length, doc.c)

View File

@ -0,0 +1,83 @@
# coding: utf8
from __future__ import unicode_literals
import spacy
from spacy.pipeline import EntityRecognizer, EntityRuler
from spacy.lang.en import English
from spacy.tests.util import make_tempdir
from spacy.tokens import Span
from spacy.util import ensure_path
def test_issue4042():
"""Test that serialization of an EntityRuler before NER works fine."""
nlp = English()
# add ner pipe
ner = nlp.create_pipe("ner")
ner.add_label("SOME_LABEL")
nlp.add_pipe(ner)
nlp.begin_training()
# Add entity ruler
ruler = EntityRuler(nlp)
patterns = [
{"label": "MY_ORG", "pattern": "Apple"},
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler, before="ner") # works fine with "after"
doc1 = nlp("What do you think about Apple ?")
assert doc1.ents[0].label_ == "MY_ORG"
with make_tempdir() as d:
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
nlp2 = spacy.load(output_dir)
doc2 = nlp2("What do you think about Apple ?")
assert doc2.ents[0].label_ == "MY_ORG"
def test_issue4042_bug2():
"""
Test that serialization of an NER works fine when new labels were added.
This is the second bug of two bugs underlying the issue 4042.
"""
nlp1 = English()
vocab = nlp1.vocab
# add ner pipe
ner1 = nlp1.create_pipe("ner")
ner1.add_label("SOME_LABEL")
nlp1.add_pipe(ner1)
nlp1.begin_training()
# add a new label to the doc
doc1 = nlp1("What do you think about Apple ?")
assert len(ner1.labels) == 1
assert "SOME_LABEL" in ner1.labels
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
doc1.ents = list(doc1.ents) + [apple_ent]
# reapply the NER - at this point it should resize itself
ner1(doc1)
assert len(ner1.labels) == 2
assert "SOME_LABEL" in ner1.labels
assert "MY_ORG" in ner1.labels
with make_tempdir() as d:
# assert IO goes fine
output_dir = ensure_path(d)
if not output_dir.exists():
output_dir.mkdir()
ner1.to_disk(output_dir)
nlp2 = English(vocab)
ner2 = EntityRecognizer(vocab)
ner2.from_disk(output_dir)
assert len(ner2.labels) == 2

View File

@ -0,0 +1,39 @@
# coding: utf8
from __future__ import unicode_literals
from collections import defaultdict
from spacy.pipeline import EntityRecognizer
from spacy.lang.en import English
from spacy.tokens import Span
def test_issue4313():
""" This should not crash or exit with some strange error code """
beam_width = 16
beam_density = 0.0001
nlp = English()
ner = EntityRecognizer(nlp.vocab)
ner.add_label("SOME_LABEL")
ner.begin_training([])
nlp.add_pipe(ner)
# add a new label to the doc
doc = nlp("What do you think about Apple ?")
assert len(ner.labels) == 1
assert "SOME_LABEL" in ner.labels
apple_ent = Span(doc, 5, 6, label="MY_ORG")
doc.ents = list(doc.ents) + [apple_ent]
# ensure the beam_parse still works with the new label
docs = [doc]
beams = nlp.entity.beam_parse(
docs, beam_width=beam_width, beam_density=beam_density
)
for doc, beam in zip(docs, beams):
entity_scores = defaultdict(float)
for score, ents in nlp.entity.moves.get_beam_parses(beam):
for start, end, label in ents:
entity_scores[(start, end, label)] += score