mirror of https://github.com/explosion/spaCy.git
Ensure the NER remains consistent after resizing (#4330)
* test and fix for second bug of issue 4042 * fix for first bug in 4042 * crashing test for Issue 4313 * forgot one instance of resize * remove prints * undo uncomment * delete test for 4313 (uses third party lib) * add fix for Issue 4313 * unit test for 4313
This commit is contained in:
parent
3906785b49
commit
22b9e12159
|
@ -180,21 +180,28 @@ class EntityRuler(object):
|
|||
|
||||
DOCS: https://spacy.io/api/entityruler#add_patterns
|
||||
"""
|
||||
for entry in patterns:
|
||||
label = entry["label"]
|
||||
if "id" in entry:
|
||||
label = self._create_label(label, entry["id"])
|
||||
pattern = entry["pattern"]
|
||||
if isinstance(pattern, basestring_):
|
||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
else:
|
||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||
for label, patterns in self.token_patterns.items():
|
||||
self.matcher.add(label, None, *patterns)
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
self.phrase_matcher.add(label, None, *patterns)
|
||||
# disable the nlp components after this one in case they hadn't been initialized / deserialised yet
|
||||
try:
|
||||
current_index = self.nlp.pipe_names.index(self.name)
|
||||
subsequent_pipes = [pipe for pipe in self.nlp.pipe_names[current_index + 1:]]
|
||||
except ValueError:
|
||||
subsequent_pipes = []
|
||||
with self.nlp.disable_pipes(*subsequent_pipes):
|
||||
for entry in patterns:
|
||||
label = entry["label"]
|
||||
if "id" in entry:
|
||||
label = self._create_label(label, entry["id"])
|
||||
pattern = entry["pattern"]
|
||||
if isinstance(pattern, basestring_):
|
||||
self.phrase_patterns[label].append(self.nlp(pattern))
|
||||
elif isinstance(pattern, list):
|
||||
self.token_patterns[label].append(pattern)
|
||||
else:
|
||||
raise ValueError(Errors.E097.format(pattern=pattern))
|
||||
for label, patterns in self.token_patterns.items():
|
||||
self.matcher.add(label, None, *patterns)
|
||||
for label, patterns in self.phrase_patterns.items():
|
||||
self.phrase_matcher.add(label, None, *patterns)
|
||||
|
||||
def _split_label(self, label):
|
||||
"""Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
|
||||
|
|
|
@ -163,10 +163,16 @@ cdef class Parser:
|
|||
added = self.moves.add_action(action, label)
|
||||
if added:
|
||||
resized = True
|
||||
if resized and "nr_class" in self.cfg:
|
||||
if resized:
|
||||
self._resize()
|
||||
|
||||
def _resize(self):
|
||||
if "nr_class" in self.cfg:
|
||||
self.cfg["nr_class"] = self.moves.n_moves
|
||||
if self.model not in (True, False, None) and resized:
|
||||
if self.model not in (True, False, None):
|
||||
self.model.resize_output(self.moves.n_moves)
|
||||
if self._rehearsal_model not in (True, False, None):
|
||||
self._rehearsal_model.resize_output(self.moves.n_moves)
|
||||
|
||||
def add_multitask_objective(self, target):
|
||||
# Defined in subclasses, to avoid circular import
|
||||
|
@ -237,7 +243,9 @@ cdef class Parser:
|
|||
if isinstance(docs, Doc):
|
||||
docs = [docs]
|
||||
if not any(len(doc) for doc in docs):
|
||||
return self.moves.init_batch(docs)
|
||||
result = self.moves.init_batch(docs)
|
||||
self._resize()
|
||||
return result
|
||||
if beam_width < 2:
|
||||
return self.greedy_parse(docs, drop=drop)
|
||||
else:
|
||||
|
@ -251,7 +259,7 @@ cdef class Parser:
|
|||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
self.model.resize_output(self.moves.n_moves)
|
||||
self._resize()
|
||||
model = self.model(docs)
|
||||
weights = get_c_weights(model)
|
||||
for state in batch:
|
||||
|
@ -271,7 +279,7 @@ cdef class Parser:
|
|||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
self.model.resize_output(self.moves.n_moves)
|
||||
self._resize()
|
||||
model = self.model(docs)
|
||||
token_ids = numpy.zeros((len(docs) * beam_width, self.nr_feature),
|
||||
dtype='i', order='C')
|
||||
|
@ -445,8 +453,7 @@ cdef class Parser:
|
|||
# This is pretty dirty, but the NER can resize itself in init_batch,
|
||||
# if labels are missing. We therefore have to check whether we need to
|
||||
# expand our model output.
|
||||
self.model.resize_output(self.moves.n_moves)
|
||||
self._rehearsal_model.resize_output(self.moves.n_moves)
|
||||
self._resize()
|
||||
# Prepare the stepwise model, and get the callback for finishing the batch
|
||||
tutor, _ = self._rehearsal_model.begin_update(docs, drop=0.0)
|
||||
model, finish_update = self.model.begin_update(docs, drop=0.0)
|
||||
|
|
|
@ -63,6 +63,13 @@ cdef class TransitionSystem:
|
|||
cdef Doc doc
|
||||
beams = []
|
||||
cdef int offset = 0
|
||||
|
||||
# Doc objects might contain labels that we need to register actions for. We need to check for that
|
||||
# *before* we create any Beam objects, because the Beam object needs the correct number of
|
||||
# actions. It's sort of dumb, but the best way is to just call init_batch() -- that triggers the additions,
|
||||
# and it doesn't matter that we create and discard the state objects.
|
||||
self.init_batch(docs)
|
||||
|
||||
for doc in docs:
|
||||
beam = Beam(self.n_moves, beam_width, min_density=beam_density)
|
||||
beam.initialize(self.init_beam_state, doc.length, doc.c)
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import spacy
|
||||
|
||||
from spacy.pipeline import EntityRecognizer, EntityRuler
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tests.util import make_tempdir
|
||||
from spacy.tokens import Span
|
||||
from spacy.util import ensure_path
|
||||
|
||||
|
||||
def test_issue4042():
|
||||
"""Test that serialization of an EntityRuler before NER works fine."""
|
||||
nlp = English()
|
||||
|
||||
# add ner pipe
|
||||
ner = nlp.create_pipe("ner")
|
||||
ner.add_label("SOME_LABEL")
|
||||
nlp.add_pipe(ner)
|
||||
nlp.begin_training()
|
||||
|
||||
# Add entity ruler
|
||||
ruler = EntityRuler(nlp)
|
||||
patterns = [
|
||||
{"label": "MY_ORG", "pattern": "Apple"},
|
||||
{"label": "MY_GPE", "pattern": [{"lower": "san"}, {"lower": "francisco"}]},
|
||||
]
|
||||
ruler.add_patterns(patterns)
|
||||
nlp.add_pipe(ruler, before="ner") # works fine with "after"
|
||||
doc1 = nlp("What do you think about Apple ?")
|
||||
assert doc1.ents[0].label_ == "MY_ORG"
|
||||
|
||||
with make_tempdir() as d:
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
||||
nlp2 = spacy.load(output_dir)
|
||||
doc2 = nlp2("What do you think about Apple ?")
|
||||
assert doc2.ents[0].label_ == "MY_ORG"
|
||||
|
||||
|
||||
def test_issue4042_bug2():
|
||||
"""
|
||||
Test that serialization of an NER works fine when new labels were added.
|
||||
This is the second bug of two bugs underlying the issue 4042.
|
||||
"""
|
||||
nlp1 = English()
|
||||
vocab = nlp1.vocab
|
||||
|
||||
# add ner pipe
|
||||
ner1 = nlp1.create_pipe("ner")
|
||||
ner1.add_label("SOME_LABEL")
|
||||
nlp1.add_pipe(ner1)
|
||||
nlp1.begin_training()
|
||||
|
||||
# add a new label to the doc
|
||||
doc1 = nlp1("What do you think about Apple ?")
|
||||
assert len(ner1.labels) == 1
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
apple_ent = Span(doc1, 5, 6, label="MY_ORG")
|
||||
doc1.ents = list(doc1.ents) + [apple_ent]
|
||||
|
||||
# reapply the NER - at this point it should resize itself
|
||||
ner1(doc1)
|
||||
assert len(ner1.labels) == 2
|
||||
assert "SOME_LABEL" in ner1.labels
|
||||
assert "MY_ORG" in ner1.labels
|
||||
|
||||
with make_tempdir() as d:
|
||||
# assert IO goes fine
|
||||
output_dir = ensure_path(d)
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
ner1.to_disk(output_dir)
|
||||
|
||||
nlp2 = English(vocab)
|
||||
ner2 = EntityRecognizer(vocab)
|
||||
ner2.from_disk(output_dir)
|
||||
assert len(ner2.labels) == 2
|
|
@ -0,0 +1,39 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
from spacy.pipeline import EntityRecognizer
|
||||
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Span
|
||||
|
||||
|
||||
def test_issue4313():
|
||||
""" This should not crash or exit with some strange error code """
|
||||
beam_width = 16
|
||||
beam_density = 0.0001
|
||||
nlp = English()
|
||||
ner = EntityRecognizer(nlp.vocab)
|
||||
ner.add_label("SOME_LABEL")
|
||||
ner.begin_training([])
|
||||
nlp.add_pipe(ner)
|
||||
|
||||
# add a new label to the doc
|
||||
doc = nlp("What do you think about Apple ?")
|
||||
assert len(ner.labels) == 1
|
||||
assert "SOME_LABEL" in ner.labels
|
||||
apple_ent = Span(doc, 5, 6, label="MY_ORG")
|
||||
doc.ents = list(doc.ents) + [apple_ent]
|
||||
|
||||
# ensure the beam_parse still works with the new label
|
||||
docs = [doc]
|
||||
beams = nlp.entity.beam_parse(
|
||||
docs, beam_width=beam_width, beam_density=beam_density
|
||||
)
|
||||
|
||||
for doc, beam in zip(docs, beams):
|
||||
entity_scores = defaultdict(float)
|
||||
for score, ents in nlp.entity.moves.get_beam_parses(beam):
|
||||
for start, end, label in ents:
|
||||
entity_scores[(start, end, label)] += score
|
Loading…
Reference in New Issue