Fix debug data check for ents that cross sents (#10188)

* Fix debug data check for ents that cross sents

* Use aligned sent starts to have the same indices for the NER and sent
start annotation
* Add a temporary, insufficient hack for the case where a
sentence-initial reference token is split into multiple tokens in the
predicted doc, since `Example.get_aligned("SENT_START")` currently
aligns `True` to all the split tokens.

* Improve test example

* Use Example.get_aligned_sent_starts

* Add test for crossing entity
This commit is contained in:
Adriane Boyd 2022-02-07 08:53:30 +01:00 committed by GitHub
parent 91ccacea12
commit 63e1e4e8f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 2 deletions

View File

@ -603,6 +603,7 @@ def _compile_gold(
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word])
if "ner" in factory_names:
sent_starts = eg.get_aligned_sent_starts()
for i, label in enumerate(eg.get_aligned_ner()):
if label is None:
continue
@ -612,7 +613,7 @@ def _compile_gold(
if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1]
data["ner"][combined_label] += 1
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
if sent_starts[i] == True and label.startswith(("I-", "L-")):
data["boundary_cross_ents"] += 1
elif label == "-":
data["ner"]["-"] += 1

View File

@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
from spacy.cli.debug_data import _get_labels_from_model
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
@ -22,6 +22,7 @@ from spacy.lang.en import English
from spacy.lang.nl import Dutch
from spacy.language import Language
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.tokens import Doc
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
from spacy.training.converters import iob_to_docs
@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name):
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
else:
assert _get_labels_from_model(nlp, factory_name) == set(labels)
def test_debug_data_compile_gold():
nlp = English()
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 0
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 1