Fix debug data check for ents that cross sents (#10188)

* Fix debug data check for ents that cross sents

* Use aligned sent starts to have the same indices for the NER and sent
start annotation
* Add a temporary, insufficient hack for the case where a
sentence-initial reference token is split into multiple tokens in the
predicted doc, since `Example.get_aligned("SENT_START")` currently
aligns `True` to all the split tokens.

* Improve test example

* Use Example.get_aligned_sent_starts

* Add test for crossing entity
This commit is contained in:
Adriane Boyd 2022-02-07 08:53:30 +01:00 committed by GitHub
parent 91ccacea12
commit 63e1e4e8f6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 2 deletions

View File

@ -603,6 +603,7 @@ def _compile_gold(
if nlp.vocab.strings[word] not in nlp.vocab.vectors: if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word]) data["words_missing_vectors"].update([word])
if "ner" in factory_names: if "ner" in factory_names:
sent_starts = eg.get_aligned_sent_starts()
for i, label in enumerate(eg.get_aligned_ner()): for i, label in enumerate(eg.get_aligned_ner()):
if label is None: if label is None:
continue continue
@ -612,7 +613,7 @@ def _compile_gold(
if label.startswith(("B-", "U-")): if label.startswith(("B-", "U-")):
combined_label = label.split("-")[1] combined_label = label.split("-")[1]
data["ner"][combined_label] += 1 data["ner"][combined_label] += 1
if gold[i].is_sent_start and label.startswith(("I-", "L-")): if sent_starts[i] == True and label.startswith(("I-", "L-")):
data["boundary_cross_ents"] += 1 data["boundary_cross_ents"] += 1
elif label == "-": elif label == "-":
data["ner"]["-"] += 1 data["ner"]["-"] += 1

View File

@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands from spacy.cli._util import validate_project_commands
from spacy.cli.debug_data import _get_labels_from_model from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.debug_data import _get_labels_from_spancat
from spacy.cli.download import get_compatibility, get_version from spacy.cli.download import get_compatibility, get_version
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
@ -22,6 +22,7 @@ from spacy.lang.en import English
from spacy.lang.nl import Dutch from spacy.lang.nl import Dutch
from spacy.language import Language from spacy.language import Language
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.tokens import Doc
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
from spacy.training.converters import iob_to_docs from spacy.training.converters import iob_to_docs
@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name):
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
else: else:
assert _get_labels_from_model(nlp, factory_name) == set(labels) assert _get_labels_from_model(nlp, factory_name) == set(labels)
def test_debug_data_compile_gold():
nlp = English()
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 0
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
eg = Example(pred, ref)
data = _compile_gold([eg], ["ner"], nlp, True)
assert data["boundary_cross_ents"] == 1