From 63e1e4e8f637085b6dfa42d2918cf30e149d7474 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 7 Feb 2022 08:53:30 +0100 Subject: [PATCH] Fix debug data check for ents that cross sents (#10188) * Fix debug data check for ents that cross sents * Use aligned sent starts to have the same indices for the NER and sent start annotation * Add a temporary, insufficient hack for the case where a sentence-initial reference token is split into multiple tokens in the predicted doc, since `Example.get_aligned("SENT_START")` currently aligns `True` to all the split tokens. * Improve test example * Use Example.get_aligned_sent_starts * Add test for crossing entity --- spacy/cli/debug_data.py | 3 ++- spacy/tests/test_cli.py | 18 +++++++++++++++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index ab7c20d48..4be749204 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -603,6 +603,7 @@ def _compile_gold( if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in factory_names: + sent_starts = eg.get_aligned_sent_starts() for i, label in enumerate(eg.get_aligned_ner()): if label is None: continue @@ -612,7 +613,7 @@ def _compile_gold( if label.startswith(("B-", "U-")): combined_label = label.split("-")[1] data["ner"][combined_label] += 1 - if gold[i].is_sent_start and label.startswith(("I-", "L-")): + if sent_starts[i] == True and label.startswith(("I-", "L-")): data["boundary_cross_ents"] += 1 elif label == "-": data["ner"]["-"] += 1 diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 253469909..9d5bdfab2 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config from spacy.cli._util import parse_config_overrides, string_to_list from spacy.cli._util import substitute_project_variables from spacy.cli._util import validate_project_commands -from spacy.cli.debug_data import _get_labels_from_model +from spacy.cli.debug_data import _compile_gold, _get_labels_from_model from spacy.cli.debug_data import _get_labels_from_spancat from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config @@ -22,6 +22,7 @@ from spacy.lang.en import English from spacy.lang.nl import Dutch from spacy.language import Language from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate +from spacy.tokens import Doc from spacy.training import Example, docs_to_json, offsets_to_biluo_tags from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs @@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name): assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels) else: assert _get_labels_from_model(nlp, factory_name) == set(labels) + + +def test_debug_data_compile_gold(): + nlp = English() + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"]) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 0 + + pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"]) + ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"]) + eg = Example(pred, ref) + data = _compile_gold([eg], ["ner"], nlp, True) + assert data["boundary_cross_ents"] == 1