Fix debug data check for ents that cross sents (#10188)

* Fix debug data check for ents that cross sents * Use aligned sent starts to have the same indices for the NER and sent start annotation * Add a temporary, insufficient hack for the case where a sentence-initial reference token is split into multiple tokens in the predicted doc, since `Example.get_aligned("SENT_START")` currently aligns `True` to all the split tokens. * Improve test example * Use Example.get_aligned_sent_starts * Add test for crossing entity
2022-02-07 08:53:30 +01:00 · 2022-02-07 08:53:30 +01:00 · 63e1e4e8f6
parent 91ccacea12
commit 63e1e4e8f6
2 changed files with 19 additions and 2 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -603,6 +603,7 @@ def _compile_gold(
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in factory_names:
+            sent_starts = eg.get_aligned_sent_starts()
            for i, label in enumerate(eg.get_aligned_ner()):
                if label is None:
                    continue
@ -612,7 +613,7 @@ def _compile_gold(
                if label.startswith(("B-", "U-")):
                    combined_label = label.split("-")[1]
                    data["ner"][combined_label] += 1
-                if gold[i].is_sent_start and label.startswith(("I-", "L-")):
+                if sent_starts[i] == True and label.startswith(("I-", "L-")):
                    data["boundary_cross_ents"] += 1
                elif label == "-":
                    data["ner"]["-"] += 1
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
-from spacy.cli.debug_data import _get_labels_from_model
+from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
@ -22,6 +22,7 @@ from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import Language
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.tokens import Doc
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
 from spacy.training.converters import iob_to_docs
@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name):
        assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
    else:
        assert _get_labels_from_model(nlp, factory_name) == set(labels)
+
+
+def test_debug_data_compile_gold():
+    nlp = English()
+    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
+    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
+    eg = Example(pred, ref)
+    data = _compile_gold([eg], ["ner"], nlp, True)
+    assert data["boundary_cross_ents"] == 0
+
+    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
+    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
+    eg = Example(pred, ref)
+    data = _compile_gold([eg], ["ner"], nlp, True)
+    assert data["boundary_cross_ents"] == 1