From 63e1e4e8f637085b6dfa42d2918cf30e149d7474 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 7 Feb 2022 08:53:30 +0100
Subject: [PATCH] Fix debug data check for ents that cross sents (#10188)

* Fix debug data check for ents that cross sents

* Use aligned sent starts to have the same indices for the NER and sent
start annotation
* Add a temporary, insufficient hack for the case where a
sentence-initial reference token is split into multiple tokens in the
predicted doc, since `Example.get_aligned("SENT_START")` currently
aligns `True` to all the split tokens.

* Improve test example

* Use Example.get_aligned_sent_starts

* Add test for crossing entity
---
 spacy/cli/debug_data.py |  3 ++-
 spacy/tests/test_cli.py | 18 +++++++++++++++++-
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index ab7c20d48..4be749204 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -603,6 +603,7 @@ def _compile_gold(
                 if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                     data["words_missing_vectors"].update([word])
         if "ner" in factory_names:
+            sent_starts = eg.get_aligned_sent_starts()
             for i, label in enumerate(eg.get_aligned_ner()):
                 if label is None:
                     continue
@@ -612,7 +613,7 @@ def _compile_gold(
                 if label.startswith(("B-", "U-")):
                     combined_label = label.split("-")[1]
                     data["ner"][combined_label] += 1
-                if gold[i].is_sent_start and label.startswith(("I-", "L-")):
+                if sent_starts[i] == True and label.startswith(("I-", "L-")):
                     data["boundary_cross_ents"] += 1
                 elif label == "-":
                     data["ner"]["-"] += 1
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 253469909..9d5bdfab2 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
 from spacy.cli._util import parse_config_overrides, string_to_list
 from spacy.cli._util import substitute_project_variables
 from spacy.cli._util import validate_project_commands
-from spacy.cli.debug_data import _get_labels_from_model
+from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
 from spacy.cli.debug_data import _get_labels_from_spancat
 from spacy.cli.download import get_compatibility, get_version
 from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
@@ -22,6 +22,7 @@ from spacy.lang.en import English
 from spacy.lang.nl import Dutch
 from spacy.language import Language
 from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
+from spacy.tokens import Doc
 from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
 from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
 from spacy.training.converters import iob_to_docs
@@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name):
         assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
     else:
         assert _get_labels_from_model(nlp, factory_name) == set(labels)
+
+
+def test_debug_data_compile_gold():
+    nlp = English()
+    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
+    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
+    eg = Example(pred, ref)
+    data = _compile_gold([eg], ["ner"], nlp, True)
+    assert data["boundary_cross_ents"] == 0
+
+    pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
+    ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
+    eg = Example(pred, ref)
+    data = _compile_gold([eg], ["ner"], nlp, True)
+    assert data["boundary_cross_ents"] == 1