mirror of https://github.com/explosion/spaCy.git
Fix debug data check for ents that cross sents (#10188)
* Fix debug data check for ents that cross sents * Use aligned sent starts to have the same indices for the NER and sent start annotation * Add a temporary, insufficient hack for the case where a sentence-initial reference token is split into multiple tokens in the predicted doc, since `Example.get_aligned("SENT_START")` currently aligns `True` to all the split tokens. * Improve test example * Use Example.get_aligned_sent_starts * Add test for crossing entity
This commit is contained in:
parent
91ccacea12
commit
63e1e4e8f6
|
@ -603,6 +603,7 @@ def _compile_gold(
|
|||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||
data["words_missing_vectors"].update([word])
|
||||
if "ner" in factory_names:
|
||||
sent_starts = eg.get_aligned_sent_starts()
|
||||
for i, label in enumerate(eg.get_aligned_ner()):
|
||||
if label is None:
|
||||
continue
|
||||
|
@ -612,7 +613,7 @@ def _compile_gold(
|
|||
if label.startswith(("B-", "U-")):
|
||||
combined_label = label.split("-")[1]
|
||||
data["ner"][combined_label] += 1
|
||||
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
|
||||
if sent_starts[i] == True and label.startswith(("I-", "L-")):
|
||||
data["boundary_cross_ents"] += 1
|
||||
elif label == "-":
|
||||
data["ner"]["-"] += 1
|
||||
|
|
|
@ -12,7 +12,7 @@ from spacy.cli._util import is_subpath_of, load_project_config
|
|||
from spacy.cli._util import parse_config_overrides, string_to_list
|
||||
from spacy.cli._util import substitute_project_variables
|
||||
from spacy.cli._util import validate_project_commands
|
||||
from spacy.cli.debug_data import _get_labels_from_model
|
||||
from spacy.cli.debug_data import _compile_gold, _get_labels_from_model
|
||||
from spacy.cli.debug_data import _get_labels_from_spancat
|
||||
from spacy.cli.download import get_compatibility, get_version
|
||||
from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config
|
||||
|
@ -22,6 +22,7 @@ from spacy.lang.en import English
|
|||
from spacy.lang.nl import Dutch
|
||||
from spacy.language import Language
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.tokens import Doc
|
||||
from spacy.training import Example, docs_to_json, offsets_to_biluo_tags
|
||||
from spacy.training.converters import conll_ner_to_docs, conllu_to_docs
|
||||
from spacy.training.converters import iob_to_docs
|
||||
|
@ -692,3 +693,18 @@ def test_get_labels_from_model(factory_name, pipe_name):
|
|||
assert _get_labels_from_spancat(nlp)[pipe.key] == set(labels)
|
||||
else:
|
||||
assert _get_labels_from_model(nlp, factory_name) == set(labels)
|
||||
|
||||
|
||||
def test_debug_data_compile_gold():
|
||||
nlp = English()
|
||||
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
|
||||
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "O", "B-ENT"])
|
||||
eg = Example(pred, ref)
|
||||
data = _compile_gold([eg], ["ner"], nlp, True)
|
||||
assert data["boundary_cross_ents"] == 0
|
||||
|
||||
pred = Doc(nlp.vocab, words=["Token", ".", "New", "York", "City"])
|
||||
ref = Doc(nlp.vocab, words=["Token", ".", "New York City"], sent_starts=[True, False, True], ents=["O", "B-ENT", "I-ENT"])
|
||||
eg = Example(pred, ref)
|
||||
data = _compile_gold([eg], ["ner"], nlp, True)
|
||||
assert data["boundary_cross_ents"] == 1
|
||||
|
|
Loading…
Reference in New Issue