From 20e18cc2461c09af2d4364c127e97f4121d5a1e4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 15 Dec 2020 09:43:14 +0100 Subject: [PATCH] Fix alignment and vector checks in debug data * Update token alignment check to use Example alignment * Update missing vector check further related to changes in v3 --- spacy/cli/debug_data.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index f161d0254..d23cd3717 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -504,13 +504,18 @@ def _compile_gold( for eg in examples: gold = eg.reference doc = eg.predicted - valid_words = [x for x in gold if x is not None] + valid_words = [x.text for x in gold] data["words"].update(valid_words) data["n_words"] += len(valid_words) - data["n_misaligned_words"] += len(gold) - len(valid_words) + align = eg.alignment + for token in doc: + if token.orth_.isspace(): + continue + if align.x2y.lengths[token.i] != 1: + data["n_misaligned_words"] += 1 data["texts"].add(doc.text) if len(nlp.vocab.vectors): - for word in valid_words: + for word in [t.text for t in doc]: if nlp.vocab.strings[word] not in nlp.vocab.vectors: data["words_missing_vectors"].update([word]) if "ner" in factory_names: