Merge pull request #6571 from adrianeboyd/bugfix/debug-data-missing-vectors

Fix alignment and vector checks in debug data
2020-12-17 10:10:47 +11:00 · 2020-12-17 10:10:47 +11:00 · 3f90bffa27
parent 546af3966a 20e18cc246
commit 3f90bffa27
1 changed files with 8 additions and 3 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -504,13 +504,18 @@ def _compile_gold(
    for eg in examples:
        gold = eg.reference
        doc = eg.predicted
-        valid_words = [x for x in gold if x is not None]
+        valid_words = [x.text for x in gold]
        data["words"].update(valid_words)
        data["n_words"] += len(valid_words)
-        data["n_misaligned_words"] += len(gold) - len(valid_words)
+        align = eg.alignment
        for token in doc:
            if token.orth_.isspace():
                continue
            if align.x2y.lengths[token.i] != 1:
                data["n_misaligned_words"] += 1
        data["texts"].add(doc.text)
        if len(nlp.vocab.vectors):
-            for word in valid_words:
+            for word in [t.text for t in doc]:
                if nlp.vocab.strings[word] not in nlp.vocab.vectors:
                    data["words_missing_vectors"].update([word])
        if "ner" in factory_names: