Fix minor issues in debug-data (#4636)

* Add error in debug-data if no dev docs are available (see #4575) * Update debug-data for GoldCorpus / Example * Ignore None label in misaligned NER data
2019-11-13 15:25:03 +01:00 · 2019-11-13 15:25:03 +01:00 · 3ac4e8eb7a
parent e48a09df4e
commit 3ac4e8eb7a
1 changed files with 4 additions and 2 deletions
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@ -121,6 +121,8 @@ def debug_data(
    msg.text("{} training docs".format(len(train_dataset)))
    msg.text("{} evaluation docs".format(len(gold_dev_data)))

+    if not len(gold_dev_data):
+        msg.fail("No evaluation docs")
    overlap = len(train_texts.intersection(dev_texts))
    if overlap:
        msg.warn("{} training examples also in evaluation data".format(overlap))
@ -181,7 +183,7 @@ def debug_data(
    if "ner" in pipeline:
        # Get all unique NER labels present in the data
        labels = set(
-            label for label in gold_train_data["ner"] if label not in ("O", "-")
+            label for label in gold_train_data["ner"] if label not in ("O", "-", None)
        )
        label_counts = gold_train_data["ner"]
        model_labels = _get_labels_from_model(nlp, "ner")
@ -601,7 +603,7 @@ def _format_labels(labels, counts=False):
 def _get_examples_without_label(data, label):
    count = 0
    for ex in data:
-        labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")]
+        labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
        if label not in labels:
            count += 1
    return count