From 3ac4e8eb7a6c688ddc7abd205e2ed7060cbf0798 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Wed, 13 Nov 2019 15:25:03 +0100 Subject: [PATCH] Fix minor issues in debug-data (#4636) * Add error in debug-data if no dev docs are available (see #4575) * Update debug-data for GoldCorpus / Example * Ignore None label in misaligned NER data --- spacy/cli/debug_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py index 76276ee56..ed19703ac 100644 --- a/spacy/cli/debug_data.py +++ b/spacy/cli/debug_data.py @@ -121,6 +121,8 @@ def debug_data( msg.text("{} training docs".format(len(train_dataset))) msg.text("{} evaluation docs".format(len(gold_dev_data))) + if not len(gold_dev_data): + msg.fail("No evaluation docs") overlap = len(train_texts.intersection(dev_texts)) if overlap: msg.warn("{} training examples also in evaluation data".format(overlap)) @@ -181,7 +183,7 @@ def debug_data( if "ner" in pipeline: # Get all unique NER labels present in the data labels = set( - label for label in gold_train_data["ner"] if label not in ("O", "-") + label for label in gold_train_data["ner"] if label not in ("O", "-", None) ) label_counts = gold_train_data["ner"] model_labels = _get_labels_from_model(nlp, "ner") @@ -601,7 +603,7 @@ def _format_labels(labels, counts=False): def _get_examples_without_label(data, label): count = 0 for ex in data: - labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")] + labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)] if label not in labels: count += 1 return count