Fix minor issues in debug-data (#4636)

* Add error in debug-data if no dev docs are available (see #4575)

* Update debug-data for GoldCorpus / Example

* Ignore None label in misaligned NER data
This commit is contained in:
adrianeboyd 2019-11-13 15:25:03 +01:00 committed by Ines Montani
parent e48a09df4e
commit 3ac4e8eb7a
1 changed files with 4 additions and 2 deletions

View File

@ -121,6 +121,8 @@ def debug_data(
msg.text("{} training docs".format(len(train_dataset))) msg.text("{} training docs".format(len(train_dataset)))
msg.text("{} evaluation docs".format(len(gold_dev_data))) msg.text("{} evaluation docs".format(len(gold_dev_data)))
if not len(gold_dev_data):
msg.fail("No evaluation docs")
overlap = len(train_texts.intersection(dev_texts)) overlap = len(train_texts.intersection(dev_texts))
if overlap: if overlap:
msg.warn("{} training examples also in evaluation data".format(overlap)) msg.warn("{} training examples also in evaluation data".format(overlap))
@ -181,7 +183,7 @@ def debug_data(
if "ner" in pipeline: if "ner" in pipeline:
# Get all unique NER labels present in the data # Get all unique NER labels present in the data
labels = set( labels = set(
label for label in gold_train_data["ner"] if label not in ("O", "-") label for label in gold_train_data["ner"] if label not in ("O", "-", None)
) )
label_counts = gold_train_data["ner"] label_counts = gold_train_data["ner"]
model_labels = _get_labels_from_model(nlp, "ner") model_labels = _get_labels_from_model(nlp, "ner")
@ -601,7 +603,7 @@ def _format_labels(labels, counts=False):
def _get_examples_without_label(data, label): def _get_examples_without_label(data, label):
count = 0 count = 0
for ex in data: for ex in data:
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")] labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
if label not in labels: if label not in labels:
count += 1 count += 1
return count return count