mirror of https://github.com/explosion/spaCy.git
Fix minor issues in debug-data (#4636)
* Add error in debug-data if no dev docs are available (see #4575) * Update debug-data for GoldCorpus / Example * Ignore None label in misaligned NER data
This commit is contained in:
parent
e48a09df4e
commit
3ac4e8eb7a
|
@ -121,6 +121,8 @@ def debug_data(
|
|||
msg.text("{} training docs".format(len(train_dataset)))
|
||||
msg.text("{} evaluation docs".format(len(gold_dev_data)))
|
||||
|
||||
if not len(gold_dev_data):
|
||||
msg.fail("No evaluation docs")
|
||||
overlap = len(train_texts.intersection(dev_texts))
|
||||
if overlap:
|
||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||
|
@ -181,7 +183,7 @@ def debug_data(
|
|||
if "ner" in pipeline:
|
||||
# Get all unique NER labels present in the data
|
||||
labels = set(
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-")
|
||||
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
||||
)
|
||||
label_counts = gold_train_data["ner"]
|
||||
model_labels = _get_labels_from_model(nlp, "ner")
|
||||
|
@ -601,7 +603,7 @@ def _format_labels(labels, counts=False):
|
|||
def _get_examples_without_label(data, label):
|
||||
count = 0
|
||||
for ex in data:
|
||||
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")]
|
||||
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
|
||||
if label not in labels:
|
||||
count += 1
|
||||
return count
|
||||
|
|
Loading…
Reference in New Issue