mirror of https://github.com/explosion/spaCy.git
Fix minor issues in debug-data (#4636)
* Add error in debug-data if no dev docs are available (see #4575) * Update debug-data for GoldCorpus / Example * Ignore None label in misaligned NER data
This commit is contained in:
parent
e48a09df4e
commit
3ac4e8eb7a
|
@ -121,6 +121,8 @@ def debug_data(
|
||||||
msg.text("{} training docs".format(len(train_dataset)))
|
msg.text("{} training docs".format(len(train_dataset)))
|
||||||
msg.text("{} evaluation docs".format(len(gold_dev_data)))
|
msg.text("{} evaluation docs".format(len(gold_dev_data)))
|
||||||
|
|
||||||
|
if not len(gold_dev_data):
|
||||||
|
msg.fail("No evaluation docs")
|
||||||
overlap = len(train_texts.intersection(dev_texts))
|
overlap = len(train_texts.intersection(dev_texts))
|
||||||
if overlap:
|
if overlap:
|
||||||
msg.warn("{} training examples also in evaluation data".format(overlap))
|
msg.warn("{} training examples also in evaluation data".format(overlap))
|
||||||
|
@ -181,7 +183,7 @@ def debug_data(
|
||||||
if "ner" in pipeline:
|
if "ner" in pipeline:
|
||||||
# Get all unique NER labels present in the data
|
# Get all unique NER labels present in the data
|
||||||
labels = set(
|
labels = set(
|
||||||
label for label in gold_train_data["ner"] if label not in ("O", "-")
|
label for label in gold_train_data["ner"] if label not in ("O", "-", None)
|
||||||
)
|
)
|
||||||
label_counts = gold_train_data["ner"]
|
label_counts = gold_train_data["ner"]
|
||||||
model_labels = _get_labels_from_model(nlp, "ner")
|
model_labels = _get_labels_from_model(nlp, "ner")
|
||||||
|
@ -601,7 +603,7 @@ def _format_labels(labels, counts=False):
|
||||||
def _get_examples_without_label(data, label):
|
def _get_examples_without_label(data, label):
|
||||||
count = 0
|
count = 0
|
||||||
for ex in data:
|
for ex in data:
|
||||||
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-")]
|
labels = [label.split("-")[1] for label in ex.gold.ner if label not in ("O", "-", None)]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
count += 1
|
count += 1
|
||||||
return count
|
return count
|
||||||
|
|
Loading…
Reference in New Issue