mirror of https://github.com/explosion/spaCy.git
Update cats scoring to provide overall score
* Provide top-level score as `attr_score` * Provide a description of the score as `attr_score_desc` * Provide all potential scores keys, setting unused keys to `None` * Update CLI evaluate accordingly
This commit is contained in:
parent
f8cf378be9
commit
baf19fd652
|
@ -82,8 +82,7 @@ def evaluate(
|
||||||
"NER P": "ents_p",
|
"NER P": "ents_p",
|
||||||
"NER R": "ents_r",
|
"NER R": "ents_r",
|
||||||
"NER F": "ents_f",
|
"NER F": "ents_f",
|
||||||
"Textcat AUC": "textcat_macro_auc",
|
"Textcat": "cats_score",
|
||||||
"Textcat F": "textcat_macro_f",
|
|
||||||
"Sent P": "sents_p",
|
"Sent P": "sents_p",
|
||||||
"Sent R": "sents_r",
|
"Sent R": "sents_r",
|
||||||
"Sent F": "sents_f",
|
"Sent F": "sents_f",
|
||||||
|
@ -91,6 +90,8 @@ def evaluate(
|
||||||
results = {}
|
results = {}
|
||||||
for metric, key in metrics.items():
|
for metric, key in metrics.items():
|
||||||
if key in scores:
|
if key in scores:
|
||||||
|
if key == "cats_score":
|
||||||
|
metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")"
|
||||||
results[metric] = f"{scores[key]*100:.2f}"
|
results[metric] = f"{scores[key]*100:.2f}"
|
||||||
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
|
data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()}
|
||||||
|
|
||||||
|
@ -99,12 +100,12 @@ def evaluate(
|
||||||
if "ents_per_type" in scores:
|
if "ents_per_type" in scores:
|
||||||
if scores["ents_per_type"]:
|
if scores["ents_per_type"]:
|
||||||
print_ents_per_type(msg, scores["ents_per_type"])
|
print_ents_per_type(msg, scores["ents_per_type"])
|
||||||
if "textcat_f_per_cat" in scores:
|
if "cats_f_per_type" in scores:
|
||||||
if scores["textcat_f_per_cat"]:
|
if scores["cats_f_per_type"]:
|
||||||
print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"])
|
print_textcats_f_per_cat(msg, scores["cats_f_per_type"])
|
||||||
if "textcat_auc_per_cat" in scores:
|
if "cats_auc_per_type" in scores:
|
||||||
if scores["textcat_auc_per_cat"]:
|
if scores["cats_auc_per_type"]:
|
||||||
print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"])
|
print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"])
|
||||||
|
|
||||||
if displacy_path:
|
if displacy_path:
|
||||||
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names]
|
||||||
|
@ -170,7 +171,7 @@ def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]])
|
||||||
data,
|
data,
|
||||||
header=("", "P", "R", "F"),
|
header=("", "P", "R", "F"),
|
||||||
aligns=("l", "r", "r", "r"),
|
aligns=("l", "r", "r", "r"),
|
||||||
title="Textcat F (per type)",
|
title="Textcat F (per label)",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -298,7 +298,8 @@ class Scorer:
|
||||||
**cfg
|
**cfg
|
||||||
):
|
):
|
||||||
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
"""Returns PRF and ROC AUC scores for a doc-level attribute with a
|
||||||
dict with scores for each label like Doc.cats.
|
dict with scores for each label like Doc.cats. The reported overall
|
||||||
|
score depends on the scorer settings.
|
||||||
|
|
||||||
examples (Iterable[Example]): Examples to score
|
examples (Iterable[Example]): Examples to score
|
||||||
attr (str): The attribute to score.
|
attr (str): The attribute to score.
|
||||||
|
@ -309,11 +310,16 @@ class Scorer:
|
||||||
Defaults to True.
|
Defaults to True.
|
||||||
positive_label (str): The positive label for a binary task with
|
positive_label (str): The positive label for a binary task with
|
||||||
exclusive classes. Defaults to None.
|
exclusive classes. Defaults to None.
|
||||||
RETURNS (dict): A dictionary containing the scores:
|
RETURNS (dict): A dictionary containing the scores, with inapplicable
|
||||||
for binary exclusive with positive label: attr_p/r/f,
|
scores as None:
|
||||||
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f,
|
for all:
|
||||||
for multilabel, macro-averaged AUC: attr_macro_auc,
|
attr_score (one of attr_f / attr_macro_f / attr_macro_auc),
|
||||||
for all: attr_f_per_type, attr_auc_per_type
|
attr_score_desc (text description of the overall score),
|
||||||
|
attr_f_per_type,
|
||||||
|
attr_auc_per_type
|
||||||
|
for binary exclusive with positive label: attr_p/r/f
|
||||||
|
for 3+ exclusive classes, macro-averaged fscore: attr_macro_f
|
||||||
|
for multilabel, macro-averaged AUC: attr_macro_auc
|
||||||
"""
|
"""
|
||||||
score = PRFScore()
|
score = PRFScore()
|
||||||
f_per_type = dict()
|
f_per_type = dict()
|
||||||
|
@ -362,6 +368,13 @@ class Scorer:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
results = {
|
results = {
|
||||||
|
attr + "_score": None,
|
||||||
|
attr + "_score_desc": None,
|
||||||
|
attr + "_p": None,
|
||||||
|
attr + "_r": None,
|
||||||
|
attr + "_f": None,
|
||||||
|
attr + "_macro_f": None,
|
||||||
|
attr + "_macro_auc": None,
|
||||||
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
|
||||||
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()},
|
||||||
}
|
}
|
||||||
|
@ -369,16 +382,22 @@ class Scorer:
|
||||||
results[attr + "_p"] = score.precision
|
results[attr + "_p"] = score.precision
|
||||||
results[attr + "_r"] = score.recall
|
results[attr + "_r"] = score.recall
|
||||||
results[attr + "_f"] = score.fscore
|
results[attr + "_f"] = score.fscore
|
||||||
|
results[attr + "_score"] = results[attr + "_f"]
|
||||||
|
results[attr + "_score_desc"] = "F (" + positive_label + ")"
|
||||||
elif not multi_label:
|
elif not multi_label:
|
||||||
results[attr + "_macro_f"] = sum(
|
results[attr + "_macro_f"] = sum(
|
||||||
[score.fscore for label, score in f_per_type.items()]
|
[score.fscore for label, score in f_per_type.items()]
|
||||||
) / (len(f_per_type) + 1e-100)
|
) / (len(f_per_type) + 1e-100)
|
||||||
|
results[attr + "_score"] = results[attr + "_macro_f"]
|
||||||
|
results[attr + "_score_desc"] = "macro F"
|
||||||
else:
|
else:
|
||||||
results[attr + "_macro_auc"] = max(
|
results[attr + "_macro_auc"] = max(
|
||||||
sum([score.score for label, score in auc_per_type.items()])
|
sum([score.score for label, score in auc_per_type.items()])
|
||||||
/ (len(auc_per_type) + 1e-100),
|
/ (len(auc_per_type) + 1e-100),
|
||||||
-1,
|
-1,
|
||||||
)
|
)
|
||||||
|
results[attr + "_score"] = results[attr + "_macro_auc"]
|
||||||
|
results[attr + "_score_desc"] = "macro AUC"
|
||||||
return results
|
return results
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
|
@ -121,6 +121,8 @@ def test_overfitting_IO():
|
||||||
train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}}
|
train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}}
|
||||||
)
|
)
|
||||||
assert scores["cats_f"] == 1.0
|
assert scores["cats_f"] == 1.0
|
||||||
|
assert scores["cats_score"] == 1.0
|
||||||
|
assert "cats_score_desc" in scores
|
||||||
|
|
||||||
|
|
||||||
# fmt: off
|
# fmt: off
|
||||||
|
|
Loading…
Reference in New Issue