From baf19fd652ebbe08ec2d1a6cc494a6f1753388de Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 27 Jul 2020 11:17:52 +0200 Subject: [PATCH] Update cats scoring to provide overall score * Provide top-level score as `attr_score` * Provide a description of the score as `attr_score_desc` * Provide all potential scores keys, setting unused keys to `None` * Update CLI evaluate accordingly --- spacy/cli/evaluate.py | 19 +++++++++-------- spacy/scorer.py | 31 ++++++++++++++++++++++------ spacy/tests/pipeline/test_textcat.py | 2 ++ 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py index 5cdbee065..83281543a 100644 --- a/spacy/cli/evaluate.py +++ b/spacy/cli/evaluate.py @@ -82,8 +82,7 @@ def evaluate( "NER P": "ents_p", "NER R": "ents_r", "NER F": "ents_f", - "Textcat AUC": "textcat_macro_auc", - "Textcat F": "textcat_macro_f", + "Textcat": "cats_score", "Sent P": "sents_p", "Sent R": "sents_r", "Sent F": "sents_f", @@ -91,6 +90,8 @@ def evaluate( results = {} for metric, key in metrics.items(): if key in scores: + if key == "cats_score": + metric = metric + " (" + scores.get("cats_score_desc", "unk") + ")" results[metric] = f"{scores[key]*100:.2f}" data = {re.sub(r"[\s/]", "_", k.lower()): v for k, v in results.items()} @@ -99,12 +100,12 @@ def evaluate( if "ents_per_type" in scores: if scores["ents_per_type"]: print_ents_per_type(msg, scores["ents_per_type"]) - if "textcat_f_per_cat" in scores: - if scores["textcat_f_per_cat"]: - print_textcats_f_per_cat(msg, scores["textcat_f_per_cat"]) - if "textcat_auc_per_cat" in scores: - if scores["textcat_auc_per_cat"]: - print_textcats_auc_per_cat(msg, scores["textcat_auc_per_cat"]) + if "cats_f_per_type" in scores: + if scores["cats_f_per_type"]: + print_textcats_f_per_cat(msg, scores["cats_f_per_type"]) + if "cats_auc_per_type" in scores: + if scores["cats_auc_per_type"]: + print_textcats_auc_per_cat(msg, scores["cats_auc_per_type"]) if displacy_path: factory_names = [nlp.get_pipe_meta(pipe).factory for pipe in nlp.pipe_names] @@ -170,7 +171,7 @@ def print_textcats_f_per_cat(msg: Printer, scores: Dict[str, Dict[str, float]]) data, header=("", "P", "R", "F"), aligns=("l", "r", "r", "r"), - title="Textcat F (per type)", + title="Textcat F (per label)", ) diff --git a/spacy/scorer.py b/spacy/scorer.py index a95fe70cf..2bbf453e7 100644 --- a/spacy/scorer.py +++ b/spacy/scorer.py @@ -298,7 +298,8 @@ class Scorer: **cfg ): """Returns PRF and ROC AUC scores for a doc-level attribute with a - dict with scores for each label like Doc.cats. + dict with scores for each label like Doc.cats. The reported overall + score depends on the scorer settings. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. @@ -309,11 +310,16 @@ class Scorer: Defaults to True. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. - RETURNS (dict): A dictionary containing the scores: - for binary exclusive with positive label: attr_p/r/f, - for 3+ exclusive classes, macro-averaged fscore: attr_macro_f, - for multilabel, macro-averaged AUC: attr_macro_auc, - for all: attr_f_per_type, attr_auc_per_type + RETURNS (dict): A dictionary containing the scores, with inapplicable + scores as None: + for all: + attr_score (one of attr_f / attr_macro_f / attr_macro_auc), + attr_score_desc (text description of the overall score), + attr_f_per_type, + attr_auc_per_type + for binary exclusive with positive label: attr_p/r/f + for 3+ exclusive classes, macro-averaged fscore: attr_macro_f + for multilabel, macro-averaged AUC: attr_macro_auc """ score = PRFScore() f_per_type = dict() @@ -362,6 +368,13 @@ class Scorer: ) ) results = { + attr + "_score": None, + attr + "_score_desc": None, + attr + "_p": None, + attr + "_r": None, + attr + "_f": None, + attr + "_macro_f": None, + attr + "_macro_auc": None, attr + "_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, attr + "_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } @@ -369,16 +382,22 @@ class Scorer: results[attr + "_p"] = score.precision results[attr + "_r"] = score.recall results[attr + "_f"] = score.fscore + results[attr + "_score"] = results[attr + "_f"] + results[attr + "_score_desc"] = "F (" + positive_label + ")" elif not multi_label: results[attr + "_macro_f"] = sum( [score.fscore for label, score in f_per_type.items()] ) / (len(f_per_type) + 1e-100) + results[attr + "_score"] = results[attr + "_macro_f"] + results[attr + "_score_desc"] = "macro F" else: results[attr + "_macro_auc"] = max( sum([score.score for label, score in auc_per_type.items()]) / (len(auc_per_type) + 1e-100), -1, ) + results[attr + "_score"] = results[attr + "_macro_auc"] + results[attr + "_score_desc"] = "macro AUC" return results @staticmethod diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 5e8dab0bd..15832d4bd 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -121,6 +121,8 @@ def test_overfitting_IO(): train_examples, component_cfg={"scorer": {"positive_label": "POSITIVE"}} ) assert scores["cats_f"] == 1.0 + assert scores["cats_score"] == 1.0 + assert "cats_score_desc" in scores # fmt: off