spaCy/spacy/scorer.py

from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
import numpy as np
from collections import defaultdict

from .training import Example
from .tokens import Token, Doc, Span, MorphAnalysis
from .errors import Errors
from .util import get_lang_class, SimpleFrozenList
from .morphology import Morphology

if TYPE_CHECKING:
    # This lets us add type hints for mypy etc. without causing circular imports
    from .language import Language  # noqa: F401


DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
MISSING_VALUES = frozenset([None, 0, ""])


class PRFScore:
    """A precision / recall / F score."""

    def __init__(self) -> None:
        self.tp = 0
        self.fp = 0
        self.fn = 0

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def fscore(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f": self.fscore}


class ROCAUCScore:
    """An AUC ROC score. This is only defined for binary classification.
    Use the method is_binary before calculating the score, otherwise it
    may throw an error."""

    def __init__(self) -> None:
        self.golds = []
        self.cands = []
        self.saved_score = 0.0
        self.saved_score_at_len = 0

    def score_set(self, cand, gold) -> None:
        self.cands.append(cand)
        self.golds.append(gold)

    def is_binary(self):
        return len(np.unique(self.golds)) == 2

    @property
    def score(self):
        if not self.is_binary():
            raise ValueError(Errors.E165.format(label=set(self.golds)))
        if len(self.golds) == self.saved_score_at_len:
            return self.saved_score
        self.saved_score = _roc_auc_score(self.golds, self.cands)
        self.saved_score_at_len = len(self.golds)
        return self.saved_score


class Scorer:
    """Compute evaluation scores."""

    def __init__(
        self,
        nlp: Optional["Language"] = None,
        default_lang: str = "xx",
        default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
        **cfg,
    ) -> None:
        """Initialize the Scorer.

        DOCS: https://nightly.spacy.io/api/scorer#init
        """
        self.nlp = nlp
        self.cfg = cfg
        if not nlp:
            nlp = get_lang_class(default_lang)()
            for pipe in default_pipeline:
                nlp.add_pipe(pipe)
            self.nlp = nlp

    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
        """Evaluate a list of Examples.

        examples (Iterable[Example]): The predicted annotations + correct annotations.
        RETURNS (Dict): A dictionary of scores.

        DOCS: https://nightly.spacy.io/api/scorer#score
        """
        scores = {}
        if hasattr(self.nlp.tokenizer, "score"):
            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
        for name, component in self.nlp.pipeline:
            if hasattr(component, "score"):
                scores.update(component.score(examples, **self.cfg))
        return scores

    @staticmethod
    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
        """Returns accuracy and PRF scores for tokenization.
        * token_acc: # correct tokens / # gold tokens
        * token_p/r/f: PRF for token character spans

        examples (Iterable[Example]): Examples to score
        RETURNS (Dict[str, Any]): A dictionary containing the scores
            token_acc/p/r/f.

        DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
        """
        acc_score = PRFScore()
        prf_score = PRFScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            if gold_doc.has_unknown_spaces:
                continue
            align = example.alignment
            gold_spans = set()
            pred_spans = set()
            for token in gold_doc:
                if token.orth_.isspace():
                    continue
                gold_spans.add((token.idx, token.idx + len(token)))
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                pred_spans.add((token.idx, token.idx + len(token)))
                if align.x2y.lengths[token.i] != 1:
                    acc_score.fp += 1
                else:
                    acc_score.tp += 1
            prf_score.score_set(pred_spans, gold_spans)
        if len(acc_score) > 0:
            return {
                "token_acc": acc_score.fscore,
                "token_p": prf_score.precision,
                "token_r": prf_score.recall,
                "token_f": prf_score.fscore,
            }
        else:
            return {
                "token_acc": None,
                "token_p": None,
                "token_r": None,
                "token_f": None
            }

    @staticmethod
    def score_token_attr(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
        missing_values: Set[Any] = MISSING_VALUES,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns an accuracy score for a token-level attribute.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
            under the key attr_acc.

        DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
        """
        tag_score = PRFScore()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_tags = set()
            missing_indices = set()
            for gold_i, token in enumerate(gold_doc):
                value = getter(token, attr)
                if value not in missing_values:
                    gold_tags.add((gold_i, getter(token, attr)))
                else:
                    missing_indices.add(gold_i)
            pred_tags = set()
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    if gold_i not in missing_indices:
                        pred_tags.add((gold_i, getter(token, attr)))
            tag_score.score_set(pred_tags, gold_tags)
        score_key = f"{attr}_acc"
        if len(tag_score) == 0:
            return {score_key: None}
        else:
            return {score_key: tag_score.fscore}

    @staticmethod
    def score_token_attr_per_feat(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
        missing_values: Set[Any] = MISSING_VALUES,
        **cfg,
    ) -> Dict[str, Any]:
        """Return PRF scores per feat for a token attribute in UFEATS format.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        RETURNS (dict): A dictionary containing the per-feat PRF scores under
            the key attr_per_feat.
        """
        per_feat = {}
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
            align = example.alignment
            gold_per_feat = {}
            missing_indices = set()
            for gold_i, token in enumerate(gold_doc):
                value = getter(token, attr)
                morph = gold_doc.vocab.strings[value]
                if value not in missing_values and morph != Morphology.EMPTY_MORPH:
                    for feat in morph.split(Morphology.FEATURE_SEP):
                        field, values = feat.split(Morphology.FIELD_SEP)
                        if field not in per_feat:
                            per_feat[field] = PRFScore()
                        if field not in gold_per_feat:
                            gold_per_feat[field] = set()
                        gold_per_feat[field].add((gold_i, feat))
                else:
                    missing_indices.add(gold_i)
            pred_per_feat = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] == 1:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                    if gold_i not in missing_indices:
                        value = getter(token, attr)
                        morph = gold_doc.vocab.strings[value]
                        if value not in missing_values and morph != Morphology.EMPTY_MORPH:
                            for feat in morph.split(Morphology.FEATURE_SEP):
                                field, values = feat.split(Morphology.FIELD_SEP)
                                if field not in per_feat:
                                    per_feat[field] = PRFScore()
                                if field not in pred_per_feat:
                                    pred_per_feat[field] = set()
                                pred_per_feat[field].add((gold_i, feat))
            for field in per_feat:
                per_feat[field].score_set(
                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
                )
        score_key = f"{attr}_per_feat"
        if any([len(v) for v in per_feat.values()]):
            result = {k: v.to_dict() for k, v in per_feat.items()}
            return {score_key: result}
        else:
            return {score_key: None}

    @staticmethod
    def score_spans(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
        has_annotation: Optional[Callable[[Doc], bool]] = None,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns PRF scores for labeled spans.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
            provided, getter(doc, attr) should return the spans for the
            individual doc.
        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.

        DOCS: https://nightly.spacy.io/api/scorer#score_spans
        """
        score = PRFScore()
        score_per_type = dict()
        for example in examples:
            pred_doc = example.predicted
            gold_doc = example.reference
            # Option to handle docs without sents
            if has_annotation is not None:
                if not has_annotation(gold_doc):
                    continue
            # Find all labels in gold and doc
            labels = set(
                [k.label_ for k in getter(gold_doc, attr)]
                + [k.label_ for k in getter(pred_doc, attr)]
            )
            # Set up all labels for per type scoring and prepare gold per type
            gold_per_type = {label: set() for label in labels}
            for label in labels:
                if label not in score_per_type:
                    score_per_type[label] = PRFScore()
            # Find all predidate labels, for all and per type
            gold_spans = set()
            pred_spans = set()
            for span in getter(gold_doc, attr):
                gold_span = (span.label_, span.start, span.end - 1)
                gold_spans.add(gold_span)
                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
            pred_per_type = {label: set() for label in labels}
            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
                pred_spans.add((span.label_, span.start, span.end - 1))
                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
            # Scores per label
            for k, v in score_per_type.items():
                if k in pred_per_type:
                    v.score_set(pred_per_type[k], gold_per_type[k])
            # Score for all labels
            score.score_set(pred_spans, gold_spans)
        if len(score) > 0:
            return {
                f"{attr}_p": score.precision,
                f"{attr}_r": score.recall,
                f"{attr}_f": score.fscore,
                f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
            }
        else:
            return {
                f"{attr}_p": None,
                f"{attr}_r": None,
                f"{attr}_f": None,
                f"{attr}_per_type": None,
            }


    @staticmethod
    def score_cats(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Doc, str], Any] = getattr,
        labels: Iterable[str] = SimpleFrozenList(),
        multi_label: bool = True,
        positive_label: Optional[str] = None,
        threshold: Optional[float] = None,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns PRF and ROC AUC scores for a doc-level attribute with a
        dict with scores for each label like Doc.cats. The reported overall
        score depends on the scorer settings.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute to score.
        getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
            getter(doc, attr) should return the values for the individual doc.
        labels (Iterable[str]): The set of possible labels. Defaults to [].
        multi_label (bool): Whether the attribute allows multiple labels.
            Defaults to True.
        positive_label (str): The positive label for a binary task with
            exclusive classes. Defaults to None.
        threshold (float): Cutoff to consider a prediction "positive". Defaults
            to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
            otherwise.
        RETURNS (Dict[str, Any]): A dictionary containing the scores, with
            inapplicable scores as None:
            for all:
                attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
                attr_score_desc (text description of the overall score),
                attr_micro_p,
                attr_micro_r,
                attr_micro_f,
                attr_macro_p,
                attr_macro_r,
                attr_macro_f,
                attr_macro_auc,
                attr_f_per_type,
                attr_auc_per_type

        DOCS: https://nightly.spacy.io/api/scorer#score_cats
        """
        if threshold is None:
            threshold = 0.5 if multi_label else 0.0
        f_per_type = {label: PRFScore() for label in labels}
        auc_per_type = {label: ROCAUCScore() for label in labels}
        labels = set(labels)
        if labels:
            for eg in examples:
                labels.update(eg.predicted.cats.keys())
                labels.update(eg.reference.cats.keys())
        for example in examples:
            # Through this loop, None in the gold_cats indicates missing label.
            pred_cats = getter(example.predicted, attr)
            gold_cats = getter(example.reference, attr)

            for label in labels:
                pred_score = pred_cats.get(label, 0.0)
                gold_score = gold_cats.get(label, 0.0)
                if gold_score is not None:
                    auc_per_type[label].score_set(pred_score, gold_score)
            if multi_label:
                for label in labels:
                    pred_score = pred_cats.get(label, 0.0)
                    gold_score = gold_cats.get(label, 0.0)
                    if gold_score is not None:
                        if pred_score >= threshold and gold_score > 0:
                            f_per_type[label].tp += 1
                        elif pred_score >= threshold and gold_score == 0:
                            f_per_type[label].fp += 1
                        elif pred_score < threshold and gold_score > 0:
                            f_per_type[label].fn += 1
            elif pred_cats and gold_cats:
                # Get the highest-scoring for each.
                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
                if gold_score is not None:
                    if pred_label == gold_label and pred_score >= threshold:
                        f_per_type[pred_label].tp += 1
                    else:
                        f_per_type[gold_label].fn += 1
                        if pred_score >= threshold:
                            f_per_type[pred_label].fp += 1
            elif gold_cats:
                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
                if gold_score is not None and gold_score > 0:
                    f_per_type[gold_label].fn += 1
            else:
                pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
                if pred_score >= threshold:
                    f_per_type[pred_label].fp += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
            micro_prf.tp += label_prf.tp
            micro_prf.fn += label_prf.fn
            micro_prf.fp += label_prf.fp
        n_cats = len(f_per_type) + 1e-100
        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
        # Limit macro_auc to those labels with gold annotations,
        # but still divide by all cats to avoid artificial boosting of datasets with missing labels
        macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
        results = {
            f"{attr}_score": None,
            f"{attr}_score_desc": None,
            f"{attr}_micro_p": micro_prf.precision,
            f"{attr}_micro_r": micro_prf.recall,
            f"{attr}_micro_f": micro_prf.fscore,
            f"{attr}_macro_p": macro_p,
            f"{attr}_macro_r": macro_r,
            f"{attr}_macro_f": macro_f,
            f"{attr}_macro_auc": macro_auc,
            f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
            f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
        }
        if len(labels) == 2 and not multi_label and positive_label:
            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
            results[f"{attr}_score"] = positive_label_f
            results[f"{attr}_score_desc"] = f"F ({positive_label})"
        elif not multi_label:
            results[f"{attr}_score"] = results[f"{attr}_macro_f"]
            results[f"{attr}_score_desc"] = "macro F"
        else:
            results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
            results[f"{attr}_score_desc"] = "macro AUC"
        return results

    @staticmethod
    def score_links(
        examples: Iterable[Example], *, negative_labels: Iterable[str]
    ) -> Dict[str, Any]:
        """Returns PRF for predicted links on the entity level.
        To disentangle the performance of the NEL from the NER,
        this method only evaluates NEL links for entities that overlap
        between the gold reference and the predictions.

        examples (Iterable[Example]): Examples to score
        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
        RETURNS (Dict[str, Any]): A dictionary containing the scores.

        DOCS: https://nightly.spacy.io/api/scorer#score_links
        """
        f_per_type = {}
        for example in examples:
            gold_ent_by_offset = {}
            for gold_ent in example.reference.ents:
                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent

            for pred_ent in example.predicted.ents:
                gold_span = gold_ent_by_offset.get(
                    (pred_ent.start_char, pred_ent.end_char), None
                )
                label = gold_span.label_
                if label not in f_per_type:
                    f_per_type[label] = PRFScore()
                gold = gold_span.kb_id_
                # only evaluating entities that overlap between gold and pred,
                # to disentangle the performance of the NEL from the NER
                if gold is not None:
                    pred = pred_ent.kb_id_
                    if gold in negative_labels and pred in negative_labels:
                        # ignore true negatives
                        pass
                    elif gold == pred:
                        f_per_type[label].tp += 1
                    elif gold in negative_labels:
                        f_per_type[label].fp += 1
                    elif pred in negative_labels:
                        f_per_type[label].fn += 1
                    else:
                        # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
                        f_per_type[label].fp += 1
                        f_per_type[label].fn += 1
        micro_prf = PRFScore()
        for label_prf in f_per_type.values():
            micro_prf.tp += label_prf.tp
            micro_prf.fn += label_prf.fn
            micro_prf.fp += label_prf.fp
        n_labels = len(f_per_type) + 1e-100
        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
        results = {
            f"nel_score": micro_prf.fscore,
            f"nel_score_desc": "micro F",
            f"nel_micro_p": micro_prf.precision,
            f"nel_micro_r": micro_prf.recall,
            f"nel_micro_f": micro_prf.fscore,
            f"nel_macro_p": macro_p,
            f"nel_macro_r": macro_r,
            f"nel_macro_f": macro_f,
            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
        }
        return results

    @staticmethod
    def score_deps(
        examples: Iterable[Example],
        attr: str,
        *,
        getter: Callable[[Token, str], Any] = getattr,
        head_attr: str = "head",
        head_getter: Callable[[Token, str], Token] = getattr,
        ignore_labels: Iterable[str] = SimpleFrozenList(),
        missing_values: Set[Any] = MISSING_VALUES,
        **cfg,
    ) -> Dict[str, Any]:
        """Returns the UAS, LAS, and LAS per type scores for dependency
        parses.

        examples (Iterable[Example]): Examples to score
        attr (str): The attribute containing the dependency label.
        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
            getter(token, attr) should return the value of the attribute for an
            individual token.
        head_attr (str): The attribute containing the head token. Defaults to
            'head'.
        head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
            head_getter(token, attr) should return the value of the head for an
            individual token.
        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
        RETURNS (Dict[str, Any]): A dictionary containing the scores:
            attr_uas, attr_las, and attr_las_per_type.

        DOCS: https://nightly.spacy.io/api/scorer#score_deps
        """
        unlabelled = PRFScore()
        labelled = PRFScore()
        labelled_per_dep = dict()
        missing_indices = set()
        for example in examples:
            gold_doc = example.reference
            pred_doc = example.predicted
            align = example.alignment
            gold_deps = set()
            gold_deps_per_dep = {}
            for gold_i, token in enumerate(gold_doc):
                dep = getter(token, attr)
                head = head_getter(token, head_attr)
                if dep not in missing_values:
                    if dep not in ignore_labels:
                        gold_deps.add((gold_i, head.i, dep))
                        if dep not in labelled_per_dep:
                            labelled_per_dep[dep] = PRFScore()
                        if dep not in gold_deps_per_dep:
                            gold_deps_per_dep[dep] = set()
                        gold_deps_per_dep[dep].add((gold_i, head.i, dep))
                else:
                    missing_indices.add(gold_i)
            pred_deps = set()
            pred_deps_per_dep = {}
            for token in pred_doc:
                if token.orth_.isspace():
                    continue
                if align.x2y.lengths[token.i] != 1:
                    gold_i = None
                else:
                    gold_i = align.x2y[token.i].dataXd[0, 0]
                if gold_i not in missing_indices:
                    dep = getter(token, attr)
                    head = head_getter(token, head_attr)
                    if dep not in ignore_labels and token.orth_.strip():
                        if align.x2y.lengths[head.i] == 1:
                            gold_head = align.x2y[head.i].dataXd[0, 0]
                        else:
                            gold_head = None
                        # None is indistinct, so we can't just add it to the set
                        # Multiple (None, None) deps are possible
                        if gold_i is None or gold_head is None:
                            unlabelled.fp += 1
                            labelled.fp += 1
                        else:
                            pred_deps.add((gold_i, gold_head, dep))
                            if dep not in labelled_per_dep:
                                labelled_per_dep[dep] = PRFScore()
                            if dep not in pred_deps_per_dep:
                                pred_deps_per_dep[dep] = set()
                            pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
            labelled.score_set(pred_deps, gold_deps)
            for dep in labelled_per_dep:
                labelled_per_dep[dep].score_set(
                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
                )
            unlabelled.score_set(
                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
            )
        if len(unlabelled) > 0:
            return {
                f"{attr}_uas": unlabelled.fscore,
                f"{attr}_las": labelled.fscore,
                f"{attr}_las_per_type": {
                    k: v.to_dict() for k, v in labelled_per_dep.items()
                },
            }
        else:
            return {
                f"{attr}_uas": None,
                f"{attr}_las": None,
                f"{attr}_las_per_type": None,
            }


def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
    """Compute micro-PRF and per-entity PRF scores for a sequence of examples.
    """
    score_per_type = defaultdict(PRFScore)
    for eg in examples:
        if not eg.y.has_annotation("ENT_IOB"):
            continue
        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
        align_x2y = eg.alignment.x2y
        for pred_ent in eg.x.ents:
            if pred_ent.label_ not in score_per_type:
                score_per_type[pred_ent.label_] = PRFScore()
            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
            if len(indices):
                g_span = eg.y[indices[0] : indices[-1] + 1]
                # Check we aren't missing annotation on this span. If so,
                # our prediction is neither right nor wrong, we just
                # ignore it.
                if all(token.ent_iob != 0 for token in g_span):
                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
                    if key in golds:
                        score_per_type[pred_ent.label_].tp += 1
                        golds.remove(key)
                    else:
                        score_per_type[pred_ent.label_].fp += 1
        for label, start, end in golds:
            score_per_type[label].fn += 1
    totals = PRFScore()
    for prf in score_per_type.values():
        totals += prf
    if len(totals) > 0:
        return {
            "ents_p": totals.precision,
            "ents_r": totals.recall,
            "ents_f": totals.fscore,
            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
        }
    else:
        return {
            "ents_p": None,
            "ents_r": None,
            "ents_f": None,
            "ents_per_type": None,
        }


# The following implementation of roc_auc_score() is adapted from
# scikit-learn, which is distributed under the New BSD License.
# Copyright (c) 2007–2019 The scikit-learn developers.
# See licenses/3rd_party_licenses.txt
def _roc_auc_score(y_true, y_score):
    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
    from prediction scores.

    Note: this implementation is restricted to the binary classification task

    Parameters
    ----------
    y_true : array, shape = [n_samples] or [n_samples, n_classes]
        True binary labels or binary label indicators.
        The multiclass case expects shape = [n_samples] and labels
        with values in ``range(n_classes)``.

    y_score : array, shape = [n_samples] or [n_samples, n_classes]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers). For binary
        y_true, y_score is supposed to be the score of the class with greater
        label. The multiclass case expects shape = [n_samples, n_classes]
        where the scores correspond to probability estimates.

    Returns
    -------
    auc : float

    References
    ----------
    .. [1] `Wikipedia entry for the Receiver operating characteristic
            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
           Letters, 2006, 27(8):861-874.

    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
    """
    if len(np.unique(y_true)) != 2:
        raise ValueError(Errors.E165.format(label=np.unique(y_true)))
    fpr, tpr, _ = _roc_curve(y_true, y_score)
    return _auc(fpr, tpr)


def _roc_curve(y_true, y_score):
    """Compute Receiver operating characteristic (ROC)

    Note: this implementation is restricted to the binary classification task.

    Parameters
    ----------

    y_true : array, shape = [n_samples]
        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
        pos_label should be explicitly given.

    y_score : array, shape = [n_samples]
        Target scores, can either be probability estimates of the positive
        class, confidence values, or non-thresholded measure of decisions
        (as returned by "decision_function" on some classifiers).

    Returns
    -------
    fpr : array, shape = [>2]
        Increasing false positive rates such that element i is the false
        positive rate of predictions with score >= thresholds[i].

    tpr : array, shape = [>2]
        Increasing true positive rates such that element i is the true
        positive rate of predictions with score >= thresholds[i].

    thresholds : array, shape = [n_thresholds]
        Decreasing thresholds on the decision function used to compute
        fpr and tpr. `thresholds[0]` represents no instances being predicted
        and is arbitrarily set to `max(y_score) + 1`.

    Notes
    -----
    Since the thresholds are sorted from low to high values, they
    are reversed upon returning them to ensure they correspond to both ``fpr``
    and ``tpr``, which are sorted in reversed order during their calculation.

    References
    ----------
    .. [1] `Wikipedia entry for the Receiver operating characteristic
            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_

    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
           Letters, 2006, 27(8):861-874.
    """
    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)

    # Add an extra threshold position
    # to make sure that the curve starts at (0, 0)
    tps = np.r_[0, tps]
    fps = np.r_[0, fps]
    thresholds = np.r_[thresholds[0] + 1, thresholds]

    if fps[-1] <= 0:
        fpr = np.repeat(np.nan, fps.shape)
    else:
        fpr = fps / fps[-1]

    if tps[-1] <= 0:
        tpr = np.repeat(np.nan, tps.shape)
    else:
        tpr = tps / tps[-1]

    return fpr, tpr, thresholds


def _binary_clf_curve(y_true, y_score):
    """Calculate true and false positives per binary classification threshold.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        True targets of binary classification

    y_score : array, shape = [n_samples]
        Estimated probabilities or decision function

    Returns
    -------
    fps : array, shape = [n_thresholds]
        A count of false positives, at index i being the number of negative
        samples assigned a score >= thresholds[i]. The total number of
        negative samples is equal to fps[-1] (thus true negatives are given by
        fps[-1] - fps).

    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
        An increasing count of true positives, at index i being the number
        of positive samples assigned a score >= thresholds[i]. The total
        number of positive samples is equal to tps[-1] (thus false negatives
        are given by tps[-1] - tps).

    thresholds : array, shape = [n_thresholds]
        Decreasing score values.
    """
    pos_label = 1.0

    y_true = np.ravel(y_true)
    y_score = np.ravel(y_score)

    # make y_true a boolean vector
    y_true = y_true == pos_label

    # sort scores and corresponding truth values
    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
    y_score = y_score[desc_score_indices]
    y_true = y_true[desc_score_indices]
    weight = 1.0

    # y_score typically has many tied values. Here we extract
    # the indices associated with the distinct values. We also
    # concatenate a value for the end of the curve.
    distinct_value_indices = np.where(np.diff(y_score))[0]
    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]

    # accumulate the true positives with decreasing threshold
    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
    fps = 1 + threshold_idxs - tps
    return fps, tps, y_score[threshold_idxs]


def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
    """Use high precision for cumsum and check that final value matches sum

    Parameters
    ----------
    arr : array-like
        To be cumulatively summed as flat
    axis : int, optional
        Axis along which the cumulative sum is computed.
        The default (None) is to compute the cumsum over the flattened array.
    rtol : float
        Relative tolerance, see ``np.allclose``
    atol : float
        Absolute tolerance, see ``np.allclose``
    """
    out = np.cumsum(arr, axis=axis, dtype=np.float64)
    expected = np.sum(arr, axis=axis, dtype=np.float64)
    if not np.all(
        np.isclose(
            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
        )
    ):
        raise ValueError(Errors.E163)
    return out


def _auc(x, y):
    """Compute Area Under the Curve (AUC) using the trapezoidal rule

    This is a general function, given points on a curve.  For computing the
    area under the ROC-curve, see :func:`roc_auc_score`.

    Parameters
    ----------
    x : array, shape = [n]
        x coordinates. These must be either monotonic increasing or monotonic
        decreasing.
    y : array, shape = [n]
        y coordinates.

    Returns
    -------
    auc : float
    """
    x = np.ravel(x)
    y = np.ravel(y)

    direction = 1
    dx = np.diff(x)
    if np.any(dx < 0):
        if np.all(dx <= 0):
            direction = -1
        else:
            raise ValueError(Errors.E164.format(x=x))

    area = direction * np.trapz(y, x)
    if isinstance(area, np.memmap):
        # Reductions such as .sum used internally in np.trapz do not return a
        # scalar by default for numpy.memmap instances contrary to
        # regular numpy.ndarray instances.
        area = area.dtype.type(area)
    return area
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								from typing import Optional, Iterable, Dict, Set, Any, Callable, TYPE_CHECKING
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								import numpy as np
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								from collections import defaultdict
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
-												Renaming gold & annotation_setter (#6042)

* version bump to 3.0.0a16

* rename "gold" folder to "training"

* rename 'annotation_setter' to 'set_extra_annotations'

* formatting
											
										
										
											2020-09-09 08:31:03 +00:00
+								from .training import Example
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								from .tokens import Token, Doc, Span, MorphAnalysis
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								from .errors import Errors
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 13:20:11 +00:00
+								from .util import get_lang_class, SimpleFrozenList
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								from .morphology import Morphology
-												* Fix evaluation of NER in scorer.py

											
										
										
											2015-05-27 01:18:16 +00:00
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								if TYPE_CHECKING:
 								    # This lets us add type hints for mypy etc. without causing circular imports
 								    from .language import Language  # noqa: F401
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								DEFAULT_PIPELINE = ("senter", "tagger", "morphologizer", "parser", "ner", "textcat")
 								MISSING_VALUES = frozenset([None, 0, ""])
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
-												* Print parse if verbose in scorer

											
										
										
											2015-04-05 20:29:30 +00:00
-												Remove object subclassing

											
										
										
											2020-07-12 12:03:23 +00:00
+								class PRFScore:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    """A precision / recall / F score."""
-												💫 Tidy up and auto-format .py files (#2983)

<!--- Provide a general summary of your changes in the title. -->

## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)

Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.

At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.

### Types of change
enhancement, code style

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.

											
										
										
											2018-11-30 16:03:03 +00:00
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def __init__(self) -> None:
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
+								        self.tp = 0
 								        self.fp = 0
 								        self.fn = 0
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								    def __len__(self) -> int:
 								        return self.tp + self.fp + self.fn
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								    def __iadd__(self, other):
 								        self.tp += other.tp
 								        self.fp += other.fp
 								        self.fn += other.fn
 								        return self
 								    def __add__(self, other):
 								        return PRFScore(
-												Tidy up and auto-format

											
										
										
											2020-09-29 19:39:28 +00:00
+								            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								        )
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def score_set(self, cand: set, gold: set) -> None:
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
+								        self.tp += len(cand.intersection(gold))
 								        self.fp += len(cand - gold)
 								        self.fn += len(gold - cand)
 								    @property
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def precision(self) -> float:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 02:00:14 +00:00
+								        return self.tp / (self.tp + self.fp + 1e-100)
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
 								    @property
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def recall(self) -> float:
-												Fix the git "sparse checkout" functionality (#5973)

* Fix the git sparse checkout functionality

* Format
											
										
										
											2020-08-26 02:00:14 +00:00
+								        return self.tp / (self.tp + self.fn + 1e-100)
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
 								    @property
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def fscore(self) -> float:
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
+								        p = self.precision
 								        r = self.recall
-												Various small tweaks to project CLI (#5965)

* Fix up/download of http and local paths

* Support git_sparse_checkout for assets

* Fix scorer

* Handle already-present directories for git assets

* Improve convert command

* Fix support for existant files in git assets

* Support branches in git sparse checkout

* Format

* Fix git assets

* Document git block in assets

* Fix test

* Fix test

* Revert "Fix test"

This reverts commit cf3097260fd2205604981dcee1030f835a548a3f.

* Revert "Fix test"

This reverts commit 964d636e2782ed4660942dffc905cb7e739659d6.

* Dont multiply p/r/f by 100

* Display scores * 100 during training
											
										
										
											2020-08-24 22:30:52 +00:00
+								        return 2 * ((p * r) / (p + r + 1e-100))
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def to_dict(self) -> Dict[str, float]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        return {"p": self.precision, "r": self.recall, "f": self.fscore}
-												* Update spacy.scorer, to use P/R/F to support tokenization errors

											
										
										
											2015-05-24 18:07:18 +00:00
-												Remove object subclassing

											
										
										
											2020-07-12 12:03:23 +00:00
+								class ROCAUCScore:
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								    """An AUC ROC score. This is only defined for binary classification.
 								    Use the method is_binary before calculating the score, otherwise it
 								    may throw an error."""
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
-												Fix types in Scorer

											
										
										
											2020-07-29 08:39:33 +00:00
+								    def __init__(self) -> None:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								        self.golds = []
 								        self.cands = []
 								        self.saved_score = 0.0
 								        self.saved_score_at_len = 0
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def score_set(self, cand, gold) -> None:
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								        self.cands.append(cand)
 								        self.golds.append(gold)
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								    def is_binary(self):
 								        return len(np.unique(self.golds)) == 2
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								    @property
 								    def score(self):
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								        if not self.is_binary():
 								            raise ValueError(Errors.E165.format(label=set(self.golds)))
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								        if len(self.golds) == self.saved_score_at_len:
 								            return self.saved_score
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								        self.saved_score = _roc_auc_score(self.golds, self.cands)
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								        self.saved_score_at_len = len(self.golds)
 								        return self.saved_score
-												Remove object subclassing

											
										
										
											2020-07-12 12:03:23 +00:00
+								class Scorer:
-												Update Scorer and add API docs

											
										
										
											2019-05-24 12:06:04 +00:00
+								    """Compute evaluation scores."""
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def __init__(
 								        self,
 								        nlp: Optional["Language"] = None,
 								        default_lang: str = "xx",
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        **cfg,
 								    ) -> None:
-												Update Scorer and add API docs

											
										
										
											2019-05-24 12:06:04 +00:00
+								        """Initialize the Scorer.
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#init
-												Update Scorer and add API docs

											
										
										
											2019-05-24 12:06:04 +00:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        self.nlp = nlp
 								        self.cfg = cfg
 								        if not nlp:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            nlp = get_lang_class(default_lang)()
 								            for pipe in default_pipeline:
 								                nlp.add_pipe(pipe)
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            self.nlp = nlp
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def score(self, examples: Iterable[Example]) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Evaluate a list of Examples.
 								        examples (Iterable[Example]): The predicted annotations + correct annotations.
 								        RETURNS (Dict): A dictionary of scores.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score
-												Add a tagger-based SentenceRecognizer (#4713)

* Add sent_starts to GoldParse

* Add SentTagger pipeline component

Add `SentTagger` pipeline component as a subclass of `Tagger`.

* Model reduces default parameters from `Tagger` to be small and fast
* Hard-coded set of two labels:
  * S (1): token at beginning of sentence
  * I (0): all other sentence positions
* Sets `token.sent_start` values

* Add sentence segmentation to Scorer

Report `sent_p/r/f` for sentence boundaries, which may be provided by
various pipeline components.

* Add sentence segmentation to CLI evaluate

* Add senttagger metrics/scoring to train CLI

* Rename SentTagger to SentenceRecognizer

* Add SentenceRecognizer to spacy.pipes imports

* Add SentenceRecognizer serialization test

* Shorten component name to sentrec

* Remove duplicates from train CLI output metrics

											
										
										
											2019-11-28 10:10:07 +00:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        scores = {}
 								        if hasattr(self.nlp.tokenizer, "score"):
 								            scores.update(self.nlp.tokenizer.score(examples, **self.cfg))
 								        for name, component in self.nlp.pipeline:
 								            if hasattr(component, "score"):
 								                scores.update(component.score(examples, **self.cfg))
 								        return scores
-												* Add scorer script

											
										
										
											2015-03-11 01:07:03 +00:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								    @staticmethod
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								    def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Returns accuracy and PRF scores for tokenization.
 								        * token_acc: # correct tokens / # gold tokens
 								        * token_p/r/f: PRF for token character spans
-												* Add scorer script

											
										
										
											2015-03-11 01:07:03 +00:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        examples (Iterable[Example]): Examples to score
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the scores
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            token_acc/p/r/f.
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_tokenization
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 20:18:16 +00:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        acc_score = PRFScore()
 								        prf_score = PRFScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								            if gold_doc.has_unknown_spaces:
 								                continue
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            align = example.alignment
 								            gold_spans = set()
 								            pred_spans = set()
 								            for token in gold_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                gold_spans.add((token.idx, token.idx + len(token)))
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                pred_spans.add((token.idx, token.idx + len(token)))
 								                if align.x2y.lengths[token.i] != 1:
 								                    acc_score.fp += 1
 								                else:
 								                    acc_score.tp += 1
 								            prf_score.score_set(pred_spans, gold_spans)
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        if len(acc_score) > 0:
 								            return {
 								                "token_acc": acc_score.fscore,
 								                "token_p": prf_score.precision,
 								                "token_r": prf_score.recall,
 								                "token_f": prf_score.fscore,
 								            }
 								        else:
 								            return {
 								                "token_acc": None,
 								                "token_p": None,
 								                "token_r": None,
 								                "token_f": None
 								            }
-												Add LAS per dependency to Scorer (#4560)


											
										
										
											2019-10-31 20:18:16 +00:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def score_token_attr(
 								        examples: Iterable[Example],
 								        attr: str,
 								        *,
 								        getter: Callable[[Token, str], Any] = getattr,
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        missing_values: Set[Any] = MISSING_VALUES,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        **cfg,
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Returns an accuracy score for a token-level attribute.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the accuracy score
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            under the key attr_acc.
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_token_attr
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 09:19:28 +00:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        tag_score = PRFScore()
 								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_tags = set()
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								            missing_indices = set()
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            for gold_i, token in enumerate(gold_doc):
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                value = getter(token, attr)
 								                if value not in missing_values:
 								                    gold_tags.add((gold_i, getter(token, attr)))
 								                else:
 								                    missing_indices.add(gold_i)
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            pred_tags = set()
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] == 1:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                    if gold_i not in missing_indices:
 								                        pred_tags.add((gold_i, getter(token, attr)))
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            tag_score.score_set(pred_tags, gold_tags)
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        score_key = f"{attr}_acc"
 								        if len(tag_score) == 0:
 								            return {score_key: None}
 								        else:
 								            return {score_key: tag_score.fscore}
-												Update Scorer.ents_per_type

											
										
										
											2019-07-10 09:19:28 +00:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def score_token_attr_per_feat(
 								        examples: Iterable[Example],
 								        attr: str,
 								        *,
 								        getter: Callable[[Token, str], Any] = getattr,
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        missing_values: Set[Any] = MISSING_VALUES,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        **cfg,
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Return PRF scores per feat for a token attribute in UFEATS format.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        RETURNS (dict): A dictionary containing the per-feat PRF scores under
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            the key attr_per_feat.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        per_feat = {}
 								        for example in examples:
 								            pred_doc = example.predicted
 								            gold_doc = example.reference
 								            align = example.alignment
 								            gold_per_feat = {}
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								            missing_indices = set()
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            for gold_i, token in enumerate(gold_doc):
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                value = getter(token, attr)
 								                morph = gold_doc.vocab.strings[value]
 								                if value not in missing_values and morph != Morphology.EMPTY_MORPH:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								                    for feat in morph.split(Morphology.FEATURE_SEP):
 								                        field, values = feat.split(Morphology.FIELD_SEP)
 								                        if field not in per_feat:
 								                            per_feat[field] = PRFScore()
 								                        if field not in gold_per_feat:
 								                            gold_per_feat[field] = set()
 								                        gold_per_feat[field].add((gold_i, feat))
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                else:
 								                    missing_indices.add(gold_i)
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            pred_per_feat = {}
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] == 1:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                    if gold_i not in missing_indices:
 								                        value = getter(token, attr)
 								                        morph = gold_doc.vocab.strings[value]
 								                        if value not in missing_values and morph != Morphology.EMPTY_MORPH:
 								                            for feat in morph.split(Morphology.FEATURE_SEP):
 								                                field, values = feat.split(Morphology.FIELD_SEP)
 								                                if field not in per_feat:
 								                                    per_feat[field] = PRFScore()
 								                                if field not in pred_per_feat:
 								                                    pred_per_feat[field] = set()
 								                                pred_per_feat[field].add((gold_i, feat))
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            for field in per_feat:
 								                per_feat[field].score_set(
-												fix micro PRF for textcat (#6130)

* fix micro PRF for textcat

* small fix
											
										
										
											2020-09-24 08:31:17 +00:00
+								                    pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								                )
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        score_key = f"{attr}_per_feat"
 								        if any([len(v) for v in per_feat.values()]):
 								            result = {k: v.to_dict() for k, v in per_feat.items()}
 								            return {score_key: result}
 								        else:
 								            return {score_key: None}
-												train is from-config by default (#5575)

* verbose and tag_map options

* adding init_tok2vec option and only changing the tok2vec that is specified

* adding omit_extra_lookups and verifying textcat config

* wip

* pretrain bugfix

* add replace and resume options

* train_textcat fix

* raw text functionality

* improve UX when KeyError or when input data can't be parsed

* avoid unnecessary access to goldparse in TextCat pipe

* save performance information in nlp.meta

* add noise_level to config

* move nn_parser's defaults to config file

* multitask in config - doesn't work yet

* scorer offering both F and AUC options, need to be specified in config

* add textcat verification code from old train script

* small fixes to config files

* clean up

* set default config for ner/parser to allow create_pipe to work as before

* two more test fixes

* small fixes

* cleanup

* fix NER pickling + additional unit test

* create_pipe as before
											
										
										
											2020-06-12 00:02:07 +00:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								    @staticmethod
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								    def score_spans(
 								        examples: Iterable[Example],
 								        attr: str,
 								        *,
-												Update docs, types and API consistency

											
										
										
											2020-08-17 14:45:24 +00:00
+								        getter: Callable[[Doc, str], Iterable[Span]] = getattr,
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        has_annotation: Optional[Callable[[Doc], bool]] = None,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        **cfg,
 								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Returns PRF scores for labeled spans.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Update docs, types and API consistency

											
										
										
											2020-08-17 14:45:24 +00:00
+								        getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
 								            provided, getter(doc, attr) should return the spans for the
 								            individual doc.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
 								            the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_spans
-												Update Scorer and add API docs

											
										
										
											2019-05-24 12:06:04 +00:00
+								        """
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        score = PRFScore()
 								        score_per_type = dict()
 								        for example in examples:
 								            pred_doc = example.predicted
 								            gold_doc = example.reference
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								            # Option to handle docs without sents
 								            if has_annotation is not None:
 								                if not has_annotation(gold_doc):
 								                    continue
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            # Find all labels in gold and doc
 								            labels = set(
 								                [k.label_ for k in getter(gold_doc, attr)]
 								                + [k.label_ for k in getter(pred_doc, attr)]
 								            )
 								            # Set up all labels for per type scoring and prepare gold per type
 								            gold_per_type = {label: set() for label in labels}
 								            for label in labels:
 								                if label not in score_per_type:
 								                    score_per_type[label] = PRFScore()
 								            # Find all predidate labels, for all and per type
 								            gold_spans = set()
 								            pred_spans = set()
 								            for span in getter(gold_doc, attr):
 								                gold_span = (span.label_, span.start, span.end - 1)
 								                gold_spans.add(gold_span)
 								                gold_per_type[span.label_].add((span.label_, span.start, span.end - 1))
 								            pred_per_type = {label: set() for label in labels}
-												Revert changes to Scorer.score_spans

											
										
										
											2020-09-25 06:21:30 +00:00
+								            for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)):
 								                pred_spans.add((span.label_, span.start, span.end - 1))
 								                pred_per_type[span.label_].add((span.label_, span.start, span.end - 1))
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            # Scores per label
 								            for k, v in score_per_type.items():
 								                if k in pred_per_type:
 								                    v.score_set(pred_per_type[k], gold_per_type[k])
 								            # Score for all labels
 								            score.score_set(pred_spans, gold_spans)
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        if len(score) > 0:
 								            return {
 								                f"{attr}_p": score.precision,
 								                f"{attr}_r": score.recall,
 								                f"{attr}_f": score.fscore,
 								                f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
 								            }
 								        else:
 								            return {
 								                f"{attr}_p": None,
 								                f"{attr}_r": None,
 								                f"{attr}_f": None,
 								                f"{attr}_per_type": None,
 								            }
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
 								    @staticmethod
 								    def score_cats(
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        examples: Iterable[Example],
 								        attr: str,
 								        *,
-												Fix types in Scorer

											
										
										
											2020-07-29 08:39:33 +00:00
+								        getter: Callable[[Doc, str], Any] = getattr,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 13:20:11 +00:00
+								        labels: Iterable[str] = SimpleFrozenList(),
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        multi_label: bool = True,
 								        positive_label: Optional[str] = None,
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								        threshold: Optional[float] = None,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        **cfg,
 								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Returns PRF and ROC AUC scores for a doc-level attribute with a
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 09:17:52 +00:00
+								        dict with scores for each label like Doc.cats. The reported overall
 								        score depends on the scorer settings.
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute to score.
-												Fix types in Scorer

											
										
										
											2020-07-29 08:39:33 +00:00
+								        getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            getter(doc, attr) should return the values for the individual doc.
 								        labels (Iterable[str]): The set of possible labels. Defaults to [].
 								        multi_label (bool): Whether the attribute allows multiple labels.
 								            Defaults to True.
 								        positive_label (str): The positive label for a binary task with
 								            exclusive classes. Defaults to None.
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								        threshold (float): Cutoff to consider a prediction "positive". Defaults
 								            to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring)
 								            otherwise.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the scores, with
 								            inapplicable scores as None:
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 09:17:52 +00:00
+								            for all:
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								                attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc),
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 09:17:52 +00:00
+								                attr_score_desc (text description of the overall score),
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								                attr_micro_p,
 								                attr_micro_r,
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								                attr_micro_f,
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								                attr_macro_p,
 								                attr_macro_r,
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								                attr_macro_f,
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								                attr_macro_auc,
-												Update cats scoring to provide overall score

* Provide top-level score as `attr_score`
* Provide a description of the score as `attr_score_desc`
* Provide all potential scores keys, setting unused keys to `None`
* Update CLI evaluate accordingly

											
										
										
											2020-07-27 09:17:52 +00:00
+								                attr_f_per_type,
 								                attr_auc_per_type
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_cats
-												Update Scorer and add API docs

											
										
										
											2019-05-24 12:06:04 +00:00
+								        """
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								        if threshold is None:
 								            threshold = 0.5 if multi_label else 0.0
 								        f_per_type = {label: PRFScore() for label in labels}
 								        auc_per_type = {label: ROCAUCScore() for label in labels}
 								        labels = set(labels)
 								        if labels:
 								            for eg in examples:
 								                labels.update(eg.predicted.cats.keys())
 								                labels.update(eg.reference.cats.keys())
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        for example in examples:
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								            # Through this loop, None in the gold_cats indicates missing label.
 								            pred_cats = getter(example.predicted, attr)
 								            gold_cats = getter(example.reference, attr)
 								            for label in labels:
 								                pred_score = pred_cats.get(label, 0.0)
 								                gold_score = gold_cats.get(label, 0.0)
 								                if gold_score is not None:
 								                    auc_per_type[label].score_set(pred_score, gold_score)
 								            if multi_label:
 								                for label in labels:
 								                    pred_score = pred_cats.get(label, 0.0)
 								                    gold_score = gold_cats.get(label, 0.0)
 								                    if gold_score is not None:
 								                        if pred_score >= threshold and gold_score > 0:
 								                            f_per_type[label].tp += 1
 								                        elif pred_score >= threshold and gold_score == 0:
 								                            f_per_type[label].fp += 1
 								                        elif pred_score < threshold and gold_score > 0:
 								                            f_per_type[label].fn += 1
 								            elif pred_cats and gold_cats:
 								                # Get the highest-scoring for each.
 								                pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1])
 								                gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1])
 								                if gold_score is not None:
 								                    if pred_label == gold_label and pred_score >= threshold:
 								                        f_per_type[pred_label].tp += 1
 								                    else:
 								                        f_per_type[gold_label].fn += 1
 								                        if pred_score >= threshold:
 								                            f_per_type[pred_label].fp += 1
 								            elif gold_cats:
 								                gold_label, gold_score = max(gold_cats, key=lambda it: it[1])
 								                if gold_score is not None and gold_score > 0:
 								                    f_per_type[gold_label].fn += 1
 								            else:
 								                pred_label, pred_score = max(pred_cats, key=lambda it: it[1])
 								                if pred_score >= threshold:
 								                    f_per_type[pred_label].fp += 1
 								        micro_prf = PRFScore()
 								        for label_prf in f_per_type.values():
-												fix micro PRF for textcat (#6130)

* fix micro PRF for textcat

* small fix
											
										
										
											2020-09-24 08:31:17 +00:00
+								            micro_prf.tp += label_prf.tp
 								            micro_prf.fn += label_prf.fn
 								            micro_prf.fp += label_prf.fp
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								        n_cats = len(f_per_type) + 1e-100
 								        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
 								        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
 								        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								        # Limit macro_auc to those labels with gold annotations,
 								        # but still divide by all cats to avoid artificial boosting of datasets with missing labels
 								        macro_auc = sum(auc.score if auc.is_binary() else 0.0 for auc in auc_per_type.values()) / n_cats
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        results = {
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            f"{attr}_score": None,
 								            f"{attr}_score_desc": None,
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								            f"{attr}_micro_p": micro_prf.precision,
 								            f"{attr}_micro_r": micro_prf.recall,
 								            f"{attr}_micro_f": micro_prf.fscore,
 								            f"{attr}_macro_p": macro_p,
 								            f"{attr}_macro_r": macro_r,
 								            f"{attr}_macro_f": macro_f,
-												Set macro AUC score in Scorer.score_cats

											
										
										
											2020-08-26 08:49:30 +00:00
+								            f"{attr}_macro_auc": macro_auc,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								            f"{attr}_auc_per_type": {k: v.score if v.is_binary() else None for k, v in auc_per_type.items()},
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        }
 								        if len(labels) == 2 and not multi_label and positive_label:
-												Tidy up and auto-format

											
										
										
											2020-08-09 20:36:23 +00:00
+								            positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"]
-												Be less choosy about reporting textcat scores (#5879)

* Set textcat scores more consistently

* Refactor textcat scores

* Fixes to scorer

* Add comments

* Add threshold

* Rename just 'f' to micro_f in textcat scorer

* Fix textcat score for two-class

* Fix syntax

* Fix textcat score

* Fix docstring
											
										
										
											2020-08-06 14:24:13 +00:00
+								            results[f"{attr}_score"] = positive_label_f
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            results[f"{attr}_score_desc"] = f"F ({positive_label})"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        elif not multi_label:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            results[f"{attr}_score"] = results[f"{attr}_macro_f"]
 								            results[f"{attr}_score_desc"] = "macro F"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        else:
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								            results[f"{attr}_score"] = results[f"{attr}_macro_auc"]
 								            results[f"{attr}_score_desc"] = "macro AUC"
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        return results
-												updates to NEL functionality (#6132)

* NEL: read sentences and ents from reference

* fiddling with sent_start annotations

* add KB serialization test

* KB write additional file with strings.json

* score_links function to calculate NEL P/R/F

* formatting

* documentation
											
										
										
											2020-09-24 14:53:59 +00:00
+								    @staticmethod
 								    def score_links(
 								        examples: Iterable[Example], *, negative_labels: Iterable[str]
 								    ) -> Dict[str, Any]:
 								        """Returns PRF for predicted links on the entity level.
 								        To disentangle the performance of the NEL from the NER,
 								        this method only evaluates NEL links for entities that overlap
 								        between the gold reference and the predictions.
 								        examples (Iterable[Example]): Examples to score
 								        negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL")
 								        RETURNS (Dict[str, Any]): A dictionary containing the scores.
-												remove TODO note

											
										
										
											2020-10-26 09:37:25 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_links
-												updates to NEL functionality (#6132)

* NEL: read sentences and ents from reference

* fiddling with sent_start annotations

* add KB serialization test

* KB write additional file with strings.json

* score_links function to calculate NEL P/R/F

* formatting

* documentation
											
										
										
											2020-09-24 14:53:59 +00:00
+								        """
 								        f_per_type = {}
 								        for example in examples:
 								            gold_ent_by_offset = {}
 								            for gold_ent in example.reference.ents:
 								                gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent
 								            for pred_ent in example.predicted.ents:
 								                gold_span = gold_ent_by_offset.get(
 								                    (pred_ent.start_char, pred_ent.end_char), None
 								                )
 								                label = gold_span.label_
-												Tidy up and auto-format

											
										
										
											2020-09-29 19:39:28 +00:00
+								                if label not in f_per_type:
-												updates to NEL functionality (#6132)

* NEL: read sentences and ents from reference

* fiddling with sent_start annotations

* add KB serialization test

* KB write additional file with strings.json

* score_links function to calculate NEL P/R/F

* formatting

* documentation
											
										
										
											2020-09-24 14:53:59 +00:00
+								                    f_per_type[label] = PRFScore()
 								                gold = gold_span.kb_id_
 								                # only evaluating entities that overlap between gold and pred,
 								                # to disentangle the performance of the NEL from the NER
 								                if gold is not None:
 								                    pred = pred_ent.kb_id_
 								                    if gold in negative_labels and pred in negative_labels:
 								                        # ignore true negatives
 								                        pass
 								                    elif gold == pred:
 								                        f_per_type[label].tp += 1
 								                    elif gold in negative_labels:
 								                        f_per_type[label].fp += 1
 								                    elif pred in negative_labels:
 								                        f_per_type[label].fn += 1
 								                    else:
 								                        # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN
 								                        f_per_type[label].fp += 1
 								                        f_per_type[label].fn += 1
 								        micro_prf = PRFScore()
 								        for label_prf in f_per_type.values():
 								            micro_prf.tp += label_prf.tp
 								            micro_prf.fn += label_prf.fn
 								            micro_prf.fp += label_prf.fp
 								        n_labels = len(f_per_type) + 1e-100
 								        macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels
 								        macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels
 								        macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels
 								        results = {
 								            f"nel_score": micro_prf.fscore,
 								            f"nel_score_desc": "micro F",
 								            f"nel_micro_p": micro_prf.precision,
 								            f"nel_micro_r": micro_prf.recall,
 								            f"nel_micro_f": micro_prf.fscore,
 								            f"nel_macro_p": macro_p,
 								            f"nel_macro_r": macro_r,
 								            f"nel_macro_f": macro_f,
 								            f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()},
 								        }
 								        return results
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								    @staticmethod
 								    def score_deps(
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        examples: Iterable[Example],
 								        attr: str,
 								        *,
 								        getter: Callable[[Token, str], Any] = getattr,
 								        head_attr: str = "head",
-												Update docs, types and API consistency

											
										
										
											2020-08-17 14:45:24 +00:00
+								        head_getter: Callable[[Token, str], Token] = getattr,
-												Use frozen list with custom errors

We don't want to break backwards compatibility too much but we also want to provide the best possible UX

											
										
										
											2020-08-29 13:20:11 +00:00
+								        ignore_labels: Iterable[str] = SimpleFrozenList(),
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        missing_values: Set[Any] = MISSING_VALUES,
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        **cfg,
 								    ) -> Dict[str, Any]:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """Returns the UAS, LAS, and LAS per type scores for dependency
 								        parses.
 								        examples (Iterable[Example]): Examples to score
 								        attr (str): The attribute containing the dependency label.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            getter(token, attr) should return the value of the attribute for an
 								            individual token.
 								        head_attr (str): The attribute containing the head token. Defaults to
 								            'head'.
-												Update docs, types and API consistency

											
										
										
											2020-08-17 14:45:24 +00:00
+								        head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            head_getter(token, attr) should return the value of the head for an
 								            individual token.
 								        ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
+								        RETURNS (Dict[str, Any]): A dictionary containing the scores:
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            attr_uas, attr_las, and attr_las_per_type.
-												Small adjustments to Scorer and docs

											
										
										
											2020-07-28 19:39:42 +00:00
-												Update docs links in codebase

											
										
										
											2020-09-04 10:58:50 +00:00
+								        DOCS: https://nightly.spacy.io/api/scorer#score_deps
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        """
 								        unlabelled = PRFScore()
 								        labelled = PRFScore()
 								        labelled_per_dep = dict()
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        missing_indices = set()
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								        for example in examples:
 								            gold_doc = example.reference
 								            pred_doc = example.predicted
 								            align = example.alignment
 								            gold_deps = set()
 								            gold_deps_per_dep = {}
 								            for gold_i, token in enumerate(gold_doc):
 								                dep = getter(token, attr)
 								                head = head_getter(token, head_attr)
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                if dep not in missing_values:
 								                    if dep not in ignore_labels:
 								                        gold_deps.add((gold_i, head.i, dep))
 								                        if dep not in labelled_per_dep:
 								                            labelled_per_dep[dep] = PRFScore()
 								                        if dep not in gold_deps_per_dep:
 								                            gold_deps_per_dep[dep] = set()
 								                        gold_deps_per_dep[dep].add((gold_i, head.i, dep))
 								                else:
 								                    missing_indices.add(gold_i)
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            pred_deps = set()
 								            pred_deps_per_dep = {}
 								            for token in pred_doc:
 								                if token.orth_.isspace():
 								                    continue
 								                if align.x2y.lengths[token.i] != 1:
 								                    gold_i = None
 								                else:
 								                    gold_i = align.x2y[token.i].dataXd[0, 0]
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                if gold_i not in missing_indices:
 								                    dep = getter(token, attr)
 								                    head = head_getter(token, head_attr)
 								                    if dep not in ignore_labels and token.orth_.strip():
 								                        if align.x2y.lengths[head.i] == 1:
 								                            gold_head = align.x2y[head.i].dataXd[0, 0]
 								                        else:
 								                            gold_head = None
 								                        # None is indistinct, so we can't just add it to the set
 								                        # Multiple (None, None) deps are possible
 								                        if gold_i is None or gold_head is None:
 								                            unlabelled.fp += 1
 								                            labelled.fp += 1
 								                        else:
 								                            pred_deps.add((gold_i, gold_head, dep))
 								                            if dep not in labelled_per_dep:
 								                                labelled_per_dep[dep] = PRFScore()
 								                            if dep not in pred_deps_per_dep:
 								                                pred_deps_per_dep[dep] = set()
 								                            pred_deps_per_dep[dep].add((gold_i, gold_head, dep))
-												Refactor the Scorer to improve flexibility (#5731)

* Refactor the Scorer to improve flexibility

Refactor the `Scorer` to improve flexibility for arbitrary pipeline
components.

* Individual pipeline components provide their own `evaluate` methods
that score a list of `Example`s and return a dictionary of scores
* `Scorer` is initialized either:
  * with a provided pipeline containing components to be scored
  * with a default pipeline containing the built-in statistical
    components (senter, tagger, morphologizer, parser, ner)
* `Scorer.score` evaluates a list of `Example`s and returns a dictionary
of scores referring to the scores provided by the components in the
pipeline

Significant differences:

* `tags_acc` is renamed to `tag_acc` to be consistent with `token_acc`
and the new `morph_acc`, `pos_acc`, and `lemma_acc`
* Scoring is no longer cumulative: `Scorer.score` scores a list of
examples rather than a single example and does not retain any state
about previously scored examples
* PRF values in the returned scores are no longer multiplied by 100

* Add kwargs to Morphologizer.evaluate

* Create generalized scoring methods in Scorer

* Generalized static scoring methods are added to `Scorer`
  * Methods require an attribute (either on Token or Doc) that is
used to key the returned scores

Naming differences:

* `uas`, `las`, and `las_per_type` in the scores dict are renamed to
`dep_uas`, `dep_las`, and `dep_las_per_type`

Scoring differences:

* `Doc.sents` is now scored as spans rather than on sentence-initial
token positions so that `Doc.sents` and `Doc.ents` can be scored with
the same method (this lowers scores since a single incorrect sentence
start results in two incorrect spans)

* Simplify / extend hasattr check for eval method

* Add hasattr check to tokenizer scoring
* Simplify to hasattr check for component scoring

* Reset Example alignment if docs are set

Reset the Example alignment if either doc is set in case the
tokenization has changed.

* Add PRF tokenization scoring for tokens as spans

Add PRF scores for tokens as character spans. The scores are:

* token_acc: # correct tokens / # gold tokens
* token_p/r/f: PRF for (token.idx, token.idx + len(token))

* Add docstring to Scorer.score_tokenization

* Rename component.evaluate() to component.score()

* Update Scorer API docs

* Update scoring for positive_label in textcat

* Fix TextCategorizer.score kwargs

* Update Language.evaluate docs

* Update score names in default config
											
										
										
											2020-07-25 10:53:02 +00:00
+								            labelled.score_set(pred_deps, gold_deps)
 								            for dep in labelled_per_dep:
 								                labelled_per_dep[dep].score_set(
 								                    pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set())
 								                )
 								            unlabelled.score_set(
 								                set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps)
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								            )
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								        if len(unlabelled) > 0:
 								            return {
 								                f"{attr}_uas": unlabelled.fscore,
 								                f"{attr}_las": labelled.fscore,
 								                f"{attr}_las_per_type": {
 								                    k: v.to_dict() for k, v in labelled_per_dep.items()
 								                },
 								            }
 								        else:
 								            return {
 								                f"{attr}_uas": None,
 								                f"{attr}_las": None,
 								                f"{attr}_las_per_type": None,
 								            }
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								def get_ner_prf(examples: Iterable[Example]) -> Dict[str, Any]:
 								    """Compute micro-PRF and per-entity PRF scores for a sequence of examples.
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								    """
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								    score_per_type = defaultdict(PRFScore)
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								    for eg in examples:
 								        if not eg.y.has_annotation("ENT_IOB"):
 								            continue
 								        golds = {(e.label_, e.start, e.end) for e in eg.y.ents}
 								        align_x2y = eg.alignment.x2y
 								        for pred_ent in eg.x.ents:
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								            if pred_ent.label_ not in score_per_type:
 								                score_per_type[pred_ent.label_] = PRFScore()
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								            indices = align_x2y[pred_ent.start : pred_ent.end].dataXd.ravel()
 								            if len(indices):
 								                g_span = eg.y[indices[0] : indices[-1] + 1]
 								                # Check we aren't missing annotation on this span. If so,
 								                # our prediction is neither right nor wrong, we just
 								                # ignore it.
 								                if all(token.ent_iob != 0 for token in g_span):
 								                    key = (pred_ent.label_, indices[0], indices[-1] + 1)
 								                    if key in golds:
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                        score_per_type[pred_ent.label_].tp += 1
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								                        golds.remove(key)
 								                    else:
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								                        score_per_type[pred_ent.label_].fp += 1
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
+								        for label, start, end in golds:
-												Handle missing reference values in scorer (#6286)

* Handle missing reference values in scorer

Handle missing values in reference doc during scoring where it is
possible to detect an unset state for the attribute. If no reference
docs contain annotation, `None` is returned instead of a score. `spacy
evaluate` displays `-` for missing scores and the missing scores are
saved as `None`/`null` in the metrics.

Attributes without unset states:

* `token.head`: relies on `token.dep` to recognize unset values
* `doc.cats`: unable to handle missing annotation

Additional changes:

* add optional `has_annotation` check to `score_scans` to replace
`doc.sents` hack
* update `score_token_attr_per_feat` to handle missing and empty morph
representations
* fix bug in `Doc.has_annotation` for normalization of `IS_SENT_START`
vs. `SENT_START`

* Fix import

* Update return types
											
										
										
											2020-11-03 14:47:18 +00:00
+								            score_per_type[label].fn += 1
 								    totals = PRFScore()
 								    for prf in score_per_type.values():
 								        totals += prf
 								    if len(totals) > 0:
 								        return {
 								            "ents_p": totals.precision,
 								            "ents_r": totals.recall,
 								            "ents_f": totals.fscore,
 								            "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items()},
 								        }
 								    else:
 								        return {
 								            "ents_p": None,
 								            "ents_r": None,
 								            "ents_f": None,
 								            "ents_per_type": None,
 								        }
-												Fix skipped documents in entity scorer (#6137)

* Fix skipped documents in entity scorer

* Add back the skipping of unannotated entities

* Update spacy/scorer.py

* Use more specific NER scorer

* Fix import

* Fix get_ner_prf

* Add scorer

* Fix scorer

Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2020-09-24 18:38:57 +00:00
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								# The following implementation of roc_auc_score() is adapted from
-												Clean up 3rd party license info (#6478)

Move scikit-learn license from `Scorer` to
`licenses/3rd_party_licenses.txt`.
											
										
										
											2020-12-02 09:15:23 +00:00
+								# scikit-learn, which is distributed under the New BSD License.
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								# Copyright (c) 2007–2019 The scikit-learn developers.
-												Clean up 3rd party license info (#6478)

Move scikit-learn license from `Scorer` to
`licenses/3rd_party_licenses.txt`.
											
										
										
											2020-12-02 09:15:23 +00:00
+								# See licenses/3rd_party_licenses.txt
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								def _roc_auc_score(y_true, y_score):
 								    """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC)
 								    from prediction scores.
 								    Note: this implementation is restricted to the binary classification task
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples] or [n_samples, n_classes]
 								        True binary labels or binary label indicators.
 								        The multiclass case expects shape = [n_samples] and labels
 								        with values in ``range(n_classes)``.
 								    y_score : array, shape = [n_samples] or [n_samples, n_classes]
 								        Target scores, can either be probability estimates of the positive
 								        class, confidence values, or non-thresholded measure of decisions
 								        (as returned by "decision_function" on some classifiers). For binary
 								        y_true, y_score is supposed to be the score of the class with greater
 								        label. The multiclass case expects shape = [n_samples, n_classes]
 								        where the scores correspond to probability estimates.
 								    Returns
 								    -------
 								    auc : float
 								    References
 								    ----------
 								    .. [1] `Wikipedia entry for the Receiver operating characteristic
 								            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 								    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
 								           Letters, 2006, 27(8):861-874.
 								    .. [3] `Analyzing a portion of the ROC curve. McClish, 1989
 								            <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_
 								    """
 								    if len(np.unique(y_true)) != 2:
-												TextCat updates and fixes (#6263)

* small fix in example imports

* throw error when train_corpus or dev_corpus is not a string

* small fix in custom logger example

* limit macro_auc to labels with 2 annotations

* fix typo

* also create parents of output_dir if need be

* update documentation of textcat scores

* refactor TextCatEnsemble

* fix tests for new AUC definition

* bump to 3.0.0a42

* update docs

* rename to spacy.TextCatEnsemble.v2

* spacy.TextCatEnsemble.v1 in legacy

* cleanup

* small fix

* update to 3.0.0rc2

* fix import that got lost in merge

* cursed IDE

* fix two typos
											
										
										
											2020-10-18 12:50:41 +00:00
+								        raise ValueError(Errors.E165.format(label=np.unique(y_true)))
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								    fpr, tpr, _ = _roc_curve(y_true, y_score)
 								    return _auc(fpr, tpr)
 								def _roc_curve(y_true, y_score):
 								    """Compute Receiver operating characteristic (ROC)
 								    Note: this implementation is restricted to the binary classification task.
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples]
 								        True binary labels. If labels are not either {-1, 1} or {0, 1}, then
 								        pos_label should be explicitly given.
 								    y_score : array, shape = [n_samples]
 								        Target scores, can either be probability estimates of the positive
 								        class, confidence values, or non-thresholded measure of decisions
 								        (as returned by "decision_function" on some classifiers).
 								    Returns
 								    -------
 								    fpr : array, shape = [>2]
 								        Increasing false positive rates such that element i is the false
 								        positive rate of predictions with score >= thresholds[i].
 								    tpr : array, shape = [>2]
 								        Increasing true positive rates such that element i is the true
 								        positive rate of predictions with score >= thresholds[i].
 								    thresholds : array, shape = [n_thresholds]
 								        Decreasing thresholds on the decision function used to compute
 								        fpr and tpr. `thresholds[0]` represents no instances being predicted
 								        and is arbitrarily set to `max(y_score) + 1`.
 								    Notes
 								    -----
 								    Since the thresholds are sorted from low to high values, they
 								    are reversed upon returning them to ensure they correspond to both ``fpr``
 								    and ``tpr``, which are sorted in reversed order during their calculation.
 								    References
 								    ----------
 								    .. [1] `Wikipedia entry for the Receiver operating characteristic
 								            <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
 								    .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition
 								           Letters, 2006, 27(8):861-874.
 								    """
 								    fps, tps, thresholds = _binary_clf_curve(y_true, y_score)
 								    # Add an extra threshold position
 								    # to make sure that the curve starts at (0, 0)
 								    tps = np.r_[0, tps]
 								    fps = np.r_[0, fps]
 								    thresholds = np.r_[thresholds[0] + 1, thresholds]
 								    if fps[-1] <= 0:
 								        fpr = np.repeat(np.nan, fps.shape)
 								    else:
 								        fpr = fps / fps[-1]
 								    if tps[-1] <= 0:
 								        tpr = np.repeat(np.nan, tps.shape)
 								    else:
 								        tpr = tps / tps[-1]
 								    return fpr, tpr, thresholds
 								def _binary_clf_curve(y_true, y_score):
 								    """Calculate true and false positives per binary classification threshold.
 								    Parameters
 								    ----------
 								    y_true : array, shape = [n_samples]
 								        True targets of binary classification
 								    y_score : array, shape = [n_samples]
 								        Estimated probabilities or decision function
 								    Returns
 								    -------
 								    fps : array, shape = [n_thresholds]
 								        A count of false positives, at index i being the number of negative
 								        samples assigned a score >= thresholds[i]. The total number of
 								        negative samples is equal to fps[-1] (thus true negatives are given by
 								        fps[-1] - fps).
 								    tps : array, shape = [n_thresholds <= len(np.unique(y_score))]
 								        An increasing count of true positives, at index i being the number
 								        of positive samples assigned a score >= thresholds[i]. The total
 								        number of positive samples is equal to tps[-1] (thus false negatives
 								        are given by tps[-1] - tps).
 								    thresholds : array, shape = [n_thresholds]
 								        Decreasing score values.
 								    """
-												Auto-format

											
										
										
											2019-09-18 17:56:55 +00:00
+								    pos_label = 1.0
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
 								    y_true = np.ravel(y_true)
 								    y_score = np.ravel(y_score)
 								    # make y_true a boolean vector
-												Auto-format

											
										
										
											2019-09-18 17:56:55 +00:00
+								    y_true = y_true == pos_label
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
 								    # sort scores and corresponding truth values
 								    desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1]
 								    y_score = y_score[desc_score_indices]
 								    y_true = y_true[desc_score_indices]
-												Auto-format

											
										
										
											2019-09-18 17:56:55 +00:00
+								    weight = 1.0
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
 								    # y_score typically has many tied values. Here we extract
 								    # the indices associated with the distinct values. We also
 								    # concatenate a value for the end of the curve.
 								    distinct_value_indices = np.where(np.diff(y_score))[0]
 								    threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
 								    # accumulate the true positives with decreasing threshold
 								    tps = _stable_cumsum(y_true * weight)[threshold_idxs]
 								    fps = 1 + threshold_idxs - tps
 								    return fps, tps, y_score[threshold_idxs]
 								def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08):
 								    """Use high precision for cumsum and check that final value matches sum
 								    Parameters
 								    ----------
 								    arr : array-like
 								        To be cumulatively summed as flat
 								    axis : int, optional
 								        Axis along which the cumulative sum is computed.
 								        The default (None) is to compute the cumsum over the flattened array.
 								    rtol : float
 								        Relative tolerance, see ``np.allclose``
 								    atol : float
 								        Absolute tolerance, see ``np.allclose``
 								    """
 								    out = np.cumsum(arr, axis=axis, dtype=np.float64)
 								    expected = np.sum(arr, axis=axis, dtype=np.float64)
-												Auto-format

											
										
										
											2019-09-18 17:56:55 +00:00
+								    if not np.all(
 								        np.isclose(
 								            out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True
 								        )
 								    ):
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
+								        raise ValueError(Errors.E163)
 								    return out
 								def _auc(x, y):
 								    """Compute Area Under the Curve (AUC) using the trapezoidal rule
 								    This is a general function, given points on a curve.  For computing the
 								    area under the ROC-curve, see :func:`roc_auc_score`.
 								    Parameters
 								    ----------
 								    x : array, shape = [n]
 								        x coordinates. These must be either monotonic increasing or monotonic
 								        decreasing.
 								    y : array, shape = [n]
 								        y coordinates.
 								    Returns
 								    -------
 								    auc : float
 								    """
 								    x = np.ravel(x)
 								    y = np.ravel(y)
 								    direction = 1
 								    dx = np.diff(x)
 								    if np.any(dx < 0):
 								        if np.all(dx <= 0):
 								            direction = -1
 								        else:
-												Tidy up errors and warnings

											
										
										
											2020-10-04 09:16:31 +00:00
+								            raise ValueError(Errors.E164.format(x=x))
-												Add textcat to train CLI (#4226)

* Add doc.cats to spacy.gold at the paragraph level

Support `doc.cats` as `"cats": [{"label": string, "value": number}]` in
the spacy JSON training format at the paragraph level.

* `spacy.gold.docs_to_json()` writes `docs.cats`

* `GoldCorpus` reads in cats in each `GoldParse`

* Update instances of gold_tuples to handle cats

Update iteration over gold_tuples / gold_parses to handle addition of
cats at the paragraph level.

* Add textcat to train CLI

* Add textcat options to train CLI
* Add textcat labels in `TextCategorizer.begin_training()`
* Add textcat evaluation to `Scorer`:
  * For binary exclusive classes with provided label: F1 for label
  * For 2+ exclusive classes: F1 macro average
  * For multilabel (not exclusive): ROC AUC macro average (currently
relying on sklearn)
* Provide user info on textcat evaluation settings, potential
incompatibilities
* Provide pipeline to Scorer in `Language.evaluate` for textcat config
* Customize train CLI output to include only metrics relevant to current
pipeline
* Add textcat evaluation to evaluate CLI

* Fix handling of unset arguments and config params

Fix handling of unset arguments and model confiug parameters in Scorer
initialization.

* Temporarily add sklearn requirement

* Remove sklearn version number

* Improve Scorer handling of models without textcats

* Fixing Scorer handling of models without textcats

* Update Scorer output for python 2.7

* Modify inf in Scorer for python 2.7

* Auto-format

Also make small adjustments to make auto-formatting with black easier and produce nicer results

* Move error message to Errors

* Update documentation

* Add cats to annotation JSON format [ci skip]

* Fix tpl flag and docs [ci skip]

* Switch to internal roc_auc_score

Switch to internal `roc_auc_score()` adapted from scikit-learn.

* Add AUCROCScore tests and improve errors/warnings

* Add tests for AUCROCScore and roc_auc_score
* Add missing error for only positive/negative values
* Remove unnecessary warnings and errors

* Make reduced roc_auc_score functions private

Because most of the checks and warnings have been stripped for the
internal functions and access is only intended through `ROCAUCScore`,
make the functions for roc_auc_score adapted from scikit-learn private.

* Check that data corresponds with multilabel flag

Check that the training instances correspond with the multilabel flag,
adding the multilabel flag if required.

* Add textcat score to early stopping check

* Add more checks to debug-data for textcat

* Add example training data for textcat

* Add more checks to textcat train CLI

* Check configuration when extending base model
* Fix typos

* Update textcat example data

* Provide licensing details and licenses for data
* Remove two labels with no positive instances from jigsaw-toxic-comment
data.


Co-authored-by: Ines Montani <ines@ines.io>
											
										
										
											2019-09-15 20:31:31 +00:00
 								    area = direction * np.trapz(y, x)
 								    if isinstance(area, np.memmap):
 								        # Reductions such as .sum used internally in np.trapz do not return a
 								        # scalar by default for numpy.memmap instances contrary to
 								        # regular numpy.ndarray instances.
 								        area = area.dtype.type(area)
 								    return area