from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING import numpy as np from .training import Example from .tokens import Token, Doc, Span from .errors import Errors from .util import get_lang_class, SimpleFrozenList from .morphology import Morphology if TYPE_CHECKING: # This lets us add type hints for mypy etc. without causing circular imports from .language import Language # noqa: F401 DEFAULT_PIPELINE = ["senter", "tagger", "morphologizer", "parser", "ner", "textcat"] class PRFScore: """A precision / recall / F score.""" def __init__(self) -> None: self.tp = 0 self.fp = 0 self.fn = 0 def score_set(self, cand: set, gold: set) -> None: self.tp += len(cand.intersection(gold)) self.fp += len(cand - gold) self.fn += len(gold - cand) @property def precision(self) -> float: return self.tp / (self.tp + self.fp + 1e-100) @property def recall(self) -> float: return self.tp / (self.tp + self.fn + 1e-100) @property def fscore(self) -> float: p = self.precision r = self.recall return 2 * ((p * r) / (p + r + 1e-100)) def to_dict(self) -> Dict[str, float]: return {"p": self.precision, "r": self.recall, "f": self.fscore} class ROCAUCScore: """An AUC ROC score.""" def __init__(self) -> None: self.golds = [] self.cands = [] self.saved_score = 0.0 self.saved_score_at_len = 0 def score_set(self, cand, gold) -> None: self.cands.append(cand) self.golds.append(gold) @property def score(self): if len(self.golds) == self.saved_score_at_len: return self.saved_score try: self.saved_score = _roc_auc_score(self.golds, self.cands) # catch ValueError: Only one class present in y_true. # ROC AUC score is not defined in that case. except ValueError: self.saved_score = -float("inf") self.saved_score_at_len = len(self.golds) return self.saved_score class Scorer: """Compute evaluation scores.""" def __init__( self, nlp: Optional["Language"] = None, default_lang: str = "xx", default_pipeline=DEFAULT_PIPELINE, **cfg, ) -> None: """Initialize the Scorer. DOCS: https://nightly.spacy.io/api/scorer#init """ self.nlp = nlp self.cfg = cfg if not nlp: nlp = get_lang_class(default_lang)() for pipe in default_pipeline: nlp.add_pipe(pipe) self.nlp = nlp def score(self, examples: Iterable[Example]) -> Dict[str, Any]: """Evaluate a list of Examples. examples (Iterable[Example]): The predicted annotations + correct annotations. RETURNS (Dict): A dictionary of scores. DOCS: https://nightly.spacy.io/api/scorer#score """ scores = {} if hasattr(self.nlp.tokenizer, "score"): scores.update(self.nlp.tokenizer.score(examples, **self.cfg)) for name, component in self.nlp.pipeline: if hasattr(component, "score"): scores.update(component.score(examples, **self.cfg)) return scores @staticmethod def score_tokenization(examples: Iterable[Example], **cfg) -> Dict[str, float]: """Returns accuracy and PRF scores for tokenization. * token_acc: # correct tokens / # gold tokens * token_p/r/f: PRF for token character spans examples (Iterable[Example]): Examples to score RETURNS (Dict[str, float]): A dictionary containing the scores token_acc/p/r/f. DOCS: https://nightly.spacy.io/api/scorer#score_tokenization """ acc_score = PRFScore() prf_score = PRFScore() for example in examples: gold_doc = example.reference pred_doc = example.predicted align = example.alignment gold_spans = set() pred_spans = set() for token in gold_doc: if token.orth_.isspace(): continue gold_spans.add((token.idx, token.idx + len(token))) for token in pred_doc: if token.orth_.isspace(): continue pred_spans.add((token.idx, token.idx + len(token))) if align.x2y.lengths[token.i] != 1: acc_score.fp += 1 else: acc_score.tp += 1 prf_score.score_set(pred_spans, gold_spans) return { "token_acc": acc_score.fscore, "token_p": prf_score.precision, "token_r": prf_score.recall, "token_f": prf_score.fscore, } @staticmethod def score_token_attr( examples: Iterable[Example], attr: str, *, getter: Callable[[Token, str], Any] = getattr, **cfg, ) -> Dict[str, float]: """Returns an accuracy score for a token-level attribute. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. RETURNS (Dict[str, float]): A dictionary containing the accuracy score under the key attr_acc. DOCS: https://nightly.spacy.io/api/scorer#score_token_attr """ tag_score = PRFScore() for example in examples: gold_doc = example.reference pred_doc = example.predicted align = example.alignment gold_tags = set() for gold_i, token in enumerate(gold_doc): gold_tags.add((gold_i, getter(token, attr))) pred_tags = set() for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] == 1: gold_i = align.x2y[token.i].dataXd[0, 0] pred_tags.add((gold_i, getter(token, attr))) tag_score.score_set(pred_tags, gold_tags) return {f"{attr}_acc": tag_score.fscore} @staticmethod def score_token_attr_per_feat( examples: Iterable[Example], attr: str, *, getter: Callable[[Token, str], Any] = getattr, **cfg, ): """Return PRF scores per feat for a token attribute in UFEATS format. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. RETURNS (dict): A dictionary containing the per-feat PRF scores unders the key attr_per_feat. """ per_feat = {} for example in examples: pred_doc = example.predicted gold_doc = example.reference align = example.alignment gold_per_feat = {} for gold_i, token in enumerate(gold_doc): morph = str(getter(token, attr)) if morph: for feat in morph.split(Morphology.FEATURE_SEP): field, values = feat.split(Morphology.FIELD_SEP) if field not in per_feat: per_feat[field] = PRFScore() if field not in gold_per_feat: gold_per_feat[field] = set() gold_per_feat[field].add((gold_i, feat)) pred_per_feat = {} for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] == 1: gold_i = align.x2y[token.i].dataXd[0, 0] morph = str(getter(token, attr)) if morph: for feat in morph.split("|"): field, values = feat.split("=") if field not in per_feat: per_feat[field] = PRFScore() if field not in pred_per_feat: pred_per_feat[field] = set() pred_per_feat[field].add((gold_i, feat)) for field in per_feat: per_feat[field].score_set( pred_per_feat.get(field, set()), gold_per_feat.get(field, set()) ) result = {k: v.to_dict() for k, v in per_feat.items()} return {f"{attr}_per_feat": result} @staticmethod def score_spans( examples: Iterable[Example], attr: str, *, getter: Callable[[Doc, str], Iterable[Span]] = getattr, **cfg, ) -> Dict[str, Any]: """Returns PRF scores for labeled spans. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If provided, getter(doc, attr) should return the spans for the individual doc. RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under the keys attr_p/r/f and the per-type PRF scores under attr_per_type. DOCS: https://nightly.spacy.io/api/scorer#score_spans """ score = PRFScore() score_per_type = dict() for example in examples: pred_doc = example.predicted gold_doc = example.reference # TODO # This is a temporary hack to work around the problem that the scorer # fails if you have examples that are not fully annotated for all # the tasks in your pipeline. For instance, you might have a corpus # of NER annotations that does not set sentence boundaries, but the # pipeline includes a parser or senter, and then the score_weights # are used to evaluate that component. When the scorer attempts # to read the sentences from the gold document, it fails. try: list(getter(gold_doc, attr)) except ValueError: continue # Find all labels in gold and doc labels = set( [k.label_ for k in getter(gold_doc, attr)] + [k.label_ for k in getter(pred_doc, attr)] ) # Set up all labels for per type scoring and prepare gold per type gold_per_type = {label: set() for label in labels} for label in labels: if label not in score_per_type: score_per_type[label] = PRFScore() # Find all predidate labels, for all and per type gold_spans = set() pred_spans = set() # Special case for ents: # If we have missing values in the gold, we can't easily tell # whether our NER predictions are true. # It seems bad but it's what we've always done. if attr == "ents" and not all(token.ent_iob != 0 for token in gold_doc): continue for span in getter(gold_doc, attr): gold_span = (span.label_, span.start, span.end - 1) gold_spans.add(gold_span) gold_per_type[span.label_].add((span.label_, span.start, span.end - 1)) pred_per_type = {label: set() for label in labels} for span in example.get_aligned_spans_x2y(getter(pred_doc, attr)): pred_spans.add((span.label_, span.start, span.end - 1)) pred_per_type[span.label_].add((span.label_, span.start, span.end - 1)) # Scores per label for k, v in score_per_type.items(): if k in pred_per_type: v.score_set(pred_per_type[k], gold_per_type[k]) # Score for all labels score.score_set(pred_spans, gold_spans) results = { f"{attr}_p": score.precision, f"{attr}_r": score.recall, f"{attr}_f": score.fscore, f"{attr}_per_type": {k: v.to_dict() for k, v in score_per_type.items()}, } return results @staticmethod def score_cats( examples: Iterable[Example], attr: str, *, getter: Callable[[Doc, str], Any] = getattr, labels: Iterable[str] = SimpleFrozenList(), multi_label: bool = True, positive_label: Optional[str] = None, threshold: Optional[float] = None, **cfg, ) -> Dict[str, Any]: """Returns PRF and ROC AUC scores for a doc-level attribute with a dict with scores for each label like Doc.cats. The reported overall score depends on the scorer settings. examples (Iterable[Example]): Examples to score attr (str): The attribute to score. getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided, getter(doc, attr) should return the values for the individual doc. labels (Iterable[str]): The set of possible labels. Defaults to []. multi_label (bool): Whether the attribute allows multiple labels. Defaults to True. positive_label (str): The positive label for a binary task with exclusive classes. Defaults to None. threshold (float): Cutoff to consider a prediction "positive". Defaults to 0.5 for multi-label, and 0.0 (i.e. whatever's highest scoring) otherwise. RETURNS (Dict[str, Any]): A dictionary containing the scores, with inapplicable scores as None: for all: attr_score (one of attr_micro_f / attr_macro_f / attr_macro_auc), attr_score_desc (text description of the overall score), attr_micro_f, attr_macro_f, attr_auc, attr_f_per_type, attr_auc_per_type DOCS: https://nightly.spacy.io/api/scorer#score_cats """ if threshold is None: threshold = 0.5 if multi_label else 0.0 f_per_type = {label: PRFScore() for label in labels} auc_per_type = {label: ROCAUCScore() for label in labels} labels = set(labels) if labels: for eg in examples: labels.update(eg.predicted.cats.keys()) labels.update(eg.reference.cats.keys()) for example in examples: # Through this loop, None in the gold_cats indicates missing label. pred_cats = getter(example.predicted, attr) gold_cats = getter(example.reference, attr) # I think the AUC metric is applicable regardless of whether we're # doing multi-label classification? Unsure. If not, move this into # the elif pred_cats and gold_cats block below. for label in labels: pred_score = pred_cats.get(label, 0.0) gold_score = gold_cats.get(label, 0.0) if gold_score is not None: auc_per_type[label].score_set(pred_score, gold_score) if multi_label: for label in labels: pred_score = pred_cats.get(label, 0.0) gold_score = gold_cats.get(label, 0.0) if gold_score is not None: if pred_score >= threshold and gold_score > 0: f_per_type[label].tp += 1 elif pred_score >= threshold and gold_score == 0: f_per_type[label].fp += 1 elif pred_score < threshold and gold_score > 0: f_per_type[label].fn += 1 elif pred_cats and gold_cats: # Get the highest-scoring for each. pred_label, pred_score = max(pred_cats.items(), key=lambda it: it[1]) gold_label, gold_score = max(gold_cats.items(), key=lambda it: it[1]) if gold_score is not None: if pred_label == gold_label and pred_score >= threshold: f_per_type[pred_label].tp += 1 else: f_per_type[gold_label].fn += 1 if pred_score >= threshold: f_per_type[pred_label].fp += 1 elif gold_cats: gold_label, gold_score = max(gold_cats, key=lambda it: it[1]) if gold_score is not None and gold_score > 0: f_per_type[gold_label].fn += 1 else: pred_label, pred_score = max(pred_cats, key=lambda it: it[1]) if pred_score >= threshold: f_per_type[pred_label].fp += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn micro_prf.fp += label_prf.fp n_cats = len(f_per_type) + 1e-100 macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_cats macro_auc = sum(auc.score for auc in auc_per_type.values()) / n_cats results = { f"{attr}_score": None, f"{attr}_score_desc": None, f"{attr}_micro_p": micro_prf.precision, f"{attr}_micro_r": micro_prf.recall, f"{attr}_micro_f": micro_prf.fscore, f"{attr}_macro_p": macro_p, f"{attr}_macro_r": macro_r, f"{attr}_macro_f": macro_f, f"{attr}_macro_auc": macro_auc, f"{attr}_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, f"{attr}_auc_per_type": {k: v.score for k, v in auc_per_type.items()}, } if len(labels) == 2 and not multi_label and positive_label: positive_label_f = results[f"{attr}_f_per_type"][positive_label]["f"] results[f"{attr}_score"] = positive_label_f results[f"{attr}_score_desc"] = f"F ({positive_label})" elif not multi_label: results[f"{attr}_score"] = results[f"{attr}_macro_f"] results[f"{attr}_score_desc"] = "macro F" else: results[f"{attr}_score"] = results[f"{attr}_macro_auc"] results[f"{attr}_score_desc"] = "macro AUC" return results @staticmethod def score_links( examples: Iterable[Example], *, negative_labels: Iterable[str] ) -> Dict[str, Any]: """Returns PRF for predicted links on the entity level. To disentangle the performance of the NEL from the NER, this method only evaluates NEL links for entities that overlap between the gold reference and the predictions. examples (Iterable[Example]): Examples to score negative_labels (Iterable[str]): The string values that refer to no annotation (e.g. "NIL") RETURNS (Dict[str, Any]): A dictionary containing the scores. DOCS (TODO): https://nightly.spacy.io/api/scorer#score_links """ f_per_type = {} for example in examples: gold_ent_by_offset = {} for gold_ent in example.reference.ents: gold_ent_by_offset[(gold_ent.start_char, gold_ent.end_char)] = gold_ent for pred_ent in example.predicted.ents: gold_span = gold_ent_by_offset.get( (pred_ent.start_char, pred_ent.end_char), None ) label = gold_span.label_ if not label in f_per_type: f_per_type[label] = PRFScore() gold = gold_span.kb_id_ # only evaluating entities that overlap between gold and pred, # to disentangle the performance of the NEL from the NER if gold is not None: pred = pred_ent.kb_id_ if gold in negative_labels and pred in negative_labels: # ignore true negatives pass elif gold == pred: f_per_type[label].tp += 1 elif gold in negative_labels: f_per_type[label].fp += 1 elif pred in negative_labels: f_per_type[label].fn += 1 else: # a wrong prediction (e.g. Q42 != Q3) counts as both a FP as well as a FN f_per_type[label].fp += 1 f_per_type[label].fn += 1 micro_prf = PRFScore() for label_prf in f_per_type.values(): micro_prf.tp += label_prf.tp micro_prf.fn += label_prf.fn micro_prf.fp += label_prf.fp n_labels = len(f_per_type) + 1e-100 macro_p = sum(prf.precision for prf in f_per_type.values()) / n_labels macro_r = sum(prf.recall for prf in f_per_type.values()) / n_labels macro_f = sum(prf.fscore for prf in f_per_type.values()) / n_labels results = { f"nel_score": micro_prf.fscore, f"nel_score_desc": "micro F", f"nel_micro_p": micro_prf.precision, f"nel_micro_r": micro_prf.recall, f"nel_micro_f": micro_prf.fscore, f"nel_macro_p": macro_p, f"nel_macro_r": macro_r, f"nel_macro_f": macro_f, f"nel_f_per_type": {k: v.to_dict() for k, v in f_per_type.items()}, } return results @staticmethod def score_deps( examples: Iterable[Example], attr: str, *, getter: Callable[[Token, str], Any] = getattr, head_attr: str = "head", head_getter: Callable[[Token, str], Token] = getattr, ignore_labels: Iterable[str] = SimpleFrozenList(), **cfg, ) -> Dict[str, Any]: """Returns the UAS, LAS, and LAS per type scores for dependency parses. examples (Iterable[Example]): Examples to score attr (str): The attribute containing the dependency label. getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, getter(token, attr) should return the value of the attribute for an individual token. head_attr (str): The attribute containing the head token. Defaults to 'head'. head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided, head_getter(token, attr) should return the value of the head for an individual token. ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct). RETURNS (Dict[str, Any]): A dictionary containing the scores: attr_uas, attr_las, and attr_las_per_type. DOCS: https://nightly.spacy.io/api/scorer#score_deps """ unlabelled = PRFScore() labelled = PRFScore() labelled_per_dep = dict() for example in examples: gold_doc = example.reference pred_doc = example.predicted align = example.alignment gold_deps = set() gold_deps_per_dep = {} for gold_i, token in enumerate(gold_doc): dep = getter(token, attr) head = head_getter(token, head_attr) if dep not in ignore_labels: gold_deps.add((gold_i, head.i, dep)) if dep not in labelled_per_dep: labelled_per_dep[dep] = PRFScore() if dep not in gold_deps_per_dep: gold_deps_per_dep[dep] = set() gold_deps_per_dep[dep].add((gold_i, head.i, dep)) pred_deps = set() pred_deps_per_dep = {} for token in pred_doc: if token.orth_.isspace(): continue if align.x2y.lengths[token.i] != 1: gold_i = None else: gold_i = align.x2y[token.i].dataXd[0, 0] dep = getter(token, attr) head = head_getter(token, head_attr) if dep not in ignore_labels and token.orth_.strip(): if align.x2y.lengths[head.i] == 1: gold_head = align.x2y[head.i].dataXd[0, 0] else: gold_head = None # None is indistinct, so we can't just add it to the set # Multiple (None, None) deps are possible if gold_i is None or gold_head is None: unlabelled.fp += 1 labelled.fp += 1 else: pred_deps.add((gold_i, gold_head, dep)) if dep not in labelled_per_dep: labelled_per_dep[dep] = PRFScore() if dep not in pred_deps_per_dep: pred_deps_per_dep[dep] = set() pred_deps_per_dep[dep].add((gold_i, gold_head, dep)) labelled.score_set(pred_deps, gold_deps) for dep in labelled_per_dep: labelled_per_dep[dep].score_set( pred_deps_per_dep.get(dep, set()), gold_deps_per_dep.get(dep, set()) ) unlabelled.score_set( set(item[:2] for item in pred_deps), set(item[:2] for item in gold_deps) ) return { f"{attr}_uas": unlabelled.fscore, f"{attr}_las": labelled.fscore, f"{attr}_las_per_type": { k: v.to_dict() for k, v in labelled_per_dep.items() }, } ############################################################################# # # The following implementation of roc_auc_score() is adapted from # scikit-learn, which is distributed under the following license: # # New BSD License # # Copyright (c) 2007–2019 The scikit-learn developers. # All rights reserved. # # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # a. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # b. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # c. Neither the name of the Scikit-learn Developers nor the names of # its contributors may be used to endorse or promote products # derived from this software without specific prior written # permission. # # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH # DAMAGE. def _roc_auc_score(y_true, y_score): """Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. Note: this implementation is restricted to the binary classification task Parameters ---------- y_true : array, shape = [n_samples] or [n_samples, n_classes] True binary labels or binary label indicators. The multiclass case expects shape = [n_samples] and labels with values in ``range(n_classes)``. y_score : array, shape = [n_samples] or [n_samples, n_classes] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). For binary y_true, y_score is supposed to be the score of the class with greater label. The multiclass case expects shape = [n_samples, n_classes] where the scores correspond to probability estimates. Returns ------- auc : float References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic `_ .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition Letters, 2006, 27(8):861-874. .. [3] `Analyzing a portion of the ROC curve. McClish, 1989 `_ """ if len(np.unique(y_true)) != 2: raise ValueError(Errors.E165) fpr, tpr, _ = _roc_curve(y_true, y_score) return _auc(fpr, tpr) def _roc_curve(y_true, y_score): """Compute Receiver operating characteristic (ROC) Note: this implementation is restricted to the binary classification task. Parameters ---------- y_true : array, shape = [n_samples] True binary labels. If labels are not either {-1, 1} or {0, 1}, then pos_label should be explicitly given. y_score : array, shape = [n_samples] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). Returns ------- fpr : array, shape = [>2] Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i]. tpr : array, shape = [>2] Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i]. thresholds : array, shape = [n_thresholds] Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `max(y_score) + 1`. Notes ----- Since the thresholds are sorted from low to high values, they are reversed upon returning them to ensure they correspond to both ``fpr`` and ``tpr``, which are sorted in reversed order during their calculation. References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic `_ .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition Letters, 2006, 27(8):861-874. """ fps, tps, thresholds = _binary_clf_curve(y_true, y_score) # Add an extra threshold position # to make sure that the curve starts at (0, 0) tps = np.r_[0, tps] fps = np.r_[0, fps] thresholds = np.r_[thresholds[0] + 1, thresholds] if fps[-1] <= 0: fpr = np.repeat(np.nan, fps.shape) else: fpr = fps / fps[-1] if tps[-1] <= 0: tpr = np.repeat(np.nan, tps.shape) else: tpr = tps / tps[-1] return fpr, tpr, thresholds def _binary_clf_curve(y_true, y_score): """Calculate true and false positives per binary classification threshold. Parameters ---------- y_true : array, shape = [n_samples] True targets of binary classification y_score : array, shape = [n_samples] Estimated probabilities or decision function Returns ------- fps : array, shape = [n_thresholds] A count of false positives, at index i being the number of negative samples assigned a score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives are given by fps[-1] - fps). tps : array, shape = [n_thresholds <= len(np.unique(y_score))] An increasing count of true positives, at index i being the number of positive samples assigned a score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives are given by tps[-1] - tps). thresholds : array, shape = [n_thresholds] Decreasing score values. """ pos_label = 1.0 y_true = np.ravel(y_true) y_score = np.ravel(y_score) # make y_true a boolean vector y_true = y_true == pos_label # sort scores and corresponding truth values desc_score_indices = np.argsort(y_score, kind="mergesort")[::-1] y_score = y_score[desc_score_indices] y_true = y_true[desc_score_indices] weight = 1.0 # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = np.where(np.diff(y_score))[0] threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold tps = _stable_cumsum(y_true * weight)[threshold_idxs] fps = 1 + threshold_idxs - tps return fps, tps, y_score[threshold_idxs] def _stable_cumsum(arr, axis=None, rtol=1e-05, atol=1e-08): """Use high precision for cumsum and check that final value matches sum Parameters ---------- arr : array-like To be cumulatively summed as flat axis : int, optional Axis along which the cumulative sum is computed. The default (None) is to compute the cumsum over the flattened array. rtol : float Relative tolerance, see ``np.allclose`` atol : float Absolute tolerance, see ``np.allclose`` """ out = np.cumsum(arr, axis=axis, dtype=np.float64) expected = np.sum(arr, axis=axis, dtype=np.float64) if not np.all( np.isclose( out.take(-1, axis=axis), expected, rtol=rtol, atol=atol, equal_nan=True ) ): raise ValueError(Errors.E163) return out def _auc(x, y): """Compute Area Under the Curve (AUC) using the trapezoidal rule This is a general function, given points on a curve. For computing the area under the ROC-curve, see :func:`roc_auc_score`. Parameters ---------- x : array, shape = [n] x coordinates. These must be either monotonic increasing or monotonic decreasing. y : array, shape = [n] y coordinates. Returns ------- auc : float """ x = np.ravel(x) y = np.ravel(y) direction = 1 dx = np.diff(x) if np.any(dx < 0): if np.all(dx <= 0): direction = -1 else: raise ValueError(Errors.E164.format(x)) area = direction * np.trapz(y, x) if isinstance(area, np.memmap): # Reductions such as .sum used internally in np.trapz do not return a # scalar by default for numpy.memmap instances contrary to # regular numpy.ndarray instances. area = area.dtype.type(area) return area