lightning/pytorch_lightning/metrics/sklearns.py

854 lines
33 KiB
Python

from typing import Any, Optional, Union, Sequence
import numpy as np
import torch
from pytorch_lightning import _logger as lightning_logger
from pytorch_lightning.metrics.metric import NumpyMetric
from pytorch_lightning.utilities import rank_zero_warn
try:
from torch.distributed import ReduceOp, group
except ImportError:
class ReduceOp:
SUM = None
class group:
WORLD = None
rank_zero_warn('Unsupported `ReduceOp` for distributed computing.')
class SklearnMetric(NumpyMetric):
"""
Bridge between PyTorch Lightning and scikit-learn metrics
Warning:
Every metric call will cause a GPU synchronization, which may slow down your code
Note:
The order of targets and predictions may be different from the order typically used in PyTorch
"""
def __init__(
self,
metric_name: str,
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
**kwargs,
):
"""
Args:
metric_name: the metric name to import and compute from scikit-learn.metrics
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
**kwargs: additonal keyword arguments (will be forwarded to metric call)
"""
super().__init__(name=metric_name,
reduce_group=reduce_group,
reduce_op=reduce_op)
self.metric_kwargs = kwargs
lightning_logger.debug(
f'Metric {self.__class__.__name__} is using Sklearn as backend, meaning that'
' every metric call will cause a GPU synchronization, which may slow down your code'
)
@property
def metric_fn(self):
import sklearn.metrics
return getattr(sklearn.metrics, self.name)
def forward(self, *args, **kwargs) -> Union[np.ndarray, int, float]:
"""
Carries the actual metric computation
Args:
*args: Positional arguments forwarded to metric call (should be already converted to numpy)
**kwargs: keyword arguments forwarded to metric call (should be already converted to numpy)
Return:
the metric value (will be converted to tensor by baseclass)
"""
return self.metric_fn(*args, **kwargs, **self.metric_kwargs)
class Accuracy(SklearnMetric):
"""
Calculates the Accuracy Score
Warning:
Every metric call will cause a GPU synchronization, which may slow down your code
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = Accuracy()
>>> metric(y_pred, y_true)
tensor([0.7500])
"""
def __init__(
self,
normalize: bool = True,
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
normalize: If ``False``, return the number of correctly classified samples.
Otherwise, return the fraction of correctly classified samples.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__(metric_name='accuracy_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
normalize=normalize)
def forward(
self,
y_pred: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> float:
"""
Computes the accuracy
Args:
y_pred: the array containing the predictions (already in categorical form)
y_true: the array containing the targets (in categorical form)
sample_weight: Sample weights.
Return:
Accuracy Score
"""
return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
class AUC(SklearnMetric):
"""
Calculates the Area Under the Curve using the trapoezoidal rule
Warning:
Every metric call will cause a GPU synchronization, which may slow down your code
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = AUC()
>>> metric(y_pred, y_true)
tensor([4.])
"""
def __init__(
self,
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__(metric_name='auc',
reduce_group=reduce_group,
reduce_op=reduce_op)
def forward(self, x: np.ndarray, y: np.ndarray) -> float:
"""
Computes the AUC
Args:
x: x coordinates.
y: y coordinates.
Return:
AUC calculated with trapezoidal rule
"""
return super().forward(x=x, y=y)
class AveragePrecision(SklearnMetric):
"""
Calculates the average precision (AP) score.
"""
def __init__(
self,
average: Optional[str] = 'macro',
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
average: If None, the scores for each class are returned. Otherwise, this determines the type of
averaging performed on the data:
* If 'micro': Calculate metrics globally by considering each element of the label indicator
matrix as a label.
* If 'macro': Calculate metrics for each label, and find their unweighted mean.
This does not take label imbalance into account.
* If 'weighted': Calculate metrics for each label, and find their average, weighted by
support (the number of true instances for each label).
* If 'samples': Calculate metrics for each instance, and find their average.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('average_precision_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
average=average)
def forward(
self,
y_score: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> float:
"""
Args:
y_score: Target scores, can either be probability estimates of the positive class,
confidence values, or binary decisions.
y_true: True binary labels in binary label indicators.
sample_weight: Sample weights.
Return:
average precision score
"""
return super().forward(y_score=y_score, y_true=y_true,
sample_weight=sample_weight)
class ConfusionMatrix(SklearnMetric):
"""
Compute confusion matrix to evaluate the accuracy of a classification
By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` but
predicted to be in group :math:`j`.
Example:
>>> y_pred = torch.tensor([0, 1, 2, 1])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = ConfusionMatrix()
>>> metric(y_pred, y_true)
tensor([[1., 0., 0.],
[0., 1., 0.],
[0., 1., 1.]])
"""
def __init__(
self, labels: Optional[Sequence] = None,
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
labels: List of labels to index the matrix. This may be used to reorder
or select a subset of labels.
If none is given, those that appear at least once
in ``y_true`` or ``y_pred`` are used in sorted order.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('confusion_matrix',
reduce_group=reduce_group,
reduce_op=reduce_op,
labels=labels)
def forward(self, y_pred: np.ndarray, y_true: np.ndarray) -> np.ndarray:
"""
Args:
y_pred: Estimated targets as returned by a classifier.
y_true: Ground truth (correct) target values.
Return:
Confusion matrix (array of shape [num_classes, num_classes])
"""
return super().forward(y_pred=y_pred, y_true=y_true)
class F1(SklearnMetric):
r"""
Compute the F1 score, also known as balanced F-score or F-measure
The F1 score can be interpreted as a weighted average of the precision and
recall, where an F1 score reaches its best value at 1 and worst score at 0.
The relative contribution of precision and recall to the F1 score are
equal. The formula for the F1 score is:
.. math::
F_1 = 2 \cdot \frac{precision \cdot recall}{precision + recall}
In the multi-class and multi-label case, this is the weighted average of
the F1 score of each class.
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = F1()
>>> metric(y_pred, y_true)
tensor([0.6667])
References
- [1] `Wikipedia entry for the F1-score
<http://en.wikipedia.org/wiki/F1_score>`_
"""
def __init__(
self, labels: Optional[Sequence] = None,
pos_label: Union[str, int] = 1,
average: Optional[str] = 'macro',
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
labels: Integer array of labels.
pos_label: The class to report if ``average='binary'``.
average: This parameter is required for multiclass/multilabel targets.
If ``None``, the scores for each class are returned. Otherwise, this
determines the type of averaging performed on the data:
* ``'binary'``:
Only report results for the class specified by ``pos_label``.
This is applicable only if targets (``y_{true,pred}``) are binary.
* ``'micro'``:
Calculate metrics globally by counting the total true positives,
false negatives and false positives.
* ``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
* ``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label). This
alters 'macro' to account for label imbalance; it can result in an
F-score that is not between precision and recall.
* ``'samples'``:
Calculate metrics for each instance, and find their average (only
meaningful for multilabel classification where this differs from
:func:`accuracy_score`).
Note that if ``pos_label`` is given in binary classification with
`average != 'binary'`, only that positive class is reported. This
behavior is deprecated and will change in version 0.18.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('f1_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
labels=labels,
pos_label=pos_label,
average=average)
def forward(
self,
y_pred: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> Union[np.ndarray, float]:
"""
Args:
y_pred : Estimated targets as returned by a classifier.
y_true: Ground truth (correct) target values.
sample_weight: Sample weights.
Return:
F1 score of the positive class in binary classification or weighted
average of the F1 scores of each class for the multiclass task.
"""
return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
class FBeta(SklearnMetric):
"""
Compute the F-beta score. The `beta` parameter determines the weight of precision in the combined
score. ``beta < 1`` lends more weight to precision, while ``beta > 1``
favors recall (``beta -> 0`` considers only precision, ``beta -> inf``
only recall).
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = FBeta(beta=0.25)
>>> metric(y_pred, y_true)
tensor([0.7361])
References:
- [1] R. Baeza-Yates and B. Ribeiro-Neto (2011).
Modern Information Retrieval. Addison Wesley, pp. 327-328.
- [2] `Wikipedia entry for the F1-score
<http://en.wikipedia.org/wiki/F1_score>`_
"""
def __init__(
self,
beta: float,
labels: Optional[Sequence] = None,
pos_label: Union[str, int] = 1,
average: Optional[str] = 'macro',
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
beta: Weight of precision in harmonic mean.
labels: Integer array of labels.
pos_label: The class to report if ``average='binary'``.
average: This parameter is required for multiclass/multilabel targets.
If ``None``, the scores for each class are returned. Otherwise, this
determines the type of averaging performed on the data:
* ``'binary'``:
Only report results for the class specified by ``pos_label``.
This is applicable only if targets (``y_{true,pred}``) are binary.
* ``'micro'``:
Calculate metrics globally by counting the total true positives,
false negatives and false positives.
* ``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
* ``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label). This
alters 'macro' to account for label imbalance; it can result in an
F-score that is not between precision and recall.
* ``'samples'``:
Calculate metrics for each instance, and find their average (only
meaningful for multilabel classification where this differs from
:func:`accuracy_score`).
Note that if ``pos_label`` is given in binary classification with
`average != 'binary'`, only that positive class is reported. This
behavior is deprecated and will change in version 0.18.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('fbeta_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
beta=beta,
labels=labels,
pos_label=pos_label,
average=average)
def forward(
self,
y_pred: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> Union[np.ndarray, float]:
"""
Args:
y_pred : Estimated targets as returned by a classifier.
y_true: Ground truth (correct) target values.
sample_weight: Sample weights.
Return:
FBeta score of the positive class in binary classification or weighted
average of the FBeta scores of each class for the multiclass task.
"""
return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
class Precision(SklearnMetric):
"""
Compute the precision
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
true positives and ``fp`` the number of false positives. The precision is
intuitively the ability of the classifier not to label as positive a sample
that is negative.
The best value is 1 and the worst value is 0.
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = Precision()
>>> metric(y_pred, y_true)
tensor([0.7500])
"""
def __init__(
self,
labels: Optional[Sequence] = None,
pos_label: Union[str, int] = 1,
average: Optional[str] = 'macro',
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
labels: Integer array of labels.
pos_label: The class to report if ``average='binary'``.
average: This parameter is required for multiclass/multilabel targets.
If ``None``, the scores for each class are returned. Otherwise, this
determines the type of averaging performed on the data:
* ``'binary'``:
Only report results for the class specified by ``pos_label``.
This is applicable only if targets (``y_{true,pred}``) are binary.
* ``'micro'``:
Calculate metrics globally by counting the total true positives,
false negatives and false positives.
* ``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
* ``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label). This
alters 'macro' to account for label imbalance; it can result in an
F-score that is not between precision and recall.
* ``'samples'``:
Calculate metrics for each instance, and find their average (only
meaningful for multilabel classification where this differs from
:func:`accuracy_score`).
Note that if ``pos_label`` is given in binary classification with
`average != 'binary'`, only that positive class is reported. This
behavior is deprecated and will change in version 0.18.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('precision_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
labels=labels,
pos_label=pos_label,
average=average)
def forward(
self,
y_pred: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None,
) -> Union[np.ndarray, float]:
"""
Args:
y_pred : Estimated targets as returned by a classifier.
y_true: Ground truth (correct) target values.
sample_weight: Sample weights.
Return:
Precision of the positive class in binary classification or weighted
average of the precision of each class for the multiclass task.
"""
return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
class Recall(SklearnMetric):
"""
Compute the recall
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
true positives and ``fn`` the number of false negatives. The recall is
intuitively the ability of the classifier to find all the positive samples.
The best value is 1 and the worst value is 0.
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = Recall()
>>> metric(y_pred, y_true)
tensor([0.6250])
"""
def __init__(
self,
labels: Optional[Sequence] = None,
pos_label: Union[str, int] = 1,
average: Optional[str] = 'macro',
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
labels: Integer array of labels.
pos_label: The class to report if ``average='binary'``.
average: This parameter is required for multiclass/multilabel targets.
If ``None``, the scores for each class are returned. Otherwise, this
determines the type of averaging performed on the data:
* ``'binary'``:
Only report results for the class specified by ``pos_label``.
This is applicable only if targets (``y_{true,pred}``) are binary.
* ``'micro'``:
Calculate metrics globally by counting the total true positives,
false negatives and false positives.
* ``'macro'``:
Calculate metrics for each label, and find their unweighted
mean. This does not take label imbalance into account.
* ``'weighted'``:
Calculate metrics for each label, and find their average, weighted
by support (the number of true instances for each label). This
alters 'macro' to account for label imbalance; it can result in an
F-score that is not between precision and recall.
* ``'samples'``:
Calculate metrics for each instance, and find their average (only
meaningful for multilabel classification where this differs from
:func:`accuracy_score`).
Note that if ``pos_label`` is given in binary classification with
`average != 'binary'`, only that positive class is reported. This
behavior is deprecated and will change in version 0.18.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('recall_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
labels=labels,
pos_label=pos_label,
average=average)
def forward(
self,
y_pred: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None,
) -> Union[np.ndarray, float]:
"""
Args:
y_pred : Estimated targets as returned by a classifier.
y_true: Ground truth (correct) target values.
sample_weight: Sample weights.
Return:
Recall of the positive class in binary classification or weighted
average of the recall of each class for the multiclass task.
"""
return super().forward(y_pred=y_pred, y_true=y_true, sample_weight=sample_weight)
class PrecisionRecallCurve(SklearnMetric):
"""
Compute precision-recall pairs for different probability thresholds
Note:
This implementation is restricted to the binary classification task.
The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of
true positives and ``fp`` the number of false positives. The precision is
intuitively the ability of the classifier not to label as positive a sample
that is negative.
The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of
true positives and ``fn`` the number of false negatives. The recall is
intuitively the ability of the classifier to find all the positive samples.
The last precision and recall values are 1. and 0. respectively and do not
have a corresponding threshold. This ensures that the graph starts on the
x axis.
"""
def __init__(
self,
pos_label: Union[str, int] = 1,
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
pos_label: The class to report if ``average='binary'``.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('precision_recall_curve',
reduce_group=reduce_group,
reduce_op=reduce_op,
pos_label=pos_label)
def forward(
self,
probas_pred: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> Union[np.ndarray, float]:
"""
Args:
probas_pred : Estimated probabilities or decision function.
y_true: Ground truth (correct) target values.
sample_weight: Sample weights.
Returns:
precision:
Precision values such that element i is the precision of
predictions with score >= thresholds[i] and the last element is 1.
recall:
Decreasing recall values such that element i is the recall of
predictions with score >= thresholds[i] and the last element is 0.
thresholds:
Increasing thresholds on the decision function used to compute
precision and recall.
"""
# only return x and y here, since for now we cannot auto-convert elements of multiple length.
# Will be fixed in native implementation
return np.array(super().forward(probas_pred=probas_pred,
y_true=y_true,
sample_weight=sample_weight)[:2])
class ROC(SklearnMetric):
"""
Compute Receiver operating characteristic (ROC)
Note:
this implementation is restricted to the binary classification task.
Example:
>>> y_pred = torch.tensor([0, 1, 2, 3])
>>> y_true = torch.tensor([0, 1, 2, 2])
>>> metric = ROC()
>>> fps, tps = metric(y_pred, y_true)
>>> fps
tensor([0.0000, 0.3333, 0.6667, 0.6667, 1.0000])
>>> tps
tensor([0., 0., 0., 1., 1.])
References:
- [1] `Wikipedia entry for the Receiver operating characteristic
<http://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_
"""
def __init__(
self,
pos_label: Union[str, int] = 1,
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
pos_labels: The class to report if ``average='binary'``.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('roc_curve',
reduce_group=reduce_group,
reduce_op=reduce_op,
pos_label=pos_label)
def forward(
self,
y_score: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None
) -> Union[np.ndarray, float]:
"""
Args:
y_score : Target scores, can either be probability estimates of the positive
class or confidence values.
y_true: Ground truth (correct) target values.
sample_weight: Sample weights.
Returns:
fpr:
Increasing false positive rates such that element i is the false
positive rate of predictions with score >= thresholds[i].
tpr:
Increasing true positive rates such that element i is the true
positive rate of predictions with score >= thresholds[i].
thresholds:
Decreasing thresholds on the decision function used to compute
fpr and tpr. `thresholds[0]` represents no instances being predicted
and is arbitrarily set to `max(y_score) + 1`.
"""
return np.array(super().forward(y_score=y_score, y_true=y_true, sample_weight=sample_weight)[:2])
class AUROC(SklearnMetric):
"""
Compute Area Under the Curve (AUC) from prediction scores
Note:
this implementation is restricted to the binary classification task
or multilabel classification task in label indicator format.
"""
def __init__(
self,
average: Optional[str] = 'macro',
reduce_group: Any = group.WORLD,
reduce_op: Any = ReduceOp.SUM,
):
"""
Args:
average: If None, the scores for each class are returned. Otherwise, this determines the type of
averaging performed on the data:
* If 'micro': Calculate metrics globally by considering each element of the label indicator
matrix as a label.
* If 'macro': Calculate metrics for each label, and find their unweighted mean.
This does not take label imbalance into account.
* If 'weighted': Calculate metrics for each label, and find their average, weighted by
support (the number of true instances for each label).
* If 'samples': Calculate metrics for each instance, and find their average.
reduce_group: the process group for DDP reduces (only needed for DDP training).
Defaults to all processes (world)
reduce_op: the operation to perform during reduction within DDP (only needed for DDP training).
Defaults to sum.
"""
super().__init__('roc_auc_score',
reduce_group=reduce_group,
reduce_op=reduce_op,
average=average)
def forward(
self,
y_score: np.ndarray,
y_true: np.ndarray,
sample_weight: Optional[np.ndarray] = None,
) -> float:
"""
Args:
y_score: Target scores, can either be probability estimates of the positive class,
confidence values, or binary decisions.
y_true: True binary labels in binary label indicators.
sample_weight: Sample weights.
Return:
Area Under Receiver Operating Characteristic Curve
"""
return super().forward(y_score=y_score, y_true=y_true,
sample_weight=sample_weight)