lightning/tests/metrics/test_aggregation.py

import pytest
import sys
from collections import namedtuple
from functools import partial
import math

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
import numpy as np

from tests.base import EvalModelTemplate
from pytorch_lightning import Trainer
import tests.base.develop_utils as tutils
from pytorch_lightning.metrics import (
    Accuracy,
    ConfusionMatrix,
    PrecisionRecallCurve,
    Precision,
    Recall,
    AveragePrecision,
    AUROC,
    FBeta,
    F1,
    ROC,
    MulticlassROC,
    MulticlassPrecisionRecallCurve,
    DiceCoefficient,
    IoU,
    MAE,
    MSE,
    RMSE,
    RMSLE,
    PSNR,
    SSIM,
)

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    precision_recall_curve,
    precision_score,
    recall_score,
    average_precision_score,
    roc_auc_score,
    fbeta_score,
    f1_score,
    roc_curve,
    jaccard_score,
    mean_squared_error,
    mean_absolute_error,
    mean_squared_log_error
)

from skimage.metrics import (
    peak_signal_noise_ratio,
    structural_similarity
)

# example structure
TestCase = namedtuple('example', ['name', 'lightning_metric', 'comparing_metric', 'test_input'])

# setup some standard testcases
NB_SAMPLES = 200
multiclass_example = [(torch.randint(10, (NB_SAMPLES,)), torch.randint(10, (NB_SAMPLES,)))]
binary_example = [(torch.randint(2, (NB_SAMPLES,)), torch.randint(2, (NB_SAMPLES,)))]
multiclass_and_binary_example = [*multiclass_example, *binary_example]
binary_example_logits = (torch.randint(2, (NB_SAMPLES,)), torch.randint(5, (NB_SAMPLES,)))
multiclass_example_probs = (torch.randint(10, (NB_SAMPLES,)), torch.randn((NB_SAMPLES, 10)).softmax(-1))
regression_example = [(torch.rand((NB_SAMPLES,)), torch.rand((NB_SAMPLES,)))]


# construct additional test functions
def root_mean_squared_error(x, y):
    return math.sqrt(mean_squared_error(x, y))


def root_mean_squared_log_error(x, y):
    return math.sqrt(mean_squared_log_error(x, y))


# Define testcases
# TODO: update remaining metrics and uncomment the corresponding test cases
TESTS = [
    TestCase('accuracy',
             Accuracy,
             accuracy_score,
             multiclass_and_binary_example),
    TestCase('confusion matrix without normalize',
             ConfusionMatrix,
             confusion_matrix,
             multiclass_and_binary_example),
    TestCase('confusion matrix with normalize',
             partial(ConfusionMatrix, normalize=True),
             partial(confusion_matrix, normalize='true'),
             multiclass_and_binary_example),
    # TestCase('precision recall curve',
    #          PrecisionRecallCurve,
    #          precision_recall_curve,
    #          binary_example),
    TestCase('precision',
             Precision,
             partial(precision_score, average='micro'),
             multiclass_and_binary_example),
    TestCase('recall',
             Recall,
             partial(recall_score, average='micro'),
             multiclass_and_binary_example),
    # TestCase('average_precision',
    #          AveragePrecision,
    #          average_precision_score,
    #          binary_example),
    # TestCase('auroc',
    #          AUROC,
    #          roc_auc_score,
    #          binary_example),
    TestCase('f beta',
             partial(FBeta, beta=2),
             partial(fbeta_score, average='micro', beta=2),
             multiclass_and_binary_example),
    TestCase('f1',
             F1,
             partial(f1_score, average='micro'),
             multiclass_and_binary_example),
    # TestCase('roc',
    #          ROC,
    #          roc_curve,
    #          binary_example),
    # TestCase('multiclass roc',
    #          MulticlassROC,
    #          multiclass_roc,
    #          binary_example),
    # TestCase('multiclass precision recall curve',
    #          MulticlassPrecisionRecallCurve,
    #          multiclass_precision_recall_curve,
    #          binary_example),
    # TestCase('dice coefficient',
    #          DiceCoefficient,
    #          partial(f1_score, average='micro'),
    #          multiclass_and_binary_example),
    # TestCase('intersection over union',
    #          IoU,
    #          partial(jaccard_score, average='macro'),
    #          binary_example),
    TestCase('mean squared error',
             MSE,
             mean_squared_error,
             regression_example),
    TestCase('root mean squared error',
             RMSE,
             root_mean_squared_error,
             regression_example),
    TestCase('mean absolute error',
             MAE,
             mean_absolute_error,
             regression_example),
    TestCase('root mean squared log error',
             RMSLE,
             root_mean_squared_log_error,
             regression_example),
    TestCase('peak signal-to-noise ratio',
             partial(PSNR, data_range=10),
             partial(peak_signal_noise_ratio, data_range=10),
             regression_example),
    # TestCase('structual similarity index measure',
    #          SSIM,
    #          structural_similarity,
    #          regression_example)
]


# Utility test functions
def _idsfn(test):
    """ Return id for current example being tested """
    return test.name


def _setup_ddp(rank, worldsize):
    """ setup ddp enviroment for testing """
    import os
    os.environ['MASTER_ADDR'] = 'localhost'
    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=worldsize)


def comparing_fn(lightning_val, comparing_val, rtol=1e-03, atol=1e-08):
    """ function for comparing output, both multi and single output"""
    # multi output
    if isinstance(comparing_val, tuple):
        for l_score, c_score in zip(lightning_val, comparing_val):
            assert np.allclose(l_score.numpy(), c_score, rtol, atol)
    else:  # single output
        assert np.allclose(lightning_val.numpy(), comparing_val, rtol, atol)


# ===== Tests start here =====
def _test_ddp_single_batch(rank, worldsize, lightning_metric, comparing_metric, test_inputs):
    """ ddp testing function, divide test_inputs equally between all processes """
    _setup_ddp(rank, worldsize)

    # Setup metric for ddp
    lightning_metric = lightning_metric()
    for test_input in test_inputs:
        # rank 0 receives sample 0,2,4,...
        # rank 1 receives sample 1,3,5,...
        lightning_val = lightning_metric(*[ti[rank::2] for ti in test_input])

        comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])

        comparing_fn(lightning_val, comparing_val)


@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
@pytest.mark.parametrize("test", TESTS, ids=_idsfn)
def test_ddp(test):
    """Make sure that metrics are correctly sync and reduced in DDP mode"""
    tutils.reset_seed()
    tutils.set_random_master_port()

    worldsize = 2
    mp.spawn(_test_ddp_single_batch,
             args=(worldsize,
                   test.lightning_metric,
                   test.comparing_metric,
                   test.test_input),
             nprocs=worldsize)


@pytest.mark.parametrize("test", TESTS, ids=_idsfn)
def test_multi_batch(test):
    """ test that aggregation works for multiple batches """
    lightning_metric = test.lightning_metric()
    comparing_metric = test.comparing_metric

    for test_input in test.test_input:
        for i in range(2):  # for lightning device in 2 artificially batches
            # first batch consist of samples 0,2,4,...
            # second batch consist of samples 1,3,5,...
            _ = lightning_metric(*[ti[i::2] for ti in test_input])
        lightning_val = lightning_metric.aggregated
        comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])

        comparing_fn(lightning_val, comparing_val)


@pytest.mark.parametrize("test", TESTS, ids=_idsfn)
def test_multi_batch_unequal_sizes(test):
    """ test that aggregation works for multiple batches with uneven sizes """
    lightning_metric = test.lightning_metric()
    comparing_metric = test.comparing_metric

    for test_input in test.test_input:
        for i in range(2):  # for lightning device in 2 artificially batches
            if i == 0:  # allocate 3/4 of data to the first batch
                _ = lightning_metric(*[ti[:int(3 / 4 * len(ti))] for ti in test_input])
            else:
                _ = lightning_metric(*[ti[int(3 / 4 * len(ti)):] for ti in test_input])
        lightning_val = lightning_metric.aggregated
        comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])

        comparing_fn(lightning_val, comparing_val)


def _test_ddp_multi_batch(rank, worldsize, lightning_metric, comparing_metric, test_inputs):
    """ ddp testing function, test that metric works with aggregation over multiple
        devices and multiple batches """
    _setup_ddp(rank, worldsize)

    # Setup metric for ddp
    lightning_metric = lightning_metric()
    for test_input in test_inputs:
        for i in range(2):  # artificially divide samples between batches and processes
            # rank 0, batch 0 consist of samples 0,4,8,...
            # rank 0, batch 1 consist of samples 1,5,9,...
            # rank 1, batch 0 consist of samples 2,6,10,...
            # rank 1, batch 1 consist of samples 3,7,11,...
            _ = lightning_metric(*[ti[i + worldsize * rank::4] for ti in test_input])
        lightning_val = lightning_metric.aggregated
        comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])

        comparing_fn(lightning_val, comparing_val)


@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")
@pytest.mark.parametrize("test", TESTS, ids=_idsfn)
def test_ddp_multi_batch(test):
    """ test that aggregation works fine with in DDP mode and multiple batches """
    tutils.reset_seed()
    tutils.set_random_master_port()

    worldsize = 2
    mp.spawn(_test_ddp_multi_batch,
             args=(worldsize,
                   test.lightning_metric,
                   test.comparing_metric,
                   test.test_input),
             nprocs=worldsize)
Metric aggregation testing (#3517) * aggregation testing * add more tests * mse * more tests * fix tests * fix doctest * fix codefactor * fix import error * fix doctest * revert docfix * test for model integration * fix integration test * added test cases * fix rmsle * aggregation testing * add more tests * mse * more tests * fix tests * fix doctest * fix codefactor * fix import error * fix doctest * revert docfix * test for model integration * fix integration test * fix psnr * add warning/valueerror to embedding similarity * fixed f scores * disable some test * fix tests * fixing codefactor * fix pep8 * changelog * fix doctest * cleaning test * fix pickle error * pickle fix * fix pickle error * Apply suggestions from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * code cleanup + changes based on suggestions * update based on suggestion * update based on suggestions * Apply suggestions from code review Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Nicki Skafte <nugginea@gmail.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> 2020-10-01 13:37:51 +00:00			`import pytest`
			`import sys`
			`from collections import namedtuple`
			`from functools import partial`
			`import math`

			`import torch`
			`import torch.distributed as dist`
			`import torch.multiprocessing as mp`
			`import numpy as np`

			`from tests.base import EvalModelTemplate`
			`from pytorch_lightning import Trainer`
			`import tests.base.develop_utils as tutils`
			`from pytorch_lightning.metrics import (`
			`Accuracy,`
			`ConfusionMatrix,`
			`PrecisionRecallCurve,`
			`Precision,`
			`Recall,`
			`AveragePrecision,`
			`AUROC,`
			`FBeta,`
			`F1,`
			`ROC,`
			`MulticlassROC,`
			`MulticlassPrecisionRecallCurve,`
			`DiceCoefficient,`
			`IoU,`
			`MAE,`
			`MSE,`
			`RMSE,`
			`RMSLE,`
			`PSNR,`
			`SSIM,`
			`)`

			`from sklearn.metrics import (`
			`accuracy_score,`
			`confusion_matrix,`
			`precision_recall_curve,`
			`precision_score,`
			`recall_score,`
			`average_precision_score,`
			`roc_auc_score,`
			`fbeta_score,`
			`f1_score,`
			`roc_curve,`
			`jaccard_score,`
			`mean_squared_error,`
			`mean_absolute_error,`
			`mean_squared_log_error`
			`)`

			`from skimage.metrics import (`
			`peak_signal_noise_ratio,`
			`structural_similarity`
			`)`

			`# example structure`
			`TestCase = namedtuple('example', ['name', 'lightning_metric', 'comparing_metric', 'test_input'])`

			`# setup some standard testcases`
			`NB_SAMPLES = 200`
			`multiclass_example = [(torch.randint(10, (NB_SAMPLES,)), torch.randint(10, (NB_SAMPLES,)))]`
			`binary_example = [(torch.randint(2, (NB_SAMPLES,)), torch.randint(2, (NB_SAMPLES,)))]`
			`multiclass_and_binary_example = [multiclass_example, binary_example]`
			`binary_example_logits = (torch.randint(2, (NB_SAMPLES,)), torch.randint(5, (NB_SAMPLES,)))`
			`multiclass_example_probs = (torch.randint(10, (NB_SAMPLES,)), torch.randn((NB_SAMPLES, 10)).softmax(-1))`
			`regression_example = [(torch.rand((NB_SAMPLES,)), torch.rand((NB_SAMPLES,)))]`


			`# construct additional test functions`
			`def root_mean_squared_error(x, y):`
			`return math.sqrt(mean_squared_error(x, y))`


			`def root_mean_squared_log_error(x, y):`
			`return math.sqrt(mean_squared_log_error(x, y))`


			`# Define testcases`
			`# TODO: update remaining metrics and uncomment the corresponding test cases`
			`TESTS = [`
			`TestCase('accuracy',`
			`Accuracy,`
			`accuracy_score,`
			`multiclass_and_binary_example),`
			`TestCase('confusion matrix without normalize',`
			`ConfusionMatrix,`
			`confusion_matrix,`
			`multiclass_and_binary_example),`
			`TestCase('confusion matrix with normalize',`
			`partial(ConfusionMatrix, normalize=True),`
			`partial(confusion_matrix, normalize='true'),`
			`multiclass_and_binary_example),`
			`# TestCase('precision recall curve',`
			`# PrecisionRecallCurve,`
			`# precision_recall_curve,`
			`# binary_example),`
			`TestCase('precision',`
			`Precision,`
			`partial(precision_score, average='micro'),`
			`multiclass_and_binary_example),`
			`TestCase('recall',`
			`Recall,`
			`partial(recall_score, average='micro'),`
			`multiclass_and_binary_example),`
			`# TestCase('average_precision',`
			`# AveragePrecision,`
			`# average_precision_score,`
			`# binary_example),`
			`# TestCase('auroc',`
			`# AUROC,`
			`# roc_auc_score,`
			`# binary_example),`
			`TestCase('f beta',`
			`partial(FBeta, beta=2),`
			`partial(fbeta_score, average='micro', beta=2),`
			`multiclass_and_binary_example),`
			`TestCase('f1',`
			`F1,`
			`partial(f1_score, average='micro'),`
			`multiclass_and_binary_example),`
			`# TestCase('roc',`
			`# ROC,`
			`# roc_curve,`
			`# binary_example),`
			`# TestCase('multiclass roc',`
			`# MulticlassROC,`
			`# multiclass_roc,`
			`# binary_example),`
			`# TestCase('multiclass precision recall curve',`
			`# MulticlassPrecisionRecallCurve,`
			`# multiclass_precision_recall_curve,`
			`# binary_example),`
			`# TestCase('dice coefficient',`
			`# DiceCoefficient,`
			`# partial(f1_score, average='micro'),`
			`# multiclass_and_binary_example),`
			`# TestCase('intersection over union',`
			`# IoU,`
			`# partial(jaccard_score, average='macro'),`
			`# binary_example),`
			`TestCase('mean squared error',`
			`MSE,`
			`mean_squared_error,`
			`regression_example),`
			`TestCase('root mean squared error',`
			`RMSE,`
			`root_mean_squared_error,`
			`regression_example),`
			`TestCase('mean absolute error',`
			`MAE,`
			`mean_absolute_error,`
			`regression_example),`
			`TestCase('root mean squared log error',`
			`RMSLE,`
			`root_mean_squared_log_error,`
			`regression_example),`
			`TestCase('peak signal-to-noise ratio',`
			`partial(PSNR, data_range=10),`
			`partial(peak_signal_noise_ratio, data_range=10),`
			`regression_example),`
			`# TestCase('structual similarity index measure',`
			`# SSIM,`
			`# structural_similarity,`
			`# regression_example)`
			`]`


			`# Utility test functions`
			`def _idsfn(test):`
			`""" Return id for current example being tested """`
			`return test.name`


			`def _setup_ddp(rank, worldsize):`
			`""" setup ddp enviroment for testing """`
			`import os`
			`os.environ['MASTER_ADDR'] = 'localhost'`
			`# initialize the process group`
			`dist.init_process_group("gloo", rank=rank, world_size=worldsize)`


			`def comparing_fn(lightning_val, comparing_val, rtol=1e-03, atol=1e-08):`
			`""" function for comparing output, both multi and single output"""`
			`# multi output`
			`if isinstance(comparing_val, tuple):`
			`for l_score, c_score in zip(lightning_val, comparing_val):`
			`assert np.allclose(l_score.numpy(), c_score, rtol, atol)`
			`else: # single output`
			`assert np.allclose(lightning_val.numpy(), comparing_val, rtol, atol)`


			`# ===== Tests start here =====`
			`def _test_ddp_single_batch(rank, worldsize, lightning_metric, comparing_metric, test_inputs):`
			`""" ddp testing function, divide test_inputs equally between all processes """`
			`_setup_ddp(rank, worldsize)`

			`# Setup metric for ddp`
			`lightning_metric = lightning_metric()`
			`for test_input in test_inputs:`
			`# rank 0 receives sample 0,2,4,...`
			`# rank 1 receives sample 1,3,5,...`
			`lightning_val = lightning_metric(*[ti[rank::2] for ti in test_input])`

			`comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])`

			`comparing_fn(lightning_val, comparing_val)`


			`@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")`
			`@pytest.mark.parametrize("test", TESTS, ids=_idsfn)`
			`def test_ddp(test):`
			`"""Make sure that metrics are correctly sync and reduced in DDP mode"""`
			`tutils.reset_seed()`
			`tutils.set_random_master_port()`

			`worldsize = 2`
			`mp.spawn(_test_ddp_single_batch,`
			`args=(worldsize,`
			`test.lightning_metric,`
			`test.comparing_metric,`
			`test.test_input),`
			`nprocs=worldsize)`


			`@pytest.mark.parametrize("test", TESTS, ids=_idsfn)`
			`def test_multi_batch(test):`
			`""" test that aggregation works for multiple batches """`
			`lightning_metric = test.lightning_metric()`
			`comparing_metric = test.comparing_metric`

			`for test_input in test.test_input:`
			`for i in range(2): # for lightning device in 2 artificially batches`
			`# first batch consist of samples 0,2,4,...`
			`# second batch consist of samples 1,3,5,...`
			`_ = lightning_metric(*[ti[i::2] for ti in test_input])`
			`lightning_val = lightning_metric.aggregated`
			`comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])`

			`comparing_fn(lightning_val, comparing_val)`


			`@pytest.mark.parametrize("test", TESTS, ids=_idsfn)`
			`def test_multi_batch_unequal_sizes(test):`
			`""" test that aggregation works for multiple batches with uneven sizes """`
			`lightning_metric = test.lightning_metric()`
			`comparing_metric = test.comparing_metric`

			`for test_input in test.test_input:`
			`for i in range(2): # for lightning device in 2 artificially batches`
			`if i == 0: # allocate 3/4 of data to the first batch`
			`_ = lightning_metric([ti[:int(3 / 4 len(ti))] for ti in test_input])`
			`else:`
			`_ = lightning_metric([ti[int(3 / 4 len(ti)):] for ti in test_input])`
			`lightning_val = lightning_metric.aggregated`
			`comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])`

			`comparing_fn(lightning_val, comparing_val)`


			`def _test_ddp_multi_batch(rank, worldsize, lightning_metric, comparing_metric, test_inputs):`
			`""" ddp testing function, test that metric works with aggregation over multiple`
			`devices and multiple batches """`
			`_setup_ddp(rank, worldsize)`

			`# Setup metric for ddp`
			`lightning_metric = lightning_metric()`
			`for test_input in test_inputs:`
			`for i in range(2): # artificially divide samples between batches and processes`
			`# rank 0, batch 0 consist of samples 0,4,8,...`
			`# rank 0, batch 1 consist of samples 1,5,9,...`
			`# rank 1, batch 0 consist of samples 2,6,10,...`
			`# rank 1, batch 1 consist of samples 3,7,11,...`
			`_ = lightning_metric([ti[i + worldsize rank::4] for ti in test_input])`
			`lightning_val = lightning_metric.aggregated`
			`comparing_val = comparing_metric(*[ti.numpy() for ti in reversed(test_input)])`

			`comparing_fn(lightning_val, comparing_val)`


			`@pytest.mark.skipif(sys.platform == "win32", reason="DDP not available on windows")`
			`@pytest.mark.parametrize("test", TESTS, ids=_idsfn)`
			`def test_ddp_multi_batch(test):`
			`""" test that aggregation works fine with in DDP mode and multiple batches """`
			`tutils.reset_seed()`
			`tutils.set_random_master_port()`

			`worldsize = 2`
			`mp.spawn(_test_ddp_multi_batch,`
			`args=(worldsize,`
			`test.lightning_metric,`
			`test.comparing_metric,`
			`test.test_input),`
			`nprocs=worldsize)`