lightning/tests/models/test_horovod.py

176 lines
6.1 KiB
Python
Raw Normal View History

import json
import os
import platform
import shlex
import subprocess
import sys
import pytest
import torch
import tests.base.utils as tutils
from pytorch_lightning import Trainer
from tests.base import EvalModelTemplate
from tests.base.models import TestGAN
try:
from horovod.common.util import nccl_built
except ImportError:
HOROVOD_AVAILABLE = False
else:
HOROVOD_AVAILABLE = True
# This script will run the actual test model training in parallel
TEST_SCRIPT = os.path.join(os.path.dirname(__file__), 'data', 'horovod', 'train_default_model.py')
def _nccl_available():
if not HOROVOD_AVAILABLE:
return False
try:
return nccl_built()
except AttributeError:
# Horovod 0.19.1 nccl_built() does not yet work with Python 3.8:
# See: https://github.com/horovod/horovod/issues/1891
return False
def _run_horovod(trainer_options, on_gpu=False):
"""Execute the training script across multiple workers in parallel."""
Option to provide seed to random generators to ensure reproducibility (#1572) * Option to provide seed to random generators to ensure reproducibility I added small function in utilities which imports torch, numpy, python random and sets seed for all of the libraries to ensure reproducibility of results. * Apply recommendations from core contributors on seeding 1. Moved the seeding code to another file 2. Make deterministic as a parameter for trainer class 3. Add assertions for seeding numpy 4. Added warnings 5. torch.manual_seed should be enough for seeding torch * Revert "Apply recommendations from core contributors on seeding" This reverts commit a213c8e6882eec8a9e7408b9418926d2db7c5461. * Revert "Revert "Apply recommendations from core contributors on seeding"" This reverts commit 59b2da53c62878de7aab0aa3feb3115e105eea06. * Change in test, for correct seeding * Allow seed equal to 0 * Allow seed to be uint32.max * Added deterministic to benchmarks * Cuda manual seed as in benchmark seeding * Seeding should be done before model initialization * cuda manual_seed is not necessary * Fixing seed test_cpu_lbfgs On some seeds seems like lbfgs doesn't converge. So I fixed the seed during testing. * rebasing issue with old reproducibility.py * Improved documentation and ability to seed before initializing Train class * Change in docs * Removed seed from trainer, update for documentation * Typo in the docs * Added seed_everything to _all_ * Fixing old changes * Model initialization should be earlier then Trainer * Update pytorch_lightning/trainer/__init__.py From Example to testcode Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Fixing according to the contributors suggestions * Moving horovod deterministic to Trainer class * deterministic flag affects horovod docs update * Improved static typing * Added deterministic to test runners of horovod It is failing on some versions, not very predictable * static seeds for horovod tests * Change for reset_seed function in tests * Seeding horovod using reset_seed from tutils * Update pytorch_lightning/trainer/__init__.py * chlog * Update trainer.py * change "testcode" to "Example" in trainer init documentation * Update pytorch_lightning/trainer/seed.py, first line in comment Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-05-12 11:53:20 +00:00
tutils.reset_seed()
cmdline = [
'horovodrun',
'-np', '2',
sys.executable, TEST_SCRIPT,
'--trainer-options', shlex.quote(json.dumps(trainer_options))
]
if on_gpu:
cmdline += ['--on-gpu']
exit_code = subprocess.call(' '.join(cmdline), shell=True, env=os.environ.copy())
assert exit_code == 0
@pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8")
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
def test_horovod_cpu(tmpdir):
"""Test Horovod running multi-process on CPU."""
trainer_options = dict(
default_root_dir=str(tmpdir),
gradient_clip_val=1.0,
progress_bar_refresh_rate=0,
max_epochs=1,
train_percent_check=0.4,
val_percent_check=0.2,
Option to provide seed to random generators to ensure reproducibility (#1572) * Option to provide seed to random generators to ensure reproducibility I added small function in utilities which imports torch, numpy, python random and sets seed for all of the libraries to ensure reproducibility of results. * Apply recommendations from core contributors on seeding 1. Moved the seeding code to another file 2. Make deterministic as a parameter for trainer class 3. Add assertions for seeding numpy 4. Added warnings 5. torch.manual_seed should be enough for seeding torch * Revert "Apply recommendations from core contributors on seeding" This reverts commit a213c8e6882eec8a9e7408b9418926d2db7c5461. * Revert "Revert "Apply recommendations from core contributors on seeding"" This reverts commit 59b2da53c62878de7aab0aa3feb3115e105eea06. * Change in test, for correct seeding * Allow seed equal to 0 * Allow seed to be uint32.max * Added deterministic to benchmarks * Cuda manual seed as in benchmark seeding * Seeding should be done before model initialization * cuda manual_seed is not necessary * Fixing seed test_cpu_lbfgs On some seeds seems like lbfgs doesn't converge. So I fixed the seed during testing. * rebasing issue with old reproducibility.py * Improved documentation and ability to seed before initializing Train class * Change in docs * Removed seed from trainer, update for documentation * Typo in the docs * Added seed_everything to _all_ * Fixing old changes * Model initialization should be earlier then Trainer * Update pytorch_lightning/trainer/__init__.py From Example to testcode Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Fixing according to the contributors suggestions * Moving horovod deterministic to Trainer class * deterministic flag affects horovod docs update * Improved static typing * Added deterministic to test runners of horovod It is failing on some versions, not very predictable * static seeds for horovod tests * Change for reset_seed function in tests * Seeding horovod using reset_seed from tutils * Update pytorch_lightning/trainer/__init__.py * chlog * Update trainer.py * change "testcode" to "Example" in trainer init documentation * Update pytorch_lightning/trainer/seed.py, first line in comment Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-05-12 11:53:20 +00:00
distributed_backend='horovod',
deterministic=True,
)
_run_horovod(trainer_options)
@pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8")
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
def test_horovod_cpu_implicit(tmpdir):
"""Test Horovod without specifying a backend, inferring from env set by `horovodrun`."""
trainer_options = dict(
default_root_dir=str(tmpdir),
gradient_clip_val=1.0,
progress_bar_refresh_rate=0,
max_epochs=1,
train_percent_check=0.4,
val_percent_check=0.2,
Option to provide seed to random generators to ensure reproducibility (#1572) * Option to provide seed to random generators to ensure reproducibility I added small function in utilities which imports torch, numpy, python random and sets seed for all of the libraries to ensure reproducibility of results. * Apply recommendations from core contributors on seeding 1. Moved the seeding code to another file 2. Make deterministic as a parameter for trainer class 3. Add assertions for seeding numpy 4. Added warnings 5. torch.manual_seed should be enough for seeding torch * Revert "Apply recommendations from core contributors on seeding" This reverts commit a213c8e6882eec8a9e7408b9418926d2db7c5461. * Revert "Revert "Apply recommendations from core contributors on seeding"" This reverts commit 59b2da53c62878de7aab0aa3feb3115e105eea06. * Change in test, for correct seeding * Allow seed equal to 0 * Allow seed to be uint32.max * Added deterministic to benchmarks * Cuda manual seed as in benchmark seeding * Seeding should be done before model initialization * cuda manual_seed is not necessary * Fixing seed test_cpu_lbfgs On some seeds seems like lbfgs doesn't converge. So I fixed the seed during testing. * rebasing issue with old reproducibility.py * Improved documentation and ability to seed before initializing Train class * Change in docs * Removed seed from trainer, update for documentation * Typo in the docs * Added seed_everything to _all_ * Fixing old changes * Model initialization should be earlier then Trainer * Update pytorch_lightning/trainer/__init__.py From Example to testcode Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Fixing according to the contributors suggestions * Moving horovod deterministic to Trainer class * deterministic flag affects horovod docs update * Improved static typing * Added deterministic to test runners of horovod It is failing on some versions, not very predictable * static seeds for horovod tests * Change for reset_seed function in tests * Seeding horovod using reset_seed from tutils * Update pytorch_lightning/trainer/__init__.py * chlog * Update trainer.py * change "testcode" to "Example" in trainer init documentation * Update pytorch_lightning/trainer/seed.py, first line in comment Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-05-12 11:53:20 +00:00
deterministic=True,
)
_run_horovod(trainer_options)
@pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8")
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="test requires multi-GPU machine")
def test_horovod_multi_gpu(tmpdir):
"""Test Horovod with multi-GPU support."""
trainer_options = dict(
default_root_dir=str(tmpdir),
gradient_clip_val=1.0,
progress_bar_refresh_rate=0,
max_epochs=1,
train_percent_check=0.4,
val_percent_check=0.2,
gpus=1,
Option to provide seed to random generators to ensure reproducibility (#1572) * Option to provide seed to random generators to ensure reproducibility I added small function in utilities which imports torch, numpy, python random and sets seed for all of the libraries to ensure reproducibility of results. * Apply recommendations from core contributors on seeding 1. Moved the seeding code to another file 2. Make deterministic as a parameter for trainer class 3. Add assertions for seeding numpy 4. Added warnings 5. torch.manual_seed should be enough for seeding torch * Revert "Apply recommendations from core contributors on seeding" This reverts commit a213c8e6882eec8a9e7408b9418926d2db7c5461. * Revert "Revert "Apply recommendations from core contributors on seeding"" This reverts commit 59b2da53c62878de7aab0aa3feb3115e105eea06. * Change in test, for correct seeding * Allow seed equal to 0 * Allow seed to be uint32.max * Added deterministic to benchmarks * Cuda manual seed as in benchmark seeding * Seeding should be done before model initialization * cuda manual_seed is not necessary * Fixing seed test_cpu_lbfgs On some seeds seems like lbfgs doesn't converge. So I fixed the seed during testing. * rebasing issue with old reproducibility.py * Improved documentation and ability to seed before initializing Train class * Change in docs * Removed seed from trainer, update for documentation * Typo in the docs * Added seed_everything to _all_ * Fixing old changes * Model initialization should be earlier then Trainer * Update pytorch_lightning/trainer/__init__.py From Example to testcode Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Fixing according to the contributors suggestions * Moving horovod deterministic to Trainer class * deterministic flag affects horovod docs update * Improved static typing * Added deterministic to test runners of horovod It is failing on some versions, not very predictable * static seeds for horovod tests * Change for reset_seed function in tests * Seeding horovod using reset_seed from tutils * Update pytorch_lightning/trainer/__init__.py * chlog * Update trainer.py * change "testcode" to "Example" in trainer init documentation * Update pytorch_lightning/trainer/seed.py, first line in comment Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-05-12 11:53:20 +00:00
deterministic=True,
distributed_backend='horovod'
)
_run_horovod(trainer_options, on_gpu=True)
@pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8")
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
@pytest.mark.skipif(not _nccl_available(), reason="test requires Horovod with NCCL support")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="test requires GPU machine")
def test_horovod_transfer_batch_to_gpu(tmpdir):
class TestTrainingStepModel(EvalModelTemplate):
def training_step(self, batch, *args, **kwargs):
x, y = batch
assert str(x.device) != 'cpu'
assert str(y.device) != 'cpu'
return super(TestTrainingStepModel, self).training_step(batch, *args, **kwargs)
def validation_step(self, batch, *args, **kwargs):
x, y = batch
assert str(x.device) != 'cpu'
assert str(y.device) != 'cpu'
return super(TestTrainingStepModel, self).validation_step(batch, *args, **kwargs)
hparams = EvalModelTemplate.get_default_hparams()
model = TestTrainingStepModel(hparams)
trainer_options = dict(
default_root_dir=str(tmpdir),
progress_bar_refresh_rate=0,
max_epochs=1,
train_percent_check=0.4,
val_percent_check=0.2,
gpus=1,
Option to provide seed to random generators to ensure reproducibility (#1572) * Option to provide seed to random generators to ensure reproducibility I added small function in utilities which imports torch, numpy, python random and sets seed for all of the libraries to ensure reproducibility of results. * Apply recommendations from core contributors on seeding 1. Moved the seeding code to another file 2. Make deterministic as a parameter for trainer class 3. Add assertions for seeding numpy 4. Added warnings 5. torch.manual_seed should be enough for seeding torch * Revert "Apply recommendations from core contributors on seeding" This reverts commit a213c8e6882eec8a9e7408b9418926d2db7c5461. * Revert "Revert "Apply recommendations from core contributors on seeding"" This reverts commit 59b2da53c62878de7aab0aa3feb3115e105eea06. * Change in test, for correct seeding * Allow seed equal to 0 * Allow seed to be uint32.max * Added deterministic to benchmarks * Cuda manual seed as in benchmark seeding * Seeding should be done before model initialization * cuda manual_seed is not necessary * Fixing seed test_cpu_lbfgs On some seeds seems like lbfgs doesn't converge. So I fixed the seed during testing. * rebasing issue with old reproducibility.py * Improved documentation and ability to seed before initializing Train class * Change in docs * Removed seed from trainer, update for documentation * Typo in the docs * Added seed_everything to _all_ * Fixing old changes * Model initialization should be earlier then Trainer * Update pytorch_lightning/trainer/__init__.py From Example to testcode Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Fixing according to the contributors suggestions * Moving horovod deterministic to Trainer class * deterministic flag affects horovod docs update * Improved static typing * Added deterministic to test runners of horovod It is failing on some versions, not very predictable * static seeds for horovod tests * Change for reset_seed function in tests * Seeding horovod using reset_seed from tutils * Update pytorch_lightning/trainer/__init__.py * chlog * Update trainer.py * change "testcode" to "Example" in trainer init documentation * Update pytorch_lightning/trainer/seed.py, first line in comment Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-05-12 11:53:20 +00:00
deterministic=True,
distributed_backend='horovod'
)
tutils.run_model_test_without_loggers(trainer_options, model)
@pytest.mark.skipif(sys.version_info >= (3, 8), reason="Horovod not yet supported in Python 3.8")
@pytest.mark.skipif(platform.system() == "Windows", reason="Horovod is not supported on Windows")
def test_horovod_multi_optimizer(tmpdir):
replace Hparams by init args (#1896) * remove the need for hparams * remove the need for hparams * remove the need for hparams * remove the need for hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * replace self.hparams * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * fixed * finished moco * basic * testing * todo * recurse * hparams * persist * hparams * chlog * tests * tests * tests * tests * tests * tests * review * saving * tests * tests * tests * docs * finished moco * hparams * review * Apply suggestions from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * hparams * overwrite * transform * transform * transform * transform * cleaning * cleaning * tests * examples * examples * examples * Apply suggestions from code review Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * chp key * tests * Apply suggestions from code review * class * updated docs * updated docs * updated docs * updated docs * save * wip * fix * flake8 Co-authored-by: Jirka <jirka@pytorchlightning.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com>
2020-05-24 22:59:08 +00:00
model = TestGAN(**EvalModelTemplate.get_default_hparams())
trainer_options = dict(
default_root_dir=str(tmpdir),
progress_bar_refresh_rate=0,
max_epochs=1,
train_percent_check=0.4,
val_percent_check=0.2,
Option to provide seed to random generators to ensure reproducibility (#1572) * Option to provide seed to random generators to ensure reproducibility I added small function in utilities which imports torch, numpy, python random and sets seed for all of the libraries to ensure reproducibility of results. * Apply recommendations from core contributors on seeding 1. Moved the seeding code to another file 2. Make deterministic as a parameter for trainer class 3. Add assertions for seeding numpy 4. Added warnings 5. torch.manual_seed should be enough for seeding torch * Revert "Apply recommendations from core contributors on seeding" This reverts commit a213c8e6882eec8a9e7408b9418926d2db7c5461. * Revert "Revert "Apply recommendations from core contributors on seeding"" This reverts commit 59b2da53c62878de7aab0aa3feb3115e105eea06. * Change in test, for correct seeding * Allow seed equal to 0 * Allow seed to be uint32.max * Added deterministic to benchmarks * Cuda manual seed as in benchmark seeding * Seeding should be done before model initialization * cuda manual_seed is not necessary * Fixing seed test_cpu_lbfgs On some seeds seems like lbfgs doesn't converge. So I fixed the seed during testing. * rebasing issue with old reproducibility.py * Improved documentation and ability to seed before initializing Train class * Change in docs * Removed seed from trainer, update for documentation * Typo in the docs * Added seed_everything to _all_ * Fixing old changes * Model initialization should be earlier then Trainer * Update pytorch_lightning/trainer/__init__.py From Example to testcode Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Fixing according to the contributors suggestions * Moving horovod deterministic to Trainer class * deterministic flag affects horovod docs update * Improved static typing * Added deterministic to test runners of horovod It is failing on some versions, not very predictable * static seeds for horovod tests * Change for reset_seed function in tests * Seeding horovod using reset_seed from tutils * Update pytorch_lightning/trainer/__init__.py * chlog * Update trainer.py * change "testcode" to "Example" in trainer init documentation * Update pytorch_lightning/trainer/seed.py, first line in comment Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka.borovec@seznam.cz> Co-authored-by: William Falcon <waf2107@columbia.edu>
2020-05-12 11:53:20 +00:00
deterministic=True,
distributed_backend='horovod'
)
# fit model
trainer = Trainer(**trainer_options)
result = trainer.fit(model)
assert result == 1, 'model failed to complete'
assert len(trainer.optimizers) == 2
for i, optimizer in enumerate(trainer.optimizers):
assert hasattr(optimizer, 'synchronize'), 'optimizer has not been wrapped into DistributedOptimizer'
def get_model_params(model):
return set([p for p in model.parameters()])
def get_optimizer_params(optimizer):
return set([p for group in optimizer.param_groups for p in group.get('params', [])])
assert get_model_params(model.generator) != get_model_params(model.discriminator)
assert get_model_params(model.generator) == get_optimizer_params(trainer.optimizers[0])
assert get_model_params(model.discriminator) == get_optimizer_params(trainer.optimizers[1])