[fix] Ensure we check deepspeed/sharded in multinode DDP (#6297)

* Ensure we check deepspeed/sharded in multinode

* Add CHANGELOG.md

* Add CHANGELOG.md

* Drop mock, use actual multi-gpu node
This commit is contained in:
Sean Naren 2021-03-02 13:36:18 +00:00 committed by GitHub
parent b46d22197d
commit 80019874e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 5 deletions

View File

@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275))
- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
## [1.2.1] - 2021-02-23
### Fixed

View File

@ -536,12 +536,12 @@ class AcceleratorConnector(object):
if self.distributed_backend == "horovod":
self._set_horovod_backend()
# throw error to force user ddp or ddp2 choice
_ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
if (self.num_nodes > 1 and self._distrib_type not in _ddp):
using_valid_distributed = self.use_ddp or self.use_ddp2
if self.num_nodes > 1 and not using_valid_distributed:
# throw error to force user to choose a supported distributed type such as ddp or ddp2
raise MisconfigurationException(
'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
'Your chosen distributed type does not support num_nodes > 1. '
'Please set accelerator=ddp or accelerator=ddp2.'
)
rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')

View File

@ -28,10 +28,13 @@ from pytorch_lightning.plugins import (
DDPPlugin,
DDPShardedPlugin,
DDPSpawnPlugin,
DDPSpawnShardedPlugin,
DeepSpeedPlugin,
PrecisionPlugin,
SingleDevicePlugin,
)
from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests.helpers.boring_model import BoringModel
from tests.helpers.runif import RunIf
@ -415,3 +418,26 @@ def test_plugin_accelerator_choice(accelerator, plugin):
trainer = Trainer(plugins=plugin, num_processes=2)
assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
@pytest.mark.parametrize(["accelerator", "plugin"], [
('ddp', DDPPlugin),
('ddp_spawn', DDPSpawnPlugin),
('ddp_sharded', DDPShardedPlugin),
('ddp_sharded_spawn', DDPSpawnShardedPlugin),
pytest.param(
'deepspeed',
DeepSpeedPlugin,
marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
),
])
@mock.patch('torch.cuda.is_available', return_value=True)
@mock.patch('torch.cuda.device_count', return_value=2)
def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir):
trainer = Trainer(
accelerator=accelerator,
default_root_dir=tmpdir,
num_nodes=2,
gpus=2,
)
assert isinstance(trainer.training_type_plugin, plugin)