[fix] Ensure we check deepspeed/sharded in multinode DDP (#6297)
* Ensure we check deepspeed/sharded in multinode * Add CHANGELOG.md * Add CHANGELOG.md * Drop mock, use actual multi-gpu node
This commit is contained in:
parent
b46d22197d
commit
80019874e5
|
@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
|
|||
- Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275))
|
||||
|
||||
|
||||
- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
|
||||
|
||||
|
||||
## [1.2.1] - 2021-02-23
|
||||
|
||||
### Fixed
|
||||
|
|
|
@ -536,12 +536,12 @@ class AcceleratorConnector(object):
|
|||
if self.distributed_backend == "horovod":
|
||||
self._set_horovod_backend()
|
||||
|
||||
# throw error to force user ddp or ddp2 choice
|
||||
_ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
|
||||
if (self.num_nodes > 1 and self._distrib_type not in _ddp):
|
||||
using_valid_distributed = self.use_ddp or self.use_ddp2
|
||||
if self.num_nodes > 1 and not using_valid_distributed:
|
||||
# throw error to force user to choose a supported distributed type such as ddp or ddp2
|
||||
raise MisconfigurationException(
|
||||
'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
|
||||
'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
|
||||
'Your chosen distributed type does not support num_nodes > 1. '
|
||||
'Please set accelerator=ddp or accelerator=ddp2.'
|
||||
)
|
||||
|
||||
rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
|
||||
|
|
|
@ -28,10 +28,13 @@ from pytorch_lightning.plugins import (
|
|||
DDPPlugin,
|
||||
DDPShardedPlugin,
|
||||
DDPSpawnPlugin,
|
||||
DDPSpawnShardedPlugin,
|
||||
DeepSpeedPlugin,
|
||||
PrecisionPlugin,
|
||||
SingleDevicePlugin,
|
||||
)
|
||||
from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
|
||||
from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from tests.helpers.boring_model import BoringModel
|
||||
from tests.helpers.runif import RunIf
|
||||
|
@ -415,3 +418,26 @@ def test_plugin_accelerator_choice(accelerator, plugin):
|
|||
|
||||
trainer = Trainer(plugins=plugin, num_processes=2)
|
||||
assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(["accelerator", "plugin"], [
|
||||
('ddp', DDPPlugin),
|
||||
('ddp_spawn', DDPSpawnPlugin),
|
||||
('ddp_sharded', DDPShardedPlugin),
|
||||
('ddp_sharded_spawn', DDPSpawnShardedPlugin),
|
||||
pytest.param(
|
||||
'deepspeed',
|
||||
DeepSpeedPlugin,
|
||||
marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
|
||||
),
|
||||
])
|
||||
@mock.patch('torch.cuda.is_available', return_value=True)
|
||||
@mock.patch('torch.cuda.device_count', return_value=2)
|
||||
def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir):
|
||||
trainer = Trainer(
|
||||
accelerator=accelerator,
|
||||
default_root_dir=tmpdir,
|
||||
num_nodes=2,
|
||||
gpus=2,
|
||||
)
|
||||
assert isinstance(trainer.training_type_plugin, plugin)
|
||||
|
|
Loading…
Reference in New Issue