diff --git a/CHANGELOG.md b/CHANGELOG.md index 899e79ffae..6ff1ded491 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380)) +- Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460)) + + - Fixed LightningModule `all_gather` on cpu tensors ([#6416](https://github.com/PyTorchLightning/pytorch-lightning/pull/6416)) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index a481c0c2e2..b54155d60e 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -231,6 +231,8 @@ class DeepSpeedPlugin(DDPPlugin): return optimizer, scheduler, optimizer_frequencies def _initialize_deepspeed_train(self, model): + if self.on_gpu: + torch.cuda.set_device(self.root_device) optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info(