From 1c013b43e049d423184323313014774738827c90 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Wed, 10 Mar 2021 16:29:37 +0000 Subject: [PATCH] [Fix] Ensure we set the default device before initializing deepspeed (#6460) * Ensure we set the default device before initializing deepspeed * Add CHANGELOG.md * Update pytorch_lightning/plugins/training_type/deepspeed.py Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- CHANGELOG.md | 3 +++ pytorch_lightning/plugins/training_type/deepspeed.py | 2 ++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 899e79ffae..6ff1ded491 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -122,6 +122,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380)) +- Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460)) + + - Fixed LightningModule `all_gather` on cpu tensors ([#6416](https://github.com/PyTorchLightning/pytorch-lightning/pull/6416)) diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py index a481c0c2e2..b54155d60e 100644 --- a/pytorch_lightning/plugins/training_type/deepspeed.py +++ b/pytorch_lightning/plugins/training_type/deepspeed.py @@ -231,6 +231,8 @@ class DeepSpeedPlugin(DDPPlugin): return optimizer, scheduler, optimizer_frequencies def _initialize_deepspeed_train(self, model): + if self.on_gpu: + torch.cuda.set_device(self.root_device) optimizer, lightning_scheduler, optimizer_frequencies = None, None, None if "optimizer" not in self.config: rank_zero_info(