From f5b513df92966c934f539c6d9f3f6978ffcb7d04 Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Wed, 13 Nov 2024 16:40:19 +0100 Subject: [PATCH 1/3] temporarily disable tpu from required checks (#20417) --- .github/checkgroup.yml | 17 +++++++++-------- .../{tpu-tests.yml => tpu-tests.yml.disabled} | 0 2 files changed, 9 insertions(+), 8 deletions(-) rename .github/workflows/{tpu-tests.yml => tpu-tests.yml.disabled} (100%) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 8e92842b48..6c7c6aa5f7 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -89,14 +89,15 @@ subprojects: checks: - "lightning.Benchmarks" - - id: "pytorch-lightning: TPU workflow" - paths: - # tpu CI availability is very limited, so we only require tpu tests - # to pass when their configurations are modified - - ".github/workflows/tpu-tests.yml" - - "tests/tests_pytorch/run_tpu_tests.sh" - checks: - - "test-on-tpus (pytorch, pjrt, v4-8)" + # Temporarily disabled + # - id: "pytorch-lightning: TPU workflow" + # paths: + # # tpu CI availability is very limited, so we only require tpu tests + # # to pass when their configurations are modified + # - ".github/workflows/tpu-tests.yml" + # - "tests/tests_pytorch/run_tpu_tests.sh" + # checks: + # - "test-on-tpus (pytorch, pjrt, v4-8)" - id: "fabric: Docs" paths: diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml.disabled similarity index 100% rename from .github/workflows/tpu-tests.yml rename to .github/workflows/tpu-tests.yml.disabled From e1b172c62e4360d487c1728058a3b83a30a04042 Mon Sep 17 00:00:00 2001 From: Eric Cousineau Date: Wed, 13 Nov 2024 14:31:11 -0500 Subject: [PATCH 2/3] Profile `Trainer.save_checkpoint` (#20405) --- src/lightning/pytorch/trainer/trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index 406f686efe..23db90fd45 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -1361,9 +1361,10 @@ class Trainer: "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call" " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?" ) - checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only) - self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options) - self.strategy.barrier("Trainer.save_checkpoint") + with self.profiler.profile("save_checkpoint"): + checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only) + self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options) + self.strategy.barrier("Trainer.save_checkpoint") """ State properties From 20d19d2f5728f7049272f2db77a9748ff4cf5ccd Mon Sep 17 00:00:00 2001 From: Alan Chu <30797645+chualanagit@users.noreply.github.com> Date: Wed, 13 Nov 2024 11:42:14 -0800 Subject: [PATCH 3/3] Remove `List[int]` as input type for Trainer when `accelerator="cpu"` (#20399) Co-authored-by: Alan Chu Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- src/lightning/fabric/accelerators/cpu.py | 8 ++++---- src/lightning/pytorch/accelerators/cpu.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/lightning/fabric/accelerators/cpu.py b/src/lightning/fabric/accelerators/cpu.py index 1bcec1b2ac..0334210ecd 100644 --- a/src/lightning/fabric/accelerators/cpu.py +++ b/src/lightning/fabric/accelerators/cpu.py @@ -39,13 +39,13 @@ class CPUAccelerator(Accelerator): @staticmethod @override - def parse_devices(devices: Union[int, str, List[int]]) -> int: + def parse_devices(devices: Union[int, str]) -> int: """Accelerator device parsing logic.""" return _parse_cpu_cores(devices) @staticmethod @override - def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]: + def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" devices = _parse_cpu_cores(devices) return [torch.device("cpu")] * devices @@ -72,12 +72,12 @@ class CPUAccelerator(Accelerator): ) -def _parse_cpu_cores(cpu_cores: Union[int, str, List[int]]) -> int: +def _parse_cpu_cores(cpu_cores: Union[int, str]) -> int: """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the :class:`~lightning.pytorch.trainer.trainer.Trainer`. Args: - cpu_cores: An int > 0. + cpu_cores: An int > 0 or a string that can be converted to an int > 0. Returns: An int representing the number of processes diff --git a/src/lightning/pytorch/accelerators/cpu.py b/src/lightning/pytorch/accelerators/cpu.py index 735312b363..a85a959ab6 100644 --- a/src/lightning/pytorch/accelerators/cpu.py +++ b/src/lightning/pytorch/accelerators/cpu.py @@ -48,13 +48,13 @@ class CPUAccelerator(Accelerator): @staticmethod @override - def parse_devices(devices: Union[int, str, List[int]]) -> int: + def parse_devices(devices: Union[int, str]) -> int: """Accelerator device parsing logic.""" return _parse_cpu_cores(devices) @staticmethod @override - def get_parallel_devices(devices: Union[int, str, List[int]]) -> List[torch.device]: + def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" devices = _parse_cpu_cores(devices) return [torch.device("cpu")] * devices