diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 8e92842b48..6c7c6aa5f7 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -89,14 +89,15 @@ subprojects: checks: - "lightning.Benchmarks" - - id: "pytorch-lightning: TPU workflow" - paths: - # tpu CI availability is very limited, so we only require tpu tests - # to pass when their configurations are modified - - ".github/workflows/tpu-tests.yml" - - "tests/tests_pytorch/run_tpu_tests.sh" - checks: - - "test-on-tpus (pytorch, pjrt, v4-8)" + # Temporarily disabled + # - id: "pytorch-lightning: TPU workflow" + # paths: + # # tpu CI availability is very limited, so we only require tpu tests + # # to pass when their configurations are modified + # - ".github/workflows/tpu-tests.yml" + # - "tests/tests_pytorch/run_tpu_tests.sh" + # checks: + # - "test-on-tpus (pytorch, pjrt, v4-8)" - id: "fabric: Docs" paths: diff --git a/.github/workflows/tpu-tests.yml b/.github/workflows/tpu-tests.yml.disabled similarity index 100% rename from .github/workflows/tpu-tests.yml rename to .github/workflows/tpu-tests.yml.disabled diff --git a/src/lightning/fabric/accelerators/cpu.py b/src/lightning/fabric/accelerators/cpu.py index a2be8a44b1..0334210ecd 100644 --- a/src/lightning/fabric/accelerators/cpu.py +++ b/src/lightning/fabric/accelerators/cpu.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Union +from typing import List, Union import torch from typing_extensions import override @@ -39,13 +39,13 @@ class CPUAccelerator(Accelerator): @staticmethod @override - def parse_devices(devices: Union[int, str, list[int]]) -> int: + def parse_devices(devices: Union[int, str]) -> int: """Accelerator device parsing logic.""" return _parse_cpu_cores(devices) @staticmethod @override - def get_parallel_devices(devices: Union[int, str, list[int]]) -> list[torch.device]: + def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" devices = _parse_cpu_cores(devices) return [torch.device("cpu")] * devices @@ -72,12 +72,12 @@ class CPUAccelerator(Accelerator): ) -def _parse_cpu_cores(cpu_cores: Union[int, str, list[int]]) -> int: +def _parse_cpu_cores(cpu_cores: Union[int, str]) -> int: """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the :class:`~lightning.pytorch.trainer.trainer.Trainer`. Args: - cpu_cores: An int > 0. + cpu_cores: An int > 0 or a string that can be converted to an int > 0. Returns: An int representing the number of processes diff --git a/src/lightning/pytorch/accelerators/cpu.py b/src/lightning/pytorch/accelerators/cpu.py index 177514232f..a85a959ab6 100644 --- a/src/lightning/pytorch/accelerators/cpu.py +++ b/src/lightning/pytorch/accelerators/cpu.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Union +from typing import Any, Dict, List, Union import torch from lightning_utilities.core.imports import RequirementCache @@ -38,7 +38,7 @@ class CPUAccelerator(Accelerator): raise MisconfigurationException(f"Device should be CPU, got {device} instead.") @override - def get_device_stats(self, device: _DEVICE) -> dict[str, Any]: + def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]: """Get CPU stats from ``psutil`` package.""" return get_cpu_stats() @@ -48,13 +48,13 @@ class CPUAccelerator(Accelerator): @staticmethod @override - def parse_devices(devices: Union[int, str, list[int]]) -> int: + def parse_devices(devices: Union[int, str]) -> int: """Accelerator device parsing logic.""" return _parse_cpu_cores(devices) @staticmethod @override - def get_parallel_devices(devices: Union[int, str, list[int]]) -> list[torch.device]: + def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]: """Gets parallel devices for the Accelerator.""" devices = _parse_cpu_cores(devices) return [torch.device("cpu")] * devices @@ -89,7 +89,7 @@ _CPU_SWAP_PERCENT = "cpu_swap_percent" _PSUTIL_AVAILABLE = RequirementCache("psutil") -def get_cpu_stats() -> dict[str, float]: +def get_cpu_stats() -> Dict[str, float]: if not _PSUTIL_AVAILABLE: raise ModuleNotFoundError( f"Fetching CPU device stats requires `psutil` to be installed. {str(_PSUTIL_AVAILABLE)}" diff --git a/src/lightning/pytorch/trainer/trainer.py b/src/lightning/pytorch/trainer/trainer.py index 4531600360..0509f28acb 100644 --- a/src/lightning/pytorch/trainer/trainer.py +++ b/src/lightning/pytorch/trainer/trainer.py @@ -1362,9 +1362,10 @@ class Trainer: "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call" " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?" ) - checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only) - self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options) - self.strategy.barrier("Trainer.save_checkpoint") + with self.profiler.profile("save_checkpoint"): + checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only) + self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options) + self.strategy.barrier("Trainer.save_checkpoint") """ State properties