Merge branch 'master' into bump/python_3.9+

2024-11-13 21:09:58 +01:00 · 2024-11-13 21:09:58 +01:00 · 4f221638d3
parent ce62f96c91 20d19d2f57
commit 4f221638d3
5 changed files with 23 additions and 21 deletions
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@ -89,14 +89,15 @@ subprojects:
    checks:
      - "lightning.Benchmarks"

-  - id: "pytorch-lightning: TPU workflow"
-    paths:
-      # tpu CI availability is very limited, so we only require tpu tests
-      # to pass when their configurations are modified
-      - ".github/workflows/tpu-tests.yml"
-      - "tests/tests_pytorch/run_tpu_tests.sh"
-    checks:
-      - "test-on-tpus (pytorch, pjrt, v4-8)"
+  # Temporarily disabled
+  #  - id: "pytorch-lightning: TPU workflow"
+  #    paths:
+  #      # tpu CI availability is very limited, so we only require tpu tests
+  #      # to pass when their configurations are modified
+  #      - ".github/workflows/tpu-tests.yml"
+  #      - "tests/tests_pytorch/run_tpu_tests.sh"
+  #    checks:
+  #      - "test-on-tpus (pytorch, pjrt, v4-8)"

  - id: "fabric: Docs"
    paths:
--- a/.github/workflows/tpu-tests.yml.disabled
+++ b/.github/workflows/tpu-tests.yml.disabled
--- a/src/lightning/fabric/accelerators/cpu.py
+++ b/src/lightning/fabric/accelerators/cpu.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
+from typing import List, Union

 import torch
 from typing_extensions import override
@ -39,13 +39,13 @@ class CPUAccelerator(Accelerator):

    @staticmethod
    @override
-    def parse_devices(devices: Union[int, str, list[int]]) -> int:
+    def parse_devices(devices: Union[int, str]) -> int:
        """Accelerator device parsing logic."""
        return _parse_cpu_cores(devices)

    @staticmethod
    @override
-    def get_parallel_devices(devices: Union[int, str, list[int]]) -> list[torch.device]:
+    def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]:
        """Gets parallel devices for the Accelerator."""
        devices = _parse_cpu_cores(devices)
        return [torch.device("cpu")] * devices
@ -72,12 +72,12 @@ class CPUAccelerator(Accelerator):
        )


-def _parse_cpu_cores(cpu_cores: Union[int, str, list[int]]) -> int:
+def _parse_cpu_cores(cpu_cores: Union[int, str]) -> int:
    """Parses the cpu_cores given in the format as accepted by the ``devices`` argument in the
    :class:`~lightning.pytorch.trainer.trainer.Trainer`.

    Args:
-        cpu_cores: An int > 0.
+        cpu_cores: An int > 0 or a string that can be converted to an int > 0.

    Returns:
        An int representing the number of processes
--- a/src/lightning/pytorch/accelerators/cpu.py
+++ b/src/lightning/pytorch/accelerators/cpu.py
@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Any, Union
+from typing import Any, Dict, List, Union

 import torch
 from lightning_utilities.core.imports import RequirementCache
@ -38,7 +38,7 @@ class CPUAccelerator(Accelerator):
            raise MisconfigurationException(f"Device should be CPU, got {device} instead.")

    @override
-    def get_device_stats(self, device: _DEVICE) -> dict[str, Any]:
+    def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]:
        """Get CPU stats from ``psutil`` package."""
        return get_cpu_stats()

@ -48,13 +48,13 @@ class CPUAccelerator(Accelerator):

    @staticmethod
    @override
-    def parse_devices(devices: Union[int, str, list[int]]) -> int:
+    def parse_devices(devices: Union[int, str]) -> int:
        """Accelerator device parsing logic."""
        return _parse_cpu_cores(devices)

    @staticmethod
    @override
-    def get_parallel_devices(devices: Union[int, str, list[int]]) -> list[torch.device]:
+    def get_parallel_devices(devices: Union[int, str]) -> List[torch.device]:
        """Gets parallel devices for the Accelerator."""
        devices = _parse_cpu_cores(devices)
        return [torch.device("cpu")] * devices
@ -89,7 +89,7 @@ _CPU_SWAP_PERCENT = "cpu_swap_percent"
 _PSUTIL_AVAILABLE = RequirementCache("psutil")


-def get_cpu_stats() -> dict[str, float]:
+def get_cpu_stats() -> Dict[str, float]:
    if not _PSUTIL_AVAILABLE:
        raise ModuleNotFoundError(
            f"Fetching CPU device stats requires `psutil` to be installed. {str(_PSUTIL_AVAILABLE)}"
--- a/src/lightning/pytorch/trainer/trainer.py
+++ b/src/lightning/pytorch/trainer/trainer.py
@ -1362,9 +1362,10 @@ class Trainer:
                "Saving a checkpoint is only possible if a model is attached to the Trainer. Did you call"
                " `Trainer.save_checkpoint()` before calling `Trainer.{fit,validate,test,predict}`?"
            )
-        checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only)
-        self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
-        self.strategy.barrier("Trainer.save_checkpoint")
+        with self.profiler.profile("save_checkpoint"):
+            checkpoint = self._checkpoint_connector.dump_checkpoint(weights_only)
+            self.strategy.save_checkpoint(checkpoint, filepath, storage_options=storage_options)
+            self.strategy.barrier("Trainer.save_checkpoint")

    """
    State properties