diff --git a/.azure/gpu-benchmarks.yml b/.azure/gpu-benchmarks.yml index 111589945e..24b78542a7 100644 --- a/.azure/gpu-benchmarks.yml +++ b/.azure/gpu-benchmarks.yml @@ -46,7 +46,7 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" options: "--gpus=all --shm-size=32g" strategy: matrix: diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index e63641b8ec..ee7fe2e281 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -60,7 +60,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "fabric" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" PACKAGE_NAME: "lightning" workspace: clean: all diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 4605e82442..1ece70f75e 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -53,7 +53,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "pytorch" "Lightning | latest": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.5-cuda12.1.0" PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index fa455da015..b9fcde984b 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -21,19 +21,22 @@ subprojects: checks: - "pl-cpu (macOS-13, lightning, 3.9, 2.1, oldest)" - "pl-cpu (macOS-14, lightning, 3.10, 2.1)" - - "pl-cpu (macOS-14, lightning, 3.11, 2.2)" + - "pl-cpu (macOS-14, lightning, 3.11, 2.2.2)" - "pl-cpu (macOS-14, lightning, 3.11, 2.3)" - - "pl-cpu (macOS-14, lightning, 3.12, 2.4)" + - "pl-cpu (macOS-14, lightning, 3.12, 2.4.1)" + - "pl-cpu (macOS-14, lightning, 3.12, 2.5.1)" - "pl-cpu (ubuntu-20.04, lightning, 3.9, 2.1, oldest)" - "pl-cpu (ubuntu-20.04, lightning, 3.10, 2.1)" - - "pl-cpu (ubuntu-20.04, lightning, 3.11, 2.2)" + - "pl-cpu (ubuntu-20.04, lightning, 3.11, 2.2.2)" - "pl-cpu (ubuntu-20.04, lightning, 3.11, 2.3)" - - "pl-cpu (ubuntu-20.04, lightning, 3.12, 2.4)" + - "pl-cpu (ubuntu-22.04, lightning, 3.12, 2.4.1)" + - "pl-cpu (ubuntu-22.04, lightning, 3.12, 2.5.1)" - "pl-cpu (windows-2022, lightning, 3.9, 2.1, oldest)" - "pl-cpu (windows-2022, lightning, 3.10, 2.1)" - - "pl-cpu (windows-2022, lightning, 3.11, 2.2)" + - "pl-cpu (windows-2022, lightning, 3.11, 2.2.2)" - "pl-cpu (windows-2022, lightning, 3.11, 2.3)" - - "pl-cpu (windows-2022, lightning, 3.12, 2.4)" + - "pl-cpu (windows-2022, lightning, 3.12, 2.4.1)" + - "pl-cpu (windows-2022, lightning, 3.12, 2.5.1)" - "pl-cpu (macOS-14, pytorch, 3.9, 2.1)" - "pl-cpu (ubuntu-20.04, pytorch, 3.9, 2.1)" - "pl-cpu (windows-2022, pytorch, 3.9, 2.1)" @@ -141,15 +144,17 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "build-cuda (3.11, 2.1, 12.1.0)" - - "build-cuda (3.11, 2.2, 12.1.0)" - - "build-cuda (3.11, 2.3, 12.1.0)" - - "build-cuda (3.12, 2.4, 12.1.0)" + - "build-cuda (3.10, 2.1.2, 12.1.0)" + - "build-cuda (3.11, 2.2.2, 12.1.0)" + - "build-cuda (3.11, 2.3.1, 12.1.0)" + - "build-cuda (3.11, 2.4.1, 12.1.0)" + - "build-cuda (3.12, 2.5.1, 12.1.0)" #- "build-NGC" - - "build-pl (3.11, 2.1, 12.1.0)" + - "build-pl (3.10, 2.1, 12.1.0)" - "build-pl (3.11, 2.2, 12.1.0)" - "build-pl (3.11, 2.3, 12.1.0)" - - "build-pl (3.12, 2.4, 12.1.0)" + - "build-pl (3.11, 2.4, 12.1.0)" + - "build-pl (3.12, 2.5, 12.1.0)" # SECTION: lightning_fabric @@ -168,19 +173,22 @@ subprojects: checks: - "fabric-cpu (macOS-13, lightning, 3.9, 2.1, oldest)" - "fabric-cpu (macOS-14, lightning, 3.10, 2.1)" - - "fabric-cpu (macOS-14, lightning, 3.11, 2.2)" + - "fabric-cpu (macOS-14, lightning, 3.11, 2.2.2)" - "fabric-cpu (macOS-14, lightning, 3.11, 2.3)" - - "fabric-cpu (macOS-14, lightning, 3.12, 2.4)" + - "fabric-cpu (macOS-14, lightning, 3.12, 2.4.1)" + - "fabric-cpu (macOS-14, lightning, 3.12, 2.5.1)" - "fabric-cpu (ubuntu-20.04, lightning, 3.9, 2.1, oldest)" - "fabric-cpu (ubuntu-20.04, lightning, 3.10, 2.1)" - - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2)" + - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.2.2)" - "fabric-cpu (ubuntu-20.04, lightning, 3.11, 2.3)" - - "fabric-cpu (ubuntu-20.04, lightning, 3.12, 2.4)" + - "fabric-cpu (ubuntu-22.04, lightning, 3.12, 2.4.1)" + - "fabric-cpu (ubuntu-22.04, lightning, 3.12, 2.5.1)" - "fabric-cpu (windows-2022, lightning, 3.9, 2.1, oldest)" - "fabric-cpu (windows-2022, lightning, 3.10, 2.1)" - - "fabric-cpu (windows-2022, lightning, 3.11, 2.2)" + - "fabric-cpu (windows-2022, lightning, 3.11, 2.2.2)" - "fabric-cpu (windows-2022, lightning, 3.11, 2.3)" - - "fabric-cpu (windows-2022, lightning, 3.12, 2.4)" + - "fabric-cpu (windows-2022, lightning, 3.12, 2.4.1)" + - "fabric-cpu (windows-2022, lightning, 3.12, 2.5.1)" - "fabric-cpu (macOS-14, fabric, 3.9, 2.1)" - "fabric-cpu (ubuntu-20.04, fabric, 3.9, 2.1)" - "fabric-cpu (windows-2022, fabric, 3.9, 2.1)" diff --git a/.github/workflows/ci-tests-fabric.yml b/.github/workflows/ci-tests-fabric.yml index 7d854bbf7e..ca4dd0b845 100644 --- a/.github/workflows/ci-tests-fabric.yml +++ b/.github/workflows/ci-tests-fabric.yml @@ -43,15 +43,18 @@ jobs: - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" } + - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" } + - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" } # only run PyTorch latest with Python latest, use Fabric scope to limit dependency issues - { os: "macOS-13", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-22.04", pkg-name: "fabric", python-version: "3.10", pytorch-version: "2.1" } diff --git a/.github/workflows/ci-tests-pytorch.yml b/.github/workflows/ci-tests-pytorch.yml index a9d7dfdf55..0c7deddbe5 100644 --- a/.github/workflows/ci-tests-pytorch.yml +++ b/.github/workflows/ci-tests-pytorch.yml @@ -47,15 +47,18 @@ jobs: - { os: "macOS-14", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.10", pytorch-version: "2.1" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } + - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.2.2" } - { os: "macOS-14", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - { os: "windows-2022", pkg-name: "lightning", python-version: "3.11", pytorch-version: "2.3" } - - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4" } - - { os: "ubuntu-20.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4" } - - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" } + - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.4.1" } + - { os: "macOS-14", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" } + - { os: "ubuntu-22.04", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" } + - { os: "windows-2022", pkg-name: "lightning", python-version: "3.12", pytorch-version: "2.5.1" } # only run PyTorch latest with Python latest, use PyTorch scope to limit dependency issues - { os: "macOS-13", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" } - { os: "ubuntu-22.04", pkg-name: "pytorch", python-version: "3.10", pytorch-version: "2.1" } diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 6df2b8cbb7..09ae3adc45 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -43,10 +43,11 @@ jobs: include: # We only release one docker image per PyTorch version. # Make sure the matrix here matches the one below. - - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } + - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } - - { python_version: "3.12", pytorch_version: "2.4", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.1.0" } + - { python_version: "3.12", pytorch_version: "2.5", cuda_version: "12.1.0" } steps: - uses: actions/checkout@v4 with: @@ -103,10 +104,11 @@ jobs: include: # These are the base images for PL release docker images. # Make sure the matrix here matches the one above. - - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } - - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } - - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } - - { python_version: "3.12", pytorch_version: "2.4", cuda_version: "12.1.0" } + - { python_version: "3.10", pytorch_version: "2.1.2", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.2.2", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.3.1", cuda_version: "12.1.0" } + - { python_version: "3.11", pytorch_version: "2.4.1", cuda_version: "12.1.0" } + - { python_version: "3.12", pytorch_version: "2.5.1", cuda_version: "12.1.0" } steps: - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v3 @@ -115,6 +117,12 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} + + - name: shorten Torch version + run: | + # convert 1.10.2 to 1.10 + pt_version=$(echo ${{ matrix.pytorch_version }} | cut -d. -f1,2) + echo "PT_VERSION=$pt_version" >> $GITHUB_ENV - uses: docker/build-push-action@v6 with: build-args: | @@ -123,7 +131,7 @@ jobs: CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_NIGHTLY }} - tags: "pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}" + tags: "pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ env.PT_VERSION }}-cuda${{ matrix.cuda_version }}" timeout-minutes: 95 - uses: ravsamhq/notify-slack-action@v2 if: failure() && env.PUSH_NIGHTLY == 'true' diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 0a99614a46..42c055e85c 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -1,7 +1,7 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torch >=2.1.0, <2.5.0 +torch >=2.1.0, <2.6.0 fsspec[http] >=2022.5.0, <2024.4.0 packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 diff --git a/requirements/fabric/examples.txt b/requirements/fabric/examples.txt index cb4135da24..3352db77d8 100644 --- a/requirements/fabric/examples.txt +++ b/requirements/fabric/examples.txt @@ -1,6 +1,6 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torchvision >=0.16.0, <0.20.0 -torchmetrics >=0.10.0, <1.3.0 +torchvision >=0.16.0, <0.21.0 +torchmetrics >=0.10.0, <1.5.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/fabric/test.txt b/requirements/fabric/test.txt index 8fb9122051..2da6ae8854 100644 --- a/requirements/fabric/test.txt +++ b/requirements/fabric/test.txt @@ -7,4 +7,4 @@ pytest-rerunfailures ==12.0 pytest-random-order ==1.1.0 click ==8.1.7 tensorboardX >=2.2, <2.7.0 # min version is set by torch.onnx missing attribute -torchmetrics >=0.7.0, <1.3.0 # needed for using fixed compare_version +torchmetrics >=0.7.0, <1.5.0 # needed for using fixed compare_version diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 6ff628d7ed..94aca759c3 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -1,11 +1,11 @@ # NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment -torch >=2.1.0, <2.5.0 +torch >=2.1.0, <2.6.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 fsspec[http] >=2022.5.0, <2024.4.0 -torchmetrics >=0.7.0, <1.3.0 # needed for using fixed compare_version +torchmetrics >=0.7.0, <1.5.0 # needed for using fixed compare_version packaging >=20.0, <=23.1 typing-extensions >=4.4.0, <4.10.0 lightning-utilities >=0.10.0, <0.12.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 9a6ae7e47d..2e793e0045 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment requests <2.32.0 -torchvision >=0.16.0, <0.20.0 +torchvision >=0.16.0, <0.21.0 ipython[all] <8.15.0 -torchmetrics >=0.10.0, <1.3.0 +torchmetrics >=0.10.0, <1.5.0 lightning-utilities >=0.8.0, <0.12.0 diff --git a/requirements/typing.txt b/requirements/typing.txt index 0323edfd60..71414998dd 100644 --- a/requirements/typing.txt +++ b/requirements/typing.txt @@ -1,5 +1,5 @@ mypy==1.11.0 -torch==2.4.1 +torch==2.5.1 types-Markdown types-PyYAML diff --git a/src/lightning/fabric/__init__.py b/src/lightning/fabric/__init__.py index 921d3d61e6..d675b21e5d 100644 --- a/src/lightning/fabric/__init__.py +++ b/src/lightning/fabric/__init__.py @@ -2,6 +2,7 @@ import logging import os +import sys from lightning_utilities.core.imports import package_available @@ -26,6 +27,10 @@ if not _root_logger.hasHandlers(): # https://github.com/pytorch/pytorch/issues/83973 os.environ["PYTORCH_NVML_BASED_CUDA_CHECK"] = "1" +# see https://github.com/pytorch/pytorch/issues/139990 +if sys.platform == "win32": + os.environ["USE_LIBUV"] = "0" + from lightning.fabric.fabric import Fabric # noqa: E402 from lightning.fabric.utilities.seed import seed_everything # noqa: E402 diff --git a/src/lightning/pytorch/core/module.py b/src/lightning/pytorch/core/module.py index 782fc40d92..d8374ef7ea 100644 --- a/src/lightning/pytorch/core/module.py +++ b/src/lightning/pytorch/core/module.py @@ -531,7 +531,7 @@ class LightningModule( logger=logger, on_step=on_step, on_epoch=on_epoch, - reduce_fx=reduce_fx, # type: ignore[arg-type] + reduce_fx=reduce_fx, enable_graph=enable_graph, add_dataloader_idx=add_dataloader_idx, batch_size=batch_size, @@ -1405,7 +1405,9 @@ class LightningModule( input_sample = self._apply_batch_transfer_handler(input_sample) file_path = str(file_path) if isinstance(file_path, Path) else file_path - torch.onnx.export(self, input_sample, file_path, **kwargs) + # PyTorch (2.5) declares file_path to be str | PathLike[Any] | None, but + # BytesIO does work, too. + torch.onnx.export(self, input_sample, file_path, **kwargs) # type: ignore self.train(mode) @torch.no_grad() diff --git a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py index 583105c366..62cc7844d3 100644 --- a/src/lightning/pytorch/trainer/connectors/logger_connector/result.py +++ b/src/lightning/pytorch/trainer/connectors/logger_connector/result.py @@ -351,6 +351,7 @@ class _ResultCollection(dict): return batch_size + @torch.compiler.disable def log( self, fx: str, @@ -413,6 +414,7 @@ class _ResultCollection(dict): batch_size = self._extract_batch_size(self[key], batch_size, meta) self.update_metrics(key, value, batch_size) + @torch.compiler.disable def update_metrics(self, key: str, value: _VALUE, batch_size: int) -> None: result_metric = self[key] # performance: avoid calling `__call__` to avoid the checks in `torch.nn.Module._call_impl` diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh index 0aa0bacff1..8a4d8e180d 100755 --- a/tests/run_standalone_tests.sh +++ b/tests/run_standalone_tests.sh @@ -48,6 +48,7 @@ function show_batched_output { # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail if perl -nle 'print if /error|(?