From 6165c7719ab6450c3d5e9d434705f8c75ed24ee4 Mon Sep 17 00:00:00 2001 From: awaelchli Date: Sat, 20 Jul 2024 01:47:56 +0200 Subject: [PATCH] upgrade gpu workflow --- .azure/gpu-tests-fabric.yml | 2 +- .azure/gpu-tests-pytorch.yml | 2 +- .github/checkgroup.yml | 9 ++------- .github/workflows/docker-build.yml | 15 +++++---------- .../plugins/collectives/torch_collective.py | 2 +- 5 files changed, 10 insertions(+), 20 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index 576b9c3eb3..09b68bf84c 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -63,7 +63,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" "Lightning | future": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.4-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.4.0" PACKAGE_NAME: "lightning" workspace: clean: all diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 15ce2f6ace..a45496da5e 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -56,7 +56,7 @@ jobs: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0" PACKAGE_NAME: "lightning" "Lightning | future": - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.4-cuda12.1.0" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.4.0" PACKAGE_NAME: "lightning" pool: lit-rtx-3090 variables: diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index a87ea4ca5a..a44067d12d 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -138,19 +138,14 @@ subprojects: - "!*.md" - "!**/*.md" checks: - - "build-cuda (3.10, 2.1, 12.1.0)" - - "build-cuda (3.10, 2.2, 12.1.0)" - "build-cuda (3.11, 2.1, 12.1.0)" - "build-cuda (3.11, 2.2, 12.1.0)" - "build-cuda (3.11, 2.3, 12.1.0)" - - "build-cuda (3.11, 2.4, 12.1.0)" - #- "build-NGC" - - "build-pl (3.10, 2.1, 12.1.0)" - - "build-pl (3.10, 2.2, 12.1.0)" + - "build-cuda (3.11, 2.4, 12.4.0)" - "build-pl (3.11, 2.1, 12.1.0)" - "build-pl (3.11, 2.2, 12.1.0)" - "build-pl (3.11, 2.3, 12.1.0)" - - "build-pl (3.11, 2.4, 12.1.0)" + - "build-pl (3.11, 2.4, 12.4.0)" # SECTION: lightning_fabric diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 6fa9d0d64d..f20584cdb3 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -28,14 +28,14 @@ concurrency: cancel-in-progress: ${{ github.event_name == 'pull_request' }} env: - PUSH_NIGHTLY: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + PUSH_NIGHTLY: true # ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} PUSH_RELEASE: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' }} jobs: build-pl: # the images generated by this job are not used anywhere in this repository. they are just meant to be available # for users - if: github.event.pull_request.draft == false + # if: github.event.pull_request.draft == false runs-on: ubuntu-latest strategy: fail-fast: false @@ -43,12 +43,10 @@ jobs: include: # We only release one docker image per PyTorch version. # Make sure the matrix here matches the one below. - - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } - - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } - - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.1.0" } + - { python_version: "3.12", pytorch_version: "2.4", cuda_version: "12.4.0" } steps: - uses: actions/checkout@v4 with: @@ -97,7 +95,7 @@ jobs: timeout-minutes: 35 build-cuda: - if: github.event.pull_request.draft == false + # if: github.event.pull_request.draft == false runs-on: ubuntu-latest strategy: fail-fast: false @@ -105,13 +103,10 @@ jobs: include: # These are the base images for PL release docker images. # Make sure the matrix here matches the one above. - - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" } - - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" } - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" } - - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.1.0" } - # - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" } # todo: pending on `onnxruntime` + - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.4.0" } steps: - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v3 diff --git a/src/lightning/fabric/plugins/collectives/torch_collective.py b/src/lightning/fabric/plugins/collectives/torch_collective.py index abba459895..1073011ac5 100644 --- a/src/lightning/fabric/plugins/collectives/torch_collective.py +++ b/src/lightning/fabric/plugins/collectives/torch_collective.py @@ -208,7 +208,7 @@ class TorchCollective(Collective): @override def _convert_to_native_op(cls, op: Union[str, ReduceOp, RedOpType]) -> Union[ReduceOp, RedOpType]: # `ReduceOp` is an empty shell for `RedOpType`, the latter being the actually returned class. - # For example, `ReduceOp.SUM` returns a `RedOpType.SUM`. the only exception is `RedOpType.PREMUL_SUM` where + # For example, `ReduceOp.SUM` returns a `RedOpType.SUM`. The only exception is `RedOpType.PREMUL_SUM` where # `ReduceOp` is still the desired class, but it's created via a special `_make_nccl_premul_sum` function if isinstance(op, (ReduceOp, RedOpType)): return op