upgrade gpu workflow

2024-07-20 01:47:56 +02:00 · 2024-07-20 01:47:56 +02:00 · 6165c7719a
parent 1cd774197d
commit 6165c7719a
5 changed files with 10 additions and 20 deletions
--- a/.azure/gpu-tests-fabric.yml
+++ b/.azure/gpu-tests-fabric.yml
@ -63,7 +63,7 @@ jobs:
          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
          PACKAGE_NAME: "lightning"
        "Lightning | future":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.4-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.4.0"
          PACKAGE_NAME: "lightning"
    workspace:
      clean: all
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@ -56,7 +56,7 @@ jobs:
          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.3-cuda12.1.0"
          PACKAGE_NAME: "lightning"
        "Lightning | future":
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.11-torch2.4-cuda12.1.0"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.12-torch2.4-cuda12.4.0"
          PACKAGE_NAME: "lightning"
    pool: lit-rtx-3090
    variables:
--- a/.github/checkgroup.yml
+++ b/.github/checkgroup.yml
@ -138,19 +138,14 @@ subprojects:
      - "!*.md"
      - "!**/*.md"
    checks:
-      - "build-cuda (3.10, 2.1, 12.1.0)"
-      - "build-cuda (3.10, 2.2, 12.1.0)"
      - "build-cuda (3.11, 2.1, 12.1.0)"
      - "build-cuda (3.11, 2.2, 12.1.0)"
      - "build-cuda (3.11, 2.3, 12.1.0)"
-      - "build-cuda (3.11, 2.4, 12.1.0)"
-      #- "build-NGC"
-      - "build-pl (3.10, 2.1, 12.1.0)"
-      - "build-pl (3.10, 2.2, 12.1.0)"
+      - "build-cuda (3.11, 2.4, 12.4.0)"
      - "build-pl (3.11, 2.1, 12.1.0)"
      - "build-pl (3.11, 2.2, 12.1.0)"
      - "build-pl (3.11, 2.3, 12.1.0)"
-      - "build-pl (3.11, 2.4, 12.1.0)"
+      - "build-pl (3.11, 2.4, 12.4.0)"

  # SECTION: lightning_fabric

--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@ -28,14 +28,14 @@ concurrency:
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}

 env:
-  PUSH_NIGHTLY: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+  PUSH_NIGHTLY: true  # ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
  PUSH_RELEASE: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' }}

 jobs:
  build-pl:
    # the images generated by this job are not used anywhere in this repository. they are just meant to be available
    # for users
-    if: github.event.pull_request.draft == false
+    #    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
@ -43,12 +43,10 @@ jobs:
        include:
          # We only release one docker image per PyTorch version.
          # Make sure the matrix here matches the one below.
-          - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" }
-          - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" }
-          - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.1.0" }
+          - { python_version: "3.12", pytorch_version: "2.4", cuda_version: "12.4.0" }
    steps:
      - uses: actions/checkout@v4
        with:
@ -97,7 +95,7 @@ jobs:
        timeout-minutes: 35

  build-cuda:
-    if: github.event.pull_request.draft == false
+    # if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
@ -105,13 +103,10 @@ jobs:
        include:
          # These are the base images for PL release docker images.
          # Make sure the matrix here matches the one above.
-          - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" }
-          - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" }
-          - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.1.0" }
-          # - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" }  # todo: pending on `onnxruntime`
+          - { python_version: "3.11", pytorch_version: "2.4", cuda_version: "12.4.0" }
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-buildx-action@v3
--- a/src/lightning/fabric/plugins/collectives/torch_collective.py
+++ b/src/lightning/fabric/plugins/collectives/torch_collective.py
@ -208,7 +208,7 @@ class TorchCollective(Collective):
    @override
    def _convert_to_native_op(cls, op: Union[str, ReduceOp, RedOpType]) -> Union[ReduceOp, RedOpType]:
        # `ReduceOp` is an empty shell for `RedOpType`, the latter being the actually returned class.
-        # For example, `ReduceOp.SUM` returns a `RedOpType.SUM`. the only exception is `RedOpType.PREMUL_SUM` where
+        # For example, `ReduceOp.SUM` returns a `RedOpType.SUM`. The only exception is `RedOpType.PREMUL_SUM` where
        # `ReduceOp` is still the desired class, but it's created via a special `_make_nccl_premul_sum` function
        if isinstance(op, (ReduceOp, RedOpType)):
            return op