CI/CD: Add CUDA version to docker image tags (#13831)

* append cuda version to tags * revertme: push to hub * Update docker readme * Build base-conda-py3.9-torch1.12-cuda11.3.1 * Use new images in conda tests * revertme: push to hub * Revert "revertme: push to hub" This reverts commit 0f7d534b2a. * Revert "revertme: push to hub" This reverts commit 46a05fccbb. * Run conda if workflow edited * Run gpu testing if workflow edited * Use new tags in release/Dockerfile * Build base-cuda and PL release images with all combinations * Update release docker * Update conda from py3.9-torch1.12 to py3.10-torch.1.12 * Fix ubuntu version * Revert conda * revertme: push to hub * Don't build Python 3.10 for now... * Fix pl release builder * updating version contribute to the error? https://github.com/docker/buildx/issues/456 * Update actions' versions * Update slack user to notify * Don't use 11.6.0 to avoid bagua incompatibility * Don't use 11.1, and use 11.1.1 * Update .github/workflows/ci-pytorch_test-conda.yml Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> * Update trigger * Ignore artfacts from tutorials * Trim docker images to distribute * Add an image for tutorials * Update conda image 3.8x1.10 * Try different conda variants * No need to set cuda for conda jobs * Update who to notify ipu failure * Don't push * update filenaem Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
2022-08-10 19:37:50 +09:00 · 2022-08-10 19:37:50 +09:00 · d5f35ece72
parent ddb476d334
commit d5f35ece72
8 changed files with 87 additions and 88 deletions
--- a/.azure/gpu-benchmark.yml
+++ b/.azure/gpu-benchmark.yml
@ -28,7 +28,7 @@ jobs:
    cancelTimeoutInMinutes: "2"
    pool: azure-jirka-spot
    container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
    workspace:
      clean: all
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@ -26,7 +26,7 @@ jobs:
    strategy:
      matrix:
        'PyTorch - stable':
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
    # how long to run the job before automatically cancelling
    timeoutInMinutes: "80"
    # how much time to give 'run always even if cancelled tasks' before stopping them
@ -44,7 +44,7 @@ jobs:

    - bash: |
        CHANGED_FILES=$(git diff --name-status origin/master -- . | awk  '{print $2}')
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*'
+        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml'
        echo $CHANGED_FILES > changed_files.txt
        MATCHES=$(cat changed_files.txt | grep -E $FILTER)
        echo $MATCHES
--- a/.github/workflows/ci-pytorch-test-conda.yml
+++ b/.github/workflows/ci-pytorch-test-conda.yml
@ -22,13 +22,11 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # nightly: add when there's a release candidate
        include:
          - {python-version: "3.8", pytorch-version: "1.9"}
          - {python-version: "3.8", pytorch-version: "1.10"}
          - {python-version: "3.9", pytorch-version: "1.11"}
          - {python-version: "3.9", pytorch-version: "1.12"}
-
    timeout-minutes: 30

    steps:
@ -45,7 +43,7 @@ jobs:
      id: skip
      shell: bash -l {0}
      run: |
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml'
        echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt
        MATCHES=$(cat changed_files.txt | grep -E $FILTER)
        echo $MATCHES
--- a/.github/workflows/cicd-pytorch-dockers.yml
+++ b/.github/workflows/cicd-pytorch-dockers.yml
@ -29,17 +29,22 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image
-        python_version: ["3.9"]
-        pytorch_version: ["1.12"]
+        include:
+          # We only release one docker image per PyTorch version.
+          # The matrix here is the same as the one in release-docker.yml.
+          - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
          file: dockers/release/Dockerfile
          push: false  # pushed in release-docker.yml only when PL is released
        timeout-minutes: 50
@ -53,14 +58,14 @@ jobs:
        python_version: ["3.7"]
        xla_version: ["1.12"]
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
        if: env.PUSH_TO_HUB == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
@ -85,30 +90,31 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"}
-          # latest (used in Tutorials)
-          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"}
-          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"}
-          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
+          # These are the base images for PL release docker images,
+          # so include at least all of the combinations in release-dockers.yml.
+          - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
+          # Used in Lightning-AI/tutorials
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
        if: env.PUSH_TO_HUB == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
            CUDA_VERSION=${{ matrix.cuda_version }}
-            UBUNTU_VERSION=${{ matrix.ubuntu_version }}
          file: dockers/base-cuda/Dockerfile
          push: ${{ env.PUSH_TO_HUB }}
-          tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
        timeout-minutes: 95
      - uses: ravsamhq/notify-slack-action@v1
        if: failure() && env.PUSH_TO_HUB == 'true'
@ -126,25 +132,23 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
-          - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"}
-          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
-          #  nightly: add when there's a release candidate
-          #  - {python_version: "3.9", pytorch_version: "1.12"}
+          - {python_version: "3.8", pytorch_version: "1.9"}
+          - {python_version: "3.8", pytorch_version: "1.10"}
+          - {python_version: "3.9", pytorch_version: "1.11"}
+          - {python_version: "3.9", pytorch_version: "1.12"}
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
        if: env.PUSH_TO_HUB == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
-            CUDA_VERSION=${{ matrix.cuda_version }}
          file: dockers/base-conda/Dockerfile
          push: ${{ env.PUSH_TO_HUB }}
          tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
@ -168,14 +172,14 @@ jobs:
          # the config used in 'dockers/ci-runner-ipu/Dockerfile'
          - {python_version: "3.9", pytorch_version: "1.9"}
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
        if: env.PUSH_TO_HUB == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
@ -184,7 +188,7 @@ jobs:
          push: ${{ env.PUSH_TO_HUB }}
          tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
        timeout-minutes: 100
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
@ -199,7 +203,7 @@ jobs:
          status: ${{ job.status }}
          token: ${{ secrets.GITHUB_TOKEN }}
          notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
-          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>'  # SeanNaren
+          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>'  # kaushikb11
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

@ -212,14 +216,14 @@ jobs:
          # the config used in 'dockers/ci-runner-hpu/Dockerfile'
          - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"}
    steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
      - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
        if: env.PUSH_TO_HUB == 'true'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
        with:
          build-args: |
            DIST=latest
@ -243,10 +247,10 @@ jobs:
    runs-on: ubuntu-20.04
    steps:
      - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
      - name: Build Conda Docker
        # publish master/release
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v3
        with:
          file: dockers/nvidia/Dockerfile
          push: false
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@ -1,6 +1,5 @@
 name: Docker
-# https://www.docker.com/blog/first-docker-github-action-is-here
-# https://github.com/docker/build-push-action
+
 on:
  push:
    branches: [master, "release/*"]
@ -15,8 +14,12 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.7", "3.8", "3.9"]
-        pytorch_version: ["1.9", "1.10"]
+        include:
+          # We only release one docker image per PyTorch version.
+          - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
    steps:
      - name: Checkout
        uses: actions/checkout@v2
@ -32,19 +35,29 @@ jobs:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
          dockerfile: dockers/release/Dockerfile
-          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
-          tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
+          build_args: |
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+          tags: |
+            ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
+            latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
        timeout-minutes: 55

      - name: Publish Latest to Docker
        uses: docker/build-push-action@v1.1.0
-        # only on releases and latest Python and PyTorch
-        if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10'
+        # Only latest Python and PyTorch
+        if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
        with:
          repository: pytorchlightning/pytorch_lightning
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
          dockerfile: dockers/release/Dockerfile
-          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+          build_args: |
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
          tags: "latest"
        timeout-minutes: 55
--- a/.gitignore
+++ b/.gitignore
@ -165,3 +165,9 @@ hars*
 artifacts/*
 *docs/examples*
 *docs/source-app/api*
+
+# tutorials
+our_model.tar
+test.png
+saved_models
+data/
--- a/dockers/README.md
+++ b/dockers/README.md
@ -1,36 +1,17 @@
 # Docker images

-## Builds images form attached Dockerfiles
+## Build images from Dockerfiles

 You can build it on your own, note it takes lots of time, be prepared.

 ```bash
-git clone <git-repository>
-docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile .
-```
+git clone https://github.com/Lightning-AI/lightning.git

-or with specific arguments
+# build with the default arguments
+docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile .

-```bash
-git clone <git-repository>
-docker image build \
-    -t pytorch-lightning:base-cuda-py3.9-pt1.10 \
-    -f dockers/base-cuda/Dockerfile \
-    --build-arg PYTHON_VERSION=3.9 \
-    --build-arg PYTORCH_VERSION=1.10 \
-    .
-```
-
-or nightly version from Conda
-
-```bash
-git clone <git-repository>
-docker image build \
-    -t pytorch-lightning:base-conda-py3.9-pt1.11 \
-    -f dockers/base-conda/Dockerfile \
-    --build-arg PYTHON_VERSION=3.9 \
-    --build-arg PYTORCH_VERSION=1.11 \
-    .
+# build with specific arguments
+docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 .
 ```

 To run your docker use
@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest

 ## Run docker image with GPUs

-To run docker image with access to you GPUs you need to install
+To run docker image with access to your GPUs, you need to install

 ```bash
 # Add the package repositories
@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
 sudo systemctl restart docker
 ```

-and later run the docker image with `--gpus all` so for example
+and later run the docker image with `--gpus all`. For example,

 ```
-docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10
+docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1
 ```

 ## Run Jupyter server
@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in-

 1. Build the docker image:
   ```bash
-   docker image build \
-       -t pytorch-lightning:v1.3.1 \
-       -f dockers/nvidia/Dockerfile \
-       --build-arg LIGHTNING_VERSION=1.3.1 \
-       .
+   docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 .
   ```
 1. start the server and map ports:
   ```bash
-   docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1
+   docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5
   ```
 1. Connect in local browser:
   - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6`
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@ -14,8 +14,9 @@

 ARG PYTHON_VERSION=3.9
 ARG PYTORCH_VERSION=1.11
+ARG CUDA_VERSION=11.3.1

-FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
+FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}

 LABEL maintainer="Lightning-AI <https://github.com/Lightning-AI>"