From d5f35ece72fd253adeb8e9947fd9be4a5992f8f8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta <nitta@akihironitta.com>
Date: Wed, 10 Aug 2022 19:37:50 +0900
Subject: [PATCH] CI/CD: Add CUDA version to docker image tags (#13831)

* append cuda version to tags

* revertme: push to hub

* Update docker readme

* Build base-conda-py3.9-torch1.12-cuda11.3.1

* Use new images in conda tests

* revertme: push to hub

* Revert "revertme: push to hub"

This reverts commit 0f7d534b2ae41e4bd227961a929c333c88e35f59.

* Revert "revertme: push to hub"

This reverts commit 46a05fccbb9b596aa98d5d68424917b5811c5b4f.

* Run conda if workflow edited

* Run gpu testing if workflow edited

* Use new tags in release/Dockerfile

* Build base-cuda and PL release images with all combinations

* Update release docker

* Update conda from py3.9-torch1.12 to py3.10-torch.1.12

* Fix ubuntu version

* Revert conda

* revertme: push to hub

* Don't build Python 3.10 for now...

* Fix pl release builder

* updating version contribute to the error? https://github.com/docker/buildx/issues/456

* Update actions' versions

* Update slack user to notify

* Don't use 11.6.0 to avoid bagua incompatibility

* Don't use 11.1, and use 11.1.1

* Update .github/workflows/ci-pytorch_test-conda.yml

Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>

* Update trigger

* Ignore artfacts from tutorials

* Trim docker images to distribute

* Add an image for tutorials

* Update conda image 3.8x1.10

* Try different conda variants

* No need to set cuda for conda jobs

* Update who to notify ipu failure

* Don't push

* update filenaem

Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
---
 .azure/gpu-benchmark.yml                    |  2 +-
 .azure/gpu-tests.yml                        |  4 +-
 .github/workflows/ci-pytorch-test-conda.yml |  4 +-
 .github/workflows/cicd-pytorch-dockers.yml  | 80 +++++++++++----------
 .github/workflows/release-docker.yml        | 31 +++++---
 .gitignore                                  |  6 ++
 dockers/README.md                           | 45 +++---------
 dockers/release/Dockerfile                  |  3 +-
 8 files changed, 87 insertions(+), 88 deletions(-)

diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml
index ac5ca6f60a..0de590f2c5 100644
--- a/.azure/gpu-benchmark.yml
+++ b/.azure/gpu-benchmark.yml
@@ -28,7 +28,7 @@ jobs:
     cancelTimeoutInMinutes: "2"
     pool: azure-jirka-spot
     container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
       options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
     workspace:
       clean: all
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index f37c17613a..68ba6974a3 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         'PyTorch - stable':
-          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
     # how long to run the job before automatically cancelling
     timeoutInMinutes: "80"
     # how much time to give 'run always even if cancelled tasks' before stopping them
@@ -44,7 +44,7 @@ jobs:
 
     - bash: |
         CHANGED_FILES=$(git diff --name-status origin/master -- . | awk  '{print $2}')
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*'
+        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml'
         echo $CHANGED_FILES > changed_files.txt
         MATCHES=$(cat changed_files.txt | grep -E $FILTER)
         echo $MATCHES
diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml
index 777ec2af75..2bbdb699c2 100644
--- a/.github/workflows/ci-pytorch-test-conda.yml
+++ b/.github/workflows/ci-pytorch-test-conda.yml
@@ -22,13 +22,11 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # nightly: add when there's a release candidate
         include:
           - {python-version: "3.8", pytorch-version: "1.9"}
           - {python-version: "3.8", pytorch-version: "1.10"}
           - {python-version: "3.9", pytorch-version: "1.11"}
           - {python-version: "3.9", pytorch-version: "1.12"}
-
     timeout-minutes: 30
 
     steps:
@@ -45,7 +43,7 @@ jobs:
       id: skip
       shell: bash -l {0}
       run: |
-        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+        FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml'
         echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt
         MATCHES=$(cat changed_files.txt | grep -E $FILTER)
         echo $MATCHES
diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml
index a6ba2ac4aa..84051cafd8 100644
--- a/.github/workflows/cicd-pytorch-dockers.yml
+++ b/.github/workflows/cicd-pytorch-dockers.yml
@@ -29,17 +29,22 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image
-        python_version: ["3.9"]
-        pytorch_version: ["1.12"]
+        include:
+          # We only release one docker image per PyTorch version.
+          # The matrix here is the same as the one in release-docker.yml.
+          - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/release/Dockerfile
           push: false  # pushed in release-docker.yml only when PL is released
         timeout-minutes: 50
@@ -53,14 +58,14 @@ jobs:
         python_version: ["3.7"]
         xla_version: ["1.12"]
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
         if: env.PUSH_TO_HUB == 'true'
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
@@ -85,30 +90,31 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          # the config used in '.azure-pipelines/gpu-tests.yml'
-          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"}
-          # latest (used in Tutorials)
-          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"}
-          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"}
-          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
+          # These are the base images for PL release docker images,
+          # so include at least all of the combinations in release-dockers.yml.
+          - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
+          # Used in Lightning-AI/tutorials
+          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
         if: env.PUSH_TO_HUB == 'true'
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
             CUDA_VERSION=${{ matrix.cuda_version }}
-            UBUNTU_VERSION=${{ matrix.ubuntu_version }}
           file: dockers/base-cuda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
-          tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+          tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
         timeout-minutes: 95
       - uses: ravsamhq/notify-slack-action@v1
         if: failure() && env.PUSH_TO_HUB == 'true'
@@ -126,25 +132,23 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
-          - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"}
-          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
-          #  nightly: add when there's a release candidate
-          #  - {python_version: "3.9", pytorch_version: "1.12"}
+          - {python_version: "3.8", pytorch_version: "1.9"}
+          - {python_version: "3.8", pytorch_version: "1.10"}
+          - {python_version: "3.9", pytorch_version: "1.11"}
+          - {python_version: "3.9", pytorch_version: "1.12"}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
         if: env.PUSH_TO_HUB == 'true'
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
             PYTORCH_VERSION=${{ matrix.pytorch_version }}
-            CUDA_VERSION=${{ matrix.cuda_version }}
           file: dockers/base-conda/Dockerfile
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
@@ -168,14 +172,14 @@ jobs:
           # the config used in 'dockers/ci-runner-ipu/Dockerfile'
           - {python_version: "3.9", pytorch_version: "1.9"}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
         if: env.PUSH_TO_HUB == 'true'
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
@@ -184,7 +188,7 @@ jobs:
           push: ${{ env.PUSH_TO_HUB }}
           tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
         timeout-minutes: 100
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             PYTHON_VERSION=${{ matrix.python_version }}
@@ -199,7 +203,7 @@ jobs:
           status: ${{ job.status }}
           token: ${{ secrets.GITHUB_TOKEN }}
           notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
-          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>'  # SeanNaren
+          message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>'  # kaushikb11
         env:
           SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
 
@@ -212,14 +216,14 @@ jobs:
           # the config used in 'dockers/ci-runner-hpu/Dockerfile'
           - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - uses: docker/setup-buildx-action@v2
-      - uses: docker/login-action@v1
+      - uses: docker/login-action@v2
         if: env.PUSH_TO_HUB == 'true'
         with:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
-      - uses: docker/build-push-action@v2
+      - uses: docker/build-push-action@v3
         with:
           build-args: |
             DIST=latest
@@ -243,10 +247,10 @@ jobs:
     runs-on: ubuntu-20.04
     steps:
       - name: Checkout
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
       - name: Build Conda Docker
         # publish master/release
-        uses: docker/build-push-action@v2
+        uses: docker/build-push-action@v3
         with:
           file: dockers/nvidia/Dockerfile
           push: false
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 9d87f1a582..6901a24204 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -1,6 +1,5 @@
 name: Docker
-# https://www.docker.com/blog/first-docker-github-action-is-here
-# https://github.com/docker/build-push-action
+
 on:
   push:
     branches: [master, "release/*"]
@@ -15,8 +14,12 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python_version: ["3.7", "3.8", "3.9"]
-        pytorch_version: ["1.9", "1.10"]
+        include:
+          # We only release one docker image per PyTorch version.
+          - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+          - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+          - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
     steps:
       - name: Checkout
         uses: actions/checkout@v2
@@ -32,19 +35,29 @@ jobs:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
           dockerfile: dockers/release/Dockerfile
-          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
-          tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
+          build_args: |
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+          tags: |
+            ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
+            latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
         timeout-minutes: 55
 
       - name: Publish Latest to Docker
         uses: docker/build-push-action@v1.1.0
-        # only on releases and latest Python and PyTorch
-        if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10'
+        # Only latest Python and PyTorch
+        if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
         with:
           repository: pytorchlightning/pytorch_lightning
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
           dockerfile: dockers/release/Dockerfile
-          build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+          build_args: |
+            PYTHON_VERSION=${{ matrix.python_version }}
+            PYTORCH_VERSION=${{ matrix.pytorch_version }}
+            CUDA_VERSION=${{ matrix.cuda_version }}
+            LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
           tags: "latest"
         timeout-minutes: 55
diff --git a/.gitignore b/.gitignore
index 719f291a49..259d9f2711 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,9 @@ hars*
 artifacts/*
 *docs/examples*
 *docs/source-app/api*
+
+# tutorials
+our_model.tar
+test.png
+saved_models
+data/
diff --git a/dockers/README.md b/dockers/README.md
index 533c85739f..b1ff9826b6 100644
--- a/dockers/README.md
+++ b/dockers/README.md
@@ -1,36 +1,17 @@
 # Docker images
 
-## Builds images form attached Dockerfiles
+## Build images from Dockerfiles
 
 You can build it on your own, note it takes lots of time, be prepared.
 
 ```bash
-git clone <git-repository>
-docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile .
-```
+git clone https://github.com/Lightning-AI/lightning.git
 
-or with specific arguments
+# build with the default arguments
+docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile .
 
-```bash
-git clone <git-repository>
-docker image build \
-    -t pytorch-lightning:base-cuda-py3.9-pt1.10 \
-    -f dockers/base-cuda/Dockerfile \
-    --build-arg PYTHON_VERSION=3.9 \
-    --build-arg PYTORCH_VERSION=1.10 \
-    .
-```
-
-or nightly version from Conda
-
-```bash
-git clone <git-repository>
-docker image build \
-    -t pytorch-lightning:base-conda-py3.9-pt1.11 \
-    -f dockers/base-conda/Dockerfile \
-    --build-arg PYTHON_VERSION=3.9 \
-    --build-arg PYTORCH_VERSION=1.11 \
-    .
+# build with specific arguments
+docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 .
 ```
 
 To run your docker use
@@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest
 
 ## Run docker image with GPUs
 
-To run docker image with access to you GPUs you need to install
+To run docker image with access to your GPUs, you need to install
 
 ```bash
 # Add the package repositories
@@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
 sudo systemctl restart docker
 ```
 
-and later run the docker image with `--gpus all` so for example
+and later run the docker image with `--gpus all`. For example,
 
 ```
-docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10
+docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1
 ```
 
 ## Run Jupyter server
@@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in-
 
 1. Build the docker image:
    ```bash
-   docker image build \
-       -t pytorch-lightning:v1.3.1 \
-       -f dockers/nvidia/Dockerfile \
-       --build-arg LIGHTNING_VERSION=1.3.1 \
-       .
+   docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 .
    ```
 1. start the server and map ports:
    ```bash
-   docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1
+   docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5
    ```
 1. Connect in local browser:
    - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6`
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index cb393c91df..c39e665091 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -14,8 +14,9 @@
 
 ARG PYTHON_VERSION=3.9
 ARG PYTORCH_VERSION=1.11
+ARG CUDA_VERSION=11.3.1
 
-FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
+FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}
 
 LABEL maintainer="Lightning-AI <https://github.com/Lightning-AI>"