From d5f35ece72fd253adeb8e9947fd9be4a5992f8f8 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 19:37:50 +0900 Subject: [PATCH] CI/CD: Add CUDA version to docker image tags (#13831) * append cuda version to tags * revertme: push to hub * Update docker readme * Build base-conda-py3.9-torch1.12-cuda11.3.1 * Use new images in conda tests * revertme: push to hub * Revert "revertme: push to hub" This reverts commit 0f7d534b2ae41e4bd227961a929c333c88e35f59. * Revert "revertme: push to hub" This reverts commit 46a05fccbb9b596aa98d5d68424917b5811c5b4f. * Run conda if workflow edited * Run gpu testing if workflow edited * Use new tags in release/Dockerfile * Build base-cuda and PL release images with all combinations * Update release docker * Update conda from py3.9-torch1.12 to py3.10-torch.1.12 * Fix ubuntu version * Revert conda * revertme: push to hub * Don't build Python 3.10 for now... * Fix pl release builder * updating version contribute to the error? https://github.com/docker/buildx/issues/456 * Update actions' versions * Update slack user to notify * Don't use 11.6.0 to avoid bagua incompatibility * Don't use 11.1, and use 11.1.1 * Update .github/workflows/ci-pytorch_test-conda.yml Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> * Update trigger * Ignore artfacts from tutorials * Trim docker images to distribute * Add an image for tutorials * Update conda image 3.8x1.10 * Try different conda variants * No need to set cuda for conda jobs * Update who to notify ipu failure * Don't push * update filenaem Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 4 +- .github/workflows/cicd-pytorch-dockers.yml | 80 +++++++++++---------- .github/workflows/release-docker.yml | 31 +++++--- .gitignore | 6 ++ dockers/README.md | 45 +++--------- dockers/release/Dockerfile | 3 +- 8 files changed, 87 insertions(+), 88 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index ac5ca6f60a..0de590f2c5 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f37c17613a..68ba6974a3 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 777ec2af75..2bbdb699c2 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -22,13 +22,11 @@ jobs: strategy: fail-fast: false matrix: - # nightly: add when there's a release candidate include: - {python-version: "3.8", pytorch-version: "1.9"} - {python-version: "3.8", pytorch-version: "1.10"} - {python-version: "3.9", pytorch-version: "1.11"} - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 30 steps: @@ -45,7 +43,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml index a6ba2ac4aa..84051cafd8 100644 --- a/.github/workflows/cicd-pytorch-dockers.yml +++ b/.github/workflows/cicd-pytorch-dockers.yml @@ -29,17 +29,22 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image - python_version: ["3.9"] - pytorch_version: ["1.12"] + include: + # We only release one docker image per PyTorch version. + # The matrix here is the same as the one in release-docker.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/release/Dockerfile push: false # pushed in release-docker.yml only when PL is released timeout-minutes: 50 @@ -53,14 +58,14 @@ jobs: python_version: ["3.7"] xla_version: ["1.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -85,30 +90,31 @@ jobs: fail-fast: false matrix: include: - # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} - # latest (used in Tutorials) - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + # These are the base images for PL release docker images, + # so include at least all of the combinations in release-dockers.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + # Used in Lightning-AI/tutorials + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ matrix.cuda_version }} - UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 95 - uses: ravsamhq/notify-slack-action@v1 if: failure() && env.PUSH_TO_HUB == 'true' @@ -126,25 +132,23 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - # nightly: add when there's a release candidate - # - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9"} + - {python_version: "3.8", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} + - {python_version: "3.9", pytorch_version: "1.12"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -168,14 +172,14 @@ jobs: # the config used in 'dockers/ci-runner-ipu/Dockerfile' - {python_version: "3.9", pytorch_version: "1.9"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -184,7 +188,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 100 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -199,7 +203,7 @@ jobs: status: ${{ job.status }} token: ${{ secrets.GITHUB_TOKEN }} notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} @@ -212,14 +216,14 @@ jobs: # the config used in 'dockers/ci-runner-hpu/Dockerfile' - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | DIST=latest @@ -243,10 +247,10 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build Conda Docker # publish master/release - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: file: dockers/nvidia/Dockerfile push: false diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 9d87f1a582..6901a24204 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,6 +1,5 @@ name: Docker -# https://www.docker.com/blog/first-docker-github-action-is-here -# https://github.com/docker/build-push-action + on: push: branches: [master, "release/*"] @@ -15,8 +14,12 @@ jobs: strategy: fail-fast: false matrix: - python_version: ["3.7", "3.8", "3.9"] - pytorch_version: ["1.9", "1.10"] + include: + # We only release one docker image per PyTorch version. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - name: Checkout uses: actions/checkout@v2 @@ -32,19 +35,29 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} - tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + tags: | + ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} + latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 55 - name: Publish Latest to Docker uses: docker/build-push-action@v1.1.0 - # only on releases and latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10' + # Only latest Python and PyTorch + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} tags: "latest" timeout-minutes: 55 diff --git a/.gitignore b/.gitignore index 719f291a49..259d9f2711 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,9 @@ hars* artifacts/* *docs/examples* *docs/source-app/api* + +# tutorials +our_model.tar +test.png +saved_models +data/ diff --git a/dockers/README.md b/dockers/README.md index 533c85739f..b1ff9826b6 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,36 +1,17 @@ # Docker images -## Builds images form attached Dockerfiles +## Build images from Dockerfiles You can build it on your own, note it takes lots of time, be prepared. ```bash -git clone -docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile . -``` +git clone https://github.com/Lightning-AI/lightning.git -or with specific arguments +# build with the default arguments +docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . -```bash -git clone -docker image build \ - -t pytorch-lightning:base-cuda-py3.9-pt1.10 \ - -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.10 \ - . -``` - -or nightly version from Conda - -```bash -git clone -docker image build \ - -t pytorch-lightning:base-conda-py3.9-pt1.11 \ - -f dockers/base-conda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.11 \ - . +# build with specific arguments +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . ``` To run your docker use @@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest ## Run docker image with GPUs -To run docker image with access to you GPUs you need to install +To run docker image with access to your GPUs, you need to install ```bash # Add the package repositories @@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker ``` -and later run the docker image with `--gpus all` so for example +and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 ``` ## Run Jupyter server @@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in- 1. Build the docker image: ```bash - docker image build \ - -t pytorch-lightning:v1.3.1 \ - -f dockers/nvidia/Dockerfile \ - --build-arg LIGHTNING_VERSION=1.3.1 \ - . + docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 . ``` 1. start the server and map ports: ```bash - docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1 + docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5 ``` 1. Connect in local browser: - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6` diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index cb393c91df..c39e665091 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,8 +14,9 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 +ARG CUDA_VERSION=11.3.1 -FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} LABEL maintainer="Lightning-AI "