CI/CD: Add CUDA version to docker image tags (#13831)

* append cuda version to tags

* revertme: push to hub

* Update docker readme

* Build base-conda-py3.9-torch1.12-cuda11.3.1

* Use new images in conda tests

* revertme: push to hub

* Revert "revertme: push to hub"

This reverts commit 0f7d534b2a.

* Revert "revertme: push to hub"

This reverts commit 46a05fccbb.

* Run conda if workflow edited

* Run gpu testing if workflow edited

* Use new tags in release/Dockerfile

* Build base-cuda and PL release images with all combinations

* Update release docker

* Update conda from py3.9-torch1.12 to py3.10-torch.1.12

* Fix ubuntu version

* Revert conda

* revertme: push to hub

* Don't build Python 3.10 for now...

* Fix pl release builder

* updating version contribute to the error? https://github.com/docker/buildx/issues/456

* Update actions' versions

* Update slack user to notify

* Don't use 11.6.0 to avoid bagua incompatibility

* Don't use 11.1, and use 11.1.1

* Update .github/workflows/ci-pytorch_test-conda.yml

Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>

* Update trigger

* Ignore artfacts from tutorials

* Trim docker images to distribute

* Add an image for tutorials

* Update conda image 3.8x1.10

* Try different conda variants

* No need to set cuda for conda jobs

* Update who to notify ipu failure

* Don't push

* update filenaem

Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
This commit is contained in:
Akihiro Nitta 2022-08-10 19:37:50 +09:00 committed by GitHub
parent ddb476d334
commit d5f35ece72
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 87 additions and 88 deletions

View File

@ -28,7 +28,7 @@ jobs:
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
workspace:
clean: all

View File

@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
'PyTorch - stable':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
@ -44,7 +44,7 @@ jobs:
- bash: |
CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}')
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*'
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml'
echo $CHANGED_FILES > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES

View File

@ -22,13 +22,11 @@ jobs:
strategy:
fail-fast: false
matrix:
# nightly: add when there's a release candidate
include:
- {python-version: "3.8", pytorch-version: "1.9"}
- {python-version: "3.8", pytorch-version: "1.10"}
- {python-version: "3.9", pytorch-version: "1.11"}
- {python-version: "3.9", pytorch-version: "1.12"}
timeout-minutes: 30
steps:
@ -45,7 +43,7 @@ jobs:
id: skip
shell: bash -l {0}
run: |
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml'
echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES

View File

@ -29,17 +29,22 @@ jobs:
strategy:
fail-fast: false
matrix:
# the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image
python_version: ["3.9"]
pytorch_version: ["1.12"]
include:
# We only release one docker image per PyTorch version.
# The matrix here is the same as the one in release-docker.yml.
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
file: dockers/release/Dockerfile
push: false # pushed in release-docker.yml only when PL is released
timeout-minutes: 50
@ -53,14 +58,14 @@ jobs:
python_version: ["3.7"]
xla_version: ["1.12"]
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v1
- uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
@ -85,30 +90,31 @@ jobs:
fail-fast: false
matrix:
include:
# the config used in '.azure-pipelines/gpu-tests.yml'
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# latest (used in Tutorials)
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
# These are the base images for PL release docker images,
# so include at least all of the combinations in release-dockers.yml.
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
# Used in Lightning-AI/tutorials
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v1
- uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
UBUNTU_VERSION=${{ matrix.ubuntu_version }}
file: dockers/base-cuda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
timeout-minutes: 95
- uses: ravsamhq/notify-slack-action@v1
if: failure() && env.PUSH_TO_HUB == 'true'
@ -126,25 +132,23 @@ jobs:
fail-fast: false
matrix:
include:
- {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
# nightly: add when there's a release candidate
# - {python_version: "3.9", pytorch_version: "1.12"}
- {python_version: "3.8", pytorch_version: "1.9"}
- {python_version: "3.8", pytorch_version: "1.10"}
- {python_version: "3.9", pytorch_version: "1.11"}
- {python_version: "3.9", pytorch_version: "1.12"}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v1
- uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
file: dockers/base-conda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
@ -168,14 +172,14 @@ jobs:
# the config used in 'dockers/ci-runner-ipu/Dockerfile'
- {python_version: "3.9", pytorch_version: "1.9"}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v1
- uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
@ -184,7 +188,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
timeout-minutes: 100
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
@ -199,7 +203,7 @@ jobs:
status: ${{ job.status }}
token: ${{ secrets.GITHUB_TOKEN }}
notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren
message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
@ -212,14 +216,14 @@ jobs:
# the config used in 'dockers/ci-runner-hpu/Dockerfile'
- {gaudi_version: "1.5.0", pytorch_version: "1.11.0"}
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- uses: docker/login-action@v1
- uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- uses: docker/build-push-action@v2
- uses: docker/build-push-action@v3
with:
build-args: |
DIST=latest
@ -243,10 +247,10 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: Checkout
uses: actions/checkout@v2
uses: actions/checkout@v3
- name: Build Conda Docker
# publish master/release
uses: docker/build-push-action@v2
uses: docker/build-push-action@v3
with:
file: dockers/nvidia/Dockerfile
push: false

View File

@ -1,6 +1,5 @@
name: Docker
# https://www.docker.com/blog/first-docker-github-action-is-here
# https://github.com/docker/build-push-action
on:
push:
branches: [master, "release/*"]
@ -15,8 +14,12 @@ jobs:
strategy:
fail-fast: false
matrix:
python_version: ["3.7", "3.8", "3.9"]
pytorch_version: ["1.9", "1.10"]
include:
# We only release one docker image per PyTorch version.
- {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
- {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
steps:
- name: Checkout
uses: actions/checkout@v2
@ -32,19 +35,29 @@ jobs:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
dockerfile: dockers/release/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
build_args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
tags: |
${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
timeout-minutes: 55
- name: Publish Latest to Docker
uses: docker/build-push-action@v1.1.0
# only on releases and latest Python and PyTorch
if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10'
# Only latest Python and PyTorch
if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
with:
repository: pytorchlightning/pytorch_lightning
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
dockerfile: dockers/release/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
build_args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
tags: "latest"
timeout-minutes: 55

6
.gitignore vendored
View File

@ -165,3 +165,9 @@ hars*
artifacts/*
*docs/examples*
*docs/source-app/api*
# tutorials
our_model.tar
test.png
saved_models
data/

View File

@ -1,36 +1,17 @@
# Docker images
## Builds images form attached Dockerfiles
## Build images from Dockerfiles
You can build it on your own, note it takes lots of time, be prepared.
```bash
git clone <git-repository>
docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile .
```
git clone https://github.com/Lightning-AI/lightning.git
or with specific arguments
# build with the default arguments
docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile .
```bash
git clone <git-repository>
docker image build \
-t pytorch-lightning:base-cuda-py3.9-pt1.10 \
-f dockers/base-cuda/Dockerfile \
--build-arg PYTHON_VERSION=3.9 \
--build-arg PYTORCH_VERSION=1.10 \
.
```
or nightly version from Conda
```bash
git clone <git-repository>
docker image build \
-t pytorch-lightning:base-conda-py3.9-pt1.11 \
-f dockers/base-conda/Dockerfile \
--build-arg PYTHON_VERSION=3.9 \
--build-arg PYTORCH_VERSION=1.11 \
.
# build with specific arguments
docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 .
```
To run your docker use
@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest
## Run docker image with GPUs
To run docker image with access to you GPUs you need to install
To run docker image with access to your GPUs, you need to install
```bash
# Add the package repositories
@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```
and later run the docker image with `--gpus all` so for example
and later run the docker image with `--gpus all`. For example,
```
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1
```
## Run Jupyter server
@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in-
1. Build the docker image:
```bash
docker image build \
-t pytorch-lightning:v1.3.1 \
-f dockers/nvidia/Dockerfile \
--build-arg LIGHTNING_VERSION=1.3.1 \
.
docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 .
```
1. start the server and map ports:
```bash
docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1
docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5
```
1. Connect in local browser:
- copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6`

View File

@ -14,8 +14,9 @@
ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.11
ARG CUDA_VERSION=11.3.1
FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}
LABEL maintainer="Lightning-AI <https://github.com/Lightning-AI>"