lightning/.github/workflows/docker-build.yml

name: Docker builds

on:
  push:
    branches: [master, "release/*"]
  pull_request:
    branches: [master, "release/*"]
    types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
    paths:
      - ".actions/*"
      - ".github/workflows/docker-build.yml"
      - "dockers/**"
      - "requirements/*.txt"
      - "requirements/pytorch/**"
      - "requirements/fabric/**"
      - "setup.py"
      - "!requirements/*/docs.txt"
      - "!*.md"
      - "!**/*.md"
  schedule:
    - cron: "0 0 * * *" # at the end of every day
  release:
    types: [published]
  workflow_dispatch: {}

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}-${{ github.event_name }}
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}

env:
  PUSH_NIGHTLY: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
  PUSH_RELEASE: ${{ startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' }}

jobs:
  build-pl:
    # the images generated by this job are not used anywhere in this repository. they are just meant to be available
    # for users
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          # We only release one docker image per PyTorch version.
          # Make sure the matrix here matches the one below.
          - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" }
          - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" }
          - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" }
    steps:
      - uses: actions/checkout@v4
        with:
          submodules: true
      - uses: docker/setup-buildx-action@v3
      - uses: docker/login-action@v3
        if: env.PUSH_RELEASE == 'true' && github.repository_owner == 'Lightning-AI'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}

      - name: Get release version
        if: github.event_name == 'release'
        # For workflows triggered by release, `GITHUB_REF` is the release tag created.
        run: echo "RELEASE_VERSION=$(echo ${GITHUB_REF##*/})" >> $GITHUB_ENV
      - name: Set tags
        run: |
          import os

          repo = "pytorchlightning/pytorch_lightning"
          ver = os.getenv('RELEASE_VERSION')
          py_ver = "${{ matrix.python_version }}"
          pt_ver = "${{ matrix.pytorch_version }}"
          cuda_ver = "${{ matrix.cuda_version }}"
          tags = [f"latest-py{py_ver}-torch{pt_ver}-cuda{cuda_ver}"]
          if ver:
            tags += [f"{ver}-py{py_ver}-torch{pt_ver}-cuda{cuda_ver}"]
          if py_ver == '3.10' and pt_ver == '2.1' and cuda_ver == '12.1.0':
            tags += ["latest"]

          tags = [f"{repo}:{tag}" for tag in tags]
          with open(os.getenv('GITHUB_ENV'), "a") as gh_env:
              gh_env.write("DOCKER_TAGS=" + ",".join(tags))
        shell: python

      - uses: docker/build-push-action@v6
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
            CUDA_VERSION=${{ matrix.cuda_version }}
            LIGHTNING_VERSION=${{ env.RELEASE_VERSION }}
          file: dockers/release/Dockerfile
          push: ${{ env.PUSH_RELEASE }} # pushed in release-docker.yml only when PL is released
          tags: ${{ env.DOCKER_TAGS }}
        timeout-minutes: 35

  build-cuda:
    if: github.event.pull_request.draft == false
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        include:
          # These are the base images for PL release docker images.
          # Make sure the matrix here matches the one above.
          - { python_version: "3.10", pytorch_version: "2.0", cuda_version: "11.8.0" }
          - { python_version: "3.10", pytorch_version: "2.1", cuda_version: "12.1.0" }
          - { python_version: "3.10", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.1", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.2", cuda_version: "12.1.0" }
          - { python_version: "3.11", pytorch_version: "2.3", cuda_version: "12.1.0" }
          # - { python_version: "3.12", pytorch_version: "2.2", cuda_version: "12.1.0" }  # todo: pending on `onnxruntime`
    steps:
      - uses: actions/checkout@v4
      - uses: docker/setup-buildx-action@v3
      - uses: docker/login-action@v3
        if: env.PUSH_NIGHTLY == 'true' && github.repository_owner == 'Lightning-AI'
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
      - uses: docker/build-push-action@v6
        with:
          build-args: |
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch_version }}
            CUDA_VERSION=${{ matrix.cuda_version }}
          file: dockers/base-cuda/Dockerfile
          push: ${{ env.PUSH_NIGHTLY }}
          tags: "pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}"
        timeout-minutes: 95
      - uses: ravsamhq/notify-slack-action@v2
        if: failure() && env.PUSH_NIGHTLY == 'true'
        with:
          status: ${{ job.status }}
          token: ${{ secrets.GITHUB_TOKEN }}
          notification_title: ${{ format('CUDA; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
          message_format: "{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>" # akihironitta
        env:
          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}

  build-NGC:
    if: github.event.pull_request.draft == false
    # fixme: use larger machine or optimize image size
    # runs-on: ubuntu-latest-4-cores
    # then drop continue-on-error
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Build Conda Docker
        # publish master/release
        continue-on-error: true
        uses: docker/build-push-action@v6
        with:
          file: dockers/nvidia/Dockerfile
          push: false
        timeout-minutes: 55