diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 98ba324781..05825847a9 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -31,19 +31,23 @@ subprojects: - "setup.cfg" # includes pytest config checks: # Note: updates here should be applied to the lightning_lite group - - "pl-conda (3.8, 1.10)" - - "pl-conda (3.8, 1.9)" - - "pl-conda (3.9, 1.11)" - - "pl-conda (3.9, 1.12)" - - "pl-cpu (macOS-11, 3.10, latest, stable)" - - "pl-cpu (macOS-11, 3.7, latest, stable)" - - "pl-cpu (macOS-11, 3.7, oldest, stable)" - - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "pl-cpu (windows-2022, 3.10, latest, stable)" - - "pl-cpu (windows-2022, 3.7, latest, stable)" - - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" + - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.9)" + - "pl-cpu (ubuntu-20.04 3.8, 1.10)" + - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.9, 1.12)" + - "pl-cpu (ubuntu-20.04 3.10, 1.12)" + - "pl-cpu (macos-11, 3.7, 1.9, oldest)" + - "pl-cpu (macos-11, 3.7, 1.12)" + - "pl-cpu (macos-11, 3.8, 1.10)" + - "pl-cpu (macos-11, 3.9, 1.11)" + - "pl-cpu (macos-11, 3.10, 1.12)" + - "pl-cpu (windows-2022, 3.7, 1.9, oldest)" + - "pl-cpu (windows-2022, 3.7, 1.12)" + - "pl-cpu (windows-2022, 3.8, 1.10)" + - "pl-cpu (windows-2022, 3.9, 1.11)" + - "pl-cpu (windows-2022, 3.10, 1.12)" - "make-doctest (pytorch)" - "make-html (pytorch)" - "mypy" @@ -56,28 +60,27 @@ subprojects: # TODO: since this job cannot run on forks, it cannot be required or it will block all PL PRs from forks #- "test-on-tpus" - - id: "pytorch_lightning: Conda" - paths: - - ".github/workflows/ci-pytorch-test-conda.yml" - checks: - - "pl-conda (3.8, 1.10)" - - "pl-conda (3.8, 1.9)" - - "pl-conda (3.9, 1.11)" - - "pl-conda (3.9, 1.12)" - - id: "pytorch_lightning: CPU" paths: - ".github/workflows/ci-pytorch-test-full.yml" checks: - - "pl-cpu (macOS-11, 3.10, latest, stable)" - - "pl-cpu (macOS-11, 3.7, latest, stable)" - - "pl-cpu (macOS-11, 3.7, oldest, stable)" - - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "pl-cpu (windows-2022, 3.10, latest, stable)" - - "pl-cpu (windows-2022, 3.7, latest, stable)" - - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" + - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.9)" + - "pl-cpu (ubuntu-20.04 3.8, 1.10)" + - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.9, 1.12)" + - "pl-cpu (ubuntu-20.04 3.10, 1.12)" + - "pl-cpu (macos-11, 3.7, 1.9, oldest)" + - "pl-cpu (macos-11, 3.7, 1.12)" + - "pl-cpu (macos-11, 3.8, 1.10)" + - "pl-cpu (macos-11, 3.9, 1.11)" + - "pl-cpu (macos-11, 3.10, 1.12)" + - "pl-cpu (windows-2022, 3.7, 1.9, oldest)" + - "pl-cpu (windows-2022, 3.7, 1.12)" + - "pl-cpu (windows-2022, 3.8, 1.10)" + - "pl-cpu (windows-2022, 3.9, 1.11)" + - "pl-cpu (windows-2022, 3.10, 1.12)" - id: "pytorch_lightning: Slow" paths: @@ -127,10 +130,6 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.9, 11.1.1)" - - "build-conda (3.8, 1.10.1, 11.1.1)" - - "build-conda (3.9, 1.11, 11.3.1)" - - "build-conda (3.9, 1.12, 11.3.1)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" @@ -166,15 +165,23 @@ subprojects: - "lightning-lite (GPUs)" - "mypy" # Lite also requires PL checks as it depends on Lite - - "pl-cpu (macOS-11, 3.10, latest, stable)" - - "pl-cpu (macOS-11, 3.7, latest, stable)" - - "pl-cpu (macOS-11, 3.7, oldest, stable)" - - "pl-cpu (ubuntu-20.04, 3.10, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, latest, stable)" - - "pl-cpu (ubuntu-20.04, 3.7, oldest, stable)" - - "pl-cpu (windows-2022, 3.10, latest, stable)" - - "pl-cpu (windows-2022, 3.7, latest, stable)" - - "pl-cpu (windows-2022, 3.7, oldest, stable)" + - "pl-cpu (ubuntu-20.04 3.7, 1.9, oldest)" + - "pl-cpu (ubuntu-20.04 3.7, 1.12)" + - "pl-cpu (ubuntu-20.04 3.8, 1.9)" + - "pl-cpu (ubuntu-20.04 3.8, 1.10)" + - "pl-cpu (ubuntu-20.04 3.9, 1.11)" + - "pl-cpu (ubuntu-20.04 3.9, 1.12)" + - "pl-cpu (ubuntu-20.04 3.10, 1.12)" + - "pl-cpu (macos-11, 3.7, 1.9, oldest)" + - "pl-cpu (macos-11, 3.7, 1.12)" + - "pl-cpu (macos-11, 3.8, 1.10)" + - "pl-cpu (macos-11, 3.9, 1.11)" + - "pl-cpu (macos-11, 3.10, 1.12)" + - "pl-cpu (windows-2022, 3.7, 1.9, oldest)" + - "pl-cpu (windows-2022, 3.7, 1.12)" + - "pl-cpu (windows-2022, 3.8, 1.10)" + - "pl-cpu (windows-2022, 3.9, 1.11)" + - "pl-cpu (windows-2022, 3.10, 1.12)" - "make-doctest (pytorch)" - "make-html (pytorch)" - "pytorch-lightning (GPUs)" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 2067c244f1..b9acc1c060 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,23 +4,25 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- | -| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | -| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | +| -------------------------- | ------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | +| Test PyTorch slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | - \*Accelerators used in CI + - GPU: 2 x NVIDIA Tesla V100 - TPU: Google GKE TPUv3 - IPU: [Colossus MK1 IPU](https://www.graphcore.ai/products/ipu) - HPU: [Intel Habana Gaudi SYS-420GH-TNGR](https://www.supermicro.com/en/products/system/AI/4U/SYS-420GH-TNGR) which has 8 Gaudi accelerators +- To check which versions of Python or PyTorch are used for testing in our CI, see the corresponding workflow files or checkgroup cofig file at [`.github/checkgroup.yml`](../checkgroup.yml). + ## Documentation | workflow file | action | diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index 73f303c6cb..2e9296c3df 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -127,44 +127,6 @@ jobs: env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - build-conda: - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} - steps: - - uses: actions/checkout@v3 - - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v2 - if: env.PUSH_TO_HUB == 'true' - with: - username: ${{ secrets.DOCKER_USERNAME }} - password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v3 - with: - build-args: | - PYTHON_VERSION=${{ matrix.python_version }} - PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} - file: dockers/base-conda/Dockerfile - push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} - timeout-minutes: 95 - - uses: ravsamhq/notify-slack-action@v2 - if: failure() && env.PUSH_TO_HUB == 'true' - with: - status: ${{ job.status }} - token: ${{ secrets.GITHUB_TOKEN }} - notification_title: ${{ format('Conda; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01A5T7EY9M>' # akihironitta - env: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} - build-ipu: runs-on: ubuntu-20.04 strategy: diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml deleted file mode 100644 index 0f40d074dc..0000000000 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ /dev/null @@ -1,110 +0,0 @@ -name: Test PyTorch with Conda - -# see: https://help.github.com/en/actions/reference/events-that-trigger-workflows -on: - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - paths: - - ".github/workflows/ci-pytorch-test-conda.yml" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }} - -defaults: - run: - shell: bash -l {0} - -jobs: - pl-conda: - runs-on: ubuntu-20.04 - container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} - strategy: - fail-fast: false - matrix: - include: - - {python-version: "3.8", pytorch-version: "1.9"} - - {python-version: "3.8", pytorch-version: "1.10"} - - {python-version: "3.9", pytorch-version: "1.11"} - - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 40 - - steps: - - name: Workaround for https://github.com/actions/checkout/issues/760 - run: git config --global --add safe.directory /__w/lightning/lightning - - - uses: actions/checkout@v3 - - - name: Update base dependencies - env: - PACKAGE_NAME: pytorch - FREEZE_REQUIREMENTS: 1 - run: | - conda info - conda list - pip install -e .[test] - - - name: Freeze PIL (hotfix) - # import of PILLOW_VERSION which they recently removed in v9.0 in favor of __version__ - run: pip install "Pillow<9.0" # It messes with torchvision - - - name: DocTests - working-directory: ./src - run: pytest pytorch_lightning --cov=pytorch_lightning - - - name: Update all dependencies - env: - HOROVOD_BUILD_ARCH_FLAGS: "-mfma" - HOROVOD_WITHOUT_MXNET: 1 - HOROVOD_WITHOUT_TENSORFLOW: 1 - run: | - set -e - pip list - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/devel.txt -r requirements/pytorch/strategies.txt --find-links https://download.pytorch.org/whl/torch_stable.html - # set a per-test timeout of 2.5 minutes to fail sooner; this aids with hanging tests - pip install pytest-timeout - pip list - # sanity check - python requirements/pytorch/check-avail-extras.py - - - name: Pull legacy checkpoints - run: bash .actions/pull_legacy_checkpoints.sh - - - name: Testing PyTorch - working-directory: tests/tests_pytorch - run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - - - name: Upload pytest results - uses: actions/upload-artifact@v3 - with: - name: unittest-results-${{ runner.os }}-torch${{ matrix.pytorch-version }} - path: tests/tests_pytorch/results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml - if: failure() - - - name: Statistics - if: success() - working-directory: tests/tests_pytorch - run: | - coverage report - coverage xml - - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v3 - if: success() - # see: https://github.com/actions/toolkit/issues/399 - continue-on-error: true - with: - token: ${{ secrets.CODECOV_TOKEN }} - file: tests/tests_pytorch/coverage.xml - flags: cpu,pytest,torch${{ matrix.pytorch-version }} - name: CPU-coverage - fail_ci_if_error: false diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 6945c4909c..c7b2aa48cb 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -29,17 +29,26 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2022, macOS-11] - python-version: ["3.7", "3.10"] # minimum, maximum - requires: ["oldest", "latest"] - release: ["stable"] - exclude: - # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. - # TODO: Remove the exclusion when dropping PyTorch 1.9 support. - - {python-version: "3.10", requires: "oldest"} - # TODO: re-enable RC testing - # include: - # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} + include: + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "ubuntu-20.04", python-version: "3.7", pytorch-version: "1.12"} + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.9"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.8", pytorch-version: "1.10"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.11"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.9", pytorch-version: "1.12"} # ex-conda + - {os: "ubuntu-20.04", python-version: "3.10", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "macos-11", python-version: "3.7", pytorch-version: "1.12"} + - {os: "macos-11", python-version: "3.8", pytorch-version: "1.10"} + - {os: "macos-11", python-version: "3.9", pytorch-version: "1.11"} + - {os: "macos-11", python-version: "3.10", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.9", requires: "oldest"} + - {os: "windows-2022", python-version: "3.7", pytorch-version: "1.12"} + - {os: "windows-2022", python-version: "3.8", pytorch-version: "1.10"} + - {os: "windows-2022", python-version: "3.9", pytorch-version: "1.11"} + - {os: "windows-2022", python-version: "3.10", pytorch-version: "1.12"} + # TODO: re-enable RC testing + # - {os: ubuntu-20.04, python-version: "3.10", release: "pre"} timeout-minutes: 40 @@ -59,7 +68,6 @@ jobs: pip --version pip install -q -r .actions/requirements.txt - # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS if: runner.os == 'macOS' run: | @@ -75,8 +83,6 @@ jobs: run: | python .actions/assistant.py replace_oldest_ver - # Note: This uses an internal pip API and may not always work - # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" @@ -97,9 +103,13 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 run: | - flag=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) - url=$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) - pip install -e .[test] --upgrade $flag --find-links "https://download.pytorch.org/whl/${url}" + # adjust PyTorch versions in requirements files + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/base.txt ${{ matrix.pytorch-version }} + python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt ${{ matrix.pytorch-version }} + # install PL and optional dependencies for testing + pre_option=$(python -c "print('--pre' if '${{matrix.release}}' == 'pre' else '')" 2>&1) + url=https://download.pytorch.org/whl/$(python -c "print('test/cpu/torch_test.html' if '${{matrix.release}}' == 'pre' else 'cpu/torch_stable.html')" 2>&1) + pip install -e . -r requirements/pytorch/test.txt -r ./requirements/pytorch/extra.txt $pre_option -f ${url} pip list shell: bash @@ -107,14 +117,6 @@ jobs: working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - - name: Install extra dependencies - run: | - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt - pip install -r ./requirements/pytorch/extra.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - pip list - shell: bash - - name: Reinstall Horovod if necessary if: runner.os != 'windows' env: @@ -130,6 +132,7 @@ jobs: fi horovodrun --check-build python -c "import horovod.torch" + pip list shell: bash - name: Cache datasets @@ -139,7 +142,9 @@ jobs: key: pl-dataset - name: Sanity check - run: python requirements/pytorch/check-avail-extras.py + run: | + python -c "import torch; assert torch.__version__.startswith('${{ matrix.pytorch-version }}')" + python requirements/pytorch/check-avail-extras.py - name: Testing Warnings # the stacklevel can only be set on >=3.7 @@ -160,12 +165,6 @@ jobs: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - - name: Prepare Examples - run: | - # adjust versions according installed Torch version - python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt - pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - - name: Run Examples working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile deleted file mode 100644 index 12953af627..0000000000 --- a/dockers/base-conda/Dockerfile +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -ARG CUDA_VERSION=11.3.1 - -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 - -ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.9 -ARG CONDA_VERSION=4.11.0 - -SHELL ["/bin/bash", "-c"] -# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ -ENV \ - PATH="$PATH:/root/.local/bin" \ - DEBIAN_FRONTEND=noninteractive \ - TZ=Europe/Prague \ - # CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ - MKL_THREADING_LAYER=GNU - -RUN \ - # TODO: Remove the manual key installation once the base image is updated. - # https://github.com/NVIDIA/nvidia-docker/issues/1631 - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \ - apt-get update -qq --fix-missing && \ - NCCL_VER=$(dpkg -s libnccl2 | grep '^Version:' | awk -F ' ' '{print $2}' | awk -F '-' '{print $1}' | grep -ve '^\s*$') && \ - CUDA_VERSION_MM="${CUDA_VERSION%.*}" && \ - MAX_ALLOWED_NCCL=2.11.4 && \ - TO_INSTALL_NCCL=$(echo -e "$MAX_ALLOWED_NCCL\n$NCCL_VER" | sort -V | head -n1)-1+cuda${CUDA_VERSION_MM} && \ - apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ - wget \ - curl \ - unzip \ - ca-certificates \ - libopenmpi-dev \ - libnccl2=$TO_INSTALL_NCCL \ - libnccl-dev=$TO_INSTALL_NCCL && \ -# Install conda and python. -# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ -# Cleaning - apt-get autoremove -y && \ - apt-get clean && \ - rm -rf /root/.cache && \ - rm -rf /var/lib/apt/lists/* - -ENV \ - PATH="/root/miniconda3/bin:$PATH" \ - LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" \ - CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ - MKL_THREADING_LAYER=GNU \ - # MAKEFLAGS="-j$(nproc)" \ - MAKEFLAGS="-j2" \ - TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \ - CONDA_ENV=lightning - -COPY environment.yml environment.yml - -# conda init -RUN \ - conda update -n base -c defaults conda && \ - CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ - conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision cudatoolkit=${CUDA_VERSION_MM} \ - -c nvidia -c pytorch -c pytorch-test && \ - conda init bash && \ - # NOTE: this requires that the channel is presented in the yaml before packages \ - printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ - python prune.py && \ - rm prune.py && \ - cat environment.yml && \ - conda env update --name $CONDA_ENV --file environment.yml && \ - conda install "Pillow<9.0" && \ - conda clean -ya && \ - rm environment.yml - -ENV \ - PATH=/root/miniconda3/envs/${CONDA_ENV}/bin:$PATH \ - LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" - -COPY ./requirements/pytorch/ ./requirements/pytorch/ -COPY ./.actions/assistant.py assistant.py - -RUN \ - pip list | grep torch && \ - python -c "import torch; print(torch.__version__)" && \ - pip install -q fire && \ - python assistant.py requirements_prune_pkgs torch,torchvision && \ - # Install remaining requirements - pip install --no-cache-dir -r requirements/pytorch/base.txt \ - -r requirements/pytorch/extra.txt \ - -r requirements/pytorch/examples.txt && \ - rm assistant.py - -ENV \ - # if you want this environment to be the default o \ne, uncomment the following line: - CONDA_DEFAULT_ENV=${CONDA_ENV} \ - HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ - HOROVOD_GPU_OPERATIONS=NCCL \ - HOROVOD_WITH_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 \ - HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITH_MPI=1 - -RUN \ - HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ - export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \ - pip install --no-cache-dir -r requirements/pytorch/strategies.txt - -RUN \ - CUDA_VERSION_MAJOR=$(python -c "import torch ; print(torch.version.cuda.split('.')[0])") && \ - py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \ - # install DALI, needed for examples - # todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691 - if [ $py_ver -eq "0" ]; then \ - pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \ - python -c 'from nvidia.dali.pipeline import Pipeline' ; \ - fi - -RUN \ - # install NVIDIA apex - pip install --no-cache-dir --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \ - python -c "from apex import amp" - -RUN \ - # install Bagua - CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [116,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ - pip install "bagua-cuda$CUDA_VERSION_BAGUA" && \ - if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ - python -c "import bagua; print(bagua.__version__)" - -RUN \ - # Show what we have - pip --version && \ - conda info && \ - pip list && \ - python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ - python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python requirements/pytorch/check-avail-extras.py && \ - python requirements/pytorch/check-avail-strategies.py && \ - rm -rf requirements/