From ab508dae0c825a9e6d8ae9e3a3e7ac7a2a615e12 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 30 Sep 2020 14:36:02 +0200 Subject: [PATCH] run TPU tests with multiple versions (#3024) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * rename * multi build * multi build * copy * copy * copy * copy * copy * copy * clean * note * docker * formatting Co-authored-by: Adrian Wälchli Co-authored-by: Adrian Wälchli Co-authored-by: William Falcon --- .circleci/config.yml | 27 +++++++++++---------------- .github/workflows/ci_test-tpu.yml | 8 ++++++-- .mergify.yml | 2 +- README.md | 3 +++ dockers/base-xla/Dockerfile | 10 ++++------ dockers/tpu-tests/Dockerfile | 31 ++++++++----------------------- 6 files changed, 33 insertions(+), 48 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 0fd5dd3e22..f9043588ca 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -35,12 +35,8 @@ references: name: Build and push Docker image command: | gcloud --quiet auth configure-docker - cd dockers/tpu-tests - # TODO: How to find the GITHUB_REF in CircleCI? - # $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11. - # Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it - # for the GITHUB_REF so Docker can pull the latest pending code in PR. - if [ -z "$CI_PULL_REQUEST" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" .; else git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head && docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" .; fi + #cd dockers/tpu-tests + docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" . docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" deploy_cluster: &deploy_cluster @@ -99,9 +95,13 @@ references: jobs: TPU-tests: + parameters: + python: + type: string docker: - image: circleci/python:3.7 environment: + - PYTHON_VER: << parameters.python >> - MAX_CHECKS: 240 - CHECK_SPEEP: 5 steps: @@ -148,19 +148,14 @@ jobs: workflows: version: 2 - build: + tpu-tests: jobs: - build-Docs - TPU-tests: - filters: - branches: - # https://discuss.circleci.com/t/create-separate-steps-jobs-for-pr-forks-versus-branches/13419/4 - #only: - # # only from forks - # - /^pull\/.\d+$/ - ignore: - - master - cleanup: + matrix: + parameters: + python: ["3.6", "3.7"] + tpu-cleanup: triggers: - schedule: # The cron format is: diff --git a/.github/workflows/ci_test-tpu.yml b/.github/workflows/ci_test-tpu.yml index 5295d1fb44..062b57955a 100644 --- a/.github/workflows/ci_test-tpu.yml +++ b/.github/workflows/ci_test-tpu.yml @@ -20,6 +20,10 @@ jobs: setup-build-publish-deploy: name: tpu-testing-job runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 50 @@ -61,8 +65,8 @@ jobs: shell: bash - name: Build and Push Docker Image run: | - cd dockers/tpu-tests - docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" . + #cd dockers/tpu-tests + docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=${{ matrix.python-version }}" . docker push "$IMAGE:$GITHUB_RUN_ID" shell: bash diff --git a/.mergify.yml b/.mergify.yml index 440314b69f..44c48f2ddc 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -24,7 +24,7 @@ pull_request_rules: # no requested chnages from any reviewer - "#changes-requested-reviews-by=0" # this serves as ALL check has to pass as we have actually around 40 tests in total - - "#status-success>=53" + - "#status-success>=54" # this is just in case since we rely on GPU tests (note: redundand to the above) - status-success=continuous-integration/drone/pr - "status-success=ci/circleci: TPU-tests" diff --git a/README.md b/README.md index 50f6d32a09..4075b6ebb0 100644 --- a/README.md +++ b/README.md @@ -82,6 +82,8 @@ Get started with our [3 steps guide](https://pytorch-lightning.readthedocs.io/en | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) | | :---: | :---: | :---: | :---: | :---: | :---: | | Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | +| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) | - | +| Linux py3.6 / py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.7 [GPUs**] | - | - |[![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | @@ -91,6 +93,7 @@ Get started with our [3 steps guide](https://pytorch-lightning.readthedocs.io/en - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_ - _\** tests run on two NVIDIA K80_ - _\*** tests run on Google GKE TPUv2/3_ +- _TPU w/ py3.6/py3.7 means we support Colab and Kaggle env._ diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 68d5666fcd..9e659338ba 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -69,15 +69,13 @@ COPY ./ ./pytorch-lightning/ RUN \ # Install PL dependencies cd pytorch-lightning && \ - # drop Torch + # drop Torch as it was installed with XLA python -c "fname = \"./requirements/base.txt\" ; lines = [line for line in open(fname).readlines() if not line.startswith(\"torch\")] ; open(fname, \"w\").writelines(lines)" && \ - pip install --requirement ./requirements/base.txt --upgrade-strategy only-if-needed && \ - # drop Horovod + # drop Horovod as it is not needed python -c "fname = \"./requirements/extra.txt\" ; lines = [line for line in open(fname).readlines() if not line.startswith(\"horovod\")] ; open(fname, \"w\").writelines(lines)" && \ - pip install --requirement ./requirements/extra.txt --upgrade-strategy only-if-needed && \ - # drop TorchVision + # drop TorchVision as it was installed with XLA python -c "fname = \"./requirements/examples.txt\" ; lines = [line for line in open(fname).readlines() if not line.startswith(\"torchvision\")] ; open(fname, \"w\").writelines(lines)" && \ - pip install --requirement ./requirements/examples.txt --upgrade-strategy only-if-needed && \ + pip install --requirement ./requirements/devel.txt --upgrade-strategy only-if-needed && \ cd .. && \ rm -rf pytorch-lightning && \ rm -rf /root/.cache diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index ab32a3192b..e4f8ef49df 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -1,38 +1,23 @@ ARG PYTHON_VERSION=3.7 +ARG PYTORCH_VERSION=1.6 -FROM pytorchlightning/pytorch_lightning:XLA-extras-py${PYTHON_VERSION} - -# Build args. -ARG GITHUB_REF=refs/heads/master -ARG TEST_IMAGE=0 - -# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below. +FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} #SHELL ["/bin/bash", "-c"] -# Install pytorch-lightning at the current PR, plus dependencies. -RUN git clone https://github.com/PyTorchLightning/pytorch-lightning.git && \ - cd pytorch-lightning && \ - echo $GITHUB_REF && \ - git fetch origin $GITHUB_REF:CI && \ - git checkout CI && \ - pip install --requirement ./requirements/base.txt --no-cache-dir +COPY ./ ./pytorch-lightning/ -# If using this image for tests, intall more dependencies and don"t delete -# the source code where the tests live. +# If using this image for tests, intall more dependencies and don"t delete the source code where the tests live. RUN \ + # Install pytorch-lightning at the current PR, plus dependencies. + #pip install -r pytorch-lightning/requirements/base.txt --no-cache-dir && \ # drop Horovod #python -c "fname = 'pytorch-lightning/requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if not line.startswith('horovod')] ; open(fname, 'w').writelines(lines)" && \ - #pip install --requirement pytorch-lightning/requirements/extra.txt --no-cache-dir && \ - if [ $TEST_IMAGE -eq 1 ] ; then \ - pip install --requirement pytorch-lightning/requirements/test.txt --no-cache-dir ; \ - else \ - rm -rf pytorch-lightning ; \ - fi + pip install -r pytorch-lightning/requirements/devel.txt --no-cache-dir --upgrade-strategy only-if-needed #RUN python -c "import pytorch_lightning as pl; print(pl.__version__)" -COPY docker-entrypoint.sh /usr/local/bin/ +COPY ./dockers/tpu-tests/docker-entrypoint.sh /usr/local/bin/ RUN chmod +x /usr/local/bin/docker-entrypoint.sh ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]