From 37a59be21b73baffd68d0cc16bc31dd508d2b4c8 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 23 Sep 2020 01:41:35 +0200 Subject: [PATCH] build more docker configs (#3533) * update build cases * list * matrix * matrix * builds * docker * -j1 * -q * -q * sep * docker * docker * mergify * -j1 * -j1 * horovod * copy --- .github/workflows/ci_dockers.yml | 19 ++++++----- .github/workflows/docker-builds.yml | 28 ++++++++++------ .mergify.yml | 2 +- dockers/base-cuda/Dockerfile | 50 ++++++++++++++++------------- dockers/conda/Dockerfile | 19 +++++------ environment.yml | 4 +-- 6 files changed, 68 insertions(+), 54 deletions(-) diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 5a1de223e7..6f32ac6a8f 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -14,8 +14,8 @@ jobs: strategy: fail-fast: false matrix: - python_version: [3.7] - pytorch_version: [1.6] + python_version: [3.7, 3.8] + pytorch_version: [1.5] steps: - name: Checkout uses: actions/checkout@v2 @@ -27,7 +27,7 @@ jobs: dockerfile: dockers/conda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }} push: false - timeout-minutes: 40 + timeout-minutes: 50 build-XLA: runs-on: ubuntu-20.04 @@ -35,7 +35,7 @@ jobs: fail-fast: false matrix: python_version: [3.7] - xla_version: ["nightly"] + xla_version: [1.6, "nightly"] steps: - name: Checkout uses: actions/checkout@v2 @@ -47,21 +47,20 @@ jobs: dockerfile: dockers/base-xla/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},XLA_VERSION=${{ matrix.xla_version }} push: false - timeout-minutes: 40 + timeout-minutes: 50 build-cuda: runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: - python_version: [3.7] - pytorch_version: [1.6] - pytorch_channel: [pytorch] - # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations include: - python_version: 3.7 pytorch_version: 1.7 pytorch_channel: pytorch-nightly + - python_version: 3.8 + pytorch_version: 1.5 + pytorch_channel: pytorch steps: - name: Checkout uses: actions/checkout@v2 @@ -73,4 +72,4 @@ jobs: dockerfile: dockers/base-cuda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} push: false - timeout-minutes: 40 + timeout-minutes: 50 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index a3d04538b5..830cc35f4d 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -23,6 +23,7 @@ jobs: - name: Checkout uses: actions/checkout@v2 + # TODO: move this to nightly events - name: Publish Master to Docker # publish master uses: docker/build-push-action@v1.1.0 @@ -34,7 +35,7 @@ jobs: dockerfile: dockers/conda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }} tags: "nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" - timeout-minutes: 40 + timeout-minutes: 55 - name: Get release version if: startsWith(github.ref, 'refs/tags/') || github.event_name == 'release' @@ -52,7 +53,7 @@ jobs: dockerfile: dockers/conda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }} tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" - timeout-minutes: 40 + timeout-minutes: 55 build-XLA: runs-on: ubuntu-20.04 @@ -75,20 +76,27 @@ jobs: dockerfile: dockers/base-xla/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},XLA_VERSION=${{ matrix.xla_version }} tags: "base-xla-py${{ matrix.python_version }}-torch${{ matrix.xla_version }}" - timeout-minutes: 35 + timeout-minutes: 55 build-cuda: runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: - python_version: [3.7] - pytorch_version: [1.3, 1.4, 1.5, 1.6] - pytorch_channel: [pytorch] + python_version: [3.6, 3.7, 3.8] + pytorch_version: [1.3, 1.4, 1.5, 1.6, 1.7] + pytorch_channel: ["pytorch", "pytorch-nightly"] # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations - include: - - python_version: 3.7 - pytorch_version: 1.7 + exclude: + - pytorch_version: 1.7 + pytorch_channel: pytorch + - pytorch_version: 1.3 + pytorch_channel: pytorch-nightly + - pytorch_version: 1.4 + pytorch_channel: pytorch-nightly + - pytorch_version: 1.5 + pytorch_channel: pytorch-nightly + - pytorch_version: 1.6 pytorch_channel: pytorch-nightly steps: - name: Checkout @@ -104,4 +112,4 @@ jobs: dockerfile: dockers/base-cuda/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} tags: "base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" - timeout-minutes: 40 + timeout-minutes: 55 diff --git a/.mergify.yml b/.mergify.yml index d58afc056b..14cc5af1a9 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -24,7 +24,7 @@ pull_request_rules: # no requested chnages from any reviewer - "#changes-requested-reviews-by=0" # this serves as ALL check has to pass as we have actually around 40 tests in total - - "#status-success>=47" + - "#status-success>=50" # this is just in case since we rely on GPU tests (note: redundand to the above) - status-success=continuous-integration/drone/pr - "status-success=ci/circleci: TPU-tests" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 97100d05c5..96ea1f71f5 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -9,6 +9,7 @@ ARG CUDNN_VERSION=7 ARG CUDA_VERSION=10.1 FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel +# FROM nvidia/cuda:${CUDA_VERSION}-devel ARG PYTHON_VERSION=3.7 ARG PYTORCH_VERSION=1.6 @@ -17,22 +18,14 @@ ARG CONDA_VERSION=4.7.12 SHELL ["/bin/bash", "-c"] -ENV HOROVOD_GPU_OPERATIONS=NCCL -ENV HOROVOD_WITH_PYTORCH=1 -ENV HOROVOD_WITHOUT_TENSORFLOW=1 -ENV HOROVOD_WITHOUT_MXNET=1 -ENV HOROVOD_WITH_GLOO=1 -ENV HOROVOD_WITHOUT_MPI=1 ENV PATH="$PATH:/root/.local/bin" -# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239 -# ENV MAKEFLAGS="-j$(nproc)" RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential \ - cmake \ - git \ - curl \ - ca-certificates \ + build-essential \ + cmake \ + git \ + curl \ + ca-certificates \ && \ # Cleaning apt-get autoremove -y && \ @@ -61,31 +54,44 @@ ENV PATH="${WORKDIR}/miniconda/bin:$PATH" ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/lib:$LD_LIBRARY_PATH" ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" +ENV HOROVOD_GPU_OPERATIONS=NCCL +ENV HOROVOD_WITH_PYTORCH=1 +ENV HOROVOD_WITHOUT_TENSORFLOW=1 +ENV HOROVOD_WITHOUT_MXNET=1 +ENV HOROVOD_WITH_GLOO=1 +ENV HOROVOD_WITHOUT_MPI=1 +# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239 +# ENV MAKEFLAGS="-j$(nproc)" + # conda init RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ + python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' not in l])" && \ conda env update --file environment.yml && \ conda clean -ya && \ - rm environment.yml && \ - # Disable cache - conda install "pip>20.1" -y && \ - pip config set global.cache-dir false + rm environment.yml ENV PATH ${WORKDIR}/miniconda/envs/${CONDA_ENV}/bin:$PATH ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" # if you want this environment to be the default one, uncomment the following line: ENV CONDA_DEFAULT_ENV=${CONDA_ENV} -COPY ./requirements/test.txt requirements-tests.txt -COPY ./requirements/examples.txt requirements-examples.txt +COPY --chown=flash ./requirements/extra.txt requirements-extra.txt +COPY --chown=flash ./requirements/test.txt requirements-tests.txt +COPY --chown=flash ./requirements/examples.txt requirements-examples.txt RUN \ - echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ - source ~/.bashrc && \ + # Disable cache + pip config set global.cache-dir false && \ + #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ + #source ~/.bashrc && \ + # filter only Horovod + python -c "fname = 'requirements-extra.txt' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' in l])" && \ # Install all requirements + pip install --global-option="--quiet" -r requirements-extra.txt && \ pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \ pip install -r requirements-examples.txt --upgrade-strategy only-if-needed && \ rm requirements* && \ diff --git a/dockers/conda/Dockerfile b/dockers/conda/Dockerfile index 1dbd0b6876..3f6c51c182 100644 --- a/dockers/conda/Dockerfile +++ b/dockers/conda/Dockerfile @@ -3,7 +3,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-devel # install versions ARG PYTHON_VERSION=3.7 -ARG PYTORCH_VERSION=1.4 +ARG PYTORCH_VERSION=1.6 ARG PYTORCH_CHANNEL=pytorch ARG LIGHTNING_VERSION="" # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 @@ -45,16 +45,13 @@ ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" COPY --chown=flash environment.yml environment.yml # conda init -RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION && \ +RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \ conda init bash && \ - # conda install -y python=$PYTHON_VERSION && \ - conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \ + # NOTE: this requires that the channel is presented in the yaml before packages + python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ conda env update --file environment.yml && \ - rm environment.yml && \ - -# Disable cache - conda install "pip>20.1" && \ - pip config set global.cache-dir false + conda clean -ya && \ + rm environment.yml ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" # if you want this environment to be the default one, uncomment the following line: @@ -65,6 +62,9 @@ COPY --chown=flash ./ ./pytorch-lightning/ # install dependencies RUN \ + # Disable cache + #conda install "pip>20.1" && \ + #pip config set global.cache-dir false && \ if [ -z $LIGHTNING_VERSION ] ; then \ pip install ./pytorch-lightning --upgrade-strategy only-if-needed ; \ rm -rf pytorch-lightning ; \ @@ -75,6 +75,7 @@ RUN \ RUN python --version && \ pip --version && \ + pip list && \ python -c "import pytorch_lightning as pl; print(pl.__version__)" CMD ["/bin/bash"] diff --git a/environment.yml b/environment.yml index c9856baab7..aa86ce0224 100644 --- a/environment.yml +++ b/environment.yml @@ -23,8 +23,8 @@ channels: - pytorch dependencies: - #- python=3.7.6 - - pip>=20.0.2 + - python>=3.6 + - pip - numpy>=1.16.4 - pytorch>=1.3 - future>=0.17.1