diff --git a/.codecov.yml b/.codecov.yml index 7f8e8ca280..cc6a5e6a2b 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -23,7 +23,7 @@ codecov: strict_yaml_branch: "yaml-config" require_ci_to_pass: yes notify: - after_n_builds: 22 + after_n_builds: 23 wait_for_ci: yes # https://docs.codecov.io/docs/codecov-yaml#section-expired-reports max_report_age: off @@ -64,4 +64,4 @@ comment: layout: header, diff require_changes: false behavior: default # update if exists else create new - after_n_builds: 22 + after_n_builds: 23 diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 0ea85eacc3..5a1de223e7 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -49,23 +49,28 @@ jobs: push: false timeout-minutes: 40 -# TODO: uncomment this with fixing CUDA docker, no need to increase mergify count -# build-cuda: -# runs-on: ubuntu-20.04 -# strategy: -# fail-fast: false -# matrix: -# python_version: [3.7] -# pytorch_version: [1.5] -# steps: -# - name: Checkout -# uses: actions/checkout@v2 -# -# - name: Publish Master to Docker -# # publish master -# uses: docker/build-push-action@v1.1.0 -# with: -# dockerfile: dockers/base-cuda/Dockerfile -# build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }} -# push: false -# timeout-minutes: 40 + build-cuda: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + python_version: [3.7] + pytorch_version: [1.6] + pytorch_channel: [pytorch] + # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations + include: + - python_version: 3.7 + pytorch_version: 1.7 + pytorch_channel: pytorch-nightly + steps: + - name: Checkout + uses: actions/checkout@v2 + + - name: Build Docker + # publish master + uses: docker/build-push-action@v1.1.0 + with: + dockerfile: dockers/base-cuda/Dockerfile + build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} + push: false + timeout-minutes: 40 diff --git a/.github/workflows/ci_pt-conda.yml b/.github/workflows/ci_pt-conda.yml index 1bf6151244..48d3d58024 100644 --- a/.github/workflows/ci_pt-conda.yml +++ b/.github/workflows/ci_pt-conda.yml @@ -16,21 +16,24 @@ jobs: matrix: os: [ubuntu-20.04] python-version: [3.7] - # todo: add nightly versions - pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 + pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # TODO: fix failing test and add 1.7 (nightly) add badge # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 steps: - uses: actions/checkout@v2 - - name: Setup pyTorch + - name: Setup PyTorch nightly channel + if: matrix.pytorch-version >= 1.7 + run: | + # NOTE: this requires that the channel is presented in the yaml before packages + python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', 'pytorch-nightly', 1) ; open(fname, 'w').write(req)" + + - name: Setup PyTorch version run: | python -c "fname = 'environment.yml' ; req = open(fname).read().replace('torch>=1.3', 'torch=${{ matrix.pytorch-version }}') ; open(fname, 'w').write(req)" cat environment.yml - # TODO: set source for nightly - - name: Cache conda uses: actions/cache@v2 with: diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 744a8edf48..a3d04538b5 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -83,7 +83,13 @@ jobs: fail-fast: false matrix: python_version: [3.7] - pytorch_version: [1.3, 1.4, 1.5, 1.6.0] + pytorch_version: [1.3, 1.4, 1.5, 1.6] + pytorch_channel: [pytorch] + # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations + include: + - python_version: 3.7 + pytorch_version: 1.7 + pytorch_channel: pytorch-nightly steps: - name: Checkout uses: actions/checkout@v2 @@ -96,6 +102,6 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/base-cuda/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }} + build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} tags: "base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" timeout-minutes: 40 diff --git a/.mergify.yml b/.mergify.yml index df45969ecf..d58afc056b 100644 --- a/.mergify.yml +++ b/.mergify.yml @@ -24,7 +24,7 @@ pull_request_rules: # no requested chnages from any reviewer - "#changes-requested-reviews-by=0" # this serves as ALL check has to pass as we have actually around 40 tests in total - - "#status-success>=44" + - "#status-success>=47" # this is just in case since we rely on GPU tests (note: redundand to the above) - status-success=continuous-integration/drone/pr - "status-success=ci/circleci: TPU-tests" diff --git a/README.md b/README.md index 0a2a071d0d..7ce30605a8 100644 --- a/README.md +++ b/README.md @@ -72,12 +72,12 @@ Get started with our [3 steps guide](https://pytorch-lightning.readthedocs.io/en | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | | Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | -| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) +| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_ - _\** tests run on two NVIDIA K80_ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index fb0a00f6e3..97100d05c5 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -1,59 +1,98 @@ # Existing images: -# --build-arg TORCH_VERSION=1.6.0 --build-arg CUDA_VERSION=10.1 -# --build-arg TORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.1 -# --build-arg TORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1 -# --build-arg TORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1 -# --build-arg TORCH_VERSION=1.2 --build-arg CUDA_VERSION=10.0 -# --build-arg TORCH_VERSION=1.1.0 --build-arg CUDA_VERSION=10.0 --build-arg CUDNN_VERSION=7.5 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -ARG TORCH_VERSION=1.6.0 -ARG CUDA_VERSION=10.1 ARG CUDNN_VERSION=7 +ARG CUDA_VERSION=10.1 -# TODO: make his imagge from pure Ubuntu + install all NVIDIA drivers -# FROM nvidia/cuda:${CUDA_VERSION}-base -FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel +FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel + +ARG PYTHON_VERSION=3.7 +ARG PYTORCH_VERSION=1.6 +ARG PYTORCH_CHANNEL=pytorch +ARG CONDA_VERSION=4.7.12 SHELL ["/bin/bash", "-c"] -ENV HOROVOD_GPU_ALLREDUCE=NCCL -ENV HOROVOD_GPU_BROADCAST=NCCL +ENV HOROVOD_GPU_OPERATIONS=NCCL ENV HOROVOD_WITH_PYTORCH=1 ENV HOROVOD_WITHOUT_TENSORFLOW=1 ENV HOROVOD_WITHOUT_MXNET=1 ENV HOROVOD_WITH_GLOO=1 ENV HOROVOD_WITHOUT_MPI=1 ENV PATH="$PATH:/root/.local/bin" -ENV MAKEFLAGS="-j$(nproc)" +# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239 +# ENV MAKEFLAGS="-j$(nproc)" -COPY ./tests/install_AMP.sh install_AMP.sh -COPY ./requirements/base.txt requirements.txt -COPY ./requirements/extra.txt requirements-extra.txt -COPY ./requirements/test.txt requirements-tests.txt -COPY ./requirements/examples.txt requirements-examples.txt - -RUN apt-get update && \ - apt-get install -y \ - git \ - cmake \ +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + git \ + curl \ + ca-certificates \ && \ - -# Install AMP - bash install_AMP.sh && \ -# Install all requirements - pip install -r requirements.txt && \ - # HOROVOD_BUILD_ARCH_FLAGS="-mfma" && \ - pip install -r requirements-extra.txt && \ - pip install -r requirements-examples.txt && \ - #pip install -r requirements-tests.txt && \ - rm install_AMP.sh && \ - rm requirements* && \ - -# Cleaning + # Cleaning apt-get autoremove -y && \ apt-get clean && \ rm -rf /root/.cache && \ + rm -rf /var/lib/apt/lists/* -# Show what we have +# add non-root user +RUN useradd --create-home --shell /bin/bash flash + +USER flash +ENV CONDA_ENV=lightning +ENV WORKDIR=/home/flash +WORKDIR $WORKDIR + +COPY --chown=flash environment.yml environment.yml + +# install conda and python +RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \ + chmod +x ~/miniconda.sh && \ + ~/miniconda.sh -b -p ${WORKDIR}/miniconda && \ + rm ~/miniconda.sh + +# add conda to path +ENV PATH="${WORKDIR}/miniconda/bin:$PATH" +ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/lib:$LD_LIBRARY_PATH" +ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" + +# conda init +RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \ + conda init bash && \ + # NOTE: this requires that the channel is presented in the yaml before packages + python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ + conda env update --file environment.yml && \ + conda clean -ya && \ + rm environment.yml && \ + # Disable cache + conda install "pip>20.1" -y && \ + pip config set global.cache-dir false + +ENV PATH ${WORKDIR}/miniconda/envs/${CONDA_ENV}/bin:$PATH +ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" +# if you want this environment to be the default one, uncomment the following line: +ENV CONDA_DEFAULT_ENV=${CONDA_ENV} + +COPY ./requirements/test.txt requirements-tests.txt +COPY ./requirements/examples.txt requirements-examples.txt + +RUN \ + echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ + source ~/.bashrc && \ + # Install all requirements + pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \ + pip install -r requirements-examples.txt --upgrade-strategy only-if-needed && \ + rm requirements* && \ + # Show what we have pip --version && \ + conda info && \ + conda list && \ pip list + +CMD ["bin/bash"] diff --git a/dockers/conda/Dockerfile b/dockers/conda/Dockerfile index 266fec5b63..1dbd0b6876 100644 --- a/dockers/conda/Dockerfile +++ b/dockers/conda/Dockerfile @@ -4,6 +4,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-devel # install versions ARG PYTHON_VERSION=3.7 ARG PYTORCH_VERSION=1.4 +ARG PYTORCH_CHANNEL=pytorch ARG LIGHTNING_VERSION="" # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 ARG CONDA_VERSION=4.7.12 @@ -47,7 +48,7 @@ COPY --chown=flash environment.yml environment.yml RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION && \ conda init bash && \ # conda install -y python=$PYTHON_VERSION && \ - conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=pytorch && \ + conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \ conda env update --file environment.yml && \ rm environment.yml && \ diff --git a/requirements/examples.txt b/requirements/examples.txt index e126b74504..24506bbba7 100644 --- a/requirements/examples.txt +++ b/requirements/examples.txt @@ -1,2 +1,2 @@ -torchvision>=0.4.0, <0.7 +torchvision>=0.4.0 gym>=0.17.0 \ No newline at end of file