diff --git a/.drone.yml b/.drone.yml index 5c759f042e..bb4d8a74b2 100644 --- a/.drone.yml +++ b/.drone.yml @@ -20,44 +20,21 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5 + image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5 environment: - SLURM_LOCALID: 0 CODECOV_TOKEN: from_secret: codecov_token MKL_THREADING_LAYER: GNU - HOROVOD_GPU_OPERATIONS: NCCL - HOROVOD_WITH_PYTORCH: 1 - HOROVOD_WITHOUT_TENSORFLOW: 1 - HOROVOD_WITHOUT_MXNET: 1 - HOROVOD_WITH_GLOO: 1 - HOROVOD_WITHOUT_MPI: 1 - - #volumes: - # # Mount pip cache from host - # - name: pip_cache - # path: /opt/conda/lib/python3.7/site-packages commands: - # todo: remove unsets as in correct image Horovod shall be set - - unset HOROVOD_GPU_ALLREDUCE - - unset HOROVOD_GPU_BROADCAST - - export PATH="$PATH:/root/.local/bin" - python --version - - pip install pip -U - pip --version - nvidia-smi - #- bash ./requirements/install_AMP.sh - - apt-get update && apt-get install -y cmake - - pip uninstall -y horovod # todo: this shall not be needed - - pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed --no-cache-dir - #- pip install -r ./requirements/docs.txt --user -q - - pip install -r ./requirements/examples.txt --user -q --upgrade-strategy only-if-needed + - pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir - pip list - - python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')" - coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --color=yes --durations=25 # --flake8 - - python -m py.test benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8 + - python -m pytest benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8 #- cd docs; make doctest; make coverage - coverage report # see: https://docs.codecov.io/docs/merging-reports @@ -73,8 +50,3 @@ trigger: include: - push - pull_request - -#volumes: -# - name: pip_cache -# host: -# path: /tmp/cache/drone/pip diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 017d4e637b..c8816486f2 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -9,7 +9,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master] jobs: - build-Conda: + build-PL: runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -21,18 +21,16 @@ jobs: uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - - - name: Build Conda Docker + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 + - name: Build PL Docker # publish master uses: docker/build-push-action@v2 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - file: dockers/conda/Dockerfile + file: dockers/release/Dockerfile push: false timeout-minutes: 50 @@ -48,10 +46,8 @@ jobs: uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Build XLA Docker # publish master uses: docker/build-push-action@v2 @@ -70,24 +66,25 @@ jobs: fail-fast: false matrix: include: - #- python_version: 3.7 - # pytorch_version: 1.8 # todo - # pytorch_channel: pytorch-nightly - - python_version: 3.8 + #- python_version: 3.8 + # pytorch_version: 1.7 # todo + - python_version: 3.7 pytorch_version: 1.6 - pytorch_channel: pytorch - python_version: 3.6 - pytorch_version: 1.5 - pytorch_channel: pytorch + pytorch_version: 1.3 steps: - name: Checkout uses: actions/checkout@v2 - # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 + # for PT 1.3 and 1.4 we need to use CUDA 10.1 + - run: | + cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1) + echo "::set-output name=CUDA::$cuda" + id: extend + # https://github.com/docker/setup-buildx-action + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Build CUDA Docker # publish master uses: docker/build-push-action@v2 @@ -95,8 +92,49 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile push: false timeout-minutes: 50 + + build-conda: + runs-on: ubuntu-20.04 + strategy: + fail-fast: false + matrix: + include: + - python_version: 3.8 + pytorch_version: 1.6 + - python_version: 3.6 + pytorch_version: 1.4 + #- python_version: 3.7 + # pytorch_version: 1.8 # todo + steps: + - name: Checkout + uses: actions/checkout@v2 + + # for PT 1.3 and 1.4 we need to use CUDA 10.1 + - run: | + cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1) + echo "::set-output name=CUDA::$cuda" + channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1) + echo "::set-output name=CHANNEL::$channel" + id: extend + + # https://github.com/docker/setup-buildx-action + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 + - name: Build CUDA Docker + # publish master + uses: docker/build-push-action@v2 + with: + build-args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + file: dockers/base-conda/Dockerfile + push: false + timeout-minutes: 50 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 3289f5cbec..f652cbb1a4 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -9,14 +9,14 @@ on: # Trigger the workflow on push or pull request, but only for the master bra jobs: conda: - runs-on: ${{ matrix.os }} - container: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} + runs-on: ubuntu-20.04 + container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }} strategy: fail-fast: false matrix: - os: [ubuntu-20.04] + # os: [ubuntu-20.04] python-version: [3.7] - pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7] + pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # todo # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 35 diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index 1dc2b7c4e0..0ba6f701f6 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -8,7 +8,7 @@ on: types: [created] jobs: - build-Conda: + build-PL: runs-on: ubuntu-20.04 strategy: fail-fast: false @@ -36,7 +36,7 @@ jobs: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - dockerfile: dockers/conda/Dockerfile + dockerfile: dockers/release/Dockerfile build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }} tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" timeout-minutes: 55 diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index c91033b65f..eb10c43936 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -8,6 +8,7 @@ on: # based on https://github.com/pypa/gh-action-pypi-publish jobs: + pypi-release: runs-on: ubuntu-20.04 @@ -47,10 +48,8 @@ jobs: uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Login to DockerHub uses: docker/login-action@v1 with: @@ -78,37 +77,32 @@ jobs: matrix: python_version: [3.6, 3.7, 3.8] pytorch_version: [1.3, 1.4, 1.5, 1.6] # todo: , 1.7 - pytorch_channel: ["pytorch", "pytorch-nightly"] - # https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations exclude: - - pytorch_version: 1.7 - pytorch_channel: pytorch - - pytorch_version: 1.3 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.4 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.5 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.6 - pytorch_channel: pytorch-nightly - - pytorch_version: 1.3 - pytorch_channel: pytorch - python_version: 3.8 + # excludes PT 1.3 as it is missing on pypi + - python_version: 3.8 + pytorch_version: 1.3 + steps: - name: Checkout uses: actions/checkout@v2 # https://github.com/docker/setup-buildx-action - # to use cache-from and cache-to argument of buildx command - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v1 - + # Set up Docker Buildx - to use cache-from and cache-to argument of buildx command + - uses: docker/setup-buildx-action@v1 - name: Login to DockerHub uses: docker/login-action@v1 with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} + # for PT 1.3 and 1.4 we need to use CUDA 10.1 + - run: | + cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1) + echo "::set-output name=CUDA::$cuda" + channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1) + echo "::set-output name=CHANNEL::$channel" + id: extend + - name: Publish CUDA to Docker Hub # publish master uses: docker/build-push-action@v2 @@ -116,7 +110,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - PYTORCH_CHANNEL=${{ matrix.pytorch_channel }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} cache-to: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} file: dockers/base-cuda/Dockerfile @@ -131,7 +125,11 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - file: dockers/conda/Dockerfile + PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }} + CUDA_VERSION=${{ steps.extend.outputs.CUDA }} + cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + cache-to: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + file: dockers/base-conda/Dockerfile push: true - tags: pytorchlightning/pytorch_lightning:nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 55 diff --git a/README.md b/README.md index 54552367de..21f4aaab19 100644 --- a/README.md +++ b/README.md @@ -91,8 +91,8 @@ Lightning can automatically export to ONNX or TorchScript for those cases. | System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) | | :---: | :---: | :---: | :---: | :---: | :---: | -| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | -| Linux py3.7 [GPUs**] | - | - |[![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | +| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - | +| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - | | Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - | | Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | | OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | diff --git a/dockers/README.md b/dockers/README.md index 7b3063e00f..73c40635eb 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,4 +1,6 @@ -## Builds +# Docker images + +## Builds images form attached Dockerfiles You can build it on your own, note it takes lots of time, be prepared. @@ -31,4 +33,23 @@ and if you do not need it anymore, just clean it: ```bash docker image list docker image rm pytorch-lightning:latest -``` \ No newline at end of file +``` + +### Run docker image with GPUs + +To run docker image with access to you GPUs you need to install +```bash +# Add the package repositories +distribution=$(. /etc/os-release;echo $ID$VERSION_ID) +curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - +curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list + +sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit +sudo systemctl restart docker +``` + +and later run the docker image with `--gpus all` so for example + +``` +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6 +``` diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile new file mode 100644 index 0000000000..6a7f03970c --- /dev/null +++ b/dockers/base-conda/Dockerfile @@ -0,0 +1,121 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Existing images: +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.8 --build-arg PYTORCH_CHANNEL=pytorch-nightly +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch + +ARG CUDNN_VERSION=8 +ARG CUDA_VERSION=10.2 + +# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 +# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 + +ARG PYTHON_VERSION=3.7 +ARG PYTORCH_VERSION=1.6 +ARG PYTORCH_CHANNEL=pytorch +ARG CONDA_VERSION=4.7.12 + +SHELL ["/bin/bash", "-c"] + +ENV PATH="$PATH:/root/.local/bin" + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential \ + cmake \ + git \ + curl \ + ca-certificates \ + && \ + +# Install conda and python. +# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \ + chmod +x ~/miniconda.sh && \ + ~/miniconda.sh -b && \ + rm ~/miniconda.sh && \ + +# Cleaning + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /root/.cache && \ + rm -rf /var/lib/apt/lists/* + +ENV PATH="/root/miniconda3/bin:$PATH" +ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" +ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" + +ENV HOROVOD_GPU_OPERATIONS=NCCL +ENV HOROVOD_WITH_PYTORCH=1 +ENV HOROVOD_WITHOUT_TENSORFLOW=1 +ENV HOROVOD_WITHOUT_MXNET=1 +ENV HOROVOD_WITH_GLOO=1 +ENV HOROVOD_WITHOUT_MPI=1 +#ENV MAKEFLAGS="-j$(nproc)" +ENV MAKEFLAGS="-j1" +ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5" + +ENV CONDA_ENV=lightning +COPY environment.yml environment.yml + +# conda init +RUN conda create -y --name $CONDA_ENV && \ + conda init bash && \ + # NOTE: this requires that the channel is presented in the yaml before packages + # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later + python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ + python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \ + cat environment.yml && \ + conda env update --file environment.yml && \ + conda clean -ya && \ + rm environment.yml + +ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH +ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" +# if you want this environment to be the default one, uncomment the following line: +ENV CONDA_DEFAULT_ENV=${CONDA_ENV} + +COPY ./requirements/extra.txt requirements-extra.txt +COPY ./requirements/test.txt requirements-test.txt + +RUN \ + # Disable cache + pip config set global.cache-dir false && \ + #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ + #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ + #source ~/.bashrc && \ + # Install remaining requirements + pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \ + pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \ + rm requirements* + +RUN \ + # install NVIDIA AMP + git clone https://github.com/NVIDIA/apex && \ + pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ + rm -rf apex + +RUN \ + # Show what we have + pip --version && \ + conda info && \ + pip list && \ + python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ + python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index ee892bccb2..e22b5a862a 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -13,106 +13,97 @@ # limitations under the License. # Existing images: -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 -# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg CUDA_VERSION=10.2 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg CUDA_VERSION=10.2 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.2 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1 +# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1 -ARG CUDNN_VERSION=7 -ARG CUDA_VERSION=10.1 +ARG CUDNN_VERSION=8 +ARG CUDA_VERSION=10.2 # FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 -# FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 -FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu16.04 +FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04 # FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 -# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu16.04 ARG PYTHON_VERSION=3.7 ARG PYTORCH_VERSION=1.6 -ARG PYTORCH_CHANNEL=pytorch -ARG CONDA_VERSION=4.7.12 SHELL ["/bin/bash", "-c"] +# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Europe/Prague ENV PATH="$PATH:/root/.local/bin" +ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" -RUN apt-get update && apt-get install -y --no-install-recommends \ +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ build-essential \ + pkg-config \ cmake \ git \ - curl \ + wget \ ca-certificates \ + software-properties-common \ && \ -# Install conda and python. -# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ - chmod +x ~/miniconda.sh && \ - ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ + +# Install python + add-apt-repository ppa:deadsnakes/ppa && \ + apt-get install -y \ + python${PYTHON_VERSION} \ + python${PYTHON_VERSION}-distutils \ + python${PYTHON_VERSION}-dev \ + && \ + + update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \ + update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \ + # Cleaning apt-get autoremove -y && \ apt-get clean && \ rm -rf /root/.cache && \ rm -rf /var/lib/apt/lists/* -ENV PATH="/root/miniconda3/bin:$PATH" -ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" -ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" - ENV HOROVOD_GPU_OPERATIONS=NCCL ENV HOROVOD_WITH_PYTORCH=1 ENV HOROVOD_WITHOUT_TENSORFLOW=1 ENV HOROVOD_WITHOUT_MXNET=1 ENV HOROVOD_WITH_GLOO=1 ENV HOROVOD_WITHOUT_MPI=1 +#ENV MAKEFLAGS="-j$(nproc)" +ENV MAKEFLAGS="-j1" +ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5" -ENV CONDA_ENV=lightning -COPY environment.yml environment.yml +COPY ./requirements.txt requirements.txt +COPY ./requirements/ ./requirements/ # conda init -RUN conda create -y --name $CONDA_ENV "cudatoolkit=$CUDA_VERSION" && \ - conda init bash && \ - # NOTE: this requires that the channel is presented in the yaml before packages - # replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later - python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \ - python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ - python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \ - python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \ - cat environment.yml && \ - conda env update --file environment.yml && \ - conda clean -ya && \ - rm environment.yml - -ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH -ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" -# if you want this environment to be the default one, uncomment the following line: -ENV CONDA_DEFAULT_ENV=${CONDA_ENV} - -COPY ./requirements/extra.txt requirements-extra.txt -COPY ./requirements/test.txt requirements-tests.txt - RUN \ + wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \ + python${PYTHON_VERSION} get-pip.py && \ + rm get-pip.py && \ + # Disable cache pip config set global.cache-dir false && \ - #echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - #echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \ - #source ~/.bashrc && \ + # eventualy use pre-release + #pip install "torch==${PYTORCH_VERSION}.*" --pre && \ + # set particular PyTorch version + python -c "import re ; fname = 'requirements.txt' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch==${PYTORCH_VERSION}.*', open(fname).read()) ; open(fname, 'w').write(req)" && \ + + # Install all requirements + pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \ + rm -rf requirements* + +RUN \ # install NVIDIA AMP git clone https://github.com/NVIDIA/apex && \ - pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ - rm -rf apex && \ - # filter only Horovod - python -c "fname = 'requirements-extra.txt' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' in l])" && \ - # Install all requirements - MAKEFLAGS="-j$(nproc)" ; pip install -r requirements-extra.txt && \ - pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \ - rm requirements* + pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \ + rm -rf apex RUN \ # Show what we have pip --version && \ - conda info && \ pip list && \ python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \ python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__" \ No newline at end of file diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index ebad7f8d72..f44465383a 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -14,6 +14,8 @@ FROM google/cloud-sdk:slim +MAINTAINER PyTorchLightning + # CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6 # This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below. ARG PYTHON_VERSION=3.7 @@ -21,6 +23,7 @@ ARG XLA_VERSION=1.6 SHELL ["/bin/bash", "-c"] +ARG CONDA_VERSION=4.7.12 # for skipping configurations ENV DEBIAN_FRONTEND=noninteractive ENV CONDA_ENV=lightning @@ -40,7 +43,7 @@ RUN apt-get update && \ && \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 - curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \ + curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ rm ~/miniconda.sh && \ diff --git a/dockers/conda/Dockerfile b/dockers/release/Dockerfile similarity index 94% rename from dockers/conda/Dockerfile rename to dockers/release/Dockerfile index 17ad4f9c7e..886e794ccd 100644 --- a/dockers/conda/Dockerfile +++ b/dockers/release/Dockerfile @@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.5 FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +MAINTAINER PyTorchLightning + ARG LIGHTNING_VERSION="" COPY ./ ./pytorch-lightning/ @@ -37,8 +39,6 @@ RUN \ RUN python --version && \ pip --version && \ pip list && \ - conda info && \ - conda list && \ python -c "import pytorch_lightning as pl; print(pl.__version__)" -CMD ["/bin/bash"] +# CMD ["/bin/bash"] diff --git a/dockers/tpu-tests/Dockerfile b/dockers/tpu-tests/Dockerfile index d0f7321d5f..4d5afa6f46 100644 --- a/dockers/tpu-tests/Dockerfile +++ b/dockers/tpu-tests/Dockerfile @@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.6 FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +MAINTAINER PyTorchLightning + #SHELL ["/bin/bash", "-c"] COPY ./ ./pytorch-lightning/ diff --git a/pytorch_lightning/cluster_environments/cluster_environment.py b/pytorch_lightning/cluster_environments/cluster_environment.py index 316048d6b6..ff3436e662 100644 --- a/pytorch_lightning/cluster_environments/cluster_environment.py +++ b/pytorch_lightning/cluster_environments/cluster_environment.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + class ClusterEnvironment: def __init__(self): diff --git a/pytorch_lightning/cluster_environments/slurm_environment.py b/pytorch_lightning/cluster_environments/slurm_environment.py index 48d1e85476..44cdc22078 100644 --- a/pytorch_lightning/cluster_environments/slurm_environment.py +++ b/pytorch_lightning/cluster_environments/slurm_environment.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os import re from pytorch_lightning import _logger as log diff --git a/pytorch_lightning/cluster_environments/torchelastic_environment.py b/pytorch_lightning/cluster_environments/torchelastic_environment.py index decdd0fd84..d50a10a782 100644 --- a/pytorch_lightning/cluster_environments/torchelastic_environment.py +++ b/pytorch_lightning/cluster_environments/torchelastic_environment.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import os from pytorch_lightning import _logger as log from pytorch_lightning.utilities import rank_zero_warn @@ -44,4 +45,4 @@ class TorchElasticEnvironment(ClusterEnvironment): return port def world_size(self): - return os.environ.get('WORLD_SIZE', None) + return os.environ.get('WORLD_SIZE') diff --git a/requirements/devel.txt b/requirements/devel.txt index 5d0262ec17..a8c5293c8c 100644 --- a/requirements/devel.txt +++ b/requirements/devel.txt @@ -4,7 +4,7 @@ # install all extra dependencies for full package testing -r ./extra.txt -# extended list of dependencies dor development and run lint and tests +# extended list of dependencies for development and run lint and tests -r ./test.txt # install all extra dependencies for running examples