Drone: use nightly build cuda docker images (#3658)

* upgrade PT version

* update docker

* docker

* try 1.5

* badge

* fix typo: dor -> for (#3918)

* prune

* prune

* env

* echo

* try

* notes

* env

* env

* env

* notes

* docker

* prune

* maintainer

* CI

* update

* just 1.5

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* docker

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* CI

* push

* try

* prune

* CI

* CI

* CI

* CI

Co-authored-by: Klyukin Valeriy <mr.clyukin@gmail.com>
Co-authored-by: Jeff Yang <ydcjeff@outlook.com>
This commit is contained in:
Jirka Borovec 2020-10-26 11:47:09 +01:00 committed by GitHub
parent 98205fb438
commit ce8abd6255
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 304 additions and 155 deletions

View File

@ -20,44 +20,21 @@ name: torch-GPU
steps:
- name: testing
image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5
image: pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.5
environment:
SLURM_LOCALID: 0
CODECOV_TOKEN:
from_secret: codecov_token
MKL_THREADING_LAYER: GNU
HOROVOD_GPU_OPERATIONS: NCCL
HOROVOD_WITH_PYTORCH: 1
HOROVOD_WITHOUT_TENSORFLOW: 1
HOROVOD_WITHOUT_MXNET: 1
HOROVOD_WITH_GLOO: 1
HOROVOD_WITHOUT_MPI: 1
#volumes:
# # Mount pip cache from host
# - name: pip_cache
# path: /opt/conda/lib/python3.7/site-packages
commands:
# todo: remove unsets as in correct image Horovod shall be set
- unset HOROVOD_GPU_ALLREDUCE
- unset HOROVOD_GPU_BROADCAST
- export PATH="$PATH:/root/.local/bin"
- python --version
- pip install pip -U
- pip --version
- nvidia-smi
#- bash ./requirements/install_AMP.sh
- apt-get update && apt-get install -y cmake
- pip uninstall -y horovod # todo: this shall not be needed
- pip install -r ./requirements/devel.txt --user -q --upgrade-strategy only-if-needed --no-cache-dir
#- pip install -r ./requirements/docs.txt --user -q
- pip install -r ./requirements/examples.txt --user -q --upgrade-strategy only-if-needed
- pip install -r ./requirements/devel.txt --upgrade-strategy only-if-needed -v --no-cache-dir
- pip list
- python -c "import torch ; print(' & '.join([torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]) if torch.cuda.is_available() else 'only CPU')"
- coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v --color=yes --durations=25 # --flake8
- python -m py.test benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8
- python -m pytest benchmarks pl_examples -v --color=yes --maxfail=2 --durations=0 # --flake8
#- cd docs; make doctest; make coverage
- coverage report
# see: https://docs.codecov.io/docs/merging-reports
@ -73,8 +50,3 @@ trigger:
include:
- push
- pull_request
#volumes:
# - name: pip_cache
# host:
# path: /tmp/cache/drone/pip

View File

@ -9,7 +9,7 @@ on: # Trigger the workflow on push or pull request, but only for the master bra
branches: [master]
jobs:
build-Conda:
build-PL:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
@ -21,18 +21,16 @@ jobs:
uses: actions/checkout@v2
# https://github.com/docker/setup-buildx-action
# to use cache-from and cache-to argument of buildx command
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Build Conda Docker
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
- uses: docker/setup-buildx-action@v1
- name: Build PL Docker
# publish master
uses: docker/build-push-action@v2
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
file: dockers/conda/Dockerfile
file: dockers/release/Dockerfile
push: false
timeout-minutes: 50
@ -48,10 +46,8 @@ jobs:
uses: actions/checkout@v2
# https://github.com/docker/setup-buildx-action
# to use cache-from and cache-to argument of buildx command
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
- uses: docker/setup-buildx-action@v1
- name: Build XLA Docker
# publish master
uses: docker/build-push-action@v2
@ -70,24 +66,25 @@ jobs:
fail-fast: false
matrix:
include:
#- python_version: 3.7
# pytorch_version: 1.8 # todo
# pytorch_channel: pytorch-nightly
- python_version: 3.8
#- python_version: 3.8
# pytorch_version: 1.7 # todo
- python_version: 3.7
pytorch_version: 1.6
pytorch_channel: pytorch
- python_version: 3.6
pytorch_version: 1.5
pytorch_channel: pytorch
pytorch_version: 1.3
steps:
- name: Checkout
uses: actions/checkout@v2
# https://github.com/docker/setup-buildx-action
# to use cache-from and cache-to argument of buildx command
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
# for PT 1.3 and 1.4 we need to use CUDA 10.1
- run: |
cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1)
echo "::set-output name=CUDA::$cuda"
id: extend
# https://github.com/docker/setup-buildx-action
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
- uses: docker/setup-buildx-action@v1
- name: Build CUDA Docker
# publish master
uses: docker/build-push-action@v2
@ -95,8 +92,49 @@ jobs:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
file: dockers/base-cuda/Dockerfile
push: false
timeout-minutes: 50
build-conda:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
include:
- python_version: 3.8
pytorch_version: 1.6
- python_version: 3.6
pytorch_version: 1.4
#- python_version: 3.7
# pytorch_version: 1.8 # todo
steps:
- name: Checkout
uses: actions/checkout@v2
# for PT 1.3 and 1.4 we need to use CUDA 10.1
- run: |
cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1)
echo "::set-output name=CUDA::$cuda"
channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1)
echo "::set-output name=CHANNEL::$channel"
id: extend
# https://github.com/docker/setup-buildx-action
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
- uses: docker/setup-buildx-action@v1
- name: Build CUDA Docker
# publish master
uses: docker/build-push-action@v2
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }}
CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
file: dockers/base-conda/Dockerfile
push: false
timeout-minutes: 50

View File

@ -9,14 +9,14 @@ on: # Trigger the workflow on push or pull request, but only for the master bra
jobs:
conda:
runs-on: ${{ matrix.os }}
container: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}
runs-on: ubuntu-20.04
container: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python-version }}-torch${{ matrix.pytorch-version }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-20.04]
# os: [ubuntu-20.04]
python-version: [3.7]
pytorch-version: [1.3, 1.4, 1.5, 1.6, 1.7]
pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # todo
# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes: 35

View File

@ -8,7 +8,7 @@ on:
types: [created]
jobs:
build-Conda:
build-PL:
runs-on: ubuntu-20.04
strategy:
fail-fast: false
@ -36,7 +36,7 @@ jobs:
repository: pytorchlightning/pytorch_lightning
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
dockerfile: dockers/conda/Dockerfile
dockerfile: dockers/release/Dockerfile
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ env.RELEASE_VERSION }}
tags: "${{ env.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
timeout-minutes: 55

View File

@ -8,6 +8,7 @@ on:
# based on https://github.com/pypa/gh-action-pypi-publish
jobs:
pypi-release:
runs-on: ubuntu-20.04
@ -47,10 +48,8 @@ jobs:
uses: actions/checkout@v2
# https://github.com/docker/setup-buildx-action
# to use cache-from and cache-to argument of buildx command
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
- uses: docker/setup-buildx-action@v1
- name: Login to DockerHub
uses: docker/login-action@v1
with:
@ -78,37 +77,32 @@ jobs:
matrix:
python_version: [3.6, 3.7, 3.8]
pytorch_version: [1.3, 1.4, 1.5, 1.6] # todo: , 1.7
pytorch_channel: ["pytorch", "pytorch-nightly"]
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
exclude:
- pytorch_version: 1.7
pytorch_channel: pytorch
- pytorch_version: 1.3
pytorch_channel: pytorch-nightly
- pytorch_version: 1.4
pytorch_channel: pytorch-nightly
- pytorch_version: 1.5
pytorch_channel: pytorch-nightly
- pytorch_version: 1.6
pytorch_channel: pytorch-nightly
- pytorch_version: 1.3
pytorch_channel: pytorch
python_version: 3.8
# excludes PT 1.3 as it is missing on pypi
- python_version: 3.8
pytorch_version: 1.3
steps:
- name: Checkout
uses: actions/checkout@v2
# https://github.com/docker/setup-buildx-action
# to use cache-from and cache-to argument of buildx command
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
# Set up Docker Buildx - to use cache-from and cache-to argument of buildx command
- uses: docker/setup-buildx-action@v1
- name: Login to DockerHub
uses: docker/login-action@v1
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
# for PT 1.3 and 1.4 we need to use CUDA 10.1
- run: |
cuda=$(python -c "print(10.2 if float(${{matrix.pytorch_version}}) > 1.4 else 10.1)" 2>&1)
echo "::set-output name=CUDA::$cuda"
channel=$(python -c "print('pytorch-nightly' if float(${{matrix.pytorch_version}}) > 1.7 else 'pytorch')" 2>&1)
echo "::set-output name=CHANNEL::$channel"
id: extend
- name: Publish CUDA to Docker Hub
# publish master
uses: docker/build-push-action@v2
@ -116,7 +110,7 @@ jobs:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
cache-from: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
cache-to: pytorchlightning/pytorch_lightning:base-cuda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
file: dockers/base-cuda/Dockerfile
@ -131,7 +125,11 @@ jobs:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
file: dockers/conda/Dockerfile
PYTORCH_CHANNEL=${{ steps.extend.outputs.CHANNEL }}
CUDA_VERSION=${{ steps.extend.outputs.CUDA }}
cache-from: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
cache-to: pytorchlightning/pytorch_lightning:base-conda-cache-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
file: dockers/base-conda/Dockerfile
push: true
tags: pytorchlightning/pytorch_lightning:nightly-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
timeout-minutes: 55

View File

@ -91,8 +91,8 @@ Lightning can automatically export to ONNX or TorchScript for those cases.
| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) | 1.7 (nightly) |
| :---: | :---: | :---: | :---: | :---: | :---: |
| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
| Linux py3.7 [GPUs**] | - | - |[![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | - |
| Linux py3.7 [GPUs**] | - | - | [![Build Status](http://104.154.220.231/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://104.154.220.231/PyTorchLightning/pytorch-lightning) | - | - |
| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) | - |
| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |

View File

@ -1,4 +1,6 @@
## Builds
# Docker images
## Builds images form attached Dockerfiles
You can build it on your own, note it takes lots of time, be prepared.
@ -31,4 +33,23 @@ and if you do not need it anymore, just clean it:
```bash
docker image list
docker image rm pytorch-lightning:latest
```
```
### Run docker image with GPUs
To run docker image with access to you GPUs you need to install
```bash
# Add the package repositories
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```
and later run the docker image with `--gpus all` so for example
```
docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.6
```

View File

@ -0,0 +1,121 @@
# Copyright The PyTorch Lightning team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Existing images:
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.8 --build-arg PYTORCH_CHANNEL=pytorch-nightly
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch
ARG CUDNN_VERSION=8
ARG CUDA_VERSION=10.2
# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04
# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.6
ARG PYTORCH_CHANNEL=pytorch
ARG CONDA_VERSION=4.7.12
SHELL ["/bin/bash", "-c"]
ENV PATH="$PATH:/root/.local/bin"
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
ca-certificates \
&& \
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh && \
# Cleaning
apt-get autoremove -y && \
apt-get clean && \
rm -rf /root/.cache && \
rm -rf /var/lib/apt/lists/*
ENV PATH="/root/miniconda3/bin:$PATH"
ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH"
ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
ENV HOROVOD_GPU_OPERATIONS=NCCL
ENV HOROVOD_WITH_PYTORCH=1
ENV HOROVOD_WITHOUT_TENSORFLOW=1
ENV HOROVOD_WITHOUT_MXNET=1
ENV HOROVOD_WITH_GLOO=1
ENV HOROVOD_WITHOUT_MPI=1
#ENV MAKEFLAGS="-j$(nproc)"
ENV MAKEFLAGS="-j1"
ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5"
ENV CONDA_ENV=lightning
COPY environment.yml environment.yml
# conda init
RUN conda create -y --name $CONDA_ENV && \
conda init bash && \
# NOTE: this requires that the channel is presented in the yaml before packages
# replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \
cat environment.yml && \
conda env update --file environment.yml && \
conda clean -ya && \
rm environment.yml
ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH
ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
# if you want this environment to be the default one, uncomment the following line:
ENV CONDA_DEFAULT_ENV=${CONDA_ENV}
COPY ./requirements/extra.txt requirements-extra.txt
COPY ./requirements/test.txt requirements-test.txt
RUN \
# Disable cache
pip config set global.cache-dir false && \
#echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
#echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
#source ~/.bashrc && \
# Install remaining requirements
pip install -r requirements-extra.txt --upgrade-strategy only-if-needed && \
pip install -r requirements-test.txt --upgrade-strategy only-if-needed && \
rm requirements*
RUN \
# install NVIDIA AMP
git clone https://github.com/NVIDIA/apex && \
pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
rm -rf apex
RUN \
# Show what we have
pip --version && \
conda info && \
pip list && \
python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \
python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__"

View File

@ -13,106 +13,97 @@
# limitations under the License.
# Existing images:
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg CUDA_VERSION=10.2
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg CUDA_VERSION=10.2
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.2
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1
ARG CUDNN_VERSION=7
ARG CUDA_VERSION=10.1
ARG CUDNN_VERSION=8
ARG CUDA_VERSION=10.2
# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu16.04
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu18.04
# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04
# FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu16.04
ARG PYTHON_VERSION=3.7
ARG PYTORCH_VERSION=1.6
ARG PYTORCH_CHANNEL=pytorch
ARG CONDA_VERSION=4.7.12
SHELL ["/bin/bash", "-c"]
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
ENV DEBIAN_FRONTEND=noninteractive
ENV TZ=Europe/Prague
ENV PATH="$PATH:/root/.local/bin"
ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
RUN apt-get update && apt-get install -y --no-install-recommends \
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
pkg-config \
cmake \
git \
curl \
wget \
ca-certificates \
software-properties-common \
&& \
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh && \
# Install python
add-apt-repository ppa:deadsnakes/ppa && \
apt-get install -y \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-distutils \
python${PYTHON_VERSION}-dev \
&& \
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
# Cleaning
apt-get autoremove -y && \
apt-get clean && \
rm -rf /root/.cache && \
rm -rf /var/lib/apt/lists/*
ENV PATH="/root/miniconda3/bin:$PATH"
ENV LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH"
ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
ENV HOROVOD_GPU_OPERATIONS=NCCL
ENV HOROVOD_WITH_PYTORCH=1
ENV HOROVOD_WITHOUT_TENSORFLOW=1
ENV HOROVOD_WITHOUT_MXNET=1
ENV HOROVOD_WITH_GLOO=1
ENV HOROVOD_WITHOUT_MPI=1
#ENV MAKEFLAGS="-j$(nproc)"
ENV MAKEFLAGS="-j1"
ENV TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5"
ENV CONDA_ENV=lightning
COPY environment.yml environment.yml
COPY ./requirements.txt requirements.txt
COPY ./requirements/ ./requirements/
# conda init
RUN conda create -y --name $CONDA_ENV "cudatoolkit=$CUDA_VERSION" && \
conda init bash && \
# NOTE: this requires that the channel is presented in the yaml before packages
# replace channel to nigtly if needed, fix PT version and remove Horovod as it will be installe later
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'python[>=]+[\d\.]+', 'python=${PYTHON_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
python -c "import re ; fname = 'environment.yml' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch=${PYTORCH_VERSION}', open(fname).read()) ; open(fname, 'w').write(req)" && \
python -c "fname = 'environment.yml' ; req = open(fname).readlines() ; open(fname, 'w').writelines([ln for ln in req if 'horovod' not in ln])" && \
cat environment.yml && \
conda env update --file environment.yml && \
conda clean -ya && \
rm environment.yml
ENV PATH /root/miniconda3/envs/${CONDA_ENV}/bin:$PATH
ENV LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
# if you want this environment to be the default one, uncomment the following line:
ENV CONDA_DEFAULT_ENV=${CONDA_ENV}
COPY ./requirements/extra.txt requirements-extra.txt
COPY ./requirements/test.txt requirements-tests.txt
RUN \
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
python${PYTHON_VERSION} get-pip.py && \
rm get-pip.py && \
# Disable cache
pip config set global.cache-dir false && \
#echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
#echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
#source ~/.bashrc && \
# eventualy use pre-release
#pip install "torch==${PYTORCH_VERSION}.*" --pre && \
# set particular PyTorch version
python -c "import re ; fname = 'requirements.txt' ; req = re.sub(r'torch[>=]+[\d\.]+', 'torch==${PYTORCH_VERSION}.*', open(fname).read()) ; open(fname, 'w').write(req)" && \
# Install all requirements
pip install -r requirements/devel.txt --upgrade-strategy only-if-needed --use-feature=2020-resolver && \
rm -rf requirements*
RUN \
# install NVIDIA AMP
git clone https://github.com/NVIDIA/apex && \
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
rm -rf apex && \
# filter only Horovod
python -c "fname = 'requirements-extra.txt' ; req = open(fname).readlines() ; open(fname, 'w').writelines([l for l in req if 'horovod' in l])" && \
# Install all requirements
MAKEFLAGS="-j$(nproc)" ; pip install -r requirements-extra.txt && \
pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \
rm requirements*
pip install --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex && \
rm -rf apex
RUN \
# Show what we have
pip --version && \
conda info && \
pip list && \
python -c "import sys; assert sys.version[:3] == '$PYTHON_VERSION', sys.version" && \
python -c "import torch; assert torch.__version__[:3] == '$PYTORCH_VERSION', torch.__version__"

View File

@ -14,6 +14,8 @@
FROM google/cloud-sdk:slim
MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
# CALL: docker image build -t pytorch-lightning:XLA-extras-py3.6 -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.6
# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6 wheels available; see below.
ARG PYTHON_VERSION=3.7
@ -21,6 +23,7 @@ ARG XLA_VERSION=1.6
SHELL ["/bin/bash", "-c"]
ARG CONDA_VERSION=4.7.12
# for skipping configurations
ENV DEBIAN_FRONTEND=noninteractive
ENV CONDA_ENV=lightning
@ -40,7 +43,7 @@ RUN apt-get update && \
&& \
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh && \

View File

@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.5
FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
ARG LIGHTNING_VERSION=""
COPY ./ ./pytorch-lightning/
@ -37,8 +39,6 @@ RUN \
RUN python --version && \
pip --version && \
pip list && \
conda info && \
conda list && \
python -c "import pytorch_lightning as pl; print(pl.__version__)"
CMD ["/bin/bash"]
# CMD ["/bin/bash"]

View File

@ -17,6 +17,8 @@ ARG PYTORCH_VERSION=1.6
FROM pytorchlightning/pytorch_lightning:base-xla-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
MAINTAINER PyTorchLightning <https://github.com/PyTorchLightning>
#SHELL ["/bin/bash", "-c"]
COPY ./ ./pytorch-lightning/

View File

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class ClusterEnvironment:
def __init__(self):

View File

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
from pytorch_lightning import _logger as log

View File

@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from pytorch_lightning import _logger as log
from pytorch_lightning.utilities import rank_zero_warn
@ -44,4 +45,4 @@ class TorchElasticEnvironment(ClusterEnvironment):
return port
def world_size(self):
return os.environ.get('WORLD_SIZE', None)
return os.environ.get('WORLD_SIZE')

View File

@ -4,7 +4,7 @@
# install all extra dependencies for full package testing
-r ./extra.txt
# extended list of dependencies dor development and run lint and tests
# extended list of dependencies for development and run lint and tests
-r ./test.txt
# install all extra dependencies for running examples