stable, dev PyTorch in Dockerfile and conda gh actions (#3074)
* dockerfile and actions file * dockerfile and actions file * added pytorch conda cpu nightly * added pytorch conda cpu nightly * recopy base reqs * gh action `include` torch nightly * add pytorch nightly & conda gh badge * rebase * fix horovod * proposal refactor * Update .github/workflows/ci_pt-conda.yml Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * Update .github/workflows/ci_pt-conda.yml Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> * update * update * fix cmd * filled && * fix * add -y * torchvision >0.7 allowed * explicitly install torchvision * use HOROVOD_GPU_OPERATIONS env variable * CI * skip 1.7 * table Co-authored-by: Jirka Borovec <jirka@pytorchlightning.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
This commit is contained in:
parent
7b64472ced
commit
8be79a9a96
|
@ -23,7 +23,7 @@ codecov:
|
|||
strict_yaml_branch: "yaml-config"
|
||||
require_ci_to_pass: yes
|
||||
notify:
|
||||
after_n_builds: 22
|
||||
after_n_builds: 23
|
||||
wait_for_ci: yes
|
||||
# https://docs.codecov.io/docs/codecov-yaml#section-expired-reports
|
||||
max_report_age: off
|
||||
|
@ -64,4 +64,4 @@ comment:
|
|||
layout: header, diff
|
||||
require_changes: false
|
||||
behavior: default # update if exists else create new
|
||||
after_n_builds: 22
|
||||
after_n_builds: 23
|
||||
|
|
|
@ -49,23 +49,28 @@ jobs:
|
|||
push: false
|
||||
timeout-minutes: 40
|
||||
|
||||
# TODO: uncomment this with fixing CUDA docker, no need to increase mergify count
|
||||
# build-cuda:
|
||||
# runs-on: ubuntu-20.04
|
||||
# strategy:
|
||||
# fail-fast: false
|
||||
# matrix:
|
||||
# python_version: [3.7]
|
||||
# pytorch_version: [1.5]
|
||||
# steps:
|
||||
# - name: Checkout
|
||||
# uses: actions/checkout@v2
|
||||
#
|
||||
# - name: Publish Master to Docker
|
||||
# # publish master
|
||||
# uses: docker/build-push-action@v1.1.0
|
||||
# with:
|
||||
# dockerfile: dockers/base-cuda/Dockerfile
|
||||
# build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
|
||||
# push: false
|
||||
# timeout-minutes: 40
|
||||
build-cuda:
|
||||
runs-on: ubuntu-20.04
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: [3.7]
|
||||
pytorch_version: [1.6]
|
||||
pytorch_channel: [pytorch]
|
||||
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
|
||||
include:
|
||||
- python_version: 3.7
|
||||
pytorch_version: 1.7
|
||||
pytorch_channel: pytorch-nightly
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Build Docker
|
||||
# publish master
|
||||
uses: docker/build-push-action@v1.1.0
|
||||
with:
|
||||
dockerfile: dockers/base-cuda/Dockerfile
|
||||
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
|
||||
push: false
|
||||
timeout-minutes: 40
|
||||
|
|
|
@ -16,21 +16,24 @@ jobs:
|
|||
matrix:
|
||||
os: [ubuntu-20.04]
|
||||
python-version: [3.7]
|
||||
# todo: add nightly versions
|
||||
pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7
|
||||
pytorch-version: [1.3, 1.4, 1.5, 1.6] # , 1.7 # TODO: fix failing test and add 1.7 (nightly) add badge
|
||||
|
||||
# Timeout: https://stackoverflow.com/a/59076067/4521646
|
||||
timeout-minutes: 35
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Setup pyTorch
|
||||
- name: Setup PyTorch nightly channel
|
||||
if: matrix.pytorch-version >= 1.7
|
||||
run: |
|
||||
# NOTE: this requires that the channel is presented in the yaml before packages
|
||||
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', 'pytorch-nightly', 1) ; open(fname, 'w').write(req)"
|
||||
|
||||
- name: Setup PyTorch version
|
||||
run: |
|
||||
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('torch>=1.3', 'torch=${{ matrix.pytorch-version }}') ; open(fname, 'w').write(req)"
|
||||
cat environment.yml
|
||||
|
||||
# TODO: set source for nightly
|
||||
|
||||
- name: Cache conda
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
|
|
|
@ -83,7 +83,13 @@ jobs:
|
|||
fail-fast: false
|
||||
matrix:
|
||||
python_version: [3.7]
|
||||
pytorch_version: [1.3, 1.4, 1.5, 1.6.0]
|
||||
pytorch_version: [1.3, 1.4, 1.5, 1.6]
|
||||
pytorch_channel: [pytorch]
|
||||
# https://docs.github.com/en/actions/reference/workflow-syntax-for-github-actions#example-including-new-combinations
|
||||
include:
|
||||
- python_version: 3.7
|
||||
pytorch_version: 1.7
|
||||
pytorch_channel: pytorch-nightly
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v2
|
||||
|
@ -96,6 +102,6 @@ jobs:
|
|||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
dockerfile: dockers/base-cuda/Dockerfile
|
||||
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }}
|
||||
build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},PYTORCH_CHANNEL=${{ matrix.pytorch_channel }}
|
||||
tags: "base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
|
||||
timeout-minutes: 40
|
||||
|
|
|
@ -24,7 +24,7 @@ pull_request_rules:
|
|||
# no requested chnages from any reviewer
|
||||
- "#changes-requested-reviews-by=0"
|
||||
# this serves as ALL check has to pass as we have actually around 40 tests in total
|
||||
- "#status-success>=44"
|
||||
- "#status-success>=47"
|
||||
# this is just in case since we rely on GPU tests (note: redundand to the above)
|
||||
- status-success=continuous-integration/drone/pr
|
||||
- "status-success=ci/circleci: TPU-tests"
|
||||
|
|
|
@ -72,12 +72,12 @@ Get started with our [3 steps guide](https://pytorch-lightning.readthedocs.io/en
|
|||
|
||||
| System / PyTorch ver. | 1.3 (min. req.)* | 1.4 | 1.5 | 1.6 (latest) |
|
||||
| :---: | :---: | :---: | :---: | :---: |
|
||||
| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
|
||||
| Conda py3.7 [linux] | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) | [![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22PyTorch+%26+Conda%22+branch%3Amaster) |
|
||||
| Linux py3.7 [GPUs**] | - | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
|
||||
| Linux py3.7 [TPUs***] | - | - | - | [![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22TPU+tests%22+branch%3Amaster) |
|
||||
| Linux py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
|
||||
| OSX py3.6 / py3.7 | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
|
||||
| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22)
|
||||
| Windows py3.6 / py3.7 / py3.8 | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | - | [![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
|
||||
|
||||
- _\* `torch>=1.4` is the minimal pytorch version for Python 3.8_
|
||||
- _\** tests run on two NVIDIA K80_
|
||||
|
|
|
@ -1,59 +1,98 @@
|
|||
# Existing images:
|
||||
# --build-arg TORCH_VERSION=1.6.0 --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg TORCH_VERSION=1.5 --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg TORCH_VERSION=1.4 --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg TORCH_VERSION=1.3 --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg TORCH_VERSION=1.2 --build-arg CUDA_VERSION=10.0
|
||||
# --build-arg TORCH_VERSION=1.1.0 --build-arg CUDA_VERSION=10.0 --build-arg CUDNN_VERSION=7.5
|
||||
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.7 --build-arg PYTORCH_CHANNEL=pytorch-nightly --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.6 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.5 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.4 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
|
||||
# --build-arg PYTHON_VERSION=3.7 --build-arg PYTORCH_VERSION=1.3 --build-arg PYTORCH_CHANNEL=pytorch --build-arg CUDA_VERSION=10.1
|
||||
|
||||
ARG TORCH_VERSION=1.6.0
|
||||
ARG CUDA_VERSION=10.1
|
||||
ARG CUDNN_VERSION=7
|
||||
ARG CUDA_VERSION=10.1
|
||||
|
||||
# TODO: make his imagge from pure Ubuntu + install all NVIDIA drivers
|
||||
# FROM nvidia/cuda:${CUDA_VERSION}-base
|
||||
FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel
|
||||
|
||||
ARG PYTHON_VERSION=3.7
|
||||
ARG PYTORCH_VERSION=1.6
|
||||
ARG PYTORCH_CHANNEL=pytorch
|
||||
ARG CONDA_VERSION=4.7.12
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
ENV HOROVOD_GPU_ALLREDUCE=NCCL
|
||||
ENV HOROVOD_GPU_BROADCAST=NCCL
|
||||
ENV HOROVOD_GPU_OPERATIONS=NCCL
|
||||
ENV HOROVOD_WITH_PYTORCH=1
|
||||
ENV HOROVOD_WITHOUT_TENSORFLOW=1
|
||||
ENV HOROVOD_WITHOUT_MXNET=1
|
||||
ENV HOROVOD_WITH_GLOO=1
|
||||
ENV HOROVOD_WITHOUT_MPI=1
|
||||
ENV PATH="$PATH:/root/.local/bin"
|
||||
ENV MAKEFLAGS="-j$(nproc)"
|
||||
# TODO: uncomment in horovod next release, https://github.com/horovod/horovod/pull/2239
|
||||
# ENV MAKEFLAGS="-j$(nproc)"
|
||||
|
||||
COPY ./tests/install_AMP.sh install_AMP.sh
|
||||
COPY ./requirements/base.txt requirements.txt
|
||||
COPY ./requirements/extra.txt requirements-extra.txt
|
||||
COPY ./requirements/test.txt requirements-tests.txt
|
||||
COPY ./requirements/examples.txt requirements-examples.txt
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y \
|
||||
git \
|
||||
cmake \
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
curl \
|
||||
ca-certificates \
|
||||
&& \
|
||||
|
||||
# Install AMP
|
||||
bash install_AMP.sh && \
|
||||
# Install all requirements
|
||||
pip install -r requirements.txt && \
|
||||
# HOROVOD_BUILD_ARCH_FLAGS="-mfma" && \
|
||||
pip install -r requirements-extra.txt && \
|
||||
pip install -r requirements-examples.txt && \
|
||||
#pip install -r requirements-tests.txt && \
|
||||
rm install_AMP.sh && \
|
||||
rm requirements* && \
|
||||
|
||||
# Cleaning
|
||||
# Cleaning
|
||||
apt-get autoremove -y && \
|
||||
apt-get clean && \
|
||||
rm -rf /root/.cache && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Show what we have
|
||||
# add non-root user
|
||||
RUN useradd --create-home --shell /bin/bash flash
|
||||
|
||||
USER flash
|
||||
ENV CONDA_ENV=lightning
|
||||
ENV WORKDIR=/home/flash
|
||||
WORKDIR $WORKDIR
|
||||
|
||||
COPY --chown=flash environment.yml environment.yml
|
||||
|
||||
# install conda and python
|
||||
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_VERSION}-Linux-x86_64.sh && \
|
||||
chmod +x ~/miniconda.sh && \
|
||||
~/miniconda.sh -b -p ${WORKDIR}/miniconda && \
|
||||
rm ~/miniconda.sh
|
||||
|
||||
# add conda to path
|
||||
ENV PATH="${WORKDIR}/miniconda/bin:$PATH"
|
||||
ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/lib:$LD_LIBRARY_PATH"
|
||||
ENV CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda"
|
||||
|
||||
# conda init
|
||||
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION pytorch=$PYTORCH_VERSION torchvision cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
|
||||
conda init bash && \
|
||||
# NOTE: this requires that the channel is presented in the yaml before packages
|
||||
python -c "fname = 'environment.yml' ; req = open(fname).read().replace('pytorch', '${PYTORCH_CHANNEL}', 1) ; open(fname, 'w').write(req)" && \
|
||||
conda env update --file environment.yml && \
|
||||
conda clean -ya && \
|
||||
rm environment.yml && \
|
||||
# Disable cache
|
||||
conda install "pip>20.1" -y && \
|
||||
pip config set global.cache-dir false
|
||||
|
||||
ENV PATH ${WORKDIR}/miniconda/envs/${CONDA_ENV}/bin:$PATH
|
||||
ENV LD_LIBRARY_PATH="${WORKDIR}/miniconda/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH"
|
||||
# if you want this environment to be the default one, uncomment the following line:
|
||||
ENV CONDA_DEFAULT_ENV=${CONDA_ENV}
|
||||
|
||||
COPY ./requirements/test.txt requirements-tests.txt
|
||||
COPY ./requirements/examples.txt requirements-examples.txt
|
||||
|
||||
RUN \
|
||||
echo ". ${WORKDIR}/miniconda/etc/profile.d/conda.sh" >> ~/.bashrc && \
|
||||
echo "conda activate ${CONDA_ENV}" >> ~/.bashrc && \
|
||||
source ~/.bashrc && \
|
||||
# Install all requirements
|
||||
pip install -r requirements-tests.txt --upgrade-strategy only-if-needed && \
|
||||
pip install -r requirements-examples.txt --upgrade-strategy only-if-needed && \
|
||||
rm requirements* && \
|
||||
# Show what we have
|
||||
pip --version && \
|
||||
conda info && \
|
||||
conda list && \
|
||||
pip list
|
||||
|
||||
CMD ["bin/bash"]
|
||||
|
|
|
@ -4,6 +4,7 @@ FROM nvidia/cuda:${CUDA_VERSION}-devel
|
|||
# install versions
|
||||
ARG PYTHON_VERSION=3.7
|
||||
ARG PYTORCH_VERSION=1.4
|
||||
ARG PYTORCH_CHANNEL=pytorch
|
||||
ARG LIGHTNING_VERSION=""
|
||||
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
|
||||
ARG CONDA_VERSION=4.7.12
|
||||
|
@ -47,7 +48,7 @@ COPY --chown=flash environment.yml environment.yml
|
|||
RUN conda create -y --name $CONDA_ENV python=$PYTHON_VERSION && \
|
||||
conda init bash && \
|
||||
# conda install -y python=$PYTHON_VERSION && \
|
||||
conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=pytorch && \
|
||||
conda install pytorch=$PYTORCH_VERSION cudatoolkit=$CUDA_VERSION --channel=$PYTORCH_CHANNEL && \
|
||||
conda env update --file environment.yml && \
|
||||
rm environment.yml && \
|
||||
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
torchvision>=0.4.0, <0.7
|
||||
torchvision>=0.4.0
|
||||
gym>=0.17.0
|
Loading…
Reference in New Issue