2020-10-02 09:26:21 +00:00
|
|
|
# Copyright The PyTorch Lightning team.
|
|
|
|
#
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
|
2022-04-21 22:10:42 +00:00
|
|
|
ARG CUDA_VERSION=11.3.1
|
2022-05-06 02:56:57 +00:00
|
|
|
ARG UBUNTU_VERSION=20.04
|
2020-07-31 12:23:13 +00:00
|
|
|
|
2022-05-06 02:56:57 +00:00
|
|
|
# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
|
|
|
|
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
2020-09-17 18:30:39 +00:00
|
|
|
|
2021-11-04 17:26:24 +00:00
|
|
|
ARG PYTHON_VERSION=3.9
|
2021-11-10 16:59:10 +00:00
|
|
|
ARG PYTORCH_VERSION=1.8
|
2020-07-31 12:23:13 +00:00
|
|
|
|
2020-08-02 12:14:53 +00:00
|
|
|
SHELL ["/bin/bash", "-c"]
|
2020-10-26 10:47:09 +00:00
|
|
|
# https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/
|
2021-01-29 12:27:18 +00:00
|
|
|
ENV \
|
|
|
|
DEBIAN_FRONTEND=noninteractive \
|
|
|
|
TZ=Europe/Prague \
|
|
|
|
PATH="$PATH:/root/.local/bin" \
|
2021-02-10 09:43:26 +00:00
|
|
|
CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \
|
2022-03-10 16:01:08 +00:00
|
|
|
TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \
|
2021-07-14 15:04:33 +00:00
|
|
|
MKL_THREADING_LAYER=GNU \
|
2022-03-12 09:00:20 +00:00
|
|
|
# MAKEFLAGS="-j$(nproc)"
|
|
|
|
MAKEFLAGS="-j2"
|
2020-07-31 12:23:13 +00:00
|
|
|
|
2022-05-02 07:00:44 +00:00
|
|
|
RUN \
|
|
|
|
# TODO: Remove the manual key installation once the base image is updated.
|
|
|
|
# https://github.com/NVIDIA/nvidia-docker/issues/1631
|
|
|
|
apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub && \
|
|
|
|
apt-get update -qq --fix-missing && \
|
2020-10-26 10:47:09 +00:00
|
|
|
apt-get install -y --no-install-recommends \
|
2020-09-22 23:41:35 +00:00
|
|
|
build-essential \
|
2020-10-26 10:47:09 +00:00
|
|
|
pkg-config \
|
2020-09-22 23:41:35 +00:00
|
|
|
cmake \
|
|
|
|
git \
|
2020-10-26 10:47:09 +00:00
|
|
|
wget \
|
2021-01-08 15:36:49 +00:00
|
|
|
curl \
|
|
|
|
unzip \
|
2020-09-22 23:41:35 +00:00
|
|
|
ca-certificates \
|
2020-10-26 10:47:09 +00:00
|
|
|
software-properties-common \
|
2021-02-17 12:15:49 +00:00
|
|
|
libopenmpi-dev \
|
2022-05-06 02:56:57 +00:00
|
|
|
openmpi-bin \
|
|
|
|
ssh \
|
2020-10-26 10:47:09 +00:00
|
|
|
&& \
|
|
|
|
|
|
|
|
# Install python
|
|
|
|
add-apt-repository ppa:deadsnakes/ppa && \
|
|
|
|
apt-get install -y \
|
|
|
|
python${PYTHON_VERSION} \
|
|
|
|
python${PYTHON_VERSION}-distutils \
|
|
|
|
python${PYTHON_VERSION}-dev \
|
2020-08-02 12:14:53 +00:00
|
|
|
&& \
|
2020-10-26 10:47:09 +00:00
|
|
|
|
|
|
|
update-alternatives --install /usr/bin/python${PYTHON_VERSION%%.*} python${PYTHON_VERSION%%.*} /usr/bin/python${PYTHON_VERSION} 1 && \
|
|
|
|
update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1 && \
|
|
|
|
|
2020-09-30 12:33:22 +00:00
|
|
|
# Cleaning
|
2020-08-02 12:14:53 +00:00
|
|
|
apt-get autoremove -y && \
|
|
|
|
apt-get clean && \
|
|
|
|
rm -rf /root/.cache && \
|
2020-09-17 18:30:39 +00:00
|
|
|
rm -rf /var/lib/apt/lists/*
|
|
|
|
|
2020-10-26 10:47:09 +00:00
|
|
|
COPY ./requirements.txt requirements.txt
|
|
|
|
COPY ./requirements/ ./requirements/
|
2022-03-12 09:00:20 +00:00
|
|
|
COPY ./.actions/assistant.py assistant.py
|
2020-09-22 23:41:35 +00:00
|
|
|
|
2022-02-16 20:15:44 +00:00
|
|
|
ENV PYTHONPATH=/usr/lib/python${PYTHON_VERSION}/site-packages
|
|
|
|
|
2020-09-17 18:30:39 +00:00
|
|
|
RUN \
|
2020-10-26 10:47:09 +00:00
|
|
|
wget https://bootstrap.pypa.io/get-pip.py --progress=bar:force:noscroll --no-check-certificate && \
|
|
|
|
python${PYTHON_VERSION} get-pip.py && \
|
|
|
|
rm get-pip.py && \
|
|
|
|
|
2022-03-12 09:00:20 +00:00
|
|
|
pip install -q fire && \
|
2022-03-11 09:20:47 +00:00
|
|
|
# Disable cache \
|
2022-03-12 09:00:20 +00:00
|
|
|
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
|
2020-09-30 12:33:22 +00:00
|
|
|
pip config set global.cache-dir false && \
|
2020-10-26 10:47:09 +00:00
|
|
|
# set particular PyTorch version
|
2022-03-11 09:20:47 +00:00
|
|
|
python ./requirements/adjust-versions.py requirements.txt ${PYTORCH_VERSION} && \
|
|
|
|
python ./requirements/adjust-versions.py requirements/extra.txt ${PYTORCH_VERSION} && \
|
|
|
|
python ./requirements/adjust-versions.py requirements/examples.txt ${PYTORCH_VERSION} && \
|
2022-03-12 09:00:20 +00:00
|
|
|
# Install all requirements \
|
2022-05-09 13:25:53 +00:00
|
|
|
pip install -r requirements/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \
|
2022-03-12 09:00:20 +00:00
|
|
|
rm -rf requirements.* && \
|
|
|
|
rm assistant.py
|
2022-03-10 16:01:08 +00:00
|
|
|
|
|
|
|
RUN \
|
2022-03-18 14:23:19 +00:00
|
|
|
apt-get purge -y cmake && \
|
|
|
|
wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \
|
|
|
|
tar -zxvf cmake-3.20.2.tar.gz && \
|
|
|
|
cd cmake-3.20.2 && \
|
|
|
|
./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \
|
|
|
|
make && \
|
|
|
|
make install && \
|
|
|
|
cmake --version
|
2022-03-10 16:01:08 +00:00
|
|
|
|
2022-03-12 09:00:20 +00:00
|
|
|
ENV \
|
|
|
|
HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \
|
|
|
|
HOROVOD_GPU_OPERATIONS=NCCL \
|
|
|
|
HOROVOD_WITH_PYTORCH=1 \
|
|
|
|
HOROVOD_WITHOUT_TENSORFLOW=1 \
|
|
|
|
HOROVOD_WITHOUT_MXNET=1 \
|
|
|
|
HOROVOD_WITH_GLOO=1 \
|
2022-03-18 14:23:19 +00:00
|
|
|
HOROVOD_WITH_MPI=1
|
2022-03-12 09:00:20 +00:00
|
|
|
|
2022-03-10 16:01:08 +00:00
|
|
|
RUN \
|
2022-05-06 02:56:57 +00:00
|
|
|
# CUDA 10.2 doesn't support ampere architecture (8.0).
|
|
|
|
if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
|
2022-03-18 14:23:19 +00:00
|
|
|
HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \
|
|
|
|
export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \
|
2022-05-06 02:56:57 +00:00
|
|
|
echo $HOROVOD_BUILD_CUDA_CC_LIST && \
|
2022-03-18 14:23:19 +00:00
|
|
|
cmake --version && \
|
2022-05-04 14:06:02 +00:00
|
|
|
pip install --no-cache-dir -r ./requirements/strategies.txt && \
|
2022-05-06 02:56:57 +00:00
|
|
|
horovodrun --check-build && \
|
2022-03-18 14:23:19 +00:00
|
|
|
rm -rf requirements/
|
2020-10-26 10:47:09 +00:00
|
|
|
|
2020-11-12 14:03:43 +00:00
|
|
|
RUN \
|
2022-02-10 07:48:33 +00:00
|
|
|
CUDA_VERSION_MAJOR=$(python -c "import torch; print(torch.version.cuda.split('.')[0])") && \
|
2021-07-14 15:04:33 +00:00
|
|
|
py_ver=$(python -c "print(int('$PYTHON_VERSION'.split('.') >= '3.9'.split('.')))") && \
|
2020-11-12 14:03:43 +00:00
|
|
|
# install DALI, needed for examples
|
2021-07-14 15:04:33 +00:00
|
|
|
# todo: waiting for 1.4 - https://github.com/NVIDIA/DALI/issues/3144#issuecomment-877386691
|
|
|
|
if [ $py_ver -eq "0" ]; then \
|
|
|
|
pip install --extra-index-url https://developer.download.nvidia.com/compute/redist "nvidia-dali-cuda${CUDA_VERSION_MAJOR}0>1.0" ; \
|
|
|
|
python -c 'from nvidia.dali.pipeline import Pipeline' ; \
|
|
|
|
fi
|
|
|
|
|
2020-10-26 10:47:09 +00:00
|
|
|
RUN \
|
2022-05-06 02:56:57 +00:00
|
|
|
# CUDA 10.2 doesn't support ampere architecture (8.0).
|
|
|
|
if [[ "$CUDA_VERSION" < "11.0" ]]; then export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST//";8.0"/}; echo $TORCH_CUDA_ARCH_LIST; fi && \
|
2021-04-21 22:58:55 +00:00
|
|
|
# install NVIDIA apex
|
2022-02-16 20:15:44 +00:00
|
|
|
pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \
|
2021-07-14 15:04:33 +00:00
|
|
|
python -c "from apex import amp"
|
2020-09-30 12:33:22 +00:00
|
|
|
|
2022-04-11 15:29:54 +00:00
|
|
|
RUN \
|
|
|
|
# install Bagua
|
|
|
|
CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \
|
|
|
|
pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \
|
|
|
|
python -c "import bagua_core; bagua_core.install_deps()" && \
|
|
|
|
python -c "import bagua; print(bagua.__version__)"
|
|
|
|
|
|
|
|
COPY requirements/check-avail-extras.py check-avail-extras.py
|
|
|
|
COPY requirements/check-avail-strategies.py check-avail-strategies.py
|
|
|
|
|
2020-09-30 12:33:22 +00:00
|
|
|
RUN \
|
2020-09-17 18:30:39 +00:00
|
|
|
# Show what we have
|
2020-08-15 19:39:44 +00:00
|
|
|
pip --version && \
|
2020-09-30 12:33:22 +00:00
|
|
|
pip list && \
|
2021-07-14 15:04:33 +00:00
|
|
|
python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \
|
2022-02-10 07:48:33 +00:00
|
|
|
python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \
|
2022-04-11 15:29:54 +00:00
|
|
|
python check-avail-extras.py && \
|
|
|
|
python check-avail-strategies.py && \
|
|
|
|
rm check-avail-*.py
|