From 448be607013ae8ce3f9c73d7efb71f25fba93cbd Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sun, 2 Aug 2020 14:14:53 +0200 Subject: [PATCH] update GPU to PT 1.5 (#2779) * update gpu PT 1.6 * fix docker * use PT 1.5 * Update tests/install_AMP.sh Co-authored-by: Nathan Raw Co-authored-by: Nathan Raw --- .drone.yml | 2 +- dockers/cuda-extras/Dockerfile | 25 ++++++++++++++++++++++--- tests/install_AMP.sh | 3 ++- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/.drone.yml b/.drone.yml index edb6f48bbb..67f0c38758 100644 --- a/.drone.yml +++ b/.drone.yml @@ -6,7 +6,7 @@ name: torch-GPU steps: - name: testing - image: pytorchlightning/pytorch_lightning:devel-pt1.4 + image: pytorchlightning/pytorch_lightning:cuda-extras-py3.7-torch1.5 environment: SLURM_LOCALID: 0 diff --git a/dockers/cuda-extras/Dockerfile b/dockers/cuda-extras/Dockerfile index c4bc5cfb64..a1aaff0e7d 100644 --- a/dockers/cuda-extras/Dockerfile +++ b/dockers/cuda-extras/Dockerfile @@ -6,12 +6,16 @@ # --build-arg TORCH_VERSION=1.2 --build-arg CUDA_VERSION=10.0 # --build-arg TORCH_VERSION=1.1.0 --build-arg CUDA_VERSION=10.0 --build-arg CUDNN_VERSION=7.5 -ARG TORCH_VERSION=1.6 +ARG TORCH_VERSION=1.6.0 ARG CUDA_VERSION=10.1 ARG CUDNN_VERSION=7 +# TODO: make his imagge from pure Ubuntu + install all NVIDIA drivers +# FROM nvidia/cuda:${CUDA_VERSION}-base FROM pytorch/pytorch:${TORCH_VERSION}-cuda${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel +SHELL ["/bin/bash", "-c"] + ENV HOROVOD_GPU_ALLREDUCE=NCCL ENV HOROVOD_GPU_BROADCAST=NCCL ENV HOROVOD_WITH_PYTORCH=1 @@ -28,13 +32,28 @@ COPY ./requirements/extra.txt requirements-extra.txt COPY ./requirements/test.txt requirements-tests.txt COPY ./requirements/examples.txt requirements-examples.txt -RUN apt-get update && apt-get install -y cmake && \ - # Install AMP +RUN apt-get update && \ + apt-get install -y \ + git \ + cmake \ + && \ + +# Install AMP + # TODO: skip this instrall for PT >= 1.6 bash install_AMP.sh && \ +# Install all requirements pip install -r requirements.txt && \ # HOROVOD_BUILD_ARCH_FLAGS="-mfma" && \ pip install -r requirements-extra.txt && \ pip install -r requirements-examples.txt && \ pip install -r requirements-tests.txt && \ + rm install_AMP.sh && \ rm requirements* && \ + +# Cleaning + apt-get autoremove -y && \ + apt-get clean && \ + rm -rf /root/.cache && \ + +# Show what we have pip list diff --git a/tests/install_AMP.sh b/tests/install_AMP.sh index 2c56bb25b7..0c70e0bc34 100644 --- a/tests/install_AMP.sh +++ b/tests/install_AMP.sh @@ -4,6 +4,7 @@ ROOT=$PWD git clone https://github.com/NVIDIA/apex cd apex pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./ -pip install -v --no-cache-dir ./ +# If build with extensions fails, you can run this line to build without extensions +# pip install -v --no-cache-dir ./ cd $ROOT rm -rf apex