diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index e517d5d7c1..0a32aede34 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -95,16 +95,15 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.github/workflows/ci_test-conda.yml' - python_version: ["3.8"] - pytorch_version: ["1.8", "1.9", "1.10"] + include: + # see: https://pytorch.org/get-started/previous-versions/ + - {python_version: "3.8", pytorch_version: "1.8", cuda_version: "11.1"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"} + - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} steps: - name: Checkout uses: actions/checkout@v2 - - run: | - cuda=$(python -c "from distutils.version import LooseVersion as LVer ; print(11.1 if LVer('${{matrix.pytorch_version}}') > LVer('1.7') else 10.2)" 2>&1) - echo "::set-output name=CUDA::$cuda" - id: extend - name: Build Conda Docker # publish master/release uses: docker/build-push-action@v2 @@ -112,7 +111,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ steps.extend.outputs.CUDA }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: false timeout-minutes: 75 diff --git a/.github/workflows/ci_test-conda.yml b/.github/workflows/ci_test-conda.yml index 61a4267b62..c7bb79060c 100644 --- a/.github/workflows/ci_test-conda.yml +++ b/.github/workflows/ci_test-conda.yml @@ -33,14 +33,19 @@ jobs: - uses: actions/checkout@v2 - name: Update dependencies + env: + HOROVOD_BUILD_ARCH_FLAGS: "-mfma" + HOROVOD_WITHOUT_MXNET: 1 + HOROVOD_WITHOUT_TENSORFLOW: 1 run: | + set -e conda info conda list # adjust versions according installed Torch version python ./requirements/adjust-versions.py requirements/extra.txt python ./requirements/adjust-versions.py requirements/examples.txt - pip install --requirement requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html - # set a per-test timeout of 2.5 minutes to fail sooner. this aids with hanging tests + pip install -r requirements/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + # set a per-test timeout of 2.5 minutes to fail sooner; this aids with hanging tests pip install pytest-timeout pip list # sanity check diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 006c98e2de..9704139d1d 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -93,7 +93,7 @@ jobs: # the config used in '.azure-pipelines/gpu-tests.yml' - {python_version: "3.7", pytorch_version: "1.8"} # latest (not used) - - {python_version: "3.9", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} steps: - name: Checkout diff --git a/dockers/README.md b/dockers/README.md index 319b78f421..e960e98bc3 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -14,9 +14,9 @@ or with specific arguments ```bash git clone docker image build \ - -t pytorch-lightning:base-cuda-py3.9-pt1.8 \ + -t pytorch-lightning:base-cuda-py3.7-pt1.8 \ -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ + --build-arg PYTHON_VERSION=3.7 \ --build-arg PYTORCH_VERSION=1.8 \ . ``` @@ -26,10 +26,10 @@ or nightly version from Conda ```bash git clone docker image build \ - -t pytorch-lightning:base-conda-py3.8-pt1.9 \ + -t pytorch-lightning:base-conda-py3.9-pt1.11 \ -f dockers/base-conda/Dockerfile \ - --build-arg PYTHON_VERSION=3.8 \ - --build-arg PYTORCH_VERSION=1.9 \ + --build-arg PYTHON_VERSION=3.9 \ + --build-arg PYTORCH_VERSION=1.11 \ . ``` diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 686160f4b8..85d5ed345a 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -14,7 +14,7 @@ ARG CUDA_VERSION=11.3.1 -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.8 @@ -59,8 +59,8 @@ ENV \ LD_LIBRARY_PATH="/root/miniconda3/lib:$LD_LIBRARY_PATH" \ CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ MKL_THREADING_LAYER=GNU \ - MAKEFLAGS="-j$(nproc)" \ - # MAKEFLAGS="-j1" \ + # MAKEFLAGS="-j$(nproc)" \ + MAKEFLAGS="-j2" \ TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \ CONDA_ENV=lightning @@ -84,9 +84,9 @@ ENV \ PATH=/root/miniconda3/envs/${CONDA_ENV}/bin:$PATH \ LD_LIBRARY_PATH="/root/miniconda3/envs/${CONDA_ENV}/lib:$LD_LIBRARY_PATH" +COPY ./requirements.txt requirements.txt COPY ./requirements/extra.txt requirements-extra.txt COPY ./requirements/examples.txt requirements-examples.txt -COPY ./requirements/test.txt requirements-test.txt COPY ./requirements/adjust-versions.py requirements_adjust_versions.py COPY ./.actions/assistant.py assistant.py @@ -95,14 +95,25 @@ RUN \ python -c "import torch; print(torch.__version__)" && \ python requirements_adjust_versions.py requirements-extra.txt && \ python -c "print(' '.join([ln for ln in open('requirements-extra.txt').readlines() if 'horovod' in ln]))" > requirements_horovod.txt && \ + pip install -q fire && \ python assistant.py requirements_prune_pkgs requirements-extra.txt "horovod" && \ python requirements_adjust_versions.py requirements-examples.txt && \ # Install remaining requirements + pip install -r requirements.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \ pip install -r requirements-extra.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \ pip install -r requirements-examples.txt --no-cache-dir --find-links https://download.pytorch.org/whl/test/torch_test.html && \ - pip install -r requirements-test.txt --no-cache-dir && \ rm assistant.py +RUN \ + apt-get purge -y cmake && \ + wget -q https://github.com/Kitware/CMake/releases/download/v3.20.2/cmake-3.20.2.tar.gz && \ + tar -zxvf cmake-3.20.2.tar.gz && \ + cd cmake-3.20.2 && \ + ./bootstrap -- -DCMAKE_USE_OPENSSL=OFF && \ + make && \ + make install && \ + cmake --version + ENV \ # if you want this environment to be the default o \ne, uncomment the following line: CONDA_DEFAULT_ENV=${CONDA_ENV} \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 13e60f1cce..739ff591eb 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=10.2 +ARG CUDA_VERSION=11.1 -FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu18.04 +FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.8 @@ -28,8 +28,8 @@ ENV \ CUDA_TOOLKIT_ROOT_DIR="/usr/local/cuda" \ TORCH_CUDA_ARCH_LIST="3.7;5.0;6.0;7.0;7.5;8.0" \ MKL_THREADING_LAYER=GNU \ - MAKEFLAGS="-j$(nproc)" - # MAKEFLAGS="-j1" + # MAKEFLAGS="-j$(nproc)" + MAKEFLAGS="-j2" RUN apt-get update -qq --fix-missing && \ apt-get install -y --no-install-recommends \ @@ -64,7 +64,7 @@ RUN apt-get update -qq --fix-missing && \ COPY ./requirements.txt requirements.txt COPY ./requirements/ ./requirements/ -COPY ./.github/prune-packages.py requirements/prune_packages.py +COPY ./.actions/assistant.py assistant.py ENV PYTHONPATH=/usr/lib/python${PYTHON_VERSION}/site-packages @@ -73,27 +73,21 @@ RUN \ python${PYTHON_VERSION} get-pip.py && \ rm get-pip.py && \ + pip install -q fire && \ # Disable cache \ - export BAGUA_CUDA_VERSION=${CUDA_VERSION//"."/""} && \ + CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ + export BAGUA_CUDA_VERSION=$CUDA_VERSION_MM && \ pip config set global.cache-dir false && \ # set particular PyTorch version python ./requirements/adjust-versions.py requirements.txt ${PYTORCH_VERSION} && \ python ./requirements/adjust-versions.py requirements/extra.txt ${PYTORCH_VERSION} && \ python ./requirements/adjust-versions.py requirements/examples.txt ${PYTORCH_VERSION} && \ python -c "print(' '.join([ln for ln in open('requirements/extra.txt').readlines() if 'horovod' in ln]))" > ./requirements/horovod.txt && \ - python requirements/prune_packages.py requirements/extra.txt "horovod" && \ - # Install all requirements - pip install -r requirements/devel.txt --no-cache-dir && \ - rm -rf requirements.* - -ENV \ - HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ - HOROVOD_GPU_OPERATIONS=NCCL \ - HOROVOD_WITH_PYTORCH=1 \ - HOROVOD_WITHOUT_TENSORFLOW=1 \ - HOROVOD_WITHOUT_MXNET=1 \ - HOROVOD_WITH_GLOO=1 \ - HOROVOD_WITHOUT_MPI=1 + python assistant.py requirements_prune_pkgs requirements/examples.txt "horovod" && \ + # Install all requirements \ + pip install -r requirements/devel.txt --no-cache-dir --find-links https://download.pytorch.org/whl/cu${CUDA_VERSION_MM}/torch_stable.html && \ + rm -rf requirements.* && \ + rm assistant.py RUN \ apt-get purge -y cmake && \ @@ -105,6 +99,15 @@ RUN \ make install && \ cmake --version +ENV \ + HOROVOD_CUDA_HOME=$CUDA_TOOLKIT_ROOT_DIR \ + HOROVOD_GPU_OPERATIONS=NCCL \ + HOROVOD_WITH_PYTORCH=1 \ + HOROVOD_WITHOUT_TENSORFLOW=1 \ + HOROVOD_WITHOUT_MXNET=1 \ + HOROVOD_WITH_GLOO=1 \ + HOROVOD_WITHOUT_MPI=1 + RUN \ HOROVOD_BUILD_CUDA_CC_LIST=${TORCH_CUDA_ARCH_LIST//";"/","} && \ export HOROVOD_BUILD_CUDA_CC_LIST=${HOROVOD_BUILD_CUDA_CC_LIST//"."/""} && \