From f9b69ce5b0de32bfd2300eb8028fdbebf23fe5a1 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 12 Apr 2022 00:29:54 +0900 Subject: [PATCH] CI: check docker requires (#12677) * check docker requires * ci update * bagua * conda * cuda --- .azure-pipelines/gpu-tests.yml | 5 +++-- .github/workflows/ci_dockers.yml | 7 ++++--- dockers/README.md | 6 +++--- dockers/base-conda/Dockerfile | 25 +++++++++++++++++++++++-- dockers/base-cuda/Dockerfile | 18 ++++++++++++++---- 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 07274d5f37..0a2465b85c 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -52,9 +52,10 @@ jobs: - bash: | python -c "fname = 'requirements/extra.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" - pip install fairscale==0.4.5 + pip install fairscale>=0.4.5 pip install deepspeed>=0.6.0 - pip install bagua-cuda102==0.9.0 + CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") + pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install . --requirement requirements/devel.txt pip list displayName: 'Install dependencies' diff --git a/.github/workflows/ci_dockers.yml b/.github/workflows/ci_dockers.yml index 0a32aede34..b1504464c8 100644 --- a/.github/workflows/ci_dockers.yml +++ b/.github/workflows/ci_dockers.yml @@ -73,9 +73,10 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.azure-pipelines/gpu-tests.yml' - python_version: ["3.7"] - pytorch_version: ["1.8"] + include: + # the config used in '.azure-pipelines/gpu-tests.yml' + - {python_version: "3.7", pytorch_version: "1.8"} + - {python_version: "3.9", pytorch_version: "1.10"} steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/README.md b/dockers/README.md index e960e98bc3..e6a5f613a6 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -14,10 +14,10 @@ or with specific arguments ```bash git clone docker image build \ - -t pytorch-lightning:base-cuda-py3.7-pt1.8 \ + -t pytorch-lightning:base-cuda-py3.9-pt1.10 \ -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.7 \ - --build-arg PYTORCH_VERSION=1.8 \ + --build-arg PYTHON_VERSION=3.9 \ + --build-arg PYTORCH_VERSION=1.10 \ . ``` diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 5499c613d5..790c997f7b 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -147,6 +147,26 @@ RUN \ pip install --no-cache-dir --global-option="--cuda_ext" https://github.com/NVIDIA/apex/archive/refs/heads/master.zip && \ python -c "from apex import amp" +RUN \ + # install FairScale + pip install fairscale==0.4.5 && \ + python -c "import fairscale; print(fairscale.__version__)" + +RUN \ + # install DeepSpeed + pip install deepspeed==0.6.0 && \ + python -c "import deepspeed; print(deepspeed.__version__)" + +RUN \ + # install Bagua + CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ + pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ + python -c "import bagua_core; bagua_core.install_deps()" && \ + python -c "import bagua; print(bagua.__version__)" + +COPY requirements/check-avail-extras.py check-avail-extras.py +COPY requirements/check-avail-strategies.py check-avail-strategies.py + RUN \ # Show what we have pip --version && \ @@ -154,5 +174,6 @@ RUN \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python -c "import horovod.torch" && \ - python -c "from horovod.torch import nccl_built; nccl_built()" + python check-avail-extras.py && \ + python check-avail-strategies.py && \ + rm check-avail-*.py diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 9c5b0f681c..44ffab8833 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -76,7 +76,6 @@ RUN \ pip install -q fire && \ # Disable cache \ CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - export BAGUA_CUDA_VERSION=$CUDA_VERSION_MM && \ pip config set global.cache-dir false && \ # set particular PyTorch version python ./requirements/adjust-versions.py requirements.txt ${PYTORCH_VERSION} && \ @@ -138,14 +137,25 @@ RUN \ RUN \ # install DeepSpeed - pip install deepspeed==0.5.7 && \ + pip install deepspeed==0.6.0 && \ python -c "import deepspeed; print(deepspeed.__version__)" +RUN \ + # install Bagua + CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ + pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ + python -c "import bagua_core; bagua_core.install_deps()" && \ + python -c "import bagua; print(bagua.__version__)" + +COPY requirements/check-avail-extras.py check-avail-extras.py +COPY requirements/check-avail-strategies.py check-avail-strategies.py + RUN \ # Show what we have pip --version && \ pip list && \ python -c "import sys; ver = sys.version_info ; assert f'{ver.major}.{ver.minor}' == '$PYTHON_VERSION', ver" && \ python -c "import torch; assert torch.__version__.startswith('$PYTORCH_VERSION'), torch.__version__" && \ - python -c "import horovod.torch" && \ - python -c "from horovod.torch import nccl_built; nccl_built()" + python check-avail-extras.py && \ + python check-avail-strategies.py && \ + rm check-avail-*.py