From 70b257c17fbc4d3f21b732215c1463d1d0d7f59e Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Thu, 2 Mar 2023 23:17:29 +0100 Subject: [PATCH] ci/gpu: fix install future & use local cache (#16929) --- .azure/gpu-tests-fabric.yml | 3 ++- .azure/gpu-tests-pytorch.yml | 10 +++++++--- .github/workflows/ci-dockers-pytorch.yml | 3 ++- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ad7c22566b..489cd0395e 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -47,11 +47,12 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) FREEZE_REQUIREMENTS: "1" + PIP_CACHE_DIR: "/var/tmp/pip" container: image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1" # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--gpus=all --shm-size=2gb" + options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp" strategy: matrix: 'pkg: Fabric': diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index 35ab204df0..04df94898a 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -56,7 +56,7 @@ jobs: scope: "" PACKAGE_NAME: "pytorch" 'PyTorch | future': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch1.13-cuda11.7.1" scope: "future" PACKAGE_NAME: "pytorch" 'Lightning | latest': @@ -67,11 +67,12 @@ jobs: variables: DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) FREEZE_REQUIREMENTS: "1" + PIP_CACHE_DIR: "/var/tmp/pip" container: image: $(image) # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' - options: "--gpus=all --shm-size=2gb" + options: "--gpus=all --shm-size=2gb -v /var/tmp:/var/tmp" workspace: clean: all steps: @@ -114,8 +115,11 @@ jobs: displayName: 'Install package & extras' - bash: | + pip install -U -q pip pip uninstall -y torch torchvision - pip install torch torchvision -U --pre --no-cache -f https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}/torch_test.html + pip install "torch==2.0.0" "torchvision==0.15.0" -U --pre \ + -f "https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}/torch_test.html" \ + -f "https://download.pytorch.org/whl/nightly/cu${CUDA_VERSION_MM}/torch_nightly.html" python -c "from torch import __version__ as ver; assert ver.startswith('2.0.0'), ver" condition: eq(variables['scope'], 'future') displayName: 'bump to future' diff --git a/.github/workflows/ci-dockers-pytorch.yml b/.github/workflows/ci-dockers-pytorch.yml index 5d04f23c68..46f5eab1c5 100644 --- a/.github/workflows/ci-dockers-pytorch.yml +++ b/.github/workflows/ci-dockers-pytorch.yml @@ -8,7 +8,7 @@ on: types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped paths: - ".actions/**" - - ".github/workflows/ci-pytorch-dockers.yml" + - ".github/workflows/ci-dockers-pytorch.yml" - "dockers/**" - "requirements/*.txt" - "requirements/pytorch/**" @@ -105,6 +105,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} - {python_version: "3.9", pytorch_version: "1.13", cuda_version: "11.7.1"} + - {python_version: "3.10", pytorch_version: "1.13", cuda_version: "11.7.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2