From fab2ff35ad98ed021ee1d151bb67d3a4088c848b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Sat, 14 May 2022 10:59:03 +0900 Subject: [PATCH] CI: Azure - multiple configs (#12984) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * CI: Azure - multiple configs * names * benchmark * Apply suggestions from code review Co-authored-by: Akihiro Nitta Co-authored-by: Carlos MocholĂ­ --- .azure-pipelines/gpu-benchmark.yml | 8 +------- .azure-pipelines/gpu-tests.yml | 13 +++++++------ .github/workflows/ci_schema.yml | 8 ++++---- dockers/base-cuda/Dockerfile | 3 +-- tests/strategies/test_ddp_fully_sharded_native.py | 14 +++++++------- 5 files changed, 20 insertions(+), 26 deletions(-) diff --git a/.azure-pipelines/gpu-benchmark.yml b/.azure-pipelines/gpu-benchmark.yml index ea29c3d120..cfccbf7081 100644 --- a/.azure-pipelines/gpu-benchmark.yml +++ b/.azure-pipelines/gpu-benchmark.yml @@ -28,18 +28,12 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-gpus-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all steps: - - bash: | - # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation. - pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html - pip list - displayName: 'Install PyTorch LTS' - - bash: | python -m pytest tests/benchmarks -v --durations=0 displayName: 'Testing: benchmarks' diff --git a/.azure-pipelines/gpu-tests.yml b/.azure-pipelines/gpu-tests.yml index 439338156b..3093fafa44 100644 --- a/.azure-pipelines/gpu-tests.yml +++ b/.azure-pipelines/gpu-tests.yml @@ -18,6 +18,12 @@ pr: jobs: - job: pytest + strategy: + matrix: + 'PyTorch - LTS': + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + 'PyTorch - stable': + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" # how long to run the job before automatically cancelling timeoutInMinutes: "65" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -25,11 +31,8 @@ jobs: pool: azure-gpus-spot - # ToDo: this need to have installed docker in the base image... container: - # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04 - # run on torch 1.8 as it's the LTS version - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8" + image: $(image) # default shm size is 64m. Increase it to avoid: # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m" @@ -52,8 +55,6 @@ jobs: - bash: | python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation. - pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install . --requirement requirements/devel.txt pip install . --requirement requirements/strategies.txt diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci_schema.yml index acf210c58d..54efaff27a 100644 --- a/.github/workflows/ci_schema.yml +++ b/.github/workflows/ci_schema.yml @@ -16,9 +16,9 @@ jobs: pip install "check-jsonschema>=0.10" - name: GH Workflows - run: | - check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows" + run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows" - name: Azure Pipelines - run: | - check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json" + env: + SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json + run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE" diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 321c02d844..ded351f163 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -12,10 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -ARG CUDA_VERSION=11.3.1 ARG UBUNTU_VERSION=20.04 +ARG CUDA_VERSION=11.3.1 -# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG PYTHON_VERSION=3.9 diff --git a/tests/strategies/test_ddp_fully_sharded_native.py b/tests/strategies/test_ddp_fully_sharded_native.py index cf4973e5ae..934e55af14 100644 --- a/tests/strategies/test_ddp_fully_sharded_native.py +++ b/tests/strategies/test_ddp_fully_sharded_native.py @@ -9,16 +9,16 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from tests.helpers.boring_model import BoringModel from tests.helpers.runif import RunIf -if _TORCH_GREATER_EQUAL_1_11: +if _TORCH_GREATER_EQUAL_1_12: from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel from torch.distributed.fsdp.wrap import wrap -@RunIf(min_torch="1.11") +@RunIf(min_torch="1.12dev") def test_invalid_on_cpu(tmpdir): """Test to ensure that to raise Misconfiguration for Native FSDP on CPU.""" with pytest.raises( @@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir): @mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"}) @mock.patch("torch.cuda.device_count", return_value=1) @mock.patch("torch.cuda.is_available", return_value=True) -@RunIf(min_torch="1.11") +@RunIf(min_torch="1.12dev") def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir): """Test to ensure that plugin native amp plugin raises Misconfiguration error.""" with pytest.raises( @@ -102,7 +102,7 @@ class TestFSDPModel(BoringModel): assert self.layer.module[2].reshard_after_forward is True -@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11") +@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev") def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run.""" @@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11") +@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev") def test_fully_sharded_native_strategy_checkpoint(tmpdir): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" @@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir): _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt")) -@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11") +@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev") def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir): """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""