CI: Azure - multiple configs (#12984)

* CI: Azure - multiple configs * names * benchmark * Apply suggestions from code review Co-authored-by: Akihiro Nitta <nitta@akihironitta.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
2022-05-14 10:59:03 +09:00 · 2022-05-14 10:59:03 +09:00 · fab2ff35ad
parent d28e365669
commit fab2ff35ad
5 changed files with 20 additions and 26 deletions
--- a/.azure-pipelines/gpu-benchmark.yml
+++ b/.azure-pipelines/gpu-benchmark.yml
@ -28,18 +28,12 @@ jobs:
    cancelTimeoutInMinutes: "2"
    pool: azure-gpus-spot
    container:
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
    workspace:
      clean: all

    steps:
-      - bash: |
-          # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
-          pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
-          pip list
-        displayName: 'Install PyTorch LTS'
-
      - bash: |
          python -m pytest tests/benchmarks -v --durations=0
        displayName: 'Testing: benchmarks'
--- a/.azure-pipelines/gpu-tests.yml
+++ b/.azure-pipelines/gpu-tests.yml
@ -18,6 +18,12 @@ pr:

 jobs:
  - job: pytest
+    strategy:
+      matrix:
+        'PyTorch - LTS':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+        'PyTorch - stable':
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
    # how long to run the job before automatically cancelling
    timeoutInMinutes: "65"
    # how much time to give 'run always even if cancelled tasks' before stopping them
@ -25,11 +31,8 @@ jobs:

    pool: azure-gpus-spot

-    # ToDo: this need to have installed docker in the base image...
    container:
-      # base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
-      # run on torch 1.8 as it's the LTS version
-      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
+      image: $(image)
      # default shm size is 64m. Increase it to avoid:
      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
      options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@ -52,8 +55,6 @@ jobs:
    - bash: |
        python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
        CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
-        # TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
-        pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
        pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
        pip install . --requirement requirements/devel.txt
        pip install . --requirement requirements/strategies.txt
--- a/.github/workflows/ci_schema.yml
+++ b/.github/workflows/ci_schema.yml
@ -16,9 +16,9 @@ jobs:
          pip install "check-jsonschema>=0.10"

      - name: GH Workflows
-        run: |
-          check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
+        run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"

      - name: Azure Pipelines
-        run: |
-          check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
+        env:
+          SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
+        run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"
--- a/dockers/base-cuda/Dockerfile
+++ b/dockers/base-cuda/Dockerfile
@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-ARG CUDA_VERSION=11.3.1
 ARG UBUNTU_VERSION=20.04
+ARG CUDA_VERSION=11.3.1

-# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

 ARG PYTHON_VERSION=3.9
--- a/tests/strategies/test_ddp_fully_sharded_native.py
+++ b/tests/strategies/test_ddp_fully_sharded_native.py
@ -9,16 +9,16 @@ from pytorch_lightning import Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
+from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf

-if _TORCH_GREATER_EQUAL_1_11:
+if _TORCH_GREATER_EQUAL_1_12:
    from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
    from torch.distributed.fsdp.wrap import wrap


-@RunIf(min_torch="1.11")
+@RunIf(min_torch="1.12dev")
 def test_invalid_on_cpu(tmpdir):
    """Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
    with pytest.raises(
@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir):
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("torch.cuda.is_available", return_value=True)
-@RunIf(min_torch="1.11")
+@RunIf(min_torch="1.12dev")
 def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
    """Test to ensure that plugin native amp plugin raises Misconfiguration error."""
    with pytest.raises(
@ -102,7 +102,7 @@ class TestFSDPModel(BoringModel):
        assert self.layer.module[2].reshard_after_forward is True


-@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
+@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
 def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
    """Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""

@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
    _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))


-@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
+@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev")
 def test_fully_sharded_native_strategy_checkpoint(tmpdir):
    """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""

@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir):
    _run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))


-@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
+@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
 def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
    """Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""