ci: parameterize GPU testing (#16697)

2023-02-09 22:39:03 +09:00 · 2023-02-09 22:39:03 +09:00 · 4f35c7c356
parent 68850aada4
commit 4f35c7c356
2 changed files with 48 additions and 39 deletions
--- a/.azure/gpu-tests-fabric.yml
+++ b/.azure/gpu-tests-fabric.yml
@ -45,15 +45,21 @@ jobs:
    pool: lit-rtx-3090
    variables:
      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
      FREEZE_REQUIREMENTS: "1"
      COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))' )
    container:
      image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
      # default shm size is 64m. Increase it to avoid:
      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
      options: "--gpus=all --shm-size=2gb"
-    # TODO: package parametrization
+    strategy:
      matrix:
        'pkg: Fabric':
          PACKAGE_NAME: "fabric"
        'pkg: Lightning':
          PACKAGE_NAME: "lightning"
    workspace:
      clean: all
    steps:
    - bash: |
        echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
@ -62,6 +68,8 @@ jobs:
      displayName: 'set env. vars'
    - bash: |
        echo $(DEVICES)
        echo $(COVERAGE_SCOPE)
        echo $CUDA_VISIBLE_DEVICES
        echo $TORCH_URL
        lspci | egrep 'VGA|3D'
@ -80,11 +88,7 @@ jobs:
        done
      displayName: 'Adjust dependencies'
-    - bash: |
+    - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
        pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
      env:
        PACKAGE_NAME: "fabric"
        FREEZE_REQUIREMENTS: "1"
      displayName: 'Install package & dependencies'
    - bash: |
@ -94,17 +98,26 @@ jobs:
        python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
      displayName: 'Env details'
    - bash: python -m pytest lightning_fabric
      workingDirectory: src
      condition: eq(variables['PACKAGE_NAME'], 'fabric')
      displayName: 'Testing: Fabric doctests'
    - bash: |
        pip install -q -r .actions/requirements.txt
        python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
          --source_import="lightning.fabric,lightning.pytorch" \
          --target_import="lightning_fabric,pytorch_lightning"
-      displayName: 'Adjust tests'
+        python .actions/assistant.py copy_replace_imports --source_dir="./examples" \
          --source_import="lightning.fabric,lightning.pytorch" \
          --target_import="lightning_fabric,pytorch_lightning"
      condition: eq(variables['PACKAGE_NAME'], 'fabric')
      displayName: 'Adjust tests & examples'
-    - bash: python -m coverage run --source lightning_fabric -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+    - bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
      workingDirectory: tests/tests_fabric
      env:
        PL_RUN_CUDA_TESTS: "1"
      workingDirectory: tests/tests_fabric
      displayName: 'Testing: fabric standard'
      timeoutInMinutes: "10"
@ -113,6 +126,7 @@ jobs:
      env:
        PL_RUN_CUDA_TESTS: "1"
        PL_STANDALONE_TESTS_SOURCE: "lightning_fabric"
      condition: eq(variables['PACKAGE_NAME'], 'fabric')
      displayName: 'Testing: fabric standalone tests'
      timeoutInMinutes: "10"
@ -120,21 +134,13 @@ jobs:
        python -m coverage report
        python -m coverage xml
        python -m coverage html
-        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
+        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
          --flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
        ls -l
      workingDirectory: tests/tests_fabric
      displayName: 'Statistics'
    - task: PublishTestResults@2
      displayName: 'Publish test results'
      inputs:
        testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
      condition: succeededOrFailed()
    - script: |
        # In order to run the examples, we need to substitute the meta package imports with the standalone package
        python ../.actions/assistant.py copy_replace_imports --source_dir="./fabric" --source_import="lightning.fabric" --target_import="lightning_fabric.fabric"
        set -e
        bash run_fabric_examples.sh --accelerator=cuda --devices=1
        bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
--- a/.azure/gpu-tests-pytorch.yml
+++ b/.azure/gpu-tests-pytorch.yml
@ -40,21 +40,29 @@ pr:
 jobs:
  - job: testing
    # how long to run the job before automatically cancelling
    timeoutInMinutes: "80"
    # how much time to give 'run always even if cancelled tasks' before stopping them
    cancelTimeoutInMinutes: "2"
    strategy:
      matrix:
        'PyTorch & strategies':  # this uses torch 1.12 as not all strategies support 1.13 yet
          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
          scope: "strategies"
-        'PyTorch - latest':
+          PACKAGE_NAME: "pytorch"
        'PyTorch | latest':
          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
          scope: ""
-    # how long to run the job before automatically cancelling
+          PACKAGE_NAME: "pytorch"
-    timeoutInMinutes: "80"
+        'Lightning pkg':
-    # how much time to give 'run always even if cancelled tasks' before stopping them
+          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
-    cancelTimeoutInMinutes: "2"
+          scope: ""
          PACKAGE_NAME: "lightning"
    pool: lit-rtx-3090
    variables:
      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
      FREEZE_REQUIREMENTS: "1"
      COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))' )
    container:
      image: $(image)
      # default shm size is 64m. Increase it to avoid:
@ -62,7 +70,6 @@ jobs:
      options: "--gpus=all --shm-size=2gb"
    workspace:
      clean: all
    steps:
    - bash: |
@ -75,6 +82,8 @@ jobs:
      displayName: 'set env. vars'
    - bash: |
        echo $(DEVICES)
        echo $(COVERAGE_SCOPE)
        echo $CUDA_VISIBLE_DEVICES
        echo $CUDA_VERSION_MM
        echo $PYTORCH_VERSION
@ -95,9 +104,6 @@ jobs:
      displayName: 'Adjust dependencies'
    - bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL}
      env:
        PACKAGE_NAME: "pytorch"
        FREEZE_REQUIREMENTS: "1"
      displayName: 'Install package & extras'
    - bash: pip uninstall -y -r requirements/pytorch/strategies.txt
@ -132,6 +138,7 @@ jobs:
    - bash: python -m pytest pytorch_lightning
      workingDirectory: src
      condition: eq(variables['PACKAGE_NAME'], 'pytorch')
      displayName: 'Testing: PyTorch doctests'
    - bash: |
@ -139,12 +146,13 @@ jobs:
        python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
          --source_import="lightning.fabric,lightning.pytorch" \
          --target_import="lightning_fabric,pytorch_lightning"
      condition: eq(variables['PACKAGE_NAME'], 'pytorch')
      displayName: 'Adjust tests'
-    - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
+    - bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
      workingDirectory: tests/tests_pytorch
      env:
        PL_RUN_CUDA_TESTS: "1"
      workingDirectory: tests/tests_pytorch
      displayName: 'Testing: PyTorch standard'
      timeoutInMinutes: "35"
@ -155,6 +163,7 @@ jobs:
        PL_RUN_CUDA_TESTS: "1"
        PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning"
      displayName: 'Testing: PyTorch standalone tests'
      condition: eq(variables['PACKAGE_NAME'], 'pytorch')
      timeoutInMinutes: "35"
    - bash: bash run_standalone_tasks.sh
@ -169,18 +178,12 @@ jobs:
        python -m coverage report
        python -m coverage xml
        python -m coverage html
-        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
+        python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
          --flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
        ls -l
      workingDirectory: tests/tests_pytorch
      displayName: 'Statistics'
    - task: PublishTestResults@2
      displayName: 'Publish test results'
      inputs:
        testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
        testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
      condition: succeededOrFailed()
    - script: |
        set -e
        bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1