ci: parameterize GPU testing (#16697)

This commit is contained in:
Jirka Borovec 2023-02-09 22:39:03 +09:00 committed by GitHub
parent 68850aada4
commit 4f35c7c356
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 39 deletions

View File

@ -45,15 +45,21 @@ jobs:
pool: lit-rtx-3090 pool: lit-rtx-3090
variables: variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))' )
container: container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1" image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
# default shm size is 64m. Increase it to avoid: # default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8' # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb" options: "--gpus=all --shm-size=2gb"
# TODO: package parametrization strategy:
matrix:
'pkg: Fabric':
PACKAGE_NAME: "fabric"
'pkg: Lightning':
PACKAGE_NAME: "lightning"
workspace: workspace:
clean: all clean: all
steps: steps:
- bash: | - bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)" echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
@ -62,6 +68,8 @@ jobs:
displayName: 'set env. vars' displayName: 'set env. vars'
- bash: | - bash: |
echo $(DEVICES)
echo $(COVERAGE_SCOPE)
echo $CUDA_VISIBLE_DEVICES echo $CUDA_VISIBLE_DEVICES
echo $TORCH_URL echo $TORCH_URL
lspci | egrep 'VGA|3D' lspci | egrep 'VGA|3D'
@ -80,11 +88,7 @@ jobs:
done done
displayName: 'Adjust dependencies' displayName: 'Adjust dependencies'
- bash: | - bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "fabric"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package & dependencies' displayName: 'Install package & dependencies'
- bash: | - bash: |
@ -94,17 +98,26 @@ jobs:
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'" python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details' displayName: 'Env details'
- bash: python -m pytest lightning_fabric
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Testing: Fabric doctests'
- bash: | - bash: |
pip install -q -r .actions/requirements.txt pip install -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
--source_import="lightning.fabric,lightning.pytorch" \ --source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning" --target_import="lightning_fabric,pytorch_lightning"
displayName: 'Adjust tests' python .actions/assistant.py copy_replace_imports --source_dir="./examples" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Adjust tests & examples'
- bash: python -m coverage run --source lightning_fabric -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 - bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
workingDirectory: tests/tests_fabric
env: env:
PL_RUN_CUDA_TESTS: "1" PL_RUN_CUDA_TESTS: "1"
workingDirectory: tests/tests_fabric
displayName: 'Testing: fabric standard' displayName: 'Testing: fabric standard'
timeoutInMinutes: "10" timeoutInMinutes: "10"
@ -113,6 +126,7 @@ jobs:
env: env:
PL_RUN_CUDA_TESTS: "1" PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: "lightning_fabric" PL_STANDALONE_TESTS_SOURCE: "lightning_fabric"
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Testing: fabric standalone tests' displayName: 'Testing: fabric standalone tests'
timeoutInMinutes: "10" timeoutInMinutes: "10"
@ -120,21 +134,13 @@ jobs:
python -m coverage report python -m coverage report
python -m coverage xml python -m coverage xml
python -m coverage html python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
ls -l ls -l
workingDirectory: tests/tests_fabric workingDirectory: tests/tests_fabric
displayName: 'Statistics' displayName: 'Statistics'
- task: PublishTestResults@2
displayName: 'Publish test results'
inputs:
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()
- script: | - script: |
# In order to run the examples, we need to substitute the meta package imports with the standalone package
python ../.actions/assistant.py copy_replace_imports --source_dir="./fabric" --source_import="lightning.fabric" --target_import="lightning_fabric.fabric"
set -e set -e
bash run_fabric_examples.sh --accelerator=cuda --devices=1 bash run_fabric_examples.sh --accelerator=cuda --devices=1
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp

View File

@ -40,21 +40,29 @@ pr:
jobs: jobs:
- job: testing - job: testing
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
strategy: strategy:
matrix: matrix:
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet 'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
scope: "strategies" scope: "strategies"
'PyTorch - latest': PACKAGE_NAME: "pytorch"
'PyTorch | latest':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1" image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
scope: "" scope: ""
# how long to run the job before automatically cancelling PACKAGE_NAME: "pytorch"
timeoutInMinutes: "80" 'Lightning pkg':
# how much time to give 'run always even if cancelled tasks' before stopping them image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
cancelTimeoutInMinutes: "2" scope: ""
PACKAGE_NAME: "lightning"
pool: lit-rtx-3090 pool: lit-rtx-3090
variables: variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' ) DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))' )
container: container:
image: $(image) image: $(image)
# default shm size is 64m. Increase it to avoid: # default shm size is 64m. Increase it to avoid:
@ -62,7 +70,6 @@ jobs:
options: "--gpus=all --shm-size=2gb" options: "--gpus=all --shm-size=2gb"
workspace: workspace:
clean: all clean: all
steps: steps:
- bash: | - bash: |
@ -75,6 +82,8 @@ jobs:
displayName: 'set env. vars' displayName: 'set env. vars'
- bash: | - bash: |
echo $(DEVICES)
echo $(COVERAGE_SCOPE)
echo $CUDA_VISIBLE_DEVICES echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM echo $CUDA_VERSION_MM
echo $PYTORCH_VERSION echo $PYTORCH_VERSION
@ -95,9 +104,6 @@ jobs:
displayName: 'Adjust dependencies' displayName: 'Adjust dependencies'
- bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL} - bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "pytorch"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package & extras' displayName: 'Install package & extras'
- bash: pip uninstall -y -r requirements/pytorch/strategies.txt - bash: pip uninstall -y -r requirements/pytorch/strategies.txt
@ -132,6 +138,7 @@ jobs:
- bash: python -m pytest pytorch_lightning - bash: python -m pytest pytorch_lightning
workingDirectory: src workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
displayName: 'Testing: PyTorch doctests' displayName: 'Testing: PyTorch doctests'
- bash: | - bash: |
@ -139,12 +146,13 @@ jobs:
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \ python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
--source_import="lightning.fabric,lightning.pytorch" \ --source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning" --target_import="lightning_fabric,pytorch_lightning"
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
displayName: 'Adjust tests' displayName: 'Adjust tests'
- bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 - bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
workingDirectory: tests/tests_pytorch
env: env:
PL_RUN_CUDA_TESTS: "1" PL_RUN_CUDA_TESTS: "1"
workingDirectory: tests/tests_pytorch
displayName: 'Testing: PyTorch standard' displayName: 'Testing: PyTorch standard'
timeoutInMinutes: "35" timeoutInMinutes: "35"
@ -155,6 +163,7 @@ jobs:
PL_RUN_CUDA_TESTS: "1" PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning" PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning"
displayName: 'Testing: PyTorch standalone tests' displayName: 'Testing: PyTorch standalone tests'
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
timeoutInMinutes: "35" timeoutInMinutes: "35"
- bash: bash run_standalone_tasks.sh - bash: bash run_standalone_tasks.sh
@ -169,18 +178,12 @@ jobs:
python -m coverage report python -m coverage report
python -m coverage xml python -m coverage xml
python -m coverage html python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
ls -l ls -l
workingDirectory: tests/tests_pytorch workingDirectory: tests/tests_pytorch
displayName: 'Statistics' displayName: 'Statistics'
- task: PublishTestResults@2
displayName: 'Publish test results'
inputs:
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()
- script: | - script: |
set -e set -e
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1 bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1