ci: parameterize GPU testing (#16697)

This commit is contained in:
Jirka Borovec 2023-02-09 22:39:03 +09:00 committed by GitHub
parent 68850aada4
commit 4f35c7c356
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 48 additions and 39 deletions

View File

@ -45,15 +45,21 @@ jobs:
pool: lit-rtx-3090
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))' )
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--gpus=all --shm-size=2gb"
# TODO: package parametrization
strategy:
matrix:
'pkg: Fabric':
PACKAGE_NAME: "fabric"
'pkg: Lightning':
PACKAGE_NAME: "lightning"
workspace:
clean: all
steps:
- bash: |
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
@ -62,6 +68,8 @@ jobs:
displayName: 'set env. vars'
- bash: |
echo $(DEVICES)
echo $(COVERAGE_SCOPE)
echo $CUDA_VISIBLE_DEVICES
echo $TORCH_URL
lspci | egrep 'VGA|3D'
@ -80,11 +88,7 @@ jobs:
done
displayName: 'Adjust dependencies'
- bash: |
pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "fabric"
FREEZE_REQUIREMENTS: "1"
- bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
displayName: 'Install package & dependencies'
- bash: |
@ -94,17 +98,26 @@ jobs:
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
displayName: 'Env details'
- bash: python -m pytest lightning_fabric
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Testing: Fabric doctests'
- bash: |
pip install -q -r .actions/requirements.txt
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
displayName: 'Adjust tests'
python .actions/assistant.py copy_replace_imports --source_dir="./examples" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Adjust tests & examples'
- bash: python -m coverage run --source lightning_fabric -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
- bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
workingDirectory: tests/tests_fabric
displayName: 'Testing: fabric standard'
timeoutInMinutes: "10"
@ -113,6 +126,7 @@ jobs:
env:
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: "lightning_fabric"
condition: eq(variables['PACKAGE_NAME'], 'fabric')
displayName: 'Testing: fabric standalone tests'
timeoutInMinutes: "10"
@ -120,21 +134,13 @@ jobs:
python -m coverage report
python -m coverage xml
python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_fabric
displayName: 'Statistics'
- task: PublishTestResults@2
displayName: 'Publish test results'
inputs:
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()
- script: |
# In order to run the examples, we need to substitute the meta package imports with the standalone package
python ../.actions/assistant.py copy_replace_imports --source_dir="./fabric" --source_import="lightning.fabric" --target_import="lightning_fabric.fabric"
set -e
bash run_fabric_examples.sh --accelerator=cuda --devices=1
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp

View File

@ -40,21 +40,29 @@ pr:
jobs:
- job: testing
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
strategy:
matrix:
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
scope: "strategies"
'PyTorch - latest':
PACKAGE_NAME: "pytorch"
'PyTorch | latest':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
scope: ""
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
cancelTimeoutInMinutes: "2"
PACKAGE_NAME: "pytorch"
'Lightning pkg':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
scope: ""
PACKAGE_NAME: "lightning"
pool: lit-rtx-3090
variables:
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
FREEZE_REQUIREMENTS: "1"
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))' )
container:
image: $(image)
# default shm size is 64m. Increase it to avoid:
@ -62,7 +70,6 @@ jobs:
options: "--gpus=all --shm-size=2gb"
workspace:
clean: all
steps:
- bash: |
@ -75,6 +82,8 @@ jobs:
displayName: 'set env. vars'
- bash: |
echo $(DEVICES)
echo $(COVERAGE_SCOPE)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
echo $PYTORCH_VERSION
@ -95,9 +104,6 @@ jobs:
displayName: 'Adjust dependencies'
- bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL}
env:
PACKAGE_NAME: "pytorch"
FREEZE_REQUIREMENTS: "1"
displayName: 'Install package & extras'
- bash: pip uninstall -y -r requirements/pytorch/strategies.txt
@ -132,6 +138,7 @@ jobs:
- bash: python -m pytest pytorch_lightning
workingDirectory: src
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
displayName: 'Testing: PyTorch doctests'
- bash: |
@ -139,12 +146,13 @@ jobs:
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
--source_import="lightning.fabric,lightning.pytorch" \
--target_import="lightning_fabric,pytorch_lightning"
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
displayName: 'Adjust tests'
- bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
- bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
workingDirectory: tests/tests_pytorch
env:
PL_RUN_CUDA_TESTS: "1"
workingDirectory: tests/tests_pytorch
displayName: 'Testing: PyTorch standard'
timeoutInMinutes: "35"
@ -155,6 +163,7 @@ jobs:
PL_RUN_CUDA_TESTS: "1"
PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning"
displayName: 'Testing: PyTorch standalone tests'
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
timeoutInMinutes: "35"
- bash: bash run_standalone_tasks.sh
@ -169,18 +178,12 @@ jobs:
python -m coverage report
python -m coverage xml
python -m coverage html
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
ls -l
workingDirectory: tests/tests_pytorch
displayName: 'Statistics'
- task: PublishTestResults@2
displayName: 'Publish test results'
inputs:
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
condition: succeededOrFailed()
- script: |
set -e
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1