ci: parameterize GPU testing (#16697)
This commit is contained in:
parent
68850aada4
commit
4f35c7c356
|
@ -45,15 +45,21 @@ jobs:
|
|||
pool: lit-rtx-3090
|
||||
variables:
|
||||
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
||||
FREEZE_REQUIREMENTS: "1"
|
||||
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))' )
|
||||
container:
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
||||
# default shm size is 64m. Increase it to avoid:
|
||||
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
|
||||
options: "--gpus=all --shm-size=2gb"
|
||||
# TODO: package parametrization
|
||||
strategy:
|
||||
matrix:
|
||||
'pkg: Fabric':
|
||||
PACKAGE_NAME: "fabric"
|
||||
'pkg: Lightning':
|
||||
PACKAGE_NAME: "lightning"
|
||||
workspace:
|
||||
clean: all
|
||||
|
||||
steps:
|
||||
- bash: |
|
||||
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
|
||||
|
@ -62,6 +68,8 @@ jobs:
|
|||
displayName: 'set env. vars'
|
||||
|
||||
- bash: |
|
||||
echo $(DEVICES)
|
||||
echo $(COVERAGE_SCOPE)
|
||||
echo $CUDA_VISIBLE_DEVICES
|
||||
echo $TORCH_URL
|
||||
lspci | egrep 'VGA|3D'
|
||||
|
@ -80,11 +88,7 @@ jobs:
|
|||
done
|
||||
displayName: 'Adjust dependencies'
|
||||
|
||||
- bash: |
|
||||
pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
|
||||
env:
|
||||
PACKAGE_NAME: "fabric"
|
||||
FREEZE_REQUIREMENTS: "1"
|
||||
- bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
|
||||
displayName: 'Install package & dependencies'
|
||||
|
||||
- bash: |
|
||||
|
@ -94,17 +98,26 @@ jobs:
|
|||
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
|
||||
displayName: 'Env details'
|
||||
|
||||
- bash: python -m pytest lightning_fabric
|
||||
workingDirectory: src
|
||||
condition: eq(variables['PACKAGE_NAME'], 'fabric')
|
||||
displayName: 'Testing: Fabric doctests'
|
||||
|
||||
- bash: |
|
||||
pip install -q -r .actions/requirements.txt
|
||||
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
|
||||
--source_import="lightning.fabric,lightning.pytorch" \
|
||||
--target_import="lightning_fabric,pytorch_lightning"
|
||||
displayName: 'Adjust tests'
|
||||
python .actions/assistant.py copy_replace_imports --source_dir="./examples" \
|
||||
--source_import="lightning.fabric,lightning.pytorch" \
|
||||
--target_import="lightning_fabric,pytorch_lightning"
|
||||
condition: eq(variables['PACKAGE_NAME'], 'fabric')
|
||||
displayName: 'Adjust tests & examples'
|
||||
|
||||
- bash: python -m coverage run --source lightning_fabric -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
|
||||
- bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
|
||||
workingDirectory: tests/tests_fabric
|
||||
env:
|
||||
PL_RUN_CUDA_TESTS: "1"
|
||||
workingDirectory: tests/tests_fabric
|
||||
displayName: 'Testing: fabric standard'
|
||||
timeoutInMinutes: "10"
|
||||
|
||||
|
@ -113,6 +126,7 @@ jobs:
|
|||
env:
|
||||
PL_RUN_CUDA_TESTS: "1"
|
||||
PL_STANDALONE_TESTS_SOURCE: "lightning_fabric"
|
||||
condition: eq(variables['PACKAGE_NAME'], 'fabric')
|
||||
displayName: 'Testing: fabric standalone tests'
|
||||
timeoutInMinutes: "10"
|
||||
|
||||
|
@ -120,21 +134,13 @@ jobs:
|
|||
python -m coverage report
|
||||
python -m coverage xml
|
||||
python -m coverage html
|
||||
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
|
||||
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
|
||||
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
|
||||
ls -l
|
||||
workingDirectory: tests/tests_fabric
|
||||
displayName: 'Statistics'
|
||||
|
||||
- task: PublishTestResults@2
|
||||
displayName: 'Publish test results'
|
||||
inputs:
|
||||
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
|
||||
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
|
||||
condition: succeededOrFailed()
|
||||
|
||||
- script: |
|
||||
# In order to run the examples, we need to substitute the meta package imports with the standalone package
|
||||
python ../.actions/assistant.py copy_replace_imports --source_dir="./fabric" --source_import="lightning.fabric" --target_import="lightning_fabric.fabric"
|
||||
set -e
|
||||
bash run_fabric_examples.sh --accelerator=cuda --devices=1
|
||||
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
|
||||
|
|
|
@ -40,21 +40,29 @@ pr:
|
|||
|
||||
jobs:
|
||||
- job: testing
|
||||
# how long to run the job before automatically cancelling
|
||||
timeoutInMinutes: "80"
|
||||
# how much time to give 'run always even if cancelled tasks' before stopping them
|
||||
cancelTimeoutInMinutes: "2"
|
||||
strategy:
|
||||
matrix:
|
||||
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
|
||||
scope: "strategies"
|
||||
'PyTorch - latest':
|
||||
PACKAGE_NAME: "pytorch"
|
||||
'PyTorch | latest':
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
||||
scope: ""
|
||||
# how long to run the job before automatically cancelling
|
||||
timeoutInMinutes: "80"
|
||||
# how much time to give 'run always even if cancelled tasks' before stopping them
|
||||
cancelTimeoutInMinutes: "2"
|
||||
PACKAGE_NAME: "pytorch"
|
||||
'Lightning pkg':
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
||||
scope: ""
|
||||
PACKAGE_NAME: "lightning"
|
||||
pool: lit-rtx-3090
|
||||
variables:
|
||||
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
||||
FREEZE_REQUIREMENTS: "1"
|
||||
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))' )
|
||||
container:
|
||||
image: $(image)
|
||||
# default shm size is 64m. Increase it to avoid:
|
||||
|
@ -62,7 +70,6 @@ jobs:
|
|||
options: "--gpus=all --shm-size=2gb"
|
||||
workspace:
|
||||
clean: all
|
||||
|
||||
steps:
|
||||
|
||||
- bash: |
|
||||
|
@ -75,6 +82,8 @@ jobs:
|
|||
displayName: 'set env. vars'
|
||||
|
||||
- bash: |
|
||||
echo $(DEVICES)
|
||||
echo $(COVERAGE_SCOPE)
|
||||
echo $CUDA_VISIBLE_DEVICES
|
||||
echo $CUDA_VERSION_MM
|
||||
echo $PYTORCH_VERSION
|
||||
|
@ -95,9 +104,6 @@ jobs:
|
|||
displayName: 'Adjust dependencies'
|
||||
|
||||
- bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL}
|
||||
env:
|
||||
PACKAGE_NAME: "pytorch"
|
||||
FREEZE_REQUIREMENTS: "1"
|
||||
displayName: 'Install package & extras'
|
||||
|
||||
- bash: pip uninstall -y -r requirements/pytorch/strategies.txt
|
||||
|
@ -132,6 +138,7 @@ jobs:
|
|||
|
||||
- bash: python -m pytest pytorch_lightning
|
||||
workingDirectory: src
|
||||
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
|
||||
displayName: 'Testing: PyTorch doctests'
|
||||
|
||||
- bash: |
|
||||
|
@ -139,12 +146,13 @@ jobs:
|
|||
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
|
||||
--source_import="lightning.fabric,lightning.pytorch" \
|
||||
--target_import="lightning_fabric,pytorch_lightning"
|
||||
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
|
||||
displayName: 'Adjust tests'
|
||||
|
||||
- bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
|
||||
- bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
|
||||
workingDirectory: tests/tests_pytorch
|
||||
env:
|
||||
PL_RUN_CUDA_TESTS: "1"
|
||||
workingDirectory: tests/tests_pytorch
|
||||
displayName: 'Testing: PyTorch standard'
|
||||
timeoutInMinutes: "35"
|
||||
|
||||
|
@ -155,6 +163,7 @@ jobs:
|
|||
PL_RUN_CUDA_TESTS: "1"
|
||||
PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning"
|
||||
displayName: 'Testing: PyTorch standalone tests'
|
||||
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
|
||||
timeoutInMinutes: "35"
|
||||
|
||||
- bash: bash run_standalone_tasks.sh
|
||||
|
@ -169,18 +178,12 @@ jobs:
|
|||
python -m coverage report
|
||||
python -m coverage xml
|
||||
python -m coverage html
|
||||
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
|
||||
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
|
||||
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
|
||||
ls -l
|
||||
workingDirectory: tests/tests_pytorch
|
||||
displayName: 'Statistics'
|
||||
|
||||
- task: PublishTestResults@2
|
||||
displayName: 'Publish test results'
|
||||
inputs:
|
||||
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
|
||||
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
|
||||
condition: succeededOrFailed()
|
||||
|
||||
- script: |
|
||||
set -e
|
||||
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1
|
||||
|
|
Loading…
Reference in New Issue