ci: parameterize GPU testing (#16697)
This commit is contained in:
parent
68850aada4
commit
4f35c7c356
|
@ -45,15 +45,21 @@ jobs:
|
||||||
pool: lit-rtx-3090
|
pool: lit-rtx-3090
|
||||||
variables:
|
variables:
|
||||||
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
||||||
|
FREEZE_REQUIREMENTS: "1"
|
||||||
|
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))' )
|
||||||
container:
|
container:
|
||||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
||||||
# default shm size is 64m. Increase it to avoid:
|
# default shm size is 64m. Increase it to avoid:
|
||||||
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
|
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
|
||||||
options: "--gpus=all --shm-size=2gb"
|
options: "--gpus=all --shm-size=2gb"
|
||||||
# TODO: package parametrization
|
strategy:
|
||||||
|
matrix:
|
||||||
|
'pkg: Fabric':
|
||||||
|
PACKAGE_NAME: "fabric"
|
||||||
|
'pkg: Lightning':
|
||||||
|
PACKAGE_NAME: "lightning"
|
||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- bash: |
|
- bash: |
|
||||||
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
|
echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
|
||||||
|
@ -62,6 +68,8 @@ jobs:
|
||||||
displayName: 'set env. vars'
|
displayName: 'set env. vars'
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
|
echo $(DEVICES)
|
||||||
|
echo $(COVERAGE_SCOPE)
|
||||||
echo $CUDA_VISIBLE_DEVICES
|
echo $CUDA_VISIBLE_DEVICES
|
||||||
echo $TORCH_URL
|
echo $TORCH_URL
|
||||||
lspci | egrep 'VGA|3D'
|
lspci | egrep 'VGA|3D'
|
||||||
|
@ -80,11 +88,7 @@ jobs:
|
||||||
done
|
done
|
||||||
displayName: 'Adjust dependencies'
|
displayName: 'Adjust dependencies'
|
||||||
|
|
||||||
- bash: |
|
- bash: pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
|
||||||
pip install -e .[dev,strategies,examples] --find-links ${TORCH_URL}
|
|
||||||
env:
|
|
||||||
PACKAGE_NAME: "fabric"
|
|
||||||
FREEZE_REQUIREMENTS: "1"
|
|
||||||
displayName: 'Install package & dependencies'
|
displayName: 'Install package & dependencies'
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
|
@ -94,17 +98,26 @@ jobs:
|
||||||
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
|
python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
|
||||||
displayName: 'Env details'
|
displayName: 'Env details'
|
||||||
|
|
||||||
|
- bash: python -m pytest lightning_fabric
|
||||||
|
workingDirectory: src
|
||||||
|
condition: eq(variables['PACKAGE_NAME'], 'fabric')
|
||||||
|
displayName: 'Testing: Fabric doctests'
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
pip install -q -r .actions/requirements.txt
|
pip install -q -r .actions/requirements.txt
|
||||||
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
|
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
|
||||||
--source_import="lightning.fabric,lightning.pytorch" \
|
--source_import="lightning.fabric,lightning.pytorch" \
|
||||||
--target_import="lightning_fabric,pytorch_lightning"
|
--target_import="lightning_fabric,pytorch_lightning"
|
||||||
displayName: 'Adjust tests'
|
python .actions/assistant.py copy_replace_imports --source_dir="./examples" \
|
||||||
|
--source_import="lightning.fabric,lightning.pytorch" \
|
||||||
|
--target_import="lightning_fabric,pytorch_lightning"
|
||||||
|
condition: eq(variables['PACKAGE_NAME'], 'fabric')
|
||||||
|
displayName: 'Adjust tests & examples'
|
||||||
|
|
||||||
- bash: python -m coverage run --source lightning_fabric -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
|
- bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
|
||||||
|
workingDirectory: tests/tests_fabric
|
||||||
env:
|
env:
|
||||||
PL_RUN_CUDA_TESTS: "1"
|
PL_RUN_CUDA_TESTS: "1"
|
||||||
workingDirectory: tests/tests_fabric
|
|
||||||
displayName: 'Testing: fabric standard'
|
displayName: 'Testing: fabric standard'
|
||||||
timeoutInMinutes: "10"
|
timeoutInMinutes: "10"
|
||||||
|
|
||||||
|
@ -113,6 +126,7 @@ jobs:
|
||||||
env:
|
env:
|
||||||
PL_RUN_CUDA_TESTS: "1"
|
PL_RUN_CUDA_TESTS: "1"
|
||||||
PL_STANDALONE_TESTS_SOURCE: "lightning_fabric"
|
PL_STANDALONE_TESTS_SOURCE: "lightning_fabric"
|
||||||
|
condition: eq(variables['PACKAGE_NAME'], 'fabric')
|
||||||
displayName: 'Testing: fabric standalone tests'
|
displayName: 'Testing: fabric standalone tests'
|
||||||
timeoutInMinutes: "10"
|
timeoutInMinutes: "10"
|
||||||
|
|
||||||
|
@ -120,21 +134,13 @@ jobs:
|
||||||
python -m coverage report
|
python -m coverage report
|
||||||
python -m coverage xml
|
python -m coverage xml
|
||||||
python -m coverage html
|
python -m coverage html
|
||||||
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
|
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
|
||||||
|
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
|
||||||
ls -l
|
ls -l
|
||||||
workingDirectory: tests/tests_fabric
|
workingDirectory: tests/tests_fabric
|
||||||
displayName: 'Statistics'
|
displayName: 'Statistics'
|
||||||
|
|
||||||
- task: PublishTestResults@2
|
|
||||||
displayName: 'Publish test results'
|
|
||||||
inputs:
|
|
||||||
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
|
|
||||||
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
|
|
||||||
condition: succeededOrFailed()
|
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
# In order to run the examples, we need to substitute the meta package imports with the standalone package
|
|
||||||
python ../.actions/assistant.py copy_replace_imports --source_dir="./fabric" --source_import="lightning.fabric" --target_import="lightning_fabric.fabric"
|
|
||||||
set -e
|
set -e
|
||||||
bash run_fabric_examples.sh --accelerator=cuda --devices=1
|
bash run_fabric_examples.sh --accelerator=cuda --devices=1
|
||||||
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
|
bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
|
||||||
|
|
|
@ -40,21 +40,29 @@ pr:
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
- job: testing
|
- job: testing
|
||||||
|
# how long to run the job before automatically cancelling
|
||||||
|
timeoutInMinutes: "80"
|
||||||
|
# how much time to give 'run always even if cancelled tasks' before stopping them
|
||||||
|
cancelTimeoutInMinutes: "2"
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
|
'PyTorch & strategies': # this uses torch 1.12 as not all strategies support 1.13 yet
|
||||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
|
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1"
|
||||||
scope: "strategies"
|
scope: "strategies"
|
||||||
'PyTorch - latest':
|
PACKAGE_NAME: "pytorch"
|
||||||
|
'PyTorch | latest':
|
||||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
||||||
scope: ""
|
scope: ""
|
||||||
# how long to run the job before automatically cancelling
|
PACKAGE_NAME: "pytorch"
|
||||||
timeoutInMinutes: "80"
|
'Lightning pkg':
|
||||||
# how much time to give 'run always even if cancelled tasks' before stopping them
|
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.13-cuda11.7.1"
|
||||||
cancelTimeoutInMinutes: "2"
|
scope: ""
|
||||||
|
PACKAGE_NAME: "lightning"
|
||||||
pool: lit-rtx-3090
|
pool: lit-rtx-3090
|
||||||
variables:
|
variables:
|
||||||
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
|
||||||
|
FREEZE_REQUIREMENTS: "1"
|
||||||
|
COVERAGE_SCOPE: $( python -c 'n = "$(PACKAGE_NAME)" ; print(dict(pytorch="pytorch_lightning").get(n, n))' )
|
||||||
container:
|
container:
|
||||||
image: $(image)
|
image: $(image)
|
||||||
# default shm size is 64m. Increase it to avoid:
|
# default shm size is 64m. Increase it to avoid:
|
||||||
|
@ -62,7 +70,6 @@ jobs:
|
||||||
options: "--gpus=all --shm-size=2gb"
|
options: "--gpus=all --shm-size=2gb"
|
||||||
workspace:
|
workspace:
|
||||||
clean: all
|
clean: all
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
|
@ -75,6 +82,8 @@ jobs:
|
||||||
displayName: 'set env. vars'
|
displayName: 'set env. vars'
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
|
echo $(DEVICES)
|
||||||
|
echo $(COVERAGE_SCOPE)
|
||||||
echo $CUDA_VISIBLE_DEVICES
|
echo $CUDA_VISIBLE_DEVICES
|
||||||
echo $CUDA_VERSION_MM
|
echo $CUDA_VERSION_MM
|
||||||
echo $PYTORCH_VERSION
|
echo $PYTORCH_VERSION
|
||||||
|
@ -95,9 +104,6 @@ jobs:
|
||||||
displayName: 'Adjust dependencies'
|
displayName: 'Adjust dependencies'
|
||||||
|
|
||||||
- bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL}
|
- bash: pip install -e .[extra,test,examples] --find-links ${TORCH_URL}
|
||||||
env:
|
|
||||||
PACKAGE_NAME: "pytorch"
|
|
||||||
FREEZE_REQUIREMENTS: "1"
|
|
||||||
displayName: 'Install package & extras'
|
displayName: 'Install package & extras'
|
||||||
|
|
||||||
- bash: pip uninstall -y -r requirements/pytorch/strategies.txt
|
- bash: pip uninstall -y -r requirements/pytorch/strategies.txt
|
||||||
|
@ -132,6 +138,7 @@ jobs:
|
||||||
|
|
||||||
- bash: python -m pytest pytorch_lightning
|
- bash: python -m pytest pytorch_lightning
|
||||||
workingDirectory: src
|
workingDirectory: src
|
||||||
|
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
|
||||||
displayName: 'Testing: PyTorch doctests'
|
displayName: 'Testing: PyTorch doctests'
|
||||||
|
|
||||||
- bash: |
|
- bash: |
|
||||||
|
@ -139,12 +146,13 @@ jobs:
|
||||||
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
|
python .actions/assistant.py copy_replace_imports --source_dir="./tests" \
|
||||||
--source_import="lightning.fabric,lightning.pytorch" \
|
--source_import="lightning.fabric,lightning.pytorch" \
|
||||||
--target_import="lightning_fabric,pytorch_lightning"
|
--target_import="lightning_fabric,pytorch_lightning"
|
||||||
|
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
|
||||||
displayName: 'Adjust tests'
|
displayName: 'Adjust tests'
|
||||||
|
|
||||||
- bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50
|
- bash: python -m coverage run --source $(COVERAGE_SCOPE) -m pytest --ignore benchmarks -v --durations=50
|
||||||
|
workingDirectory: tests/tests_pytorch
|
||||||
env:
|
env:
|
||||||
PL_RUN_CUDA_TESTS: "1"
|
PL_RUN_CUDA_TESTS: "1"
|
||||||
workingDirectory: tests/tests_pytorch
|
|
||||||
displayName: 'Testing: PyTorch standard'
|
displayName: 'Testing: PyTorch standard'
|
||||||
timeoutInMinutes: "35"
|
timeoutInMinutes: "35"
|
||||||
|
|
||||||
|
@ -155,6 +163,7 @@ jobs:
|
||||||
PL_RUN_CUDA_TESTS: "1"
|
PL_RUN_CUDA_TESTS: "1"
|
||||||
PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning"
|
PL_STANDALONE_TESTS_SOURCE: "pytorch_lightning"
|
||||||
displayName: 'Testing: PyTorch standalone tests'
|
displayName: 'Testing: PyTorch standalone tests'
|
||||||
|
condition: eq(variables['PACKAGE_NAME'], 'pytorch')
|
||||||
timeoutInMinutes: "35"
|
timeoutInMinutes: "35"
|
||||||
|
|
||||||
- bash: bash run_standalone_tasks.sh
|
- bash: bash run_standalone_tasks.sh
|
||||||
|
@ -169,18 +178,12 @@ jobs:
|
||||||
python -m coverage report
|
python -m coverage report
|
||||||
python -m coverage xml
|
python -m coverage xml
|
||||||
python -m coverage html
|
python -m coverage html
|
||||||
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) --flags=gpu,pytest --name="GPU-coverage" --env=linux,azure
|
python -m codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
|
||||||
|
--flags=gpu,pytest,$(COVERAGE_SCOPE) --name="GPU-coverage" --env=linux,azure
|
||||||
ls -l
|
ls -l
|
||||||
workingDirectory: tests/tests_pytorch
|
workingDirectory: tests/tests_pytorch
|
||||||
displayName: 'Statistics'
|
displayName: 'Statistics'
|
||||||
|
|
||||||
- task: PublishTestResults@2
|
|
||||||
displayName: 'Publish test results'
|
|
||||||
inputs:
|
|
||||||
testResultsFiles: '$(Build.StagingDirectory)/test-results.xml'
|
|
||||||
testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)'
|
|
||||||
condition: succeededOrFailed()
|
|
||||||
|
|
||||||
- script: |
|
- script: |
|
||||||
set -e
|
set -e
|
||||||
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1
|
bash run_pl_examples.sh --trainer.accelerator=gpu --trainer.devices=1
|
||||||
|
|
Loading…
Reference in New Issue