CI: Azure - multiple configs (#12984)

* CI: Azure - multiple configs
* names
* benchmark
* Apply suggestions from code review

Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
This commit is contained in:
Jirka Borovec 2022-05-14 10:59:03 +09:00 committed by GitHub
parent d28e365669
commit fab2ff35ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 20 additions and 26 deletions

View File

@ -28,18 +28,12 @@ jobs:
cancelTimeoutInMinutes: "2"
pool: azure-gpus-spot
container:
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
workspace:
clean: all
steps:
- bash: |
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip list
displayName: 'Install PyTorch LTS'
- bash: |
python -m pytest tests/benchmarks -v --durations=0
displayName: 'Testing: benchmarks'

View File

@ -18,6 +18,12 @@ pr:
jobs:
- job: pytest
strategy:
matrix:
'PyTorch - LTS':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
'PyTorch - stable':
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
# how long to run the job before automatically cancelling
timeoutInMinutes: "65"
# how much time to give 'run always even if cancelled tasks' before stopping them
@ -25,11 +31,8 @@ jobs:
pool: azure-gpus-spot
# ToDo: this need to have installed docker in the base image...
container:
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
# run on torch 1.8 as it's the LTS version
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
image: $(image)
# default shm size is 64m. Increase it to avoid:
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
@ -52,8 +55,6 @@ jobs:
- bash: |
python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
pip install . --requirement requirements/devel.txt
pip install . --requirement requirements/strategies.txt

View File

@ -16,9 +16,9 @@ jobs:
pip install "check-jsonschema>=0.10"
- name: GH Workflows
run: |
check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
- name: Azure Pipelines
run: |
check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
env:
SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"

View File

@ -12,10 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
ARG CUDA_VERSION=11.3.1
ARG UBUNTU_VERSION=20.04
ARG CUDA_VERSION=11.3.1
# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG PYTHON_VERSION=3.9

View File

@ -9,16 +9,16 @@ from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
from tests.helpers.boring_model import BoringModel
from tests.helpers.runif import RunIf
if _TORCH_GREATER_EQUAL_1_11:
if _TORCH_GREATER_EQUAL_1_12:
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
from torch.distributed.fsdp.wrap import wrap
@RunIf(min_torch="1.11")
@RunIf(min_torch="1.12dev")
def test_invalid_on_cpu(tmpdir):
"""Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
with pytest.raises(
@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir):
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
@mock.patch("torch.cuda.device_count", return_value=1)
@mock.patch("torch.cuda.is_available", return_value=True)
@RunIf(min_torch="1.11")
@RunIf(min_torch="1.12dev")
def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
"""Test to ensure that plugin native amp plugin raises Misconfiguration error."""
with pytest.raises(
@ -102,7 +102,7 @@ class TestFSDPModel(BoringModel):
assert self.layer.module[2].reshard_after_forward is True
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
"""Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""
@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev")
def test_fully_sharded_native_strategy_checkpoint(tmpdir):
"""Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir):
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
"""Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""