CI: Azure - multiple configs (#12984)
* CI: Azure - multiple configs * names * benchmark * Apply suggestions from code review Co-authored-by: Akihiro Nitta <nitta@akihironitta.com> Co-authored-by: Carlos Mocholí <carlossmocholi@gmail.com>
This commit is contained in:
parent
d28e365669
commit
fab2ff35ad
|
@ -28,18 +28,12 @@ jobs:
|
|||
cancelTimeoutInMinutes: "2"
|
||||
pool: azure-gpus-spot
|
||||
container:
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
|
||||
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
|
||||
workspace:
|
||||
clean: all
|
||||
|
||||
steps:
|
||||
- bash: |
|
||||
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
|
||||
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
|
||||
pip list
|
||||
displayName: 'Install PyTorch LTS'
|
||||
|
||||
- bash: |
|
||||
python -m pytest tests/benchmarks -v --durations=0
|
||||
displayName: 'Testing: benchmarks'
|
||||
|
|
|
@ -18,6 +18,12 @@ pr:
|
|||
|
||||
jobs:
|
||||
- job: pytest
|
||||
strategy:
|
||||
matrix:
|
||||
'PyTorch - LTS':
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
|
||||
'PyTorch - stable':
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11"
|
||||
# how long to run the job before automatically cancelling
|
||||
timeoutInMinutes: "65"
|
||||
# how much time to give 'run always even if cancelled tasks' before stopping them
|
||||
|
@ -25,11 +31,8 @@ jobs:
|
|||
|
||||
pool: azure-gpus-spot
|
||||
|
||||
# ToDo: this need to have installed docker in the base image...
|
||||
container:
|
||||
# base ML image: mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.2-cudnn8-ubuntu18.04
|
||||
# run on torch 1.8 as it's the LTS version
|
||||
image: "pytorchlightning/pytorch_lightning:base-cuda-py3.7-torch1.8"
|
||||
image: $(image)
|
||||
# default shm size is 64m. Increase it to avoid:
|
||||
# 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
|
||||
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=512m"
|
||||
|
@ -52,8 +55,6 @@ jobs:
|
|||
- bash: |
|
||||
python -c "fname = 'requirements/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)"
|
||||
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
|
||||
# TODO: Prepare a docker image with 1.8.2 (LTS) installed and remove manual installation.
|
||||
pip install torch==1.8.2+cu102 torchvision==0.9.2+cu102 torchtext==0.9.2 -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
|
||||
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
|
||||
pip install . --requirement requirements/devel.txt
|
||||
pip install . --requirement requirements/strategies.txt
|
||||
|
|
|
@ -16,9 +16,9 @@ jobs:
|
|||
pip install "check-jsonschema>=0.10"
|
||||
|
||||
- name: GH Workflows
|
||||
run: |
|
||||
check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
|
||||
run: check-jsonschema .github/workflows/*.yml --builtin-schema "github-workflows"
|
||||
|
||||
- name: Azure Pipelines
|
||||
run: |
|
||||
check-jsonschema .azure-pipelines/*.yml --schemafile "https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.188.1/service-schema.json"
|
||||
env:
|
||||
SCHEMA_FILE: https://raw.githubusercontent.com/microsoft/azure-pipelines-vscode/v1.204.0/service-schema.json
|
||||
run: check-jsonschema .azure-pipelines/*.yml --schemafile "$SCHEMA_FILE"
|
||||
|
|
|
@ -12,10 +12,9 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
ARG CUDA_VERSION=11.3.1
|
||||
ARG UBUNTU_VERSION=20.04
|
||||
ARG CUDA_VERSION=11.3.1
|
||||
|
||||
# TODO: Remove OS arg to always use ubuntu20.04 when dropping CUDA 10.2
|
||||
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||
|
||||
ARG PYTHON_VERSION=3.9
|
||||
|
|
|
@ -9,16 +9,16 @@ from pytorch_lightning import Trainer
|
|||
from pytorch_lightning.callbacks import ModelCheckpoint
|
||||
from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy
|
||||
from pytorch_lightning.utilities.exceptions import MisconfigurationException
|
||||
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11
|
||||
from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12
|
||||
from tests.helpers.boring_model import BoringModel
|
||||
from tests.helpers.runif import RunIf
|
||||
|
||||
if _TORCH_GREATER_EQUAL_1_11:
|
||||
if _TORCH_GREATER_EQUAL_1_12:
|
||||
from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
|
||||
from torch.distributed.fsdp.wrap import wrap
|
||||
|
||||
|
||||
@RunIf(min_torch="1.11")
|
||||
@RunIf(min_torch="1.12dev")
|
||||
def test_invalid_on_cpu(tmpdir):
|
||||
"""Test to ensure that to raise Misconfiguration for Native FSDP on CPU."""
|
||||
with pytest.raises(
|
||||
|
@ -34,7 +34,7 @@ def test_invalid_on_cpu(tmpdir):
|
|||
@mock.patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": "0"})
|
||||
@mock.patch("torch.cuda.device_count", return_value=1)
|
||||
@mock.patch("torch.cuda.is_available", return_value=True)
|
||||
@RunIf(min_torch="1.11")
|
||||
@RunIf(min_torch="1.12dev")
|
||||
def test_fsdp_with_sharded_amp(device_count_mock, mock_cuda_available, tmpdir):
|
||||
"""Test to ensure that plugin native amp plugin raises Misconfiguration error."""
|
||||
with pytest.raises(
|
||||
|
@ -102,7 +102,7 @@ class TestFSDPModel(BoringModel):
|
|||
assert self.layer.module[2].reshard_after_forward is True
|
||||
|
||||
|
||||
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
|
||||
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
|
||||
def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
|
||||
"""Test to ensure that sync_batchnorm works when using fsdp_native and GPU, and all stages can be run."""
|
||||
|
||||
|
@ -119,7 +119,7 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir):
|
|||
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
|
||||
|
||||
|
||||
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.11")
|
||||
@RunIf(min_gpus=1, skip_windows=True, standalone=True, min_torch="1.12dev")
|
||||
def test_fully_sharded_native_strategy_checkpoint(tmpdir):
|
||||
"""Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run."""
|
||||
|
||||
|
@ -130,7 +130,7 @@ def test_fully_sharded_native_strategy_checkpoint(tmpdir):
|
|||
_run_multiple_stages(trainer, model, os.path.join(tmpdir, "last.ckpt"))
|
||||
|
||||
|
||||
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.11")
|
||||
@RunIf(min_gpus=2, skip_windows=True, standalone=True, min_torch="1.12dev")
|
||||
def test_fully_sharded_native_strategy_checkpoint_multi_gpus(tmpdir):
|
||||
"""Test to ensure that checkpoint is saved correctly when using multiple GPUs, and all stages can be run."""
|
||||
|
||||
|
|
Loading…
Reference in New Issue