DDP Parity tests as standalone task (#17503)

This commit is contained in:
Adrian Wälchli 2023-05-03 05:36:07 +02:00 committed by GitHub
parent 9eedf7ae4b
commit 249395bfe0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 57 additions and 12 deletions

View File

@ -138,6 +138,13 @@ jobs:
displayName: 'Testing: fabric standalone tests'
timeoutInMinutes: "10"
- bash: bash run_standalone_tasks.sh
workingDirectory: tests/tests_fabric
env:
PL_RUN_CUDA_TESTS: "1"
displayName: 'Testing: fabric standalone tasks'
timeoutInMinutes: "10"
- bash: |
python -m coverage report
python -m coverage xml

View File

@ -15,7 +15,6 @@ import os
import time
from copy import deepcopy
import pytest
import torch
import torch.distributed
import torch.nn.functional
@ -24,7 +23,6 @@ from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
from lightning.fabric.fabric import Fabric
from tests_fabric.helpers.runif import RunIf
from tests_fabric.parity.models import ConvNet
from tests_fabric.parity.utils import (
cuda_reset,
@ -125,16 +123,7 @@ def train_fabric_ddp(fabric):
return model.state_dict(), torch.tensor(iteration_timings), memory_stats
@RunIf(standalone=True)
@pytest.mark.usefixtures("reset_deterministic_algorithm", "reset_cudnn_benchmark")
@pytest.mark.parametrize(
"accelerator, devices, tolerance",
[
("cpu", 2, 0.02),
pytest.param("cuda", 2, 0.01, marks=RunIf(min_cuda_gpus=2)),
],
)
def test_parity_ddp(accelerator, devices, tolerance):
def run_parity_test(accelerator: str = "cpu", devices: int = 2, tolerance: float = 0.02):
cuda_reset()
# Launch processes with Fabric and re-use them for the PyTorch training for convenience
@ -169,3 +158,9 @@ def test_parity_ddp(accelerator, devices, tolerance):
if accelerator == "cuda":
assert all(fabric.all_gather(is_cuda_memory_close(memory_torch["start"], memory_fabric["start"])))
assert all(fabric.all_gather(is_cuda_memory_close(memory_torch["end"], memory_fabric["end"])))
if __name__ == "__main__":
from jsonargparse.cli import CLI
CLI(run_parity_test)

View File

@ -0,0 +1,43 @@
#!/bin/bash
# Copyright The Lightning AI team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_fabric DIRECTORY
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
export PYTHONPATH="${PYTHONPATH}:$(pwd)/.."
MAX_RETRIES=3
retry_command() {
local command="$@"
local exit_code=1
for ((i=1; i<=$MAX_RETRIES; i++))
do
echo "Run attempt: $i"
eval $command
exit_code=$?
if [ $exit_code -eq 0 ]; then
echo "Successfully ran: $command"
break
fi
echo "Attempt $i failed."
done
if [ $exit_code -ne 0 ]; then
echo "Failed after $MAX_RETRIES attempts: $command"
fi
return $exit_code
}
retry_command "python -m parity.test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
retry_command "python -m parity.test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"