DDP Parity tests as standalone task (#17503)
This commit is contained in:
parent
9eedf7ae4b
commit
249395bfe0
|
@ -138,6 +138,13 @@ jobs:
|
|||
displayName: 'Testing: fabric standalone tests'
|
||||
timeoutInMinutes: "10"
|
||||
|
||||
- bash: bash run_standalone_tasks.sh
|
||||
workingDirectory: tests/tests_fabric
|
||||
env:
|
||||
PL_RUN_CUDA_TESTS: "1"
|
||||
displayName: 'Testing: fabric standalone tasks'
|
||||
timeoutInMinutes: "10"
|
||||
|
||||
- bash: |
|
||||
python -m coverage report
|
||||
python -m coverage xml
|
||||
|
|
|
@ -15,7 +15,6 @@ import os
|
|||
import time
|
||||
from copy import deepcopy
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.nn.functional
|
||||
|
@ -24,7 +23,6 @@ from torch.utils.data import DataLoader
|
|||
from torch.utils.data.distributed import DistributedSampler
|
||||
|
||||
from lightning.fabric.fabric import Fabric
|
||||
from tests_fabric.helpers.runif import RunIf
|
||||
from tests_fabric.parity.models import ConvNet
|
||||
from tests_fabric.parity.utils import (
|
||||
cuda_reset,
|
||||
|
@ -125,16 +123,7 @@ def train_fabric_ddp(fabric):
|
|||
return model.state_dict(), torch.tensor(iteration_timings), memory_stats
|
||||
|
||||
|
||||
@RunIf(standalone=True)
|
||||
@pytest.mark.usefixtures("reset_deterministic_algorithm", "reset_cudnn_benchmark")
|
||||
@pytest.mark.parametrize(
|
||||
"accelerator, devices, tolerance",
|
||||
[
|
||||
("cpu", 2, 0.02),
|
||||
pytest.param("cuda", 2, 0.01, marks=RunIf(min_cuda_gpus=2)),
|
||||
],
|
||||
)
|
||||
def test_parity_ddp(accelerator, devices, tolerance):
|
||||
def run_parity_test(accelerator: str = "cpu", devices: int = 2, tolerance: float = 0.02):
|
||||
cuda_reset()
|
||||
|
||||
# Launch processes with Fabric and re-use them for the PyTorch training for convenience
|
||||
|
@ -169,3 +158,9 @@ def test_parity_ddp(accelerator, devices, tolerance):
|
|||
if accelerator == "cuda":
|
||||
assert all(fabric.all_gather(is_cuda_memory_close(memory_torch["start"], memory_fabric["start"])))
|
||||
assert all(fabric.all_gather(is_cuda_memory_close(memory_torch["end"], memory_fabric["end"])))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from jsonargparse.cli import CLI
|
||||
|
||||
CLI(run_parity_test)
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash
|
||||
# Copyright The Lightning AI team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_fabric DIRECTORY
|
||||
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
|
||||
export PYTHONPATH="${PYTHONPATH}:$(pwd)/.."
|
||||
|
||||
MAX_RETRIES=3
|
||||
|
||||
retry_command() {
|
||||
local command="$@"
|
||||
local exit_code=1
|
||||
for ((i=1; i<=$MAX_RETRIES; i++))
|
||||
do
|
||||
echo "Run attempt: $i"
|
||||
eval $command
|
||||
exit_code=$?
|
||||
if [ $exit_code -eq 0 ]; then
|
||||
echo "Successfully ran: $command"
|
||||
break
|
||||
fi
|
||||
echo "Attempt $i failed."
|
||||
done
|
||||
if [ $exit_code -ne 0 ]; then
|
||||
echo "Failed after $MAX_RETRIES attempts: $command"
|
||||
fi
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
retry_command "python -m parity.test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
|
||||
retry_command "python -m parity.test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"
|
Loading…
Reference in New Issue