DDP Parity tests as standalone task (#17503)

2023-05-03 05:36:07 +02:00 · 2023-05-03 05:36:07 +02:00 · 249395bfe0
parent 9eedf7ae4b
commit 249395bfe0
3 changed files with 57 additions and 12 deletions
--- a/.azure/gpu-tests-fabric.yml
+++ b/.azure/gpu-tests-fabric.yml
@ -138,6 +138,13 @@ jobs:
      displayName: 'Testing: fabric standalone tests'
      timeoutInMinutes: "10"

+    - bash: bash run_standalone_tasks.sh
+      workingDirectory: tests/tests_fabric
+      env:
+        PL_RUN_CUDA_TESTS: "1"
+      displayName: 'Testing: fabric standalone tasks'
+      timeoutInMinutes: "10"
+
    - bash: |
        python -m coverage report
        python -m coverage xml
--- a/tests/tests_fabric/parity/test_parity_ddp.py
+++ b/tests/tests_fabric/parity/test_parity_ddp.py
@ -15,7 +15,6 @@ import os
 import time
 from copy import deepcopy

-import pytest
 import torch
 import torch.distributed
 import torch.nn.functional
@ -24,7 +23,6 @@ from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler

 from lightning.fabric.fabric import Fabric
-from tests_fabric.helpers.runif import RunIf
 from tests_fabric.parity.models import ConvNet
 from tests_fabric.parity.utils import (
    cuda_reset,
@ -125,16 +123,7 @@ def train_fabric_ddp(fabric):
    return model.state_dict(), torch.tensor(iteration_timings), memory_stats


-@RunIf(standalone=True)
-@pytest.mark.usefixtures("reset_deterministic_algorithm", "reset_cudnn_benchmark")
-@pytest.mark.parametrize(
-    "accelerator, devices, tolerance",
-    [
-        ("cpu", 2, 0.02),
-        pytest.param("cuda", 2, 0.01, marks=RunIf(min_cuda_gpus=2)),
-    ],
-)
-def test_parity_ddp(accelerator, devices, tolerance):
+def run_parity_test(accelerator: str = "cpu", devices: int = 2, tolerance: float = 0.02):
    cuda_reset()

    # Launch processes with Fabric and re-use them for the PyTorch training for convenience
@ -169,3 +158,9 @@ def test_parity_ddp(accelerator, devices, tolerance):
    if accelerator == "cuda":
        assert all(fabric.all_gather(is_cuda_memory_close(memory_torch["start"], memory_fabric["start"])))
        assert all(fabric.all_gather(is_cuda_memory_close(memory_torch["end"], memory_fabric["end"])))
+
+
+if __name__ == "__main__":
+    from jsonargparse.cli import CLI
+
+    CLI(run_parity_test)
--- a/tests/tests_fabric/run_standalone_tasks.sh
+++ b/tests/tests_fabric/run_standalone_tasks.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright The Lightning AI team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_fabric DIRECTORY
+export PYTHONPATH="${PYTHONPATH}:$(pwd)"
+export PYTHONPATH="${PYTHONPATH}:$(pwd)/.."
+
+MAX_RETRIES=3
+
+retry_command() {
+  local command="$@"
+  local exit_code=1
+  for ((i=1; i<=$MAX_RETRIES; i++))
+  do
+    echo "Run attempt: $i"
+    eval $command
+    exit_code=$?
+    if [ $exit_code -eq 0 ]; then
+      echo "Successfully ran: $command"
+      break
+    fi
+    echo "Attempt $i failed."
+  done
+  if [ $exit_code -ne 0 ]; then
+    echo "Failed after $MAX_RETRIES attempts: $command"
+  fi
+  return $exit_code
+}
+
+retry_command "python -m parity.test_parity_ddp --accelerator="cpu" --devices=2 --tolerance=0.02"
+retry_command "python -m parity.test_parity_ddp --accelerator="cuda" --devices=2 --tolerance=0.01"