From 8cf185a06d47d27157718fc9bd927cf14a0736bc Mon Sep 17 00:00:00 2001 From: Jirka Borovec <6035284+Borda@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:50:26 +0100 Subject: [PATCH] fix: use standalone tests' exit code (#20430) --- .azure/gpu-tests-fabric.yml | 10 +- .azure/gpu-tests-pytorch.yml | 8 +- tests/run_standalone_tests.sh | 122 ++++++++++++-------- tests/tests_pytorch/run_standalone_tasks.sh | 8 +- 4 files changed, 87 insertions(+), 61 deletions(-) diff --git a/.azure/gpu-tests-fabric.yml b/.azure/gpu-tests-fabric.yml index ee7fe2e281..cd1cf09373 100644 --- a/.azure/gpu-tests-fabric.yml +++ b/.azure/gpu-tests-fabric.yml @@ -134,13 +134,13 @@ jobs: condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric')) displayName: "Adjust tests & examples" - - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50 - workingDirectory: tests/tests_fabric/ + - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50 + workingDirectory: tests/ displayName: "Testing: fabric standard" timeoutInMinutes: "10" - - bash: bash ../run_standalone_tests.sh "." - workingDirectory: tests/tests_fabric/ + - bash: bash ./run_standalone_tests.sh "tests_fabric" + workingDirectory: tests/ env: PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) displayName: "Testing: fabric standalone" @@ -157,7 +157,7 @@ jobs: ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \ --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure ls -l - workingDirectory: tests/tests_fabric/ + workingDirectory: tests/ displayName: "Statistics" - script: | diff --git a/.azure/gpu-tests-pytorch.yml b/.azure/gpu-tests-pytorch.yml index c014d84663..ae4c1367b6 100644 --- a/.azure/gpu-tests-pytorch.yml +++ b/.azure/gpu-tests-pytorch.yml @@ -155,13 +155,13 @@ jobs: ls -l checkpoints/ displayName: "Get legacy checkpoints" - - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest -v --durations=50 - workingDirectory: tests/tests_pytorch + - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_pytorch/ -v --durations=50 + workingDirectory: tests/ displayName: "Testing: PyTorch standard" timeoutInMinutes: "35" - - bash: bash ../run_standalone_tests.sh "." - workingDirectory: tests/tests_pytorch + - bash: bash ./run_standalone_tests.sh "tests_pytorch" + workingDirectory: tests/ env: PL_USE_MOCKED_MNIST: "1" PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE) diff --git a/tests/run_standalone_tests.sh b/tests/run_standalone_tests.sh index 8a4d8e180d..9aa54f7350 100755 --- a/tests/run_standalone_tests.sh +++ b/tests/run_standalone_tests.sh @@ -19,6 +19,11 @@ set -e # It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}" source="${PL_STANDALONE_TESTS_SOURCE:-"lightning"}" +# this is the directory where the tests are located +test_dir=$1 # parse the first argument +COLLECTED_TESTS_FILE="collected_tests.txt" + +ls -lh . # show the contents of the directory # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 @@ -26,72 +31,87 @@ export PL_RUN_STANDALONE_TESTS=1 defaults=" -m coverage run --source ${source} --append -m pytest --no-header -v -s --timeout 120 " echo "Using defaults: ${defaults}" -# get the testing location as the first argument -test_path=$1 -printf "source path: $test_path\n" +# get the list of parametrizations. we need to call them separately. the last two lines are removed. +# note: if there's a syntax error, this will fail with some garbled output +python3 -um pytest $test_dir -q --collect-only --pythonwarnings ignore 2>&1 > $COLLECTED_TESTS_FILE +# early terminate if collection failed (e.g. syntax error) +if [[ $? != 0 ]]; then + cat $COLLECTED_TESTS_FILE + exit 1 +fi -# collect all tests with parametrization based filtering with PL_RUN_STANDALONE_TESTS -standalone_tests=$(python3 -m pytest $test_path -q --collect-only --pythonwarnings ignore) -printf "Collected tests: \n $standalone_tests\n" -# match only lines with tests -parametrizations=$(perl -nle 'print $& while m{\S+::test_\S+}g' <<< "$standalone_tests") -# convert the list to be array -parametrizations_arr=($parametrizations) -report='' +# removes the last line of the file +sed -i '$d' $COLLECTED_TESTS_FILE -rm -f standalone_test_output.txt # in case it exists, remove it -rm -f testnames.txt +# Get test list and run each test individually +tests=($(grep -oP '\S+::test_\S+' "$COLLECTED_TESTS_FILE")) +test_count=${#tests[@]} +# present the collected tests +printf "collected $test_count tests:\n-------------------\n" +# replace space with new line +echo "${tests[@]}" | tr ' ' '\n' +printf "\n===================\n" -function show_batched_output { - if [ -f standalone_test_output.txt ]; then # if exists - cat standalone_test_output.txt - # heuristic: stop if there's mentions of errors. this can prevent false negatives when only some of the ranks fail - if perl -nle 'print if /error|(?> testnames.txt - - # fix the port to avoid race condition when batched distributed tests select the port randomly - export MASTER_PORT=$((29500 + $i % $test_batch_size)) +status=0 # reset the script status +report="" # final report +pids=() # array of PID for running tests +test_ids=() # array of indexes of running tests +printf "Running $test_count tests in batches of $test_batch_size\n" +for i in "${!tests[@]}"; do + # remove initial "tests/" from the test name + test=${tests[$i]/tests\//} + printf "Running test $((i+1))/$test_count: $test\n" # execute the test in the background - # redirect to a log file that buffers test output. since the tests will run in the background, we cannot let them - # output to std{out,err} because the outputs would be garbled together - python3 ${defaults} "$parametrization" &>> standalone_test_output.txt & - # save the PID in an array - pids[${i}]=$! - # add row to the final report - report+="Ran\t$parametrization\n" + # redirect to a log file that buffers test output. since the tests will run in the background, + # we cannot let them output to std{out,err} because the outputs would be garbled together + python3 ${defaults} "$test" 2>&1 > "standalone_test_output-$i.txt" & + test_ids+=($i) # save the test's id in an array with running tests + pids+=($!) # save the PID in an array with running tests - if ((($i + 1) % $test_batch_size == 0)); then + # if we reached the batch size, wait for all tests to finish + if (( (($i + 1) % $test_batch_size == 0) || $i == $test_count-1 )); then + printf "Waiting for batch to finish: $(IFS=' '; echo "${pids[@]}")\n" # wait for running tests - for pid in ${pids[*]}; do wait $pid; done - unset pids # empty the array - show_batched_output + for j in "${!test_ids[@]}"; do + i=${test_ids[$j]} # restore the global test's id + pid=${pids[$j]} # restore the particular PID + test=${tests[$i]} # restore the test name + printf "Waiting for $tests >> standalone_test_output-$i.txt (PID: $pid)\n" + wait -n $pid + # get the exit status of the test + test_status=$? + # add row to the final report + report+="Ran\t$test\t>> exit:$test_status\n" + if [[ $test_status != 0 ]]; then + # show the output of the failed test + cat "standalone_test_output-$i.txt" + # Process exited with a non-zero exit status + status=$test_status + fi + done + test_ids=() # reset the test's id array + pids=() # reset the PID array fi done -# wait for leftover tests -for pid in ${pids[*]}; do wait $pid; done -show_batched_output # echo test report printf '=%.s' {1..80} printf "\n$report" printf '=%.s' {1..80} printf '\n' + +# exit with the worst test result +exit $status diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index eb2f7f6b22..48bc920ada 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -21,7 +21,13 @@ export PL_RUN_STANDALONE_TESTS=1 # test that a user can manually launch individual processes echo "Running manual ddp launch test" export PYTHONPATH="${PYTHONPATH}:$(pwd)" -args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" +args="fit --trainer.accelerator gpu \ +--trainer.devices 2 \ +--trainer.strategy ddp \ +--trainer.max_epochs=1 \ +--trainer.limit_train_batches=1 \ +--trainer.limit_val_batches=1 \ +--trainer.limit_test_batches=1" MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} & MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args}