lightning/dockers/base-xla/tpu_workflow_lite.jsonnet

66 lines
2.0 KiB
Plaintext
Raw Normal View History

local base = import 'templates/base.libsonnet';
local tpus = import 'templates/tpus.libsonnet';
local utils = import "templates/utils.libsonnet";
local tputests = base.BaseTest {
frameworkPrefix: 'pl',
modelName: 'tpu-tests',
mode: 'postsubmit',
configMaps: [],
timeout: 6000, # 100 minutes, in seconds.
image: 'pytorchlightning/pytorch_lightning',
imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}',
tpuSettings+: {
softwareVersion: 'pytorch-{PYTORCH_VERSION}',
},
accelerator: tpus.v3_8,
command: utils.scriptCommand(
|||
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
set +x # turn off tracing, spammy
set -e # exit on error
source ~/.bashrc
conda activate lightning
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
echo "--- Cloning lightning repo ---"
git clone --single-branch --depth 1 https://github.com/Lightning-AI/lightning.git
cd lightning
# PR triggered it, check it out
if [ -n "{PR_NUMBER}" ]; then # if PR number is not empty
echo "--- Fetch the PR changes ---"
git fetch origin --depth 1 pull/{PR_NUMBER}/head:test/{PR_NUMBER}
echo "--- Checkout PR changes ---"
git -c advice.detachedHead=false checkout {SHA}
fi
echo "--- Install packages ---"
PACKAGE_NAME=lite pip install .[dev]
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
pip list
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
echo "--- Sanity check TPU availability ---"
python -c "from lightning_lite.accelerators import TPUAccelerator; assert TPUAccelerator.is_available()"
echo "Sanity check passed!"
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
echo "--- Running Lite tests ---"
cd tests/tests_lite
PL_RUN_TPU_TESTS=1 coverage run --source=lightning_lite -m pytest -vv --durations=0 ./
echo "--- Running standalone Lite tests ---"
PL_STANDALONE_TESTS_SOURCE=lightning_lite PL_STANDALONE_TESTS_BATCH_SIZE=1 bash run_standalone_tests.sh
echo "--- Generating coverage ---"
coverage xml
cat coverage.xml | tr -d '\t'
|||
),
};
tputests.oneshotJob