lightning/.circleci/config.yml

151 lines
4.8 KiB
YAML
Raw Normal View History

# Python CircleCI 2.1 configuration file.
version: 2.1
orbs:
gcp-gke: circleci/gcp-gke@1.4.0
go: circleci/go@1.7.1
codecov: codecov/codecov@1.1.0
parameters:
GHA_Actor:
type: string
default: ""
GHA_Action:
type: string
default: ""
GHA_Event:
type: string
default: ""
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
GHA_Meta:
type: string
default: ""
references:
make_docs: &make_docs
run:
name: Make Documentation
command: |
2021-12-21 17:06:15 +00:00
# the image uses python 2.7 by default, force a different version
pyenv global 3.7.3
python --version
pip install -e . -r requirements/pytorch/docs.txt
pip list
cd docs
make clean
make html --jobs 2 SPHINXOPTS="-W"
checkout_ml_testing: &checkout_ml_testing
run:
name: Checkout ml-testing-accelerators
command: |
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
cd ..
install_jsonnet: &install_jsonnet
run:
name: Install jsonnet
command: |
go install github.com/google/go-jsonnet/cmd/jsonnet@latest
update_jsonnet: &update_jsonnet
run:
name: Update jsonnet
Fix TPU test CI (#14926) * Fix TPU test CI * +x first * Lite first to uncovert errors faster * Fixes * One more * Simplify XLALauncher wrapping to avoid pickle error * debug * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Debug commit successful. Trying local definitions * Require tpu for mock test * ValueError: The number of devices must be either 1 or 8, got 4 instead * Fix mock test * Simplify call, rely on defaults * Skip OSError for now. Maybe upgrading will help * Simplify launch tests, move some to lite * Stricter typing * RuntimeError: Accessing the XLA device before processes have spawned is not allowed. * Revert "RuntimeError: Accessing the XLA device before processes have spawned is not allowed." This reverts commit f65107ebf3e062d497f1033bfbbd59774f2d253f. * Alternative boring solution to the reverted commit * Fix failing test on CUDA machine * Workarounds * Try latest mkl * Revert "Try latest mkl" This reverts commit d06813aa67cc161879775e24be24b735e2925555. * Wrong exception * xfail * Mypy * Comment change * Spawn launch refactor * Accept that we cannot lazy init now * Fix mypy and launch test failures * The base dockerfile already includes mkl-2022.1.0 - what if we use it? * try a different mkl version * Revert mkl version changes Co-authored-by: awaelchli <aedu.waelchli@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta <nitta@akihironitta.com>
2022-10-03 13:13:33 +00:00
environment:
PR_NUMBER: << pipeline.parameters.GHA_Meta >>
command: |
export SHA=$(git rev-parse --short HEAD)
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
cat dockers/tpu-tests/tpu_test_cases.jsonnet
deploy_cluster: &deploy_cluster
run:
name: Deploy the job on the kubernetes cluster
command: |
export PATH=$PATH:$HOME/go/bin
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
echo "GKE pod name: $pod_name"
echo "Waiting on kubernetes job: $job_name"
i=0 && \
# N checks spaced 30s apart = 900s total.
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
printf "Waiting for job to finish: " && \
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SLEEP; done && \
echo "Done waiting. Job status code: $status_code" && \
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
exit $status_code
stats: &stats
run:
name: Statistics
command: |
mv ./xx01 coverage.xml
jobs:
TPU-tests:
docker:
- image: circleci/python:3.7
environment:
- XLA_VER: 1.12
- PYTHON_VER: 3.7
- MAX_CHECKS: 1000
- CHECK_SLEEP: 5
steps:
- checkout
- go/install
- *checkout_ml_testing
- gcp-gke/install
- gcp-gke/update-kubeconfig-with-credentials:
cluster: $GKE_CLUSTER
perform-login: true
- *install_jsonnet
- *update_jsonnet
- *deploy_cluster
- *stats
- codecov/upload:
file: coverage.xml
flags: tpu,pytest
upload_name: TPU-coverage
- store_artifacts:
path: coverage.xml
build-Docs:
docker:
- image: readthedocs/build:latest
steps:
- checkout
- run:
command: |
git submodule update --init --recursive
name: Init git submodule
- *make_docs
- store_artifacts:
# allows us to preview the generated html pages
path: docs/build/html/
destination: html
workflows:
#build-docs: # FixMe
# when: << pipeline.parameters.GHA_Action >>
# jobs:
# - build-Docs
2022-04-04 13:08:14 +00:00
test-on-tpus:
when: << pipeline.parameters.GHA_Action >>
2022-04-04 13:08:14 +00:00
jobs:
- TPU-tests