lightning/.circleci/config.yml

173 lines
6.1 KiB
YAML
Executable File

# Python CircleCI 2.1 configuration file.
version: 2.1
orbs:
gcp-gke: circleci/gcp-gke@1.0.4
go: circleci/go@1.3.0
codecov: codecov/codecov@1.1.0
references:
make_docs: &make_docs
run:
name: Make Documentation
command: |
# First run the same pipeline as Read-The-Docs
# apt-get update && apt-get install -y cmake
# using: https://hub.docker.com/r/readthedocs/build
# we need to use py3.7 ot higher becase of an issue with metaclass inheritence
pyenv global 3.7.3
python --version
pip install -r requirements/docs.txt
cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
checkout_ml_testing: &checkout_ml_testing
run:
name: Checkout ml-testing-accelerators
command: |
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
cd ..
build_push_docker: &build_push_docker
run:
name: Build and push Docker image
command: |
gcloud --quiet auth configure-docker
#cd dockers/tpu-tests
docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" --build-arg "PYTORCH_VERSION=$XLA_VER" .
docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
deploy_cluster: &deploy_cluster
run:
name: Deploy the job on the kubernetes cluster
command: |
go get github.com/google/go-jsonnet/cmd/jsonnet
export PATH=$PATH:$HOME/go/bin
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; fff = open(fname).read().replace('pytorch-VERSION', 'pytorch-$XLA_VER') ; open(fname, 'w').write(fff)"
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
echo "Waiting on kubernetes job: $job_name"
i=0 && \
# N checks spaced 30s apart = 900s total.
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
printf "Waiting for job to finish: " && \
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
echo "Done waiting. Job status code: $status_code" && \
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
echo "GKE pod name: $pod_name" && \
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
exit $status_code
stats: &stats
run:
name: Statistics
command: |
mv ./xx01 coverage.xml
# TODO: add human readable report
cat coverage.xml
sudo pip install pycobertura
pycobertura show coverage.xml
delete_gke_jobs: &delete_gke_jobs
run:
name: Delete GKE Jobs
command: |
# Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
# that has been around longer than 1hr. First print all columns for
# matches, then execute the delete.
jobs_to_delete=$(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}')
echo $jobs_to_delete
if [ ${#jobs_to_delete} -gt 1 ];
then kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}');
fi
jobs:
TPU-tests:
parameters:
python:
type: string
docker:
- image: circleci/python:3.7
environment:
- PYTHON_VER: << parameters.python >>
- XLA_VER: 1.7
- MAX_CHECKS: 240
- CHECK_SPEEP: 5
steps:
- checkout
- go/install
- *checkout_ml_testing
- gcp-gke/install
- gcp-gke/update-kubeconfig-with-credentials:
cluster: $GKE_CLUSTER
perform-login: true
- setup_remote_docker
- *build_push_docker
- *deploy_cluster
- *stats
- codecov/upload:
file: coverage.xml
flags: tpu,pytest
upload_name: TPU-coverage
- store_artifacts:
path: coverage.xml
build-Docs:
docker:
- image: readthedocs/build:latest
steps:
- checkout
- *make_docs
- store_artifacts:
# allows us to preview the generated html pages
path: docs/build/html/
destination: html
cleanup-gke-jobs:
docker:
- image: circleci/python:3.7
steps:
- gcp-gke/install
- gcp-gke/update-kubeconfig-with-credentials:
cluster: $GKE_CLUSTER
perform-login: true
- *delete_gke_jobs
workflows:
version: 2
tpu-tests:
jobs:
- build-Docs
- TPU-tests:
matrix:
parameters:
python: ["3.6", "3.7"]
tpu-cleanup:
triggers:
- schedule:
# The cron format is:
# min (0-59) hour (0-23) monthday (1-31) month (1-12) weekday (0-6, 0=Sun)
# Set to run at the first minute of every hour.
cron: "0 * * * *"
filters:
branches:
only:
- master
jobs:
- cleanup-gke-jobs