2020-07-23 16:13:10 +00:00
# Python CircleCI 2.1 configuration file.
version : 2.1
orbs :
gcp-gke : circleci/gcp-gke@1.0.4
go : circleci/go@1.3.0
codecov : codecov/codecov@1.1.0
2020-07-14 18:04:04 +00:00
references :
2020-07-23 16:13:10 +00:00
make_docs : &make_docs
2020-07-14 18:04:04 +00:00
run :
2020-07-23 16:13:10 +00:00
name : Make Documentation
2020-07-14 18:04:04 +00:00
command : |
2020-07-23 16:13:10 +00:00
# First run the same pipeline as Read-The-Docs
# apt-get update && apt-get install -y cmake
# using: https://hub.docker.com/r/readthedocs/build
# we need to use py3.7 ot higher becase of an issue with metaclass inheritence
pyenv global 3.7.3
python --version
pip install -r requirements/docs.txt
cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"
2020-07-14 18:04:04 +00:00
2020-07-23 16:13:10 +00:00
checkout_ml_testing : &checkout_ml_testing
2020-07-14 18:04:04 +00:00
run :
2020-07-23 16:13:10 +00:00
name : Checkout ml-testing-accelerators
2020-07-14 18:04:04 +00:00
command : |
2020-07-23 16:13:10 +00:00
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
cd ..
2020-07-14 18:04:04 +00:00
2020-07-23 16:13:10 +00:00
build_push_docker : &build_push_docker
run :
name : Build and push Docker image
command : |
gcloud --quiet auth configure-docker
cd dockers/tpu-tests
# TODO: How to find the GITHUB_REF in CircleCI?
# $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11.
# Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it
# for the GITHUB_REF so Docker can pull the latest pending code in PR.
git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head
docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" .
#docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
deploy_cluster : &deploy_cluster
2020-07-14 18:04:04 +00:00
run :
2020-07-23 16:13:10 +00:00
name : Deploy the job on the kubernetes cluster
2020-07-14 18:04:04 +00:00
command : |
2020-07-23 16:13:10 +00:00
go get github.com/google/go-jsonnet/cmd/jsonnet
export PATH=$PATH:$HOME/go/bin
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
echo "Waiting on kubernetes job: $job_name"
i=0 && \
# N checks spaced 30s apart = 900s total.
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
echo "Done waiting. Job status code: $status_code" && \
# Allow time for logs to flush.
sleep 30 && \
echo "JOB_NAME: $job_name" && \
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
exit $status_code
stats : &stats
run :
name : Statistics
command : |
mv ./xx01 coverage.xml
# TODO: add human readable report
cat coverage.xml
sudo pip install pycobertura
pycobertura show coverage.xml
2020-07-14 18:04:04 +00:00
jobs :
2020-07-23 16:13:10 +00:00
TPU-tests :
docker :
- image : circleci/python:3.7
environment :
- MAX_CHECKS : 60
steps :
- checkout
- go/install
- *checkout_ml_testing
- gcp-gke/install
- gcp-gke/update-kubeconfig-with-credentials :
cluster : $GKE_CLUSTER
perform-login : true
- setup_remote_docker
- *build_push_docker
- *deploy_cluster
- *stats
- codecov/upload :
file : coverage.xml
flags : tpu,pytest
upload_name : TPU-coverage
- store_artifacts :
path : coverage.xml
build-Docs :
2020-07-14 18:04:04 +00:00
docker :
- image : readthedocs/build:latest
steps :
- checkout
- *make_docs
- store_artifacts :
# allows us to preview the generated html pages
path : docs/build/html/
destination : html
workflows :
version : 2
2020-07-23 16:13:10 +00:00
tpu-tests :
2020-07-14 18:04:04 +00:00
jobs :
2020-07-23 16:13:10 +00:00
- build-Docs
- TPU-tests