# Python CircleCI 2.1 configuration file. version: 2.1 orbs: gcp-gke: circleci/gcp-gke@1.0.4 go: circleci/go@1.3.0 codecov: codecov/codecov@1.1.0 references: make_docs: &make_docs run: name: Make Documentation command: | # First run the same pipeline as Read-The-Docs # apt-get update && apt-get install -y cmake # using: https://hub.docker.com/r/readthedocs/build # we need to use py3.7 ot higher becase of an issue with metaclass inheritence pyenv global 3.7.3 python --version pip install -r requirements/docs.txt pip list cd docs make clean make html --jobs 2 SPHINXOPTS="-W" checkout_ml_testing: &checkout_ml_testing run: name: Checkout ml-testing-accelerators command: | git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git cd ml-testing-accelerators git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable git checkout stable cd .. build_push_docker: &build_push_docker run: name: Build and push Docker image environment: - PYTHON_VER: 3.7 command: | gcloud --quiet auth configure-docker #cd dockers/tpu-tests docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" --build-arg "PYTORCH_VERSION=$XLA_VER" . docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" deploy_cluster: &deploy_cluster run: name: Deploy the job on the kubernetes cluster command: | go get github.com/google/go-jsonnet/cmd/jsonnet export PATH=$PATH:$HOME/go/bin python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; fff = open(fname).read().replace('pytorch-VERSION', 'pytorch-$XLA_VER') ; open(fname, 'w').write(fff)" job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} echo "Waiting on kubernetes job: $job_name" i=0 && \ # N checks spaced 30s apart = 900s total. status_code=2 && \ # Check on the job periodically. Set the status code depending on what # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ echo "GKE pod name: $pod_name" && \ kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \ echo "Done with log retrieval attempt." && \ gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \ exit $status_code stats: &stats run: name: Statistics command: | mv ./xx01 coverage.xml # TODO: add human readable report cat coverage.xml sudo pip install pycobertura pycobertura show coverage.xml jobs: TPU-tests: docker: - image: circleci/python:3.7 environment: - XLA_VER: 1.7 - MAX_CHECKS: 240 - CHECK_SPEEP: 5 steps: - checkout - go/install - *checkout_ml_testing - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - setup_remote_docker - *build_push_docker - *deploy_cluster - *stats - codecov/upload: file: coverage.xml flags: tpu,pytest upload_name: TPU-coverage - store_artifacts: path: coverage.xml build-Docs: docker: - image: readthedocs/build:latest steps: - checkout - *make_docs - store_artifacts: # allows us to preview the generated html pages path: docs/build/html/ destination: html workflows: version: 2 tpu-tests: jobs: - build-Docs - TPU-tests