# Python CircleCI 2.1 configuration file. version: 2.1 orbs: gcp-gke: circleci/gcp-gke@1.0.4 go: circleci/go@1.3.0 codecov: codecov/codecov@1.1.0 # Workflow Steps: # 1. Checkout # 2. Install GO # 3. Checkout ml-testing-accelerators # 4. GCP GKE install # 5. Update Kubeconfig with credintials # 6. Install jsonnet # 7. Update jsonnet # 8. Deploy the job on the kubernetes cluster # 9. Statistics # 10. Upload coverage results # 11. Upload coverage to Codecov references: make_docs: &make_docs run: name: Make Documentation command: | # First run the same pipeline as Read-The-Docs # apt-get update && apt-get install -y cmake # using: https://hub.docker.com/r/readthedocs/build # we need to use py3.7 ot higher becase of an issue with metaclass inheritence pyenv global 3.7.3 python --version pip install -r requirements/docs.txt pip list cd docs make clean make html --jobs 2 SPHINXOPTS="-W" checkout_ml_testing: &checkout_ml_testing run: name: Checkout ml-testing-accelerators command: | git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git cd ml-testing-accelerators git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable git checkout stable cd .. install_jsonnet: &install_jsonnet run: name: Install jsonnet command: | go get github.com/google/go-jsonnet/cmd/jsonnet update_jsonnet: &update_jsonnet run: name: Update jsonnet command: | export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f <(git rev-parse HEAD) | awk -F'/' '{print $3}') export SHA=$(git rev-parse --short HEAD) python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER') data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)" cat dockers/tpu-tests/tpu_test_cases.jsonnet deploy_cluster: &deploy_cluster run: name: Deploy the job on the kubernetes cluster command: | export PATH=$PATH:$HOME/go/bin job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \ job_name=${job_name#job.batch/} job_name=${job_name% created} echo "Waiting on kubernetes job: $job_name" i=0 && \ # N checks spaced 30s apart = 900s total. status_code=2 && \ # Check on the job periodically. Set the status code depending on what # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ echo "GKE pod name: $pod_name" && \ kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \ echo "Done with log retrieval attempt." && \ exit $status_code stats: &stats run: name: Statistics command: | mv ./xx01 coverage.xml # TODO: add human readable report cat coverage.xml sudo pip install pycobertura pycobertura show coverage.xml jobs: TPU-tests: docker: - image: circleci/python:3.7 environment: - XLA_VER: 1.8 - PYTHON_VER: 3.7 - MAX_CHECKS: 240 - CHECK_SPEEP: 5 steps: - checkout - go/install - *checkout_ml_testing - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - *install_jsonnet - *update_jsonnet - *deploy_cluster - *stats - codecov/upload: file: coverage.xml flags: tpu,pytest upload_name: TPU-coverage - store_artifacts: path: coverage.xml build-Docs: docker: - image: readthedocs/build:latest steps: - checkout - run: command: | git submodule update --init --recursive name: Init git submodule - *make_docs - store_artifacts: # allows us to preview the generated html pages path: docs/build/html/ destination: html workflows: version: 2 ci-tests: jobs: - build-Docs - TPU-tests