# Python CircleCI 2.1 configuration file. version: 2.1 orbs: gcp-gke: circleci/gcp-gke@1.0.4 go: circleci/go@1.3.0 codecov: codecov/codecov@1.1.0 references: make_docs: &make_docs run: name: Make Documentation command: | # First run the same pipeline as Read-The-Docs # apt-get update && apt-get install -y cmake # using: https://hub.docker.com/r/readthedocs/build # we need to use py3.7 ot higher becase of an issue with metaclass inheritence pyenv global 3.7.3 python --version pip install -r requirements/docs.txt cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W" checkout_ml_testing: &checkout_ml_testing run: name: Checkout ml-testing-accelerators command: | git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git cd ml-testing-accelerators git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable git checkout stable cd .. build_push_docker: &build_push_docker run: name: Build and push Docker image command: | gcloud --quiet auth configure-docker cd dockers/tpu-tests # TODO: How to find the GITHUB_REF in CircleCI? # $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11. # Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it # for the GITHUB_REF so Docker can pull the latest pending code in PR. git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" . #docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" . docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" deploy_cluster: &deploy_cluster run: name: Deploy the job on the kubernetes cluster command: | go get github.com/google/go-jsonnet/cmd/jsonnet export PATH=$PATH:$HOME/go/bin job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} echo "Waiting on kubernetes job: $job_name" i=0 && \ # N checks spaced 30s apart = 900s total. status_code=2 && \ # Check on the job periodically. Set the status code depending on what # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. sleep 30 && \ echo "JOB_NAME: $job_name" && \ gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \ if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \ echo "Done with log retrieval attempt." && \ gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \ exit $status_code stats: &stats run: name: Statistics command: | mv ./xx01 coverage.xml # TODO: add human readable report cat coverage.xml sudo pip install pycobertura pycobertura show coverage.xml jobs: TPU-tests: docker: - image: circleci/python:3.7 environment: - MAX_CHECKS: 60 steps: - checkout - go/install - *checkout_ml_testing - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - setup_remote_docker - *build_push_docker - *deploy_cluster - *stats - codecov/upload: file: coverage.xml flags: tpu,pytest upload_name: TPU-coverage - store_artifacts: path: coverage.xml build-Docs: docker: - image: readthedocs/build:latest steps: - checkout - *make_docs - store_artifacts: # allows us to preview the generated html pages path: docs/build/html/ destination: html workflows: version: 2 tpu-tests: jobs: - build-Docs - TPU-tests