# Python CircleCI 2.1 configuration file. version: 2.1 orbs: gcp-gke: circleci/gcp-gke@1.0.4 go: circleci/go@1.3.0 codecov: codecov/codecov@1.1.0 references: make_docs: &make_docs run: name: Make Documentation command: | # First run the same pipeline as Read-The-Docs # apt-get update && apt-get install -y cmake # using: https://hub.docker.com/r/readthedocs/build # we need to use py3.7 ot higher becase of an issue with metaclass inheritence pyenv global 3.7.3 python --version pip install -r requirements/docs.txt cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W" checkout_ml_testing: &checkout_ml_testing run: name: Checkout ml-testing-accelerators command: | git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git cd ml-testing-accelerators git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable git checkout stable cd .. build_push_docker: &build_push_docker run: name: Build and push Docker image command: | gcloud --quiet auth configure-docker cd dockers/tpu-tests # TODO: How to find the GITHUB_REF in CircleCI? # $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11. # Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it # for the GITHUB_REF so Docker can pull the latest pending code in PR. if [ -z "$CI_PULL_REQUEST" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" .; else git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head && docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" .; fi docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" deploy_cluster: &deploy_cluster run: name: Deploy the job on the kubernetes cluster command: | go get github.com/google/go-jsonnet/cmd/jsonnet export PATH=$PATH:$HOME/go/bin job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) job_name=${job_name#job.batch/} job_name=${job_name% created} echo "Waiting on kubernetes job: $job_name" i=0 && \ # N checks spaced 30s apart = 900s total. status_code=2 && \ # Check on the job periodically. Set the status code depending on what # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ echo "GKE pod name: $pod_name" && \ kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \ echo "Done with log retrieval attempt." && \ gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \ exit $status_code stats: &stats run: name: Statistics command: | mv ./xx01 coverage.xml # TODO: add human readable report cat coverage.xml sudo pip install pycobertura pycobertura show coverage.xml delete_gke_jobs: &delete_gke_jobs run: name: Delete GKE Jobs command: | # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job # that has been around longer than 1hr. First print all columns for # matches, then execute the delete. kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}' kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}') jobs: TPU-tests: docker: - image: circleci/python:3.7 environment: - MAX_CHECKS: 240 - CHECK_SPEEP: 5 steps: - checkout - go/install - *checkout_ml_testing - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - setup_remote_docker - *build_push_docker - *deploy_cluster - *stats - codecov/upload: file: coverage.xml flags: tpu,pytest upload_name: TPU-coverage - store_artifacts: path: coverage.xml build-Docs: docker: - image: readthedocs/build:latest steps: - checkout - *make_docs - store_artifacts: # allows us to preview the generated html pages path: docs/build/html/ destination: html cleanup-gke-jobs: docker: - image: circleci/python:3.7 steps: - gcp-gke/install - gcp-gke/update-kubeconfig-with-credentials: cluster: $GKE_CLUSTER perform-login: true - *delete_gke_jobs workflows: version: 2 build: jobs: - build-Docs - TPU-tests: filters: branches: # https://discuss.circleci.com/t/create-separate-steps-jobs-for-pr-forks-versus-branches/13419/4 #only: # # only from forks # - /^pull\/.\d+$/ ignore: - master cleanup: triggers: - schedule: # The cron format is: # min (0-59) hour (0-23) monthday (1-31) month (1-12) weekday (0-6, 0=Sun) # Set to run at the first minute of every hour. cron: "0 * * * *" filters: branches: only: - master jobs: - cleanup-gke-jobs