lightning/.circleci/config.yml

# Python CircleCI 2.1 configuration file.
version: 2.1
orbs:
  gcp-gke: circleci/gcp-gke@1.0.4
  go: circleci/go@1.3.0
  codecov: codecov/codecov@1.1.0

references:

  make_docs: &make_docs
    run:
      name: Make Documentation
      command: |
        # First run the same pipeline as Read-The-Docs
        # apt-get update && apt-get install -y cmake
        # using: https://hub.docker.com/r/readthedocs/build
        # we need to use py3.7 ot higher becase of an issue with metaclass inheritence
        pyenv global 3.7.3
        python --version
        pip install -r requirements/docs.txt
        cd docs; make clean; make html --debug --jobs 2 SPHINXOPTS="-W"

  checkout_ml_testing: &checkout_ml_testing
   run:
     name: Checkout ml-testing-accelerators
     command: |
       git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
       cd ml-testing-accelerators
       git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
       git checkout stable
       cd ..

  build_push_docker: &build_push_docker
    run:
      name: Build and push Docker image
      command: |
        gcloud --quiet auth configure-docker
        cd dockers/tpu-tests
        # TODO: How to find the GITHUB_REF in CircleCI?
        # $CI_PULL_REQUEST seems to be of form: https://github.com/org/repo-name/pull/11.
        # Grab the last bit, e.g. pull/11, convert to pull/11/head, and use it
        # for the GITHUB_REF so Docker can pull the latest pending code in PR.
        if [ -z "$CI_PULL_REQUEST" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" .; else git_ref=$(echo "$CI_PULL_REQUEST" | sed "s/.*pytorch-lightning\///")/head && docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=$git_ref" .; fi
        docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"

  deploy_cluster: &deploy_cluster
   run:
     name: Deploy the job on the kubernetes cluster
     command: |
       go get github.com/google/go-jsonnet/cmd/jsonnet
       export PATH=$PATH:$HOME/go/bin
       job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -)
       job_name=${job_name#job.batch/}
       job_name=${job_name% created}
       echo "Waiting on kubernetes job: $job_name"
       i=0 && \
       # N checks spaced 30s apart = 900s total.
       status_code=2 && \
       # Check on the job periodically. Set the status code depending on what
       # happened to the job in Kubernetes. If we try MAX_CHECKS times and
       # still the job hasn't finished, give up and return the starting
       # non-zero status code.
       printf "Waiting for job to finish: " && \
       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
       echo "Done waiting. Job status code: $status_code" && \
       pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
       echo "GKE pod name: $pod_name" && \
       kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
       if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
       # First portion is the test logs. Print these to Github Action stdout.
       cat xx00 && \
       echo "Done with log retrieval attempt." && \
       gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
       exit $status_code

  stats: &stats
   run:
     name: Statistics
     command: |
       mv ./xx01 coverage.xml
       # TODO: add human readable report
       cat coverage.xml
       sudo pip install pycobertura
       pycobertura show coverage.xml

  delete_gke_jobs: &delete_gke_jobs
   run:
     name: Delete GKE Jobs
     command: |
       # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
       # that has been around longer than 1hr. First print all columns for
       # matches, then execute the delete.
       jobs_to_delete=$(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}')
       echo $jobs_to_delete
       if [ ${#jobs_to_delete} -gt 1 ];
         then kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}');
       fi

jobs:

  TPU-tests:
    docker:
      - image: circleci/python:3.7
    environment:
      - MAX_CHECKS: 240
      - CHECK_SPEEP: 5
    steps:
      - checkout
      - go/install
      - *checkout_ml_testing
      - gcp-gke/install
      - gcp-gke/update-kubeconfig-with-credentials:
          cluster: $GKE_CLUSTER
          perform-login: true
      - setup_remote_docker
      - *build_push_docker
      - *deploy_cluster
      - *stats
      - codecov/upload:
          file: coverage.xml
          flags: tpu,pytest
          upload_name: TPU-coverage

      - store_artifacts:
          path: coverage.xml

  build-Docs:
    docker:
      - image: readthedocs/build:latest
    steps:
      - checkout
      - *make_docs
      - store_artifacts:
          # allows us to preview the generated html pages
          path: docs/build/html/
          destination: html

  cleanup-gke-jobs:
    docker:
      - image: circleci/python:3.7
    steps:
      - gcp-gke/install
      - gcp-gke/update-kubeconfig-with-credentials:
          cluster: $GKE_CLUSTER
          perform-login: true
      - *delete_gke_jobs


workflows:
  version: 2
  build:
    jobs:
      - build-Docs
      - TPU-tests:
          filters:
            branches:
              # https://discuss.circleci.com/t/create-separate-steps-jobs-for-pr-forks-versus-branches/13419/4
              #only:
              #  # only from forks
              #  - /^pull\/.\d+$/
              ignore:
                - master
  cleanup:
    triggers:
      - schedule:
          # The cron format is:
          # min (0-59) hour (0-23) monthday (1-31) month (1-12) weekday (0-6, 0=Sun)
          # Set to run at the first minute of every hour.
          cron: "0 * * * *"
          filters:
            branches:
              only:
                - master
    jobs:
      - cleanup-gke-jobs