lightning/.circleci/config.yml

# Python CircleCI 2.1 configuration file.
version: 2.1
orbs:
  gcp-gke: circleci/gcp-gke@1.0.4
  go: circleci/go@1.3.0
  codecov: codecov/codecov@1.1.0

# Workflow Steps:
#  1. Checkout
#  2. Install GO
#  3. Checkout ml-testing-accelerators
#  4. GCP GKE install
#  5. Update Kubeconfig with credintials
#  6. Install jsonnet
#  7. Update jsonnet
#  8. Deploy the job on the kubernetes cluster
#  9. Statistics
#  10. Upload coverage results
#  11. Upload coverage to Codecov

references:

  make_docs: &make_docs
    run:
      name: Make Documentation
      command: |
        # First run the same pipeline as Read-The-Docs
        # apt-get update && apt-get install -y cmake
        # using: https://hub.docker.com/r/readthedocs/build
        # we need to use py3.7 ot higher becase of an issue with metaclass inheritence
        pyenv global 3.7.3
        python --version
        pip install -r requirements/docs.txt
        pip list
        cd docs
        make clean
        make html --jobs 2 SPHINXOPTS="-W"

  checkout_ml_testing: &checkout_ml_testing
   run:
     name: Checkout ml-testing-accelerators
     command: |
       git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
       cd ml-testing-accelerators
       git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
       git checkout stable
       cd ..

  install_jsonnet: &install_jsonnet
    run:
      name: Install jsonnet
      command: |
        go get github.com/google/go-jsonnet/cmd/jsonnet

  update_jsonnet: &update_jsonnet
    run:
      name: Update jsonnet
      command: |
       export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f <(git rev-parse HEAD) | awk -F'/' '{print $3}')
       export SHA=$(git rev-parse --short HEAD)
       python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
       data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
       cat dockers/tpu-tests/tpu_test_cases.jsonnet

  deploy_cluster: &deploy_cluster
   run:
     name: Deploy the job on the kubernetes cluster
     command: |
       export PATH=$PATH:$HOME/go/bin
       job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
       job_name=${job_name#job.batch/}
       job_name=${job_name% created}
       echo "Waiting on kubernetes job: $job_name"
       i=0 && \
       # N checks spaced 30s apart = 900s total.
       status_code=2 && \
       # Check on the job periodically. Set the status code depending on what
       # happened to the job in Kubernetes. If we try MAX_CHECKS times and
       # still the job hasn't finished, give up and return the starting
       # non-zero status code.
       printf "Waiting for job to finish: " && \
       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
       echo "Done waiting. Job status code: $status_code" && \
       pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
       echo "GKE pod name: $pod_name" && \
       kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
       if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
       # First portion is the test logs. Print these to Github Action stdout.
       cat xx00 && \
       echo "Done with log retrieval attempt." && \
       exit $status_code

  stats: &stats
   run:
     name: Statistics
     command: |
       mv ./xx01 coverage.xml
       # TODO: add human readable report
       cat coverage.xml
       sudo pip install pycobertura
       pycobertura show coverage.xml

jobs:

  TPU-tests:
    docker:
      - image: circleci/python:3.7
    environment:
      - XLA_VER: 1.8
      - PYTHON_VER: 3.7
      - MAX_CHECKS: 240
      - CHECK_SPEEP: 5
    steps:
      - checkout
      - go/install
      - *checkout_ml_testing
      - gcp-gke/install
      - gcp-gke/update-kubeconfig-with-credentials:
          cluster: $GKE_CLUSTER
          perform-login: true
      - *install_jsonnet
      - *update_jsonnet
      - *deploy_cluster
      - *stats
      - codecov/upload:
          file: coverage.xml
          flags: tpu,pytest
          upload_name: TPU-coverage

      - store_artifacts:
          path: coverage.xml

  build-Docs:
    docker:
      - image: readthedocs/build:latest
    steps:
      - checkout
      - run:
          command: |
            git submodule update --init --recursive
          name: Init git submodule
      - *make_docs
      - store_artifacts:
          # allows us to preview the generated html pages
          path: docs/build/html/
          destination: html

workflows:
  version: 2
  ci-tests:
    jobs:
      - build-Docs
      - TPU-tests