lightning/.circleci/config.yml

# Python CircleCI 2.1 configuration file.
version: 2.1
orbs:
  gcp-gke: circleci/gcp-gke@1.4.0
  go: circleci/go@1.7.1
  codecov: codecov/codecov@1.1.0
parameters:
  GHA_Actor:
    type: string
    default: ""
  GHA_Action:
    type: string
    default: ""
  GHA_Event:
    type: string
    default: ""
  GHA_Meta:
    type: string
    default: ""

references:

  make_docs: &make_docs
    run:
      name: Make Documentation
      command: |
        # the image uses python 2.7 by default, force a different version
        pyenv global 3.7.3
        python --version
        pip install -e . -r requirements/pytorch/docs.txt
        pip list
        cd docs
        make clean
        make html --jobs 2 SPHINXOPTS="-W"

  checkout_ml_testing: &checkout_ml_testing
   run:
     name: Checkout ml-testing-accelerators
     command: |
       git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
       cd ml-testing-accelerators
       git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
       git checkout stable
       cd ..

  install_jsonnet: &install_jsonnet
    run:
      name: Install jsonnet
      command: |
        go install github.com/google/go-jsonnet/cmd/jsonnet@latest

  update_jsonnet: &update_jsonnet
    run:
      name: Update jsonnet
      environment:
        PR_NUMBER: << pipeline.parameters.GHA_Meta >>
      command: |
       export SHA=$(git rev-parse --short HEAD)
       python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
       data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
       cat dockers/tpu-tests/tpu_test_cases.jsonnet

  deploy_cluster: &deploy_cluster
   run:
     name: Deploy the job on the kubernetes cluster
     command: |
       export PATH=$PATH:$HOME/go/bin
       job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
       job_name=${job_name#job.batch/}
       job_name=${job_name% created}
       pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
       echo "GKE pod name: $pod_name"
       echo "Waiting on kubernetes job: $job_name"
       i=0 && \
       # N checks spaced 30s apart = 900s total.
       status_code=2 && \
       # Check on the job periodically. Set the status code depending on what
       # happened to the job in Kubernetes. If we try MAX_CHECKS times and
       # still the job hasn't finished, give up and return the starting
       # non-zero status code.
       printf "Waiting for job to finish: " && \
       while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SLEEP; done && \
       echo "Done waiting. Job status code: $status_code" && \
       kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
       if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
       # First portion is the test logs. Print these to Github Action stdout.
       cat xx00 && \
       echo "Done with log retrieval attempt." && \
       exit $status_code

  stats: &stats
   run:
     name: Statistics
     command: |
       mv ./xx01 coverage.xml

jobs:

  TPU-tests:
    docker:
      - image: circleci/python:3.7
    environment:
      - XLA_VER: 1.12
      - PYTHON_VER: 3.7
      - MAX_CHECKS: 1000
      - CHECK_SLEEP: 5
    steps:
      - checkout
      - go/install
      - *checkout_ml_testing
      - gcp-gke/install
      - gcp-gke/update-kubeconfig-with-credentials:
          cluster: $GKE_CLUSTER
          perform-login: true
      - *install_jsonnet
      - *update_jsonnet
      - *deploy_cluster
      - *stats
      - codecov/upload:
          file: coverage.xml
          flags: tpu,pytest
          upload_name: TPU-coverage

      - store_artifacts:
          path: coverage.xml

  build-Docs:
    docker:
      - image: readthedocs/build:latest
    steps:
      - checkout
      - run:
          command: |
            git submodule update --init --recursive
          name: Init git submodule
      - *make_docs
      - store_artifacts:
          # allows us to preview the generated html pages
          path: docs/build/html/
          destination: html

workflows:
  #build-docs:  # FixMe
  #  when: << pipeline.parameters.GHA_Action >>
  #  jobs:
  #    - build-Docs
  test-on-tpus:
    when: << pipeline.parameters.GHA_Action >>
    jobs:
      - TPU-tests