Add Github Action to run TPU tests. (#2376)

* Add Github Action to run TPU tests. * Trigger new Github Actions run. * Clean up more comments. * Use different fixed version of ml-testing-accelerators and update config to match. * use cluster in us-central1-a * Run 'gcloud logging read' directly without 'echo' to preserve newlines. * cat coverage.xml on the TPU VM side and upload xml on the Github Action side * Use new commit on ml-testing-accelerators so command runs fully. * Preserve newlines in the xml and use if: always() temporarily to upload codecov * Use pytorch_lightning for coverage instead of pytorch-lightning * Remove the debug cat of coverage xml * Apply suggestions from code review * jsonnet rename * name * add codecov flags * add codecov flags * codecov * codecov * revert codecov * Clean up after apt-get and remove old TODOs. * More codefactor cleanups. * drone * drone * disable codecov * cleaning * docker py versions * docker py 3.7 * readme * bash * docker * freeze conda * py3.6 * Stop using apt-get clean. * Dont rm pytorch-lightning * Update docker/tpu/Dockerfile * Longer timeout in the Github Action to wait for GKE to finish. * job1 * job2 * job3 Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka@pytorchlightning.ai>
2020-07-01 18:44:19 -07:00 · 2020-07-01 18:44:19 -07:00 · 1a40963d1d
parent dcd6000be7
commit 1a40963d1d
7 changed files with 230 additions and 3 deletions
--- a/.codecov.yml
+++ b/.codecov.yml
@ -50,5 +50,4 @@ comment:
  layout: header, diff
  require_changes: false
  behavior: default  # update if exists else create new
-  # branches: *
-
+  # after_n_builds: 2
--- a/.drone.yml
+++ b/.drone.yml
@ -43,7 +43,8 @@ steps:
    - python -m py.test benchmarks pl_examples -v --maxfail=2  --durations=0 # --flake8
    #- cd docs; make doctest; make coverage
    - coverage report
-    - codecov --token $CODECOV_TOKEN  # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
+    # see: https://docs.codecov.io/docs/merging-reports
+    - codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
    - python tests/collect_env_details.py

 trigger:
--- a/.github/workflows/tpu-testing.yml
+++ b/.github/workflows/tpu-testing.yml
@ -0,0 +1,113 @@
+name: TPU tests
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+env:
+  PROJECT_ID: ${{ secrets.GKE_PROJECT }}
+  GKE_CLUSTER: lightning-cluster
+  GKE_ZONE: us-central1-a
+  IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
+
+jobs:
+  setup-build-publish-deploy:
+    name: tpu-testing-job
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Install Go
+      uses: actions/setup-go@v2
+      with:
+        go-version: 1.14.x
+
+    - name: Checkout Pytorch Lightning
+      uses: actions/checkout@v2
+      with:
+        repository: PyTorchLightning/pytorch-lightning
+        ref: ${{ github.event.pull_request.head.sha }}
+        path: main
+
+    - name: Checkout ml-testing-accelerators
+      uses: actions/checkout@v2
+      with:
+        repository: GoogleCloudPlatform/ml-testing-accelerators
+        path: ml-testing-accelerators
+        ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25
+
+    - name: Setup gcloud CLI
+      uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
+      with:
+        version: '290.0.1'
+        service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }}
+        project_id: ${{ secrets.GKE_PROJECT }}
+        export_default_credentials: true
+
+    # Configure Docker to use the gcloud command-line tool as a credential helper for authentication.
+    - name: Configure Docker
+      run: |-
+        gcloud --quiet auth configure-docker
+      shell: bash
+    - name: Build and Push Docker Image
+      run: |
+        cd main/docker/tpu
+        docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
+        docker push "$IMAGE:$GITHUB_RUN_ID"
+      shell: bash
+
+    - name: Install jsonnet
+      run: |-
+        go get github.com/google/go-jsonnet/cmd/jsonnet
+      shell: bash
+    # Get the GKE credentials so we can deploy to the cluster
+    # Use either zone or region depending on cluster setup.
+    - run: |-
+        gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE"
+      shell: bash
+
+    - name: Deploy the job on the kubernetes cluster
+      run: |-
+        job_name=$(jsonnet -J ml-testing-accelerators/ main/docker/tpu/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \
+        job_name=${job_name#job.batch/} && \
+        job_name=${job_name% created} && \
+        echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
+        i=0 && \
+        # 30 checks spaced 30s apart = 900s total.
+        max_checks=30 && \
+        status_code=2 && \
+        # Check on the job periodically. Set the status code depending on what
+        # happened to the job in Kubernetes. If we try max_checks times and
+        # still the job hasn't finished, give up and return the starting
+        # non-zero status code.
+        while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
+        echo "Done waiting. Job status code: $status_code" && \
+        # Allow time for logs to flush.
+        sleep 60 && \
+        echo "JOB_NAME: $job_name" && \
+        echo "GKE_CLUSTER: $GKE_CLUSTER" && \
+        echo "GKE_ZONE: $GKE_ZONE" && \
+        gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \
+        csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/' && \
+        # First portion is the test logs. Print these to Github Action stdout.
+        cat xx00 && \
+        echo "Done with log retrieval attempt." && \
+        gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
+        exit $status_code
+      shell: bash
+
+    # todo: to be used after enable merging reports from different CIs
+    #- name: Upload coverage to Codecov
+    #  uses: codecov/codecov-action@v1
+    #  if: always()
+    #  with:
+    #    token: ${{ secrets.CODECOV_TOKEN }}
+    #    file: ./xx01
+    #    flags: tpu,pytest
+    #    # env_vars: OS,PYTHON
+    #    # name: codecov-umbrella
+    #    fail_ci_if_error: true
+
--- a/README.md
+++ b/README.md
@ -42,6 +42,7 @@
 | :---: | :---: | :---: | :---: |
 | Conda py3.7 [linux] | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) |
 | Linux py3.7 [GPU] | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
+| Linux py3.6 [TPU] | - | - | ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg) |
 | Linux py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
 | OSX py3.6 / py3.7 | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
 | Windows py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |[![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
--- a/docker/tpu/Dockerfile
+++ b/docker/tpu/Dockerfile
@ -0,0 +1,72 @@
+FROM google/cloud-sdk:slim
+
+# Build args.
+ARG GITHUB_REF=refs/heads/master
+ARG TEST_IMAGE=0
+
+# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6
+# wheels available; see below.
+ENV PYTHON_VERSION=3.6
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \
+         cmake \
+         git \
+         curl \
+         ca-certificates
+
+# Install conda and python.
+# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
+RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
+    chmod +x ~/miniconda.sh && \
+    ~/miniconda.sh -b && \
+    rm ~/miniconda.sh
+
+ENV PATH=/root/miniconda3/bin:$PATH
+
+RUN conda create -y --name container python=$PYTHON_VERSION
+
+# Run the rest of commands within the new conda env.
+# Use absolute path to appease Codefactor.
+SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
+RUN conda install -y python=$PYTHON_VERSION mkl
+
+RUN pip uninstall -y torch && \
+    # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
+    gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    apt-get install -y libomp5
+
+ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
+
+# Install pytorch-lightning at the current PR, plus dependencies.
+RUN git clone https://github.com/PyTorchLightning/pytorch-lightning.git && \
+    cd pytorch-lightning && \
+    git fetch origin $GITHUB_REF:CI && \
+    git checkout CI && \
+    cd .. && \
+    pip install ./pytorch-lightning
+
+# If using this image for tests, intall more dependencies and don't delete
+# the source code where the tests live.
+RUN if [ $TEST_IMAGE -eq 1 ] ; then \
+        pip install -r pytorch-lightning/requirements/test.txt ; \
+    else \
+        rm -rf pytorch-lightning ; \
+    fi
+
+RUN conda init bash
+RUN python -c "import pytorch_lightning as pl; print(pl.__version__)"
+
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+CMD ["bash"]
--- a/docker/tpu/docker-entrypoint.sh
+++ b/docker/tpu/docker-entrypoint.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+source ~/.bashrc
+echo "running docker-entrypoint.sh"
+conda activate container
+echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
+echo "printed TPU info"
+export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
+exec "$@"
--- a/docker/tpu/tpu_test_cases.jsonnet
+++ b/docker/tpu/tpu_test_cases.jsonnet
@ -0,0 +1,33 @@
+local base = import 'templates/base.libsonnet';
+local tpus = import 'templates/tpus.libsonnet';
+local utils = import "templates/utils.libsonnet";
+
+local tputests = base.BaseTest {
+  frameworkPrefix: 'pl',
+  modelName: 'tpu-tests',
+  mode: 'postsubmit',
+  configMaps: [],
+
+  timeout: 900, # 15 minutes, in seconds.
+
+  image: std.extVar('image'),
+  imageTag: std.extVar('image-tag'),
+
+  tpuSettings+: {
+    softwareVersion: 'pytorch-nightly',
+  },
+  accelerator: tpus.v3_8,
+
+  command: utils.scriptCommand(
+    |||
+      coverage run --source=pytorch_lightning -m pytest pytorch-lightning/tests/models/test_tpu.py -v
+      test_exit_code=$?
+      echo "\n||| END PYTEST LOGS |||\n"
+      coverage xml
+      cat coverage.xml | tr -d '\t'
+      test $test_exit_code -eq 0
+    |||
+  ),
+};
+
+tputests.oneshotJob