Add Github Action to run TPU tests. (#2376)

* Add Github Action to run TPU tests.

* Trigger new Github Actions run.

* Clean up more comments.

* Use different fixed version of ml-testing-accelerators and update config to match.

* use cluster in us-central1-a

* Run 'gcloud logging read' directly without 'echo' to preserve newlines.

* cat coverage.xml on the TPU VM side and upload xml on the Github Action side

* Use new commit on ml-testing-accelerators so command runs fully.

* Preserve newlines in the xml and use if: always() temporarily to upload codecov

* Use pytorch_lightning for coverage instead of pytorch-lightning

* Remove the debug cat of coverage xml

* Apply suggestions from code review

* jsonnet rename

* name

* add codecov flags

* add codecov flags

* codecov

* codecov

* revert codecov

* Clean up after apt-get and remove old TODOs.

* More codefactor cleanups.

* drone

* drone

* disable codecov

* cleaning

* docker py versions

* docker py 3.7

* readme

* bash

* docker

* freeze conda

* py3.6

* Stop using apt-get clean.

* Dont rm pytorch-lightning

* Update docker/tpu/Dockerfile

* Longer timeout in the Github Action to wait for GKE to finish.

* job1

* job2

* job3

Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com>
Co-authored-by: Jirka <jirka@pytorchlightning.ai>
This commit is contained in:
zcain117 2020-07-01 18:44:19 -07:00 committed by GitHub
parent dcd6000be7
commit 1a40963d1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 230 additions and 3 deletions

View File

@ -50,5 +50,4 @@ comment:
layout: header, diff
require_changes: false
behavior: default # update if exists else create new
# branches: *
# after_n_builds: 2

View File

@ -43,7 +43,8 @@ steps:
- python -m py.test benchmarks pl_examples -v --maxfail=2 --durations=0 # --flake8
#- cd docs; make doctest; make coverage
- coverage report
- codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
# see: https://docs.codecov.io/docs/merging-reports
- codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
- python tests/collect_env_details.py
trigger:

113
.github/workflows/tpu-testing.yml vendored Normal file
View File

@ -0,0 +1,113 @@
name: TPU tests
on:
push:
branches:
- master
pull_request:
branches:
- master
env:
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
GKE_CLUSTER: lightning-cluster
GKE_ZONE: us-central1-a
IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
jobs:
setup-build-publish-deploy:
name: tpu-testing-job
runs-on: ubuntu-latest
steps:
- name: Install Go
uses: actions/setup-go@v2
with:
go-version: 1.14.x
- name: Checkout Pytorch Lightning
uses: actions/checkout@v2
with:
repository: PyTorchLightning/pytorch-lightning
ref: ${{ github.event.pull_request.head.sha }}
path: main
- name: Checkout ml-testing-accelerators
uses: actions/checkout@v2
with:
repository: GoogleCloudPlatform/ml-testing-accelerators
path: ml-testing-accelerators
ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25
- name: Setup gcloud CLI
uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
with:
version: '290.0.1'
service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }}
project_id: ${{ secrets.GKE_PROJECT }}
export_default_credentials: true
# Configure Docker to use the gcloud command-line tool as a credential helper for authentication.
- name: Configure Docker
run: |-
gcloud --quiet auth configure-docker
shell: bash
- name: Build and Push Docker Image
run: |
cd main/docker/tpu
docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
docker push "$IMAGE:$GITHUB_RUN_ID"
shell: bash
- name: Install jsonnet
run: |-
go get github.com/google/go-jsonnet/cmd/jsonnet
shell: bash
# Get the GKE credentials so we can deploy to the cluster
# Use either zone or region depending on cluster setup.
- run: |-
gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE"
shell: bash
- name: Deploy the job on the kubernetes cluster
run: |-
job_name=$(jsonnet -J ml-testing-accelerators/ main/docker/tpu/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \
job_name=${job_name#job.batch/} && \
job_name=${job_name% created} && \
echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
i=0 && \
# 30 checks spaced 30s apart = 900s total.
max_checks=30 && \
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try max_checks times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
echo "Done waiting. Job status code: $status_code" && \
# Allow time for logs to flush.
sleep 60 && \
echo "JOB_NAME: $job_name" && \
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
echo "GKE_ZONE: $GKE_ZONE" && \
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/' && \
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
exit $status_code
shell: bash
# todo: to be used after enable merging reports from different CIs
#- name: Upload coverage to Codecov
# uses: codecov/codecov-action@v1
# if: always()
# with:
# token: ${{ secrets.CODECOV_TOKEN }}
# file: ./xx01
# flags: tpu,pytest
# # env_vars: OS,PYTHON
# # name: codecov-umbrella
# fail_ci_if_error: true

View File

@ -42,6 +42,7 @@
| :---: | :---: | :---: | :---: |
| Conda py3.7 [linux] | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) |
| Linux py3.7 [GPU] | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
| Linux py3.6 [TPU] | - | - | ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg) |
| Linux py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
| OSX py3.6 / py3.7 | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
| Windows py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |[![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |

72
docker/tpu/Dockerfile Normal file
View File

@ -0,0 +1,72 @@
FROM google/cloud-sdk:slim
# Build args.
ARG GITHUB_REF=refs/heads/master
ARG TEST_IMAGE=0
# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6
# wheels available; see below.
ENV PYTHON_VERSION=3.6
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
cmake \
git \
curl \
ca-certificates
# Install conda and python.
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
chmod +x ~/miniconda.sh && \
~/miniconda.sh -b && \
rm ~/miniconda.sh
ENV PATH=/root/miniconda3/bin:$PATH
RUN conda create -y --name container python=$PYTHON_VERSION
# Run the rest of commands within the new conda env.
# Use absolute path to appease Codefactor.
SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
RUN conda install -y python=$PYTHON_VERSION mkl
RUN pip uninstall -y torch && \
# Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
apt-get install -y libomp5
ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
# Install pytorch-lightning at the current PR, plus dependencies.
RUN git clone https://github.com/PyTorchLightning/pytorch-lightning.git && \
cd pytorch-lightning && \
git fetch origin $GITHUB_REF:CI && \
git checkout CI && \
cd .. && \
pip install ./pytorch-lightning
# If using this image for tests, intall more dependencies and don't delete
# the source code where the tests live.
RUN if [ $TEST_IMAGE -eq 1 ] ; then \
pip install -r pytorch-lightning/requirements/test.txt ; \
else \
rm -rf pytorch-lightning ; \
fi
RUN conda init bash
RUN python -c "import pytorch_lightning as pl; print(pl.__version__)"
COPY docker-entrypoint.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
CMD ["bash"]

View File

@ -0,0 +1,8 @@
#!/bin/bash
source ~/.bashrc
echo "running docker-entrypoint.sh"
conda activate container
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
echo "printed TPU info"
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
exec "$@"

View File

@ -0,0 +1,33 @@
local base = import 'templates/base.libsonnet';
local tpus = import 'templates/tpus.libsonnet';
local utils = import "templates/utils.libsonnet";
local tputests = base.BaseTest {
frameworkPrefix: 'pl',
modelName: 'tpu-tests',
mode: 'postsubmit',
configMaps: [],
timeout: 900, # 15 minutes, in seconds.
image: std.extVar('image'),
imageTag: std.extVar('image-tag'),
tpuSettings+: {
softwareVersion: 'pytorch-nightly',
},
accelerator: tpus.v3_8,
command: utils.scriptCommand(
|||
coverage run --source=pytorch_lightning -m pytest pytorch-lightning/tests/models/test_tpu.py -v
test_exit_code=$?
echo "\n||| END PYTEST LOGS |||\n"
coverage xml
cat coverage.xml | tr -d '\t'
test $test_exit_code -eq 0
|||
),
};
tputests.oneshotJob