2020-07-02 01:44:19 +00:00
name : TPU tests
on :
push :
branches :
- master
2020-07-04 15:31:12 +00:00
# TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs
2020-07-02 01:44:19 +00:00
pull_request :
branches :
- master
env :
PROJECT_ID : ${{ secrets.GKE_PROJECT }}
GKE_CLUSTER : lightning-cluster
GKE_ZONE : us-central1-a
IMAGE : gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
jobs :
setup-build-publish-deploy :
name : tpu-testing-job
2020-07-04 15:31:12 +00:00
runs-on : ubuntu-20.04
# Timeout: https://stackoverflow.com/a/59076067/4521646
timeout-minutes : 50
2020-07-02 01:44:19 +00:00
steps :
- name : Install Go
uses : actions/setup-go@v2
with :
go-version : 1.14 .x
2020-07-04 15:31:12 +00:00
- name : Set up Python 3.7
uses : actions/setup-python@v2
with :
python-version : 3.7
2020-07-02 01:44:19 +00:00
- name : Checkout Pytorch Lightning
uses : actions/checkout@v2
with :
repository : PyTorchLightning/pytorch-lightning
ref : ${{ github.event.pull_request.head.sha }}
- name : Checkout ml-testing-accelerators
uses : actions/checkout@v2
with :
repository : GoogleCloudPlatform/ml-testing-accelerators
path : ml-testing-accelerators
ref : 5e88ac24f631c27045e62f0e8d5dfcf34e425e25
- name : Setup gcloud CLI
uses : GoogleCloudPlatform/github-actions/setup-gcloud@master
with :
version : '290.0.1'
service_account_key : ${{ secrets.GKE_SA_KEY_BASE64 }}
project_id : ${{ secrets.GKE_PROJECT }}
export_default_credentials : true
# Configure Docker to use the gcloud command-line tool as a credential helper for authentication.
- name : Configure Docker
run : |-
gcloud --quiet auth configure-docker
shell : bash
- name : Build and Push Docker Image
run : |
2020-07-06 18:21:36 +00:00
cd dockers/tpu-tests
2020-07-02 01:44:19 +00:00
docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
docker push "$IMAGE:$GITHUB_RUN_ID"
shell : bash
- name : Install jsonnet
run : |-
go get github.com/google/go-jsonnet/cmd/jsonnet
shell : bash
# Get the GKE credentials so we can deploy to the cluster
# Use either zone or region depending on cluster setup.
- run : |-
gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE"
shell : bash
- name : Deploy the job on the kubernetes cluster
run : |-
2020-07-06 18:21:36 +00:00
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \
2020-07-02 01:44:19 +00:00
job_name=${job_name#job.batch/} && \
job_name=${job_name% created} && \
echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
i=0 && \
# 30 checks spaced 30s apart = 900s total.
max_checks=30 && \
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try max_checks times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
echo "Done waiting. Job status code: $status_code" && \
# Allow time for logs to flush.
sleep 60 && \
echo "JOB_NAME: $job_name" && \
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
echo "GKE_ZONE: $GKE_ZONE" && \
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \
2020-07-09 00:33:48 +00:00
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
2020-07-02 01:44:19 +00:00
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
exit $status_code
shell : bash
2020-07-04 15:31:12 +00:00
- name : Statistics
if : success()
run : |
mv ./xx01 coverage
# TODO: add human readable report
cat coverage
# sudo pip install pycobertura
# pycobertura show coverage.xml
2020-07-02 01:44:19 +00:00
2020-07-04 15:31:12 +00:00
- name : Upload coverage results
uses : actions/upload-artifact@master
with :
name : coverage-TPU
path : coverage
- name : Upload coverage to Codecov
uses : codecov/codecov-action@v1
if : always()
with :
token : ${{ secrets.CODECOV_TOKEN }}
file : coverage
flags : tpu,pytest
name : TPU-coverage
fail_ci_if_error : true