114 lines
4.4 KiB
YAML
114 lines
4.4 KiB
YAML
name: TPU tests
|
|
|
|
on:
|
|
push:
|
|
branches:
|
|
- master
|
|
pull_request:
|
|
branches:
|
|
- master
|
|
|
|
env:
|
|
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
|
|
GKE_CLUSTER: lightning-cluster
|
|
GKE_ZONE: us-central1-a
|
|
IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
|
|
|
|
jobs:
|
|
setup-build-publish-deploy:
|
|
name: tpu-testing-job
|
|
runs-on: ubuntu-latest
|
|
|
|
steps:
|
|
- name: Install Go
|
|
uses: actions/setup-go@v2
|
|
with:
|
|
go-version: 1.14.x
|
|
|
|
- name: Checkout Pytorch Lightning
|
|
uses: actions/checkout@v2
|
|
with:
|
|
repository: PyTorchLightning/pytorch-lightning
|
|
ref: ${{ github.event.pull_request.head.sha }}
|
|
path: main
|
|
|
|
- name: Checkout ml-testing-accelerators
|
|
uses: actions/checkout@v2
|
|
with:
|
|
repository: GoogleCloudPlatform/ml-testing-accelerators
|
|
path: ml-testing-accelerators
|
|
ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25
|
|
|
|
- name: Setup gcloud CLI
|
|
uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
|
|
with:
|
|
version: '290.0.1'
|
|
service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }}
|
|
project_id: ${{ secrets.GKE_PROJECT }}
|
|
export_default_credentials: true
|
|
|
|
# Configure Docker to use the gcloud command-line tool as a credential helper for authentication.
|
|
- name: Configure Docker
|
|
run: |-
|
|
gcloud --quiet auth configure-docker
|
|
shell: bash
|
|
- name: Build and Push Docker Image
|
|
run: |
|
|
cd main/docker/tpu
|
|
docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
|
|
docker push "$IMAGE:$GITHUB_RUN_ID"
|
|
shell: bash
|
|
|
|
- name: Install jsonnet
|
|
run: |-
|
|
go get github.com/google/go-jsonnet/cmd/jsonnet
|
|
shell: bash
|
|
# Get the GKE credentials so we can deploy to the cluster
|
|
# Use either zone or region depending on cluster setup.
|
|
- run: |-
|
|
gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE"
|
|
shell: bash
|
|
|
|
- name: Deploy the job on the kubernetes cluster
|
|
run: |-
|
|
job_name=$(jsonnet -J ml-testing-accelerators/ main/docker/tpu/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \
|
|
job_name=${job_name#job.batch/} && \
|
|
job_name=${job_name% created} && \
|
|
echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
|
|
i=0 && \
|
|
# 30 checks spaced 30s apart = 900s total.
|
|
max_checks=30 && \
|
|
status_code=2 && \
|
|
# Check on the job periodically. Set the status code depending on what
|
|
# happened to the job in Kubernetes. If we try max_checks times and
|
|
# still the job hasn't finished, give up and return the starting
|
|
# non-zero status code.
|
|
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
|
|
echo "Done waiting. Job status code: $status_code" && \
|
|
# Allow time for logs to flush.
|
|
sleep 60 && \
|
|
echo "JOB_NAME: $job_name" && \
|
|
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
|
|
echo "GKE_ZONE: $GKE_ZONE" && \
|
|
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \
|
|
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/' && \
|
|
# First portion is the test logs. Print these to Github Action stdout.
|
|
cat xx00 && \
|
|
echo "Done with log retrieval attempt." && \
|
|
gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
|
|
exit $status_code
|
|
shell: bash
|
|
|
|
# todo: to be used after enable merging reports from different CIs
|
|
#- name: Upload coverage to Codecov
|
|
# uses: codecov/codecov-action@v1
|
|
# if: always()
|
|
# with:
|
|
# token: ${{ secrets.CODECOV_TOKEN }}
|
|
# file: ./xx01
|
|
# flags: tpu,pytest
|
|
# # env_vars: OS,PYTHON
|
|
# # name: codecov-umbrella
|
|
# fail_ci_if_error: true
|
|
|