145 lines
5.5 KiB
YAML
145 lines
5.5 KiB
YAML
name: TPU tests
|
|
|
|
on:
|
|
push:
|
|
branches: [master, "release/*"]
|
|
# TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs
|
|
# pull_request:
|
|
# branches:
|
|
# - master
|
|
|
|
env:
|
|
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
|
|
GKE_CLUSTER: lightning-cluster
|
|
GKE_ZONE: us-central1-a
|
|
IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
|
|
MAX_CHECKS: 360
|
|
CHECK_SPEEP: 5
|
|
|
|
jobs:
|
|
setup-build-publish-deploy:
|
|
name: tpu-testing-job
|
|
runs-on: ubuntu-20.04
|
|
strategy:
|
|
fail-fast: false
|
|
matrix:
|
|
python-version: [3.6, 3.7]
|
|
xla-version: [1.6, 1.7]
|
|
# Timeout: https://stackoverflow.com/a/59076067/4521646
|
|
timeout-minutes: 50
|
|
|
|
steps:
|
|
- name: Set IMAGETAG
|
|
run: echo "IMAGETAG=$(date +%s)_${{ matrix.python-version }}" >> $GITHUB_ENV
|
|
- name: Install Go
|
|
uses: actions/setup-go@v2
|
|
with:
|
|
go-version: 1.14.x
|
|
- name: Set up Python 3.7
|
|
uses: actions/setup-python@v2
|
|
with:
|
|
python-version: 3.7
|
|
|
|
- name: Checkout Pytorch Lightning
|
|
uses: actions/checkout@v2
|
|
with:
|
|
repository: PyTorchLightning/pytorch-lightning
|
|
ref: ${{ github.event.pull_request.head.sha }}
|
|
|
|
- name: Checkout ml-testing-accelerators
|
|
uses: actions/checkout@v2
|
|
with:
|
|
repository: GoogleCloudPlatform/ml-testing-accelerators
|
|
path: ml-testing-accelerators
|
|
ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25
|
|
|
|
- name: Setup gcloud CLI
|
|
uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
|
|
with:
|
|
version: '290.0.1'
|
|
service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }}
|
|
project_id: ${{ secrets.GKE_PROJECT }}
|
|
export_default_credentials: true
|
|
|
|
# Configure Docker to use the gcloud command-line tool as a credential helper for authentication.
|
|
- name: Configure Docker
|
|
run: |-
|
|
gcloud --quiet auth configure-docker
|
|
shell: bash
|
|
- name: Build and Push Docker Image
|
|
env:
|
|
PYTHON_VER: ${{ matrix.python-version }}
|
|
XLA_VER: ${{ matrix.xla-version }}
|
|
run: |
|
|
#cd dockers/tpu-tests
|
|
docker build --tag "$IMAGE:$IMAGETAG" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" --build-arg "PYTORCH_VERSION=$XLA_VER" .
|
|
docker push "$IMAGE:$IMAGETAG"
|
|
shell: bash
|
|
|
|
- name: Install jsonnet
|
|
run: |-
|
|
go get github.com/google/go-jsonnet/cmd/jsonnet
|
|
shell: bash
|
|
# Get the GKE credentials so we can deploy to the cluster
|
|
# Use either zone or region depending on cluster setup.
|
|
- run: |-
|
|
gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE"
|
|
shell: bash
|
|
|
|
- name: Deploy the job on the kubernetes cluster
|
|
env:
|
|
XLA_VER: ${{ matrix.xla-version }}
|
|
run: |-
|
|
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; ttt = open(fname).read().replace('pytorch-VERSION', 'pytorch-$XLA_VER') ; open(fname, 'w').write(ttt)"
|
|
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$IMAGETAG | kubectl create -f -) && \
|
|
job_name=${job_name#job.batch/} && \
|
|
job_name=${job_name% created} && \
|
|
echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
|
|
i=0 && \
|
|
# 60 checks spaced 30s apart = 900s total.
|
|
status_code=2 && \
|
|
# Check on the job periodically. Set the status code depending on what
|
|
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
|
|
# still the job hasn't finished, give up and return the starting
|
|
# non-zero status code.
|
|
printf "Waiting for job to finish: " && \
|
|
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
|
|
echo "Done waiting. Job status code: $status_code" && \
|
|
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
|
|
echo "GKE pod name: $pod_name" && \
|
|
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
|
|
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
|
|
# First portion is the test logs. Print these to Github Action stdout.
|
|
cat xx00 && \
|
|
echo "Done with log retrieval attempt." && \
|
|
gcloud container images delete "$IMAGE:$IMAGETAG" --force-delete-tags && \
|
|
exit $status_code
|
|
shell: bash
|
|
|
|
- name: Statistics
|
|
if: success()
|
|
run: |
|
|
mv ./xx01 coverage
|
|
# TODO: add human readable report
|
|
cat coverage
|
|
# sudo pip install pycobertura
|
|
# pycobertura show coverage.xml
|
|
|
|
- name: Upload coverage results
|
|
uses: actions/upload-artifact@v2
|
|
with:
|
|
name: coverage-TPU
|
|
path: coverage
|
|
|
|
- name: Upload coverage to Codecov
|
|
uses: codecov/codecov-action@v1
|
|
# see: https://github.com/actions/toolkit/issues/399
|
|
continue-on-error: true
|
|
if: always()
|
|
with:
|
|
token: ${{ secrets.CODECOV_TOKEN }}
|
|
file: coverage
|
|
flags: tpu,pytest
|
|
name: TPU-coverage
|
|
fail_ci_if_error: true
|