lightning/.github/workflows/tpu-tests.yml

128 lines
4.6 KiB
YAML
Raw Normal View History

2022-10-26 14:24:16 +00:00
name: Test PyTorch - TPU
on:
push:
branches: [master, "release/*"]
pull_request_target:
branches: [master, "release/*"]
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
paths:
- ".github/workflows/tpu-tests.yml"
- "requirements/lite/**"
- "src/lightning_lite/**"
- "tests/tests_lite/**"
- "requirements/pytorch/**"
- "src/pytorch_lightning/**"
- "tests/tests_pytorch/**"
- "setup.cfg" # includes pytest config
- ".actions/**"
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
env:
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
GKE_CLUSTER: lightning-cluster
GKE_ZONE: us-central1-a
jobs:
# TODO: package parametrization
test-on-tpus:
runs-on: ubuntu-22.04
if: github.event.pull_request.draft == false
env:
PYTHON_VER: 3.7
timeout-minutes: 100 # should match the timeout in `tpu_test_cases.jsonnet`
steps:
- uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
- uses: actions/setup-python@v4
with:
python-version: ${{ env.PYTHON_VER }}
- name: Checkout ml-testing-accelerators
run: |
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
- uses: actions/setup-go@v3
with:
go-version: '1.19'
- name: Install jsonnet
run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest
- name: Update jsonnet
env:
XLA_VER: 1.12
PR_NUMBER: ${{ github.event.pull_request.number }}
SHA: ${{ github.event.pull_request.head.sha }}
run: |
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
cat dockers/tpu-tests/tpu_test_cases.jsonnet
shell: bash
- uses: google-github-actions/auth@v0
with:
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
# https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine
- uses: google-github-actions/get-gke-credentials@v0
with:
cluster_name: ${{ env.GKE_CLUSTER }}
location: ${{ env.GKE_ZONE }}
- name: Deploy cluster
run: |
export PATH=$PATH:$HOME/go/bin
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -)
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
echo "GKE pod name: $pod_name"
echo "Waiting on kubernetes job: $job_name"
status_code=2 &&
# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
printf "Waiting for job to finish: "
while true; do
if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then
status_code=1 && break;
elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then
status_code=0 && break;
else
printf ".";
fi;
sleep 5;
done
echo "Done waiting. Job status code: $status_code"
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then
# successful run. split the output into logs + coverage report
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/';
cat xx00 # test logs
mv xx01 coverage.xml
else
# failed run, print everything
cat /tmp/full_output.txt;
fi
exit $status_code
shell: bash
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
# see: https://github.com/actions/toolkit/issues/399
continue-on-error: true
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: coverage.xml
flags: tpu,pytest,python${{ env.PYTHON_VER }}
name: TPU-coverage
fail_ci_if_error: false