153 lines
5.3 KiB
YAML
153 lines
5.3 KiB
YAML
name: Test PyTorch - TPU
|
|
|
|
on:
|
|
push:
|
|
branches: [master, "release/*"]
|
|
pull_request_target:
|
|
branches: [master, "release/*"]
|
|
types: [opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
|
|
paths:
|
|
- ".actions/**"
|
|
- ".github/workflows/tpu-tests.yml"
|
|
- "dockers/base-xla/*"
|
|
- "requirements/fabric/**"
|
|
- "src/lightning/fabric/**"
|
|
- "src/lightning_fabric/*"
|
|
- "tests/tests_fabric/**"
|
|
- "requirements/pytorch/**"
|
|
- "src/lightning/pytorch/**"
|
|
- "src/pytorch_lightning/*"
|
|
- "tests/tests_pytorch/**"
|
|
- "pyproject.toml" # includes pytest config
|
|
- "!requirements/*/docs.txt"
|
|
- "!*.md"
|
|
- "!**/*.md"
|
|
|
|
concurrency:
|
|
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
|
|
cancel-in-progress: ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
|
|
|
|
env:
|
|
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
|
|
GKE_CLUSTER: lightning-cluster
|
|
GKE_ZONE: us-central1-a
|
|
|
|
defaults:
|
|
run:
|
|
shell: bash
|
|
|
|
jobs:
|
|
test-on-tpus:
|
|
runs-on: ubuntu-22.04
|
|
if: github.event.pull_request.draft == false
|
|
env:
|
|
PYTHON_VER: 3.8
|
|
strategy:
|
|
fail-fast: true
|
|
max-parallel: 1 # run sequential
|
|
matrix:
|
|
# TODO: add also lightning
|
|
pkg-name: ["fabric", "pytorch"]
|
|
timeout-minutes: 100 # should match the timeout in `tpu_workflow.jsonnet`
|
|
|
|
steps:
|
|
- uses: actions/checkout@v3
|
|
with:
|
|
ref: ${{ github.event.pull_request.head.sha }}
|
|
|
|
- uses: actions/setup-python@v4
|
|
with:
|
|
python-version: ${{ env.PYTHON_VER }}
|
|
|
|
- name: Checkout ml-testing-accelerators
|
|
run: |
|
|
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
|
|
cd ml-testing-accelerators
|
|
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
|
|
git checkout stable
|
|
|
|
- uses: actions/setup-go@v3
|
|
with:
|
|
go-version: '1.19'
|
|
|
|
- name: Install jsonnet
|
|
run: go install github.com/google/go-jsonnet/cmd/jsonnet@latest
|
|
|
|
- name: Update jsonnet
|
|
env:
|
|
SCOPE: ${{ matrix.pkg-name }}
|
|
XLA_VER: 1.12
|
|
PR_NUMBER: ${{ github.event.pull_request.number }}
|
|
SHA: ${{ github.event.pull_request.head.sha }}
|
|
run: |
|
|
import os
|
|
fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet'
|
|
with open(fname) as fo:
|
|
data = fo.read()
|
|
data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER"))
|
|
data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER"))
|
|
data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER"))
|
|
data = data.replace('{SHA}', os.getenv("SHA"))
|
|
with open(fname, "w") as fw:
|
|
fw.write(data)
|
|
shell: python
|
|
- name: Show jsonnet
|
|
run: cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet
|
|
|
|
- uses: google-github-actions/auth@v1
|
|
with:
|
|
credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
|
|
|
|
# https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine
|
|
- uses: google-github-actions/get-gke-credentials@v1
|
|
with:
|
|
cluster_name: ${{ env.GKE_CLUSTER }}
|
|
location: ${{ env.GKE_ZONE }}
|
|
|
|
- name: Deploy cluster
|
|
run: |
|
|
export PATH=$PATH:$HOME/go/bin
|
|
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -)
|
|
job_name=${job_name#job.batch/}
|
|
job_name=${job_name% created}
|
|
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
|
|
echo "GKE pod name: $pod_name"
|
|
echo "Waiting on kubernetes job: $job_name"
|
|
status_code=2 &&
|
|
# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
|
|
printf "Waiting for job to finish: "
|
|
while true; do
|
|
if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then
|
|
status_code=1 && break;
|
|
elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then
|
|
status_code=0 && break;
|
|
else
|
|
printf ".";
|
|
fi;
|
|
sleep 5;
|
|
done
|
|
echo "Done waiting. Job status code: $status_code"
|
|
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
|
|
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then
|
|
# successful run. split the output into logs + coverage report
|
|
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/';
|
|
cat xx00 # test logs
|
|
mv xx01 coverage.xml
|
|
else
|
|
# failed run, print everything
|
|
cat /tmp/full_output.txt;
|
|
fi
|
|
exit $status_code
|
|
shell: bash
|
|
|
|
- name: Upload coverage to Codecov
|
|
uses: codecov/codecov-action@v3
|
|
# see: https://github.com/actions/toolkit/issues/399
|
|
continue-on-error: true
|
|
with:
|
|
token: ${{ secrets.CODECOV_TOKEN }}
|
|
file: coverage.xml
|
|
flags: tpu,pytest,python${{ env.PYTHON_VER }}
|
|
name: TPU-coverage
|
|
fail_ci_if_error: false
|