2022-10-26 14:24:16 +00:00
name : Test PyTorch - TPU
2022-10-24 09:19:42 +00:00
2022-10-21 18:01:39 +00:00
on :
push :
branches : [ master, "release/*"]
2022-10-25 13:23:39 +00:00
pull_request_target :
2022-10-21 18:01:39 +00:00
branches : [ master, "release/*"]
2022-11-04 16:17:48 +00:00
types : [ opened, reopened, ready_for_review, synchronize] # added `ready_for_review` since draft is skipped
2022-10-21 18:01:39 +00:00
paths :
2022-11-26 01:16:48 +00:00
- ".actions/**"
2022-10-21 18:01:39 +00:00
- ".github/workflows/tpu-tests.yml"
2022-11-11 18:30:02 +00:00
- "dockers/base-xla/*"
2023-01-04 15:57:18 +00:00
- "requirements/fabric/**"
2023-02-01 17:18:32 +00:00
- "src/lightning/fabric/**"
- "src/lightning_fabric/*"
2023-01-04 15:57:18 +00:00
- "tests/tests_fabric/**"
2022-11-11 16:38:09 +00:00
- "requirements/pytorch/**"
2023-02-01 18:22:42 +00:00
- "src/lightning/pytorch/**"
- "src/pytorch_lightning/*"
2022-10-21 18:01:39 +00:00
- "tests/tests_pytorch/**"
2023-01-19 16:48:28 +00:00
- "pyproject.toml" # includes pytest config
2022-11-26 01:16:48 +00:00
- "!requirements/*/docs.txt"
2022-11-16 10:07:02 +00:00
- "!*.md"
- "!**/*.md"
2022-10-21 18:01:39 +00:00
concurrency :
group : ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
cancel-in-progress : ${{ ! (github.ref == 'refs/heads/master' || startsWith(github.ref, 'refs/heads/release/')) }}
env :
PROJECT_ID : ${{ secrets.GKE_PROJECT }}
GKE_CLUSTER : lightning-cluster
GKE_ZONE : us-central1-a
2022-12-06 17:00:15 +00:00
defaults :
run :
shell : bash
2022-10-21 18:01:39 +00:00
jobs :
test-on-tpus :
runs-on : ubuntu-22.04
if : github.event.pull_request.draft == false
env :
2023-02-01 01:36:42 +00:00
PYTHON_VER : 3.8
2022-12-06 17:00:15 +00:00
strategy :
2023-02-01 12:23:48 +00:00
fail-fast : true
2022-12-06 17:00:15 +00:00
max-parallel : 1 # run sequential
matrix :
# TODO: add also lightning
2023-01-04 15:57:18 +00:00
pkg-name : [ "fabric" , "pytorch" ]
2022-11-11 18:30:02 +00:00
timeout-minutes : 100 # should match the timeout in `tpu_workflow.jsonnet`
2022-10-21 18:01:39 +00:00
steps :
- uses : actions/checkout@v3
with :
ref : ${{ github.event.pull_request.head.sha }}
- uses : actions/setup-python@v4
with :
python-version : ${{ env.PYTHON_VER }}
- name : Checkout ml-testing-accelerators
run : |
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
- uses : actions/setup-go@v3
with :
go-version : '1.19'
- name : Install jsonnet
run : go install github.com/google/go-jsonnet/cmd/jsonnet@latest
- name : Update jsonnet
env :
2022-12-06 17:00:15 +00:00
SCOPE : ${{ matrix.pkg-name }}
2022-10-21 18:01:39 +00:00
XLA_VER : 1.12
PR_NUMBER : ${{ github.event.pull_request.number }}
SHA : ${{ github.event.pull_request.head.sha }}
run : |
2022-12-06 17:00:15 +00:00
import os
fname = f'dockers/base-xla/tpu_workflow_{os.getenv("SCOPE")}.jsonnet'
with open(fname) as fo :
data = fo.read()
data = data.replace('{PYTORCH_VERSION}', os.getenv("XLA_VER"))
data = data.replace('{PYTHON_VERSION}', os.getenv("PYTHON_VER"))
data = data.replace('{PR_NUMBER}', os.getenv("PR_NUMBER"))
data = data.replace('{SHA}', os.getenv("SHA"))
with open(fname, "w") as fw:
fw.write(data)
shell : python
- name : Show jsonnet
run : cat dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet
2022-10-21 18:01:39 +00:00
2022-11-15 16:24:20 +00:00
- uses : google-github-actions/auth@v1
2022-10-24 21:42:08 +00:00
with :
credentials_json : ${{ secrets.GKE_SA_KEY_BASE64 }}
2022-10-21 18:01:39 +00:00
# https://docs.github.com/en/actions/deployment/deploying-to-your-cloud-provider/deploying-to-google-kubernetes-engine
2022-11-28 04:31:56 +00:00
- uses : google-github-actions/get-gke-credentials@v1
2022-10-21 18:01:39 +00:00
with :
cluster_name : ${{ env.GKE_CLUSTER }}
location : ${{ env.GKE_ZONE }}
- name : Deploy cluster
run : |
export PATH=$PATH:$HOME/go/bin
2022-12-06 17:00:15 +00:00
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/base-xla/tpu_workflow_${{ matrix.pkg-name }}.jsonnet | kubectl create -f -)
2022-10-21 18:01:39 +00:00
job_name=${job_name#job.batch/}
job_name=${job_name% created}
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
echo "GKE pod name: $pod_name"
echo "Waiting on kubernetes job: $job_name"
status_code=2 &&
# Check on the job periodically. Set the status code depending on what happened to the job in Kubernetes.
printf "Waiting for job to finish: "
while true; do
if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then
status_code=1 && break;
elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1"; then
status_code=0 && break;
else
printf ".";
fi;
sleep 5;
done
echo "Done waiting. Job status code: $status_code"
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
2022-11-05 02:08:22 +00:00
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt; then
# successful run. split the output into logs + coverage report
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/';
cat xx00 # test logs
mv xx01 coverage.xml
else
# failed run, print everything
cat /tmp/full_output.txt;
fi
2022-10-21 18:01:39 +00:00
exit $status_code
shell : bash
- name : Upload coverage to Codecov
uses : codecov/codecov-action@v3
# see: https://github.com/actions/toolkit/issues/399
continue-on-error : true
with :
token : ${{ secrets.CODECOV_TOKEN }}
file : coverage.xml
flags : tpu,pytest,python${{ env.PYTHON_VER }}
name : TPU-coverage
fail_ci_if_error : false