2020-07-23 16:13:10 +00:00
# Python CircleCI 2.1 configuration file.
version : 2.1
orbs :
2022-07-11 17:25:32 +00:00
gcp-gke : circleci/gcp-gke@1.4.0
go : circleci/go@1.7.1
2020-07-23 16:13:10 +00:00
codecov : codecov/codecov@1.1.0
2022-09-07 13:55:45 +00:00
parameters :
GHA_Actor :
type : string
default : ""
GHA_Action :
type : string
default : ""
GHA_Event :
type : string
default : ""
2022-10-03 13:13:33 +00:00
GHA_Meta :
type : string
default : ""
2021-07-19 14:13:21 +00:00
2020-07-14 18:04:04 +00:00
references :
2020-07-23 16:13:10 +00:00
make_docs : &make_docs
2020-07-14 18:04:04 +00:00
run :
2020-07-23 16:13:10 +00:00
name : Make Documentation
2020-07-14 18:04:04 +00:00
command : |
2021-12-21 17:06:15 +00:00
# the image uses python 2.7 by default, force a different version
2020-07-23 16:13:10 +00:00
pyenv global 3.7.3
python --version
2022-06-21 15:11:33 +00:00
pip install -e . -r requirements/pytorch/docs.txt
2021-01-26 09:44:54 +00:00
pip list
cd docs
make clean
make html --jobs 2 SPHINXOPTS="-W"
2020-07-14 18:04:04 +00:00
2020-07-23 16:13:10 +00:00
checkout_ml_testing : &checkout_ml_testing
2020-07-14 18:04:04 +00:00
run :
2020-07-23 16:13:10 +00:00
name : Checkout ml-testing-accelerators
2020-07-14 18:04:04 +00:00
command : |
2020-07-23 16:13:10 +00:00
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
cd ml-testing-accelerators
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
git checkout stable
cd ..
2020-07-14 18:04:04 +00:00
2021-07-19 14:13:21 +00:00
install_jsonnet : &install_jsonnet
run :
name : Install jsonnet
command : |
2022-07-11 17:25:32 +00:00
go install github.com/google/go-jsonnet/cmd/jsonnet@latest
2021-07-19 14:13:21 +00:00
update_jsonnet : &update_jsonnet
2020-07-23 16:13:10 +00:00
run :
2021-07-19 14:13:21 +00:00
name : Update jsonnet
2022-10-03 13:13:33 +00:00
environment :
PR_NUMBER : << pipeline.parameters.GHA_Meta >>
2020-07-23 16:13:10 +00:00
command : |
2021-07-19 14:13:21 +00:00
export SHA=$(git rev-parse --short HEAD)
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
cat dockers/tpu-tests/tpu_test_cases.jsonnet
2020-07-23 16:13:10 +00:00
deploy_cluster : &deploy_cluster
2020-07-14 18:04:04 +00:00
run :
2020-07-23 16:13:10 +00:00
name : Deploy the job on the kubernetes cluster
2020-07-14 18:04:04 +00:00
command : |
2020-07-23 16:13:10 +00:00
export PATH=$PATH:$HOME/go/bin
2021-07-19 14:13:21 +00:00
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
2020-07-23 16:13:10 +00:00
job_name=${job_name#job.batch/}
job_name=${job_name% created}
2022-07-27 15:40:40 +00:00
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}')
echo "GKE pod name: $pod_name"
2020-07-23 16:13:10 +00:00
echo "Waiting on kubernetes job: $job_name"
i=0 && \
# N checks spaced 30s apart = 900s total.
status_code=2 && \
# Check on the job periodically. Set the status code depending on what
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
# still the job hasn't finished, give up and return the starting
# non-zero status code.
2020-07-27 23:07:09 +00:00
printf "Waiting for job to finish: " && \
2022-09-07 13:55:45 +00:00
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SLEEP; done && \
2020-07-23 16:13:10 +00:00
echo "Done waiting. Job status code: $status_code" && \
2020-08-11 23:30:56 +00:00
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
2020-07-23 16:13:10 +00:00
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
# First portion is the test logs. Print these to Github Action stdout.
cat xx00 && \
echo "Done with log retrieval attempt." && \
exit $status_code
stats : &stats
run :
name : Statistics
command : |
mv ./xx01 coverage.xml
2020-07-14 18:04:04 +00:00
jobs :
2020-07-23 16:13:10 +00:00
TPU-tests :
docker :
- image : circleci/python:3.7
environment :
2022-08-05 09:04:45 +00:00
- XLA_VER : 1.12
2021-07-19 14:13:21 +00:00
- PYTHON_VER : 3.7
2022-07-27 15:40:40 +00:00
- MAX_CHECKS : 1000
2022-09-07 13:55:45 +00:00
- CHECK_SLEEP : 5
2020-07-23 16:13:10 +00:00
steps :
- checkout
- go/install
- *checkout_ml_testing
- gcp-gke/install
- gcp-gke/update-kubeconfig-with-credentials :
cluster : $GKE_CLUSTER
perform-login : true
2021-07-19 14:13:21 +00:00
- *install_jsonnet
- *update_jsonnet
2020-07-23 16:13:10 +00:00
- *deploy_cluster
- *stats
- codecov/upload :
file : coverage.xml
flags : tpu,pytest
upload_name : TPU-coverage
- store_artifacts :
path : coverage.xml
build-Docs :
2020-07-14 18:04:04 +00:00
docker :
- image : readthedocs/build:latest
steps :
- checkout
2021-06-08 16:30:13 +00:00
- run :
command : |
git submodule update --init --recursive
name : Init git submodule
2020-07-14 18:04:04 +00:00
- *make_docs
- store_artifacts :
# allows us to preview the generated html pages
path : docs/build/html/
destination : html
workflows :
2022-07-15 17:19:18 +00:00
#build-docs: # FixMe
2022-09-07 13:55:45 +00:00
# when: << pipeline.parameters.GHA_Action >>
2022-07-15 17:19:18 +00:00
# jobs:
# - build-Docs
2022-04-04 13:08:14 +00:00
test-on-tpus :
2022-09-07 13:55:45 +00:00
when : << pipeline.parameters.GHA_Action >>
2022-04-04 13:08:14 +00:00
jobs :
2021-02-16 16:19:41 +00:00
- TPU-tests