164 lines
5.1 KiB
YAML
164 lines
5.1 KiB
YAML
# Python CircleCI 2.1 configuration file.
|
|
version: 2.1
|
|
orbs:
|
|
gcp-gke: circleci/gcp-gke@1.0.4
|
|
go: circleci/go@1.3.0
|
|
codecov: codecov/codecov@1.1.0
|
|
|
|
trigger:
|
|
tags:
|
|
include:
|
|
- '*'
|
|
branches:
|
|
include:
|
|
- "master"
|
|
- "release/*"
|
|
- "refs/tags/*"
|
|
pr:
|
|
- "master"
|
|
- "release/*"
|
|
|
|
# Workflow Steps:
|
|
# 1. Checkout
|
|
# 2. Install GO
|
|
# 3. Checkout ml-testing-accelerators
|
|
# 4. GCP GKE install
|
|
# 5. Update Kubeconfig with credintials
|
|
# 6. Install jsonnet
|
|
# 7. Update jsonnet
|
|
# 8. Deploy the job on the kubernetes cluster
|
|
# 9. Statistics
|
|
# 10. Upload coverage results
|
|
# 11. Upload coverage to Codecov
|
|
|
|
references:
|
|
|
|
make_docs: &make_docs
|
|
run:
|
|
name: Make Documentation
|
|
command: |
|
|
# the image uses python 2.7 by default, force a different version
|
|
pyenv global 3.7.3
|
|
python --version
|
|
pip install -r requirements/docs.txt
|
|
pip list
|
|
cd docs
|
|
make clean
|
|
make html --jobs 2 SPHINXOPTS="-W"
|
|
|
|
checkout_ml_testing: &checkout_ml_testing
|
|
run:
|
|
name: Checkout ml-testing-accelerators
|
|
command: |
|
|
git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
|
|
cd ml-testing-accelerators
|
|
git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
|
|
git checkout stable
|
|
cd ..
|
|
|
|
install_jsonnet: &install_jsonnet
|
|
run:
|
|
name: Install jsonnet
|
|
command: |
|
|
go get github.com/google/go-jsonnet/cmd/jsonnet
|
|
|
|
update_jsonnet: &update_jsonnet
|
|
run:
|
|
name: Update jsonnet
|
|
command: |
|
|
export PR_NUMBER=$(git ls-remote origin "pull/*/head" | grep -F -f <(git rev-parse HEAD) | awk -F'/' '{print $3}')
|
|
export SHA=$(git rev-parse --short HEAD)
|
|
python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; data = open(fname).read().replace('{PYTORCH_VERSION}', '$XLA_VER')
|
|
data = data.replace('{PYTHON_VERSION}', '$PYTHON_VER').replace('{PR_NUMBER}', '$PR_NUMBER').replace('{SHA}', '$SHA') ; open(fname, 'w').write(data)"
|
|
cat dockers/tpu-tests/tpu_test_cases.jsonnet
|
|
|
|
deploy_cluster: &deploy_cluster
|
|
run:
|
|
name: Deploy the job on the kubernetes cluster
|
|
command: |
|
|
export PATH=$PATH:$HOME/go/bin
|
|
job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \
|
|
job_name=${job_name#job.batch/}
|
|
job_name=${job_name% created}
|
|
echo "Waiting on kubernetes job: $job_name"
|
|
i=0 && \
|
|
# N checks spaced 30s apart = 900s total.
|
|
status_code=2 && \
|
|
# Check on the job periodically. Set the status code depending on what
|
|
# happened to the job in Kubernetes. If we try MAX_CHECKS times and
|
|
# still the job hasn't finished, give up and return the starting
|
|
# non-zero status code.
|
|
printf "Waiting for job to finish: " && \
|
|
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
|
|
echo "Done waiting. Job status code: $status_code" && \
|
|
pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
|
|
echo "GKE pod name: $pod_name" && \
|
|
kubectl logs -f $pod_name --container=train > /tmp/full_output.txt
|
|
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
|
|
# First portion is the test logs. Print these to Github Action stdout.
|
|
cat xx00 && \
|
|
echo "Done with log retrieval attempt." && \
|
|
exit $status_code
|
|
|
|
stats: &stats
|
|
run:
|
|
name: Statistics
|
|
command: |
|
|
mv ./xx01 coverage.xml
|
|
# TODO: add human readable report
|
|
cat coverage.xml
|
|
sudo pip install pycobertura
|
|
pycobertura show coverage.xml
|
|
|
|
jobs:
|
|
|
|
TPU-tests:
|
|
docker:
|
|
- image: circleci/python:3.7
|
|
environment:
|
|
- XLA_VER: 1.8
|
|
- PYTHON_VER: 3.7
|
|
- MAX_CHECKS: 240
|
|
- CHECK_SPEEP: 5
|
|
steps:
|
|
- checkout
|
|
- go/install
|
|
- *checkout_ml_testing
|
|
- gcp-gke/install
|
|
- gcp-gke/update-kubeconfig-with-credentials:
|
|
cluster: $GKE_CLUSTER
|
|
perform-login: true
|
|
- *install_jsonnet
|
|
- *update_jsonnet
|
|
- *deploy_cluster
|
|
- *stats
|
|
- codecov/upload:
|
|
file: coverage.xml
|
|
flags: tpu,pytest
|
|
upload_name: TPU-coverage
|
|
|
|
- store_artifacts:
|
|
path: coverage.xml
|
|
|
|
build-Docs:
|
|
docker:
|
|
- image: readthedocs/build:latest
|
|
steps:
|
|
- checkout
|
|
- run:
|
|
command: |
|
|
git submodule update --init --recursive
|
|
name: Init git submodule
|
|
- *make_docs
|
|
- store_artifacts:
|
|
# allows us to preview the generated html pages
|
|
path: docs/build/html/
|
|
destination: html
|
|
|
|
workflows:
|
|
version: 2
|
|
ci-tests:
|
|
jobs:
|
|
- build-Docs
|
|
- TPU-tests
|