Add Github Action to run TPU tests. (#2376)
* Add Github Action to run TPU tests. * Trigger new Github Actions run. * Clean up more comments. * Use different fixed version of ml-testing-accelerators and update config to match. * use cluster in us-central1-a * Run 'gcloud logging read' directly without 'echo' to preserve newlines. * cat coverage.xml on the TPU VM side and upload xml on the Github Action side * Use new commit on ml-testing-accelerators so command runs fully. * Preserve newlines in the xml and use if: always() temporarily to upload codecov * Use pytorch_lightning for coverage instead of pytorch-lightning * Remove the debug cat of coverage xml * Apply suggestions from code review * jsonnet rename * name * add codecov flags * add codecov flags * codecov * codecov * revert codecov * Clean up after apt-get and remove old TODOs. * More codefactor cleanups. * drone * drone * disable codecov * cleaning * docker py versions * docker py 3.7 * readme * bash * docker * freeze conda * py3.6 * Stop using apt-get clean. * Dont rm pytorch-lightning * Update docker/tpu/Dockerfile * Longer timeout in the Github Action to wait for GKE to finish. * job1 * job2 * job3 Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Jirka <jirka@pytorchlightning.ai>
This commit is contained in:
parent
dcd6000be7
commit
1a40963d1d
|
@ -50,5 +50,4 @@ comment:
|
|||
layout: header, diff
|
||||
require_changes: false
|
||||
behavior: default # update if exists else create new
|
||||
# branches: *
|
||||
|
||||
# after_n_builds: 2
|
||||
|
|
|
@ -43,7 +43,8 @@ steps:
|
|||
- python -m py.test benchmarks pl_examples -v --maxfail=2 --durations=0 # --flake8
|
||||
#- cd docs; make doctest; make coverage
|
||||
- coverage report
|
||||
- codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
|
||||
# see: https://docs.codecov.io/docs/merging-reports
|
||||
- codecov --token $CODECOV_TOKEN # --pr $DRONE_PULL_REQUEST --build $DRONE_BUILD_NUMBER --branch $DRONE_BRANCH --commit $DRONE_COMMIT --tag $DRONE_TAG
|
||||
- python tests/collect_env_details.py
|
||||
|
||||
trigger:
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
name: TPU tests
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
pull_request:
|
||||
branches:
|
||||
- master
|
||||
|
||||
env:
|
||||
PROJECT_ID: ${{ secrets.GKE_PROJECT }}
|
||||
GKE_CLUSTER: lightning-cluster
|
||||
GKE_ZONE: us-central1-a
|
||||
IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image
|
||||
|
||||
jobs:
|
||||
setup-build-publish-deploy:
|
||||
name: tpu-testing-job
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Install Go
|
||||
uses: actions/setup-go@v2
|
||||
with:
|
||||
go-version: 1.14.x
|
||||
|
||||
- name: Checkout Pytorch Lightning
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
repository: PyTorchLightning/pytorch-lightning
|
||||
ref: ${{ github.event.pull_request.head.sha }}
|
||||
path: main
|
||||
|
||||
- name: Checkout ml-testing-accelerators
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
repository: GoogleCloudPlatform/ml-testing-accelerators
|
||||
path: ml-testing-accelerators
|
||||
ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25
|
||||
|
||||
- name: Setup gcloud CLI
|
||||
uses: GoogleCloudPlatform/github-actions/setup-gcloud@master
|
||||
with:
|
||||
version: '290.0.1'
|
||||
service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }}
|
||||
project_id: ${{ secrets.GKE_PROJECT }}
|
||||
export_default_credentials: true
|
||||
|
||||
# Configure Docker to use the gcloud command-line tool as a credential helper for authentication.
|
||||
- name: Configure Docker
|
||||
run: |-
|
||||
gcloud --quiet auth configure-docker
|
||||
shell: bash
|
||||
- name: Build and Push Docker Image
|
||||
run: |
|
||||
cd main/docker/tpu
|
||||
docker build --tag "$IMAGE:$GITHUB_RUN_ID" -f Dockerfile --build-arg "GITHUB_REF=$GITHUB_REF" --build-arg "TEST_IMAGE=1" .
|
||||
docker push "$IMAGE:$GITHUB_RUN_ID"
|
||||
shell: bash
|
||||
|
||||
- name: Install jsonnet
|
||||
run: |-
|
||||
go get github.com/google/go-jsonnet/cmd/jsonnet
|
||||
shell: bash
|
||||
# Get the GKE credentials so we can deploy to the cluster
|
||||
# Use either zone or region depending on cluster setup.
|
||||
- run: |-
|
||||
gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE"
|
||||
shell: bash
|
||||
|
||||
- name: Deploy the job on the kubernetes cluster
|
||||
run: |-
|
||||
job_name=$(jsonnet -J ml-testing-accelerators/ main/docker/tpu/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$GITHUB_RUN_ID | kubectl create -f -) && \
|
||||
job_name=${job_name#job.batch/} && \
|
||||
job_name=${job_name% created} && \
|
||||
echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \
|
||||
i=0 && \
|
||||
# 30 checks spaced 30s apart = 900s total.
|
||||
max_checks=30 && \
|
||||
status_code=2 && \
|
||||
# Check on the job periodically. Set the status code depending on what
|
||||
# happened to the job in Kubernetes. If we try max_checks times and
|
||||
# still the job hasn't finished, give up and return the starting
|
||||
# non-zero status code.
|
||||
while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
|
||||
echo "Done waiting. Job status code: $status_code" && \
|
||||
# Allow time for logs to flush.
|
||||
sleep 60 && \
|
||||
echo "JOB_NAME: $job_name" && \
|
||||
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
|
||||
echo "GKE_ZONE: $GKE_ZONE" && \
|
||||
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \
|
||||
csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/' && \
|
||||
# First portion is the test logs. Print these to Github Action stdout.
|
||||
cat xx00 && \
|
||||
echo "Done with log retrieval attempt." && \
|
||||
gcloud container images delete "$IMAGE:$GITHUB_RUN_ID" --force-delete-tags && \
|
||||
exit $status_code
|
||||
shell: bash
|
||||
|
||||
# todo: to be used after enable merging reports from different CIs
|
||||
#- name: Upload coverage to Codecov
|
||||
# uses: codecov/codecov-action@v1
|
||||
# if: always()
|
||||
# with:
|
||||
# token: ${{ secrets.CODECOV_TOKEN }}
|
||||
# file: ./xx01
|
||||
# flags: tpu,pytest
|
||||
# # env_vars: OS,PYTHON
|
||||
# # name: codecov-umbrella
|
||||
# fail_ci_if_error: true
|
||||
|
|
@ -42,6 +42,7 @@
|
|||
| :---: | :---: | :---: | :---: |
|
||||
| Conda py3.7 [linux] | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) | ![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg) |
|
||||
| Linux py3.7 [GPU] | - | - | [![Build Status](http://35.192.60.23/api/badges/PyTorchLightning/pytorch-lightning/status.svg)](http://35.192.60.23/PyTorchLightning/pytorch-lightning) |
|
||||
| Linux py3.6 [TPU] | - | - | ![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg) |
|
||||
| Linux py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
|
||||
| OSX py3.6 / py3.7 | - | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |
|
||||
| Windows py3.6 / py3.7 / py3.8 | [![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) |[![CI testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20testing/badge.svg?event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions?query=workflow%3A%22CI+testing%22) | - |
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
FROM google/cloud-sdk:slim
|
||||
|
||||
# Build args.
|
||||
ARG GITHUB_REF=refs/heads/master
|
||||
ARG TEST_IMAGE=0
|
||||
|
||||
# This Dockerfile installs pytorch/xla 3.7 wheels. There are also 3.6
|
||||
# wheels available; see below.
|
||||
ENV PYTHON_VERSION=3.6
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
cmake \
|
||||
git \
|
||||
curl \
|
||||
ca-certificates
|
||||
|
||||
# Install conda and python.
|
||||
# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
|
||||
RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
|
||||
chmod +x ~/miniconda.sh && \
|
||||
~/miniconda.sh -b && \
|
||||
rm ~/miniconda.sh
|
||||
|
||||
ENV PATH=/root/miniconda3/bin:$PATH
|
||||
|
||||
RUN conda create -y --name container python=$PYTHON_VERSION
|
||||
|
||||
# Run the rest of commands within the new conda env.
|
||||
# Use absolute path to appease Codefactor.
|
||||
SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
|
||||
RUN conda install -y python=$PYTHON_VERSION mkl
|
||||
|
||||
RUN pip uninstall -y torch && \
|
||||
# Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
|
||||
gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
|
||||
gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
|
||||
gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
|
||||
pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||
pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||
pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||
rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||
rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||
rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
|
||||
apt-get install -y libomp5
|
||||
|
||||
ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
|
||||
|
||||
# Install pytorch-lightning at the current PR, plus dependencies.
|
||||
RUN git clone https://github.com/PyTorchLightning/pytorch-lightning.git && \
|
||||
cd pytorch-lightning && \
|
||||
git fetch origin $GITHUB_REF:CI && \
|
||||
git checkout CI && \
|
||||
cd .. && \
|
||||
pip install ./pytorch-lightning
|
||||
|
||||
# If using this image for tests, intall more dependencies and don't delete
|
||||
# the source code where the tests live.
|
||||
RUN if [ $TEST_IMAGE -eq 1 ] ; then \
|
||||
pip install -r pytorch-lightning/requirements/test.txt ; \
|
||||
else \
|
||||
rm -rf pytorch-lightning ; \
|
||||
fi
|
||||
|
||||
RUN conda init bash
|
||||
RUN python -c "import pytorch_lightning as pl; print(pl.__version__)"
|
||||
|
||||
COPY docker-entrypoint.sh /usr/local/bin/
|
||||
RUN chmod +x /usr/local/bin/docker-entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
|
||||
CMD ["bash"]
|
|
@ -0,0 +1,8 @@
|
|||
#!/bin/bash
|
||||
source ~/.bashrc
|
||||
echo "running docker-entrypoint.sh"
|
||||
conda activate container
|
||||
echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
|
||||
echo "printed TPU info"
|
||||
export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
|
||||
exec "$@"
|
|
@ -0,0 +1,33 @@
|
|||
local base = import 'templates/base.libsonnet';
|
||||
local tpus = import 'templates/tpus.libsonnet';
|
||||
local utils = import "templates/utils.libsonnet";
|
||||
|
||||
local tputests = base.BaseTest {
|
||||
frameworkPrefix: 'pl',
|
||||
modelName: 'tpu-tests',
|
||||
mode: 'postsubmit',
|
||||
configMaps: [],
|
||||
|
||||
timeout: 900, # 15 minutes, in seconds.
|
||||
|
||||
image: std.extVar('image'),
|
||||
imageTag: std.extVar('image-tag'),
|
||||
|
||||
tpuSettings+: {
|
||||
softwareVersion: 'pytorch-nightly',
|
||||
},
|
||||
accelerator: tpus.v3_8,
|
||||
|
||||
command: utils.scriptCommand(
|
||||
|||
|
||||
coverage run --source=pytorch_lightning -m pytest pytorch-lightning/tests/models/test_tpu.py -v
|
||||
test_exit_code=$?
|
||||
echo "\n||| END PYTEST LOGS |||\n"
|
||||
coverage xml
|
||||
cat coverage.xml | tr -d '\t'
|
||||
test $test_exit_code -eq 0
|
||||
|||
|
||||
),
|
||||
};
|
||||
|
||||
tputests.oneshotJob
|
Loading…
Reference in New Issue