name: TPU tests on: push: branches: [master, "release/*"] # TODO: temporal disable TPU testing until we find way how to pass credentials to forked PRs # pull_request: # branches: # - master env: PROJECT_ID: ${{ secrets.GKE_PROJECT }} GKE_CLUSTER: lightning-cluster GKE_ZONE: us-central1-a IMAGE: gcr.io/${{ secrets.GKE_PROJECT }}/tpu-testing-image MAX_CHECKS: 360 CHECK_SPEEP: 5 jobs: setup-build-publish-deploy: name: tpu-testing-job runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: python-version: [3.6, 3.7] xla-version: [1.6, 1.7] # Timeout: https://stackoverflow.com/a/59076067/4521646 timeout-minutes: 50 steps: - name: Set IMAGETAG run: echo "IMAGETAG=$(date +%s)_${{ matrix.python-version }}" >> $GITHUB_ENV - name: Install Go uses: actions/setup-go@v2 with: go-version: 1.14.x - name: Set up Python 3.7 uses: actions/setup-python@v2 with: python-version: 3.7 - name: Checkout Pytorch Lightning uses: actions/checkout@v2 with: repository: PyTorchLightning/pytorch-lightning ref: ${{ github.event.pull_request.head.sha }} - name: Checkout ml-testing-accelerators uses: actions/checkout@v2 with: repository: GoogleCloudPlatform/ml-testing-accelerators path: ml-testing-accelerators ref: 5e88ac24f631c27045e62f0e8d5dfcf34e425e25 - name: Setup gcloud CLI uses: GoogleCloudPlatform/github-actions/setup-gcloud@master with: version: '290.0.1' service_account_key: ${{ secrets.GKE_SA_KEY_BASE64 }} project_id: ${{ secrets.GKE_PROJECT }} export_default_credentials: true # Configure Docker to use the gcloud command-line tool as a credential helper for authentication. - name: Configure Docker run: |- gcloud --quiet auth configure-docker shell: bash - name: Build and Push Docker Image env: PYTHON_VER: ${{ matrix.python-version }} XLA_VER: ${{ matrix.xla-version }} run: | #cd dockers/tpu-tests docker build --tag "$IMAGE:$IMAGETAG" -f ./dockers/tpu-tests/Dockerfile --build-arg "PYTHON_VERSION=$PYTHON_VER" --build-arg "PYTORCH_VERSION=$XLA_VER" . docker push "$IMAGE:$IMAGETAG" shell: bash - name: Install jsonnet run: |- go get github.com/google/go-jsonnet/cmd/jsonnet shell: bash # Get the GKE credentials so we can deploy to the cluster # Use either zone or region depending on cluster setup. - run: |- gcloud container clusters get-credentials "$GKE_CLUSTER" --zone "$GKE_ZONE" shell: bash - name: Deploy the job on the kubernetes cluster env: XLA_VER: ${{ matrix.xla-version }} run: |- python -c "fname = 'dockers/tpu-tests/tpu_test_cases.jsonnet' ; ttt = open(fname).read().replace('pytorch-VERSION', 'pytorch-$XLA_VER') ; open(fname, 'w').write(ttt)" job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet --ext-str image=$IMAGE --ext-str image-tag=$IMAGETAG | kubectl create -f -) && \ job_name=${job_name#job.batch/} && \ job_name=${job_name% created} && \ echo "Waiting on kubernetes job: $job_name in cluster: $GKE_CLUSTER" && \ i=0 && \ # 60 checks spaced 30s apart = 900s total. status_code=2 && \ # Check on the job periodically. Set the status code depending on what # happened to the job in Kubernetes. If we try MAX_CHECKS times and # still the job hasn't finished, give up and return the starting # non-zero status code. printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ echo "GKE pod name: $pod_name" && \ kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \ echo "Done with log retrieval attempt." && \ gcloud container images delete "$IMAGE:$IMAGETAG" --force-delete-tags && \ exit $status_code shell: bash - name: Statistics if: success() run: | mv ./xx01 coverage # TODO: add human readable report cat coverage # sudo pip install pycobertura # pycobertura show coverage.xml - name: Upload coverage results uses: actions/upload-artifact@v2 with: name: coverage-TPU path: coverage - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true if: always() with: token: ${{ secrets.CODECOV_TOKEN }} file: coverage flags: tpu,pytest name: TPU-coverage fail_ci_if_error: true