From 580a5bd1dffed6ee5cda55ebda2ee4eb1b73fc45 Mon Sep 17 00:00:00 2001 From: zcain117 Date: Tue, 11 Aug 2020 16:30:56 -0700 Subject: [PATCH] Use kubectl to get logs from TPU CI instead of gcloud logging. (#2918) * Use kubectl to get logs from TPU CI instead of gcloud logging. * Update Github Action to read logs from kubectl rather than gcloud logging. --- .circleci/config.yml | 7 +++---- .github/workflows/tpu-testing.yml | 9 +++------ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 591cf88d42..eb44da7d8d 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -63,10 +63,9 @@ references: printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ - # Allow time for logs to flush. - sleep 30 && \ - echo "JOB_NAME: $job_name" && \ - gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \ + pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ + echo "GKE pod name: $pod_name" && \ + kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \ diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index 30c6b1124b..f8cea13f35 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -93,12 +93,9 @@ jobs: printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ - # Allow time for logs to flush. - sleep 30 && \ - echo "JOB_NAME: $job_name" && \ - echo "GKE_CLUSTER: $GKE_CLUSTER" && \ - echo "GKE_ZONE: $GKE_ZONE" && \ - gcloud logging read "resource.type=k8s_container resource.labels.project_id=$PROJECT_ID resource.labels.location=$GKE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$PROJECT_ID > /tmp/full_output.txt && \ + pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ + echo "GKE pod name: $pod_name" && \ + kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. cat xx00 && \