From 40337cce583c085bd1ccd86a689dc925b16b0c97 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 28 Jul 2020 21:52:23 +0200 Subject: [PATCH] freeze PT 1.5 for Horovod issue (#2744) * freeze pt 1.5 * torchtext * Apply suggestions from code review Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com> * timeout Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com> --- .circleci/config.yml | 2 +- .github/workflows/tpu-testing.yml | 2 +- requirements/base.txt | 2 +- requirements/extra.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4afe18875c..ca0e7660c6 100755 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -66,7 +66,7 @@ references: while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. - sleep 10 && \ + sleep 30 && \ echo "JOB_NAME: $job_name" && \ gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \ if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ diff --git a/.github/workflows/tpu-testing.yml b/.github/workflows/tpu-testing.yml index 245e424181..30c6b1124b 100644 --- a/.github/workflows/tpu-testing.yml +++ b/.github/workflows/tpu-testing.yml @@ -94,7 +94,7 @@ jobs: while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ # Allow time for logs to flush. - sleep 10 && \ + sleep 30 && \ echo "JOB_NAME: $job_name" && \ echo "GKE_CLUSTER: $GKE_CLUSTER" && \ echo "GKE_ZONE: $GKE_ZONE" && \ diff --git a/requirements/base.txt b/requirements/base.txt index 4072df9466..8eff26906e 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,7 +1,7 @@ # the default package dependencies numpy>=1.16.4 -torch>=1.3 +torch>=1.3, <1.6 # TODO: temporary freeze for Horovod incompatibility with 1.6 tensorboard>=1.14 future>=0.17.1 # required for builtins in setup.py # pyyaml>=3.13 diff --git a/requirements/extra.txt b/requirements/extra.txt index e245a9512d..113c35b466 100644 --- a/requirements/extra.txt +++ b/requirements/extra.txt @@ -11,4 +11,4 @@ horovod>=0.19.1 omegaconf>=2.0.0 # scipy>=0.13.3 scikit-learn>=0.20.0 -torchtext>=0.3.1 +torchtext>=0.3.1, <0.7 # TODO: temporary fix fix for compatibility