freeze PT 1.5 for Horovod issue (#2744)

* freeze pt 1.5

* torchtext

* Apply suggestions from code review

Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com>

* timeout

Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com>
This commit is contained in:
Jirka Borovec 2020-07-28 21:52:23 +02:00 committed by GitHub
parent bc9348f2c4
commit 40337cce58
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 4 additions and 4 deletions

View File

@ -66,7 +66,7 @@ references:
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
echo "Done waiting. Job status code: $status_code" && \
# Allow time for logs to flush.
sleep 10 && \
sleep 30 && \
echo "JOB_NAME: $job_name" && \
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \

View File

@ -94,7 +94,7 @@ jobs:
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
echo "Done waiting. Job status code: $status_code" && \
# Allow time for logs to flush.
sleep 10 && \
sleep 30 && \
echo "JOB_NAME: $job_name" && \
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
echo "GKE_ZONE: $GKE_ZONE" && \

View File

@ -1,7 +1,7 @@
# the default package dependencies
numpy>=1.16.4
torch>=1.3
torch>=1.3, <1.6 # TODO: temporary freeze for Horovod incompatibility with 1.6
tensorboard>=1.14
future>=0.17.1 # required for builtins in setup.py
# pyyaml>=3.13

View File

@ -11,4 +11,4 @@ horovod>=0.19.1
omegaconf>=2.0.0
# scipy>=0.13.3
scikit-learn>=0.20.0
torchtext>=0.3.1
torchtext>=0.3.1, <0.7 # TODO: temporary fix fix for compatibility