freeze PT 1.5 for Horovod issue (#2744)
* freeze pt 1.5 * torchtext * Apply suggestions from code review Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com> * timeout Co-authored-by: Peter Yu <2057325+yukw777@users.noreply.github.com>
This commit is contained in:
parent
bc9348f2c4
commit
40337cce58
|
@ -66,7 +66,7 @@ references:
|
|||
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \
|
||||
echo "Done waiting. Job status code: $status_code" && \
|
||||
# Allow time for logs to flush.
|
||||
sleep 10 && \
|
||||
sleep 30 && \
|
||||
echo "JOB_NAME: $job_name" && \
|
||||
gcloud logging read "resource.type=k8s_container resource.labels.project_id=$GOOGLE_PROJECT_ID resource.labels.location=$GOOGLE_COMPUTE_ZONE resource.labels.cluster_name=$GKE_CLUSTER resource.labels.namespace_name=default resource.labels.pod_name:$job_name" --limit 10000000 --order asc --format 'value(textPayload)' --project=$GOOGLE_PROJECT_ID > /tmp/full_output.txt && \
|
||||
if grep -q '<?xml version="1.0" ?>' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '/<?xml version="1.0" ?>/'; else mv /tmp/full_output.txt xx00; fi && \
|
||||
|
|
|
@ -94,7 +94,7 @@ jobs:
|
|||
while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "." ; fi; sleep $CHECK_SPEEP; done && \
|
||||
echo "Done waiting. Job status code: $status_code" && \
|
||||
# Allow time for logs to flush.
|
||||
sleep 10 && \
|
||||
sleep 30 && \
|
||||
echo "JOB_NAME: $job_name" && \
|
||||
echo "GKE_CLUSTER: $GKE_CLUSTER" && \
|
||||
echo "GKE_ZONE: $GKE_ZONE" && \
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# the default package dependencies
|
||||
|
||||
numpy>=1.16.4
|
||||
torch>=1.3
|
||||
torch>=1.3, <1.6 # TODO: temporary freeze for Horovod incompatibility with 1.6
|
||||
tensorboard>=1.14
|
||||
future>=0.17.1 # required for builtins in setup.py
|
||||
# pyyaml>=3.13
|
||||
|
|
|
@ -11,4 +11,4 @@ horovod>=0.19.1
|
|||
omegaconf>=2.0.0
|
||||
# scipy>=0.13.3
|
||||
scikit-learn>=0.20.0
|
||||
torchtext>=0.3.1
|
||||
torchtext>=0.3.1, <0.7 # TODO: temporary fix fix for compatibility
|
||||
|
|
Loading…
Reference in New Issue