From 4046298a99afac2f99d20aee272aa29102457b38 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 9 Nov 2021 12:31:38 -0800 Subject: [PATCH] client: fix overly aggressive project-wide file transfer backoff policy. Old: if we get more than 3 upload or download failures for a project, do a backoff for each failure. Problem: if we start a bunch of transfers (say the N output files of a job) and they all fail, we back off for too long, e.g.: try N uploads back off 2^N minutes try N uploads back off 4^N minutes ... New: on a failure, ignore it if we're already backed off. So the behavior should be something like: try N uploads back off 1 minute try N uploads back off 2 minutes ... --- client/client_types.cpp | 39 ++++++++++++++++++++++++++------------- client/client_types.h | 11 +++++++++-- client/project.cpp | 2 ++ 3 files changed, 37 insertions(+), 15 deletions(-) diff --git a/client/client_types.cpp b/client/client_types.cpp index 7a8696e7d5..2977ef7863 100644 --- a/client/client_types.cpp +++ b/client/client_types.cpp @@ -70,24 +70,37 @@ bool FILE_XFER_BACKOFF::ok_to_transfer() { return (dt <= 0); } +// A transfer has failed. +// Back off transfers (project-wide) if needed. +// void FILE_XFER_BACKOFF::file_xfer_failed(PROJECT* p) { + // If we're already backed off, ignore this failure. + // If we start several transfers at once + // (say, N output files of a job) and they all fail, + // we don't want to back off N times, which could be hours. + // + if (gstate.now < next_xfer_time) { + return; + } + file_xfer_failures++; if (file_xfer_failures < FILE_XFER_FAILURE_LIMIT) { next_xfer_time = 0; - } else { - double backoff = calculate_exponential_backoff( - file_xfer_failures, - gstate.pers_retry_delay_min, - gstate.pers_retry_delay_max - ); - if (log_flags.file_xfer_debug) { - msg_printf(p, MSG_INFO, - "[file_xfer] project-wide xfer delay for %f sec", - backoff - ); - } - next_xfer_time = gstate.now + backoff; + return; } + double backoff = calculate_exponential_backoff( + file_xfer_failures, + gstate.pers_retry_delay_min, + gstate.pers_retry_delay_max + ); + if (log_flags.file_xfer_debug) { + msg_printf(p, MSG_INFO, + "[file_xfer] project-wide %s delay for %f sec", + is_upload?"upload":"download", + backoff + ); + } + next_xfer_time = gstate.now + backoff; } void FILE_XFER_BACKOFF::file_xfer_succeeded() { diff --git a/client/client_types.h b/client/client_types.h index 1f224e798c..fa24575c75 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -188,18 +188,25 @@ struct FILE_REF { int write(MIOFILE&); }; -// file xfer backoff state for a project and direction (up/down) -// if file_xfer_failures exceeds FILE_XFER_FAILURE_LIMIT, +// File xfer backoff state for a project and direction (up/down). +// If we get more than FILE_XFER_FAILURE_LIMIT (3) consecutive failures, // we switch from a per-file to a project-wide backoff policy // (separately for the up/down directions) +// E.g. if we have 100 files to upload and the first 3 fail, +// we don't try the other 97 immediately. +// // NOTE: this refers to transient failures, not permanent. // + #define FILE_XFER_FAILURE_LIMIT 3 + struct FILE_XFER_BACKOFF { int file_xfer_failures; // count of consecutive failures double next_xfer_time; // when to start trying again + bool is_upload; + bool ok_to_transfer(); void file_xfer_failed(PROJECT*); void file_xfer_succeeded(); diff --git a/client/project.cpp b/client/project.cpp index 20f0085cdf..493fe92d6c 100644 --- a/client/project.cpp +++ b/client/project.cpp @@ -125,6 +125,8 @@ void PROJECT::init() { gpu_ec = 0; gpu_time = 0; app_configs.clear(); + upload_backoff.is_upload = true; + download_backoff.is_upload = false; #ifdef SIM idle_time = 0;