client: fix overly aggressive project-wide file transfer backoff policy.

Old: if we get more than 3 upload or download failures for a project,
do a backoff for each failure.

Problem: if we start a bunch of transfers (say the N output files of a job)
and they all fail, we back off for too long, e.g.:
try N uploads
back off 2^N minutes
try N uploads
back off 4^N minutes
...

New: on a failure, ignore it if we're already backed off.
So the behavior should be something like:
try N uploads
back off 1 minute
try N uploads
back off 2 minutes
...
This commit is contained in:
David Anderson 2021-11-09 12:31:38 -08:00
parent a80d131a8c
commit 4046298a99
3 changed files with 37 additions and 15 deletions

View File

@ -70,24 +70,37 @@ bool FILE_XFER_BACKOFF::ok_to_transfer() {
return (dt <= 0); return (dt <= 0);
} }
// A transfer has failed.
// Back off transfers (project-wide) if needed.
//
void FILE_XFER_BACKOFF::file_xfer_failed(PROJECT* p) { void FILE_XFER_BACKOFF::file_xfer_failed(PROJECT* p) {
// If we're already backed off, ignore this failure.
// If we start several transfers at once
// (say, N output files of a job) and they all fail,
// we don't want to back off N times, which could be hours.
//
if (gstate.now < next_xfer_time) {
return;
}
file_xfer_failures++; file_xfer_failures++;
if (file_xfer_failures < FILE_XFER_FAILURE_LIMIT) { if (file_xfer_failures < FILE_XFER_FAILURE_LIMIT) {
next_xfer_time = 0; next_xfer_time = 0;
} else { return;
double backoff = calculate_exponential_backoff(
file_xfer_failures,
gstate.pers_retry_delay_min,
gstate.pers_retry_delay_max
);
if (log_flags.file_xfer_debug) {
msg_printf(p, MSG_INFO,
"[file_xfer] project-wide xfer delay for %f sec",
backoff
);
}
next_xfer_time = gstate.now + backoff;
} }
double backoff = calculate_exponential_backoff(
file_xfer_failures,
gstate.pers_retry_delay_min,
gstate.pers_retry_delay_max
);
if (log_flags.file_xfer_debug) {
msg_printf(p, MSG_INFO,
"[file_xfer] project-wide %s delay for %f sec",
is_upload?"upload":"download",
backoff
);
}
next_xfer_time = gstate.now + backoff;
} }
void FILE_XFER_BACKOFF::file_xfer_succeeded() { void FILE_XFER_BACKOFF::file_xfer_succeeded() {

View File

@ -188,18 +188,25 @@ struct FILE_REF {
int write(MIOFILE&); int write(MIOFILE&);
}; };
// file xfer backoff state for a project and direction (up/down) // File xfer backoff state for a project and direction (up/down).
// if file_xfer_failures exceeds FILE_XFER_FAILURE_LIMIT, // If we get more than FILE_XFER_FAILURE_LIMIT (3) consecutive failures,
// we switch from a per-file to a project-wide backoff policy // we switch from a per-file to a project-wide backoff policy
// (separately for the up/down directions) // (separately for the up/down directions)
// E.g. if we have 100 files to upload and the first 3 fail,
// we don't try the other 97 immediately.
//
// NOTE: this refers to transient failures, not permanent. // NOTE: this refers to transient failures, not permanent.
// //
#define FILE_XFER_FAILURE_LIMIT 3 #define FILE_XFER_FAILURE_LIMIT 3
struct FILE_XFER_BACKOFF { struct FILE_XFER_BACKOFF {
int file_xfer_failures; int file_xfer_failures;
// count of consecutive failures // count of consecutive failures
double next_xfer_time; double next_xfer_time;
// when to start trying again // when to start trying again
bool is_upload;
bool ok_to_transfer(); bool ok_to_transfer();
void file_xfer_failed(PROJECT*); void file_xfer_failed(PROJECT*);
void file_xfer_succeeded(); void file_xfer_succeeded();

View File

@ -125,6 +125,8 @@ void PROJECT::init() {
gpu_ec = 0; gpu_ec = 0;
gpu_time = 0; gpu_time = 0;
app_configs.clear(); app_configs.clear();
upload_backoff.is_upload = true;
download_backoff.is_upload = false;
#ifdef SIM #ifdef SIM
idle_time = 0; idle_time = 0;