Merge pull request #4575 from BOINC/dpa_backoff

client: fix overly aggressive project-wide file transfer backoff policy.
This commit is contained in:
David Anderson 2021-12-07 19:39:40 -08:00 committed by GitHub
commit 6a8fe25033
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 37 additions and 15 deletions

View File

@ -70,24 +70,37 @@ bool FILE_XFER_BACKOFF::ok_to_transfer() {
return (dt <= 0);
}
// A transfer has failed.
// Back off transfers (project-wide) if needed.
//
void FILE_XFER_BACKOFF::file_xfer_failed(PROJECT* p) {
// If we're already backed off, ignore this failure.
// If we start several transfers at once
// (say, N output files of a job) and they all fail,
// we don't want to back off N times, which could be hours.
//
if (gstate.now < next_xfer_time) {
return;
}
file_xfer_failures++;
if (file_xfer_failures < FILE_XFER_FAILURE_LIMIT) {
next_xfer_time = 0;
} else {
double backoff = calculate_exponential_backoff(
file_xfer_failures,
gstate.pers_retry_delay_min,
gstate.pers_retry_delay_max
);
if (log_flags.file_xfer_debug) {
msg_printf(p, MSG_INFO,
"[file_xfer] project-wide xfer delay for %f sec",
backoff
);
}
next_xfer_time = gstate.now + backoff;
return;
}
double backoff = calculate_exponential_backoff(
file_xfer_failures,
gstate.pers_retry_delay_min,
gstate.pers_retry_delay_max
);
if (log_flags.file_xfer_debug) {
msg_printf(p, MSG_INFO,
"[file_xfer] project-wide %s delay for %f sec",
is_upload?"upload":"download",
backoff
);
}
next_xfer_time = gstate.now + backoff;
}
void FILE_XFER_BACKOFF::file_xfer_succeeded() {

View File

@ -188,18 +188,25 @@ struct FILE_REF {
int write(MIOFILE&);
};
// file xfer backoff state for a project and direction (up/down)
// if file_xfer_failures exceeds FILE_XFER_FAILURE_LIMIT,
// File xfer backoff state for a project and direction (up/down).
// If we get more than FILE_XFER_FAILURE_LIMIT (3) consecutive failures,
// we switch from a per-file to a project-wide backoff policy
// (separately for the up/down directions)
// E.g. if we have 100 files to upload and the first 3 fail,
// we don't try the other 97 immediately.
//
// NOTE: this refers to transient failures, not permanent.
//
#define FILE_XFER_FAILURE_LIMIT 3
struct FILE_XFER_BACKOFF {
int file_xfer_failures;
// count of consecutive failures
double next_xfer_time;
// when to start trying again
bool is_upload;
bool ok_to_transfer();
void file_xfer_failed(PROJECT*);
void file_xfer_succeeded();

View File

@ -125,6 +125,8 @@ void PROJECT::init() {
gpu_ec = 0;
gpu_time = 0;
app_configs.clear();
upload_backoff.is_upload = true;
download_backoff.is_upload = false;
#ifdef SIM
idle_time = 0;