mirror of https://github.com/BOINC/boinc.git
client: fix overly aggressive project-wide file transfer backoff policy.
Old: if we get more than 3 upload or download failures for a project, do a backoff for each failure. Problem: if we start a bunch of transfers (say the N output files of a job) and they all fail, we back off for too long, e.g.: try N uploads back off 2^N minutes try N uploads back off 4^N minutes ... New: on a failure, ignore it if we're already backed off. So the behavior should be something like: try N uploads back off 1 minute try N uploads back off 2 minutes ...
This commit is contained in:
parent
a80d131a8c
commit
4046298a99
|
@ -70,24 +70,37 @@ bool FILE_XFER_BACKOFF::ok_to_transfer() {
|
||||||
return (dt <= 0);
|
return (dt <= 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// A transfer has failed.
|
||||||
|
// Back off transfers (project-wide) if needed.
|
||||||
|
//
|
||||||
void FILE_XFER_BACKOFF::file_xfer_failed(PROJECT* p) {
|
void FILE_XFER_BACKOFF::file_xfer_failed(PROJECT* p) {
|
||||||
|
// If we're already backed off, ignore this failure.
|
||||||
|
// If we start several transfers at once
|
||||||
|
// (say, N output files of a job) and they all fail,
|
||||||
|
// we don't want to back off N times, which could be hours.
|
||||||
|
//
|
||||||
|
if (gstate.now < next_xfer_time) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
file_xfer_failures++;
|
file_xfer_failures++;
|
||||||
if (file_xfer_failures < FILE_XFER_FAILURE_LIMIT) {
|
if (file_xfer_failures < FILE_XFER_FAILURE_LIMIT) {
|
||||||
next_xfer_time = 0;
|
next_xfer_time = 0;
|
||||||
} else {
|
return;
|
||||||
double backoff = calculate_exponential_backoff(
|
|
||||||
file_xfer_failures,
|
|
||||||
gstate.pers_retry_delay_min,
|
|
||||||
gstate.pers_retry_delay_max
|
|
||||||
);
|
|
||||||
if (log_flags.file_xfer_debug) {
|
|
||||||
msg_printf(p, MSG_INFO,
|
|
||||||
"[file_xfer] project-wide xfer delay for %f sec",
|
|
||||||
backoff
|
|
||||||
);
|
|
||||||
}
|
|
||||||
next_xfer_time = gstate.now + backoff;
|
|
||||||
}
|
}
|
||||||
|
double backoff = calculate_exponential_backoff(
|
||||||
|
file_xfer_failures,
|
||||||
|
gstate.pers_retry_delay_min,
|
||||||
|
gstate.pers_retry_delay_max
|
||||||
|
);
|
||||||
|
if (log_flags.file_xfer_debug) {
|
||||||
|
msg_printf(p, MSG_INFO,
|
||||||
|
"[file_xfer] project-wide %s delay for %f sec",
|
||||||
|
is_upload?"upload":"download",
|
||||||
|
backoff
|
||||||
|
);
|
||||||
|
}
|
||||||
|
next_xfer_time = gstate.now + backoff;
|
||||||
}
|
}
|
||||||
|
|
||||||
void FILE_XFER_BACKOFF::file_xfer_succeeded() {
|
void FILE_XFER_BACKOFF::file_xfer_succeeded() {
|
||||||
|
|
|
@ -188,18 +188,25 @@ struct FILE_REF {
|
||||||
int write(MIOFILE&);
|
int write(MIOFILE&);
|
||||||
};
|
};
|
||||||
|
|
||||||
// file xfer backoff state for a project and direction (up/down)
|
// File xfer backoff state for a project and direction (up/down).
|
||||||
// if file_xfer_failures exceeds FILE_XFER_FAILURE_LIMIT,
|
// If we get more than FILE_XFER_FAILURE_LIMIT (3) consecutive failures,
|
||||||
// we switch from a per-file to a project-wide backoff policy
|
// we switch from a per-file to a project-wide backoff policy
|
||||||
// (separately for the up/down directions)
|
// (separately for the up/down directions)
|
||||||
|
// E.g. if we have 100 files to upload and the first 3 fail,
|
||||||
|
// we don't try the other 97 immediately.
|
||||||
|
//
|
||||||
// NOTE: this refers to transient failures, not permanent.
|
// NOTE: this refers to transient failures, not permanent.
|
||||||
//
|
//
|
||||||
|
|
||||||
#define FILE_XFER_FAILURE_LIMIT 3
|
#define FILE_XFER_FAILURE_LIMIT 3
|
||||||
|
|
||||||
struct FILE_XFER_BACKOFF {
|
struct FILE_XFER_BACKOFF {
|
||||||
int file_xfer_failures;
|
int file_xfer_failures;
|
||||||
// count of consecutive failures
|
// count of consecutive failures
|
||||||
double next_xfer_time;
|
double next_xfer_time;
|
||||||
// when to start trying again
|
// when to start trying again
|
||||||
|
bool is_upload;
|
||||||
|
|
||||||
bool ok_to_transfer();
|
bool ok_to_transfer();
|
||||||
void file_xfer_failed(PROJECT*);
|
void file_xfer_failed(PROJECT*);
|
||||||
void file_xfer_succeeded();
|
void file_xfer_succeeded();
|
||||||
|
|
|
@ -125,6 +125,8 @@ void PROJECT::init() {
|
||||||
gpu_ec = 0;
|
gpu_ec = 0;
|
||||||
gpu_time = 0;
|
gpu_time = 0;
|
||||||
app_configs.clear();
|
app_configs.clear();
|
||||||
|
upload_backoff.is_upload = true;
|
||||||
|
download_backoff.is_upload = false;
|
||||||
|
|
||||||
#ifdef SIM
|
#ifdef SIM
|
||||||
idle_time = 0;
|
idle_time = 0;
|
||||||
|
|
Loading…
Reference in New Issue