diff --git a/client/pers_file_xfer.C b/client/pers_file_xfer.C index 6a316a27f0..5b95f0ae86 100644 --- a/client/pers_file_xfer.C +++ b/client/pers_file_xfer.C @@ -225,7 +225,7 @@ int PERS_FILE_XFER::retry_and_backoff(unsigned int cur_time) { } if (log_flags.file_xfer_debug) { printf( - "exponential back off is %f, current_time is %s\n", exp_backoff,asctime( newtime ) + "exponential back off is %d, current_time is %s\n", (int) exp_backoff,asctime( newtime ) ); } return 0; diff --git a/client/pers_file_xfer.h b/client/pers_file_xfer.h index 22c42e2e5c..5271768699 100644 --- a/client/pers_file_xfer.h +++ b/client/pers_file_xfer.h @@ -37,7 +37,7 @@ #define PERS_GIVEUP (60*60*24*7*2) // 2 weeks */ #define PERS_RETRY_DELAY_MIN 1 #define PERS_RETRY_DELAY_MAX 30 -#define PERS_GIVEUP 200 +#define PERS_GIVEUP 50 // give up on xfer if this time elapses since last byte xferred class PERS_FILE_XFER { diff --git a/client/scheduler_op.C b/client/scheduler_op.C index a924c4d0ba..8a53777acf 100644 --- a/client/scheduler_op.C +++ b/client/scheduler_op.C @@ -16,7 +16,8 @@ // // Contributor(s): // - +#include +#include #include #include @@ -53,7 +54,7 @@ int SCHEDULER_OP::init_get_work() { else { project = gstate.next_project_master_pending(); if (project) { - if (retval=init_master_fetch(project)) { + if ((retval=init_master_fetch(project))) { sprintf(err_msg, "init_master_fetch failed, error %d\n", retval); backoff(project, err_msg); } @@ -103,8 +104,8 @@ int SCHEDULER_OP::init_op_project(double ns) { // TODO: integrate with other backoff sources // int SCHEDULER_OP::set_min_rpc_time(PROJECT* p) { - int x = RETRY_BASE_PERIOD; - int i; + double x; + int exp_backoff; int n = p->nrpc_failures; if (n > RETRY_CAP) n = RETRY_CAP; @@ -114,17 +115,22 @@ int SCHEDULER_OP::set_min_rpc_time(PROJECT* p) { if (log_flags.sched_op_debug) { printf("we've hit the limit on master_url fetches\n"); } - p->min_rpc_time = time(0) + MASTER_FETCH_INTERVAL; - x = MASTER_FETCH_INTERVAL; + //backoff e^MASTER_FETCH_INTERVAL * random + exp_backoff = (int) exp(((double)rand()/(double)RAND_MAX)*MASTER_FETCH_INTERVAL); + p->min_rpc_time = time(0) + exp_backoff; + } else { - for (i=0; imin_rpc_time = time(0) + x; + //backoff RETRY_BASE_PERIOD * e^nrpc_failures * random + x = RETRY_BASE_PERIOD * exp(((double)rand()/(double)RAND_MAX) * n); + exp_backoff = (int)max(PERS_RETRY_DELAY_MIN,min(PERS_RETRY_DELAY_MAX,(int) x)); + p->min_rpc_time = time(0) + exp_backoff; + } if (log_flags.sched_op_debug) { printf( "setting min RPC time for %s to %d seconds from now\n", - p->master_url, x + p->master_url, exp_backoff ); } return 0; @@ -305,7 +311,7 @@ bool SCHEDULER_OP::poll() { } project = gstate.next_project_master_pending(); if (project) { - if (retval = init_master_fetch(project)) + if ((retval = init_master_fetch(project))) backoff(project, "Master file fetch failed\n"); } else { state = SCHEDULER_OP_STATE_IDLE; @@ -389,7 +395,7 @@ bool SCHEDULER_OP::poll() { if (scheduler_op_done) { project = gstate.next_project_master_pending(); if (project) { - if (retval = init_master_fetch(project)) { + if ((retval = init_master_fetch(project))) { if (log_flags.sched_op_debug) { printf("Scheduler op: init_master_fetch failed.\n" ); } diff --git a/client/scheduler_op.h b/client/scheduler_op.h index 156ed8e295..b06193c005 100644 --- a/client/scheduler_op.h +++ b/client/scheduler_op.h @@ -36,15 +36,20 @@ // constants related to scheduler RPC policy -#define MASTER_FETCH_PERIOD 3 +#define MASTER_FETCH_PERIOD 10 // fetch and parse master URL if nrpc_failures is a multiple of this #define RETRY_BASE_PERIOD 1 // after failure, back off 2^nrpc_failures times this times random -#define RETRY_CAP 3 +#define RETRY_CAP 10 // cap on nrpc_failures in the above formula #define MASTER_FETCH_RETRY_CAP 3 //cap on how many times we will contact master_url before moving into a state in which we will not exponentially backoff anymore but rather contact the master URL at the frequency below #define MASTER_FETCH_INTERVAL 5 +//This is the Max on the time to wait after we've contacted the Master URL MASTER_FETCH_RETRY_CAP times. + +//The next two constants are used to bound RPC exponential waiting. +#define PERS_RETRY_DELAY_MIN 1 +#define PERS_RETRY_DELAY_MAX 30 #define SCHEDULER_OP_STATE_IDLE 0 #define SCHEDULER_OP_STATE_GET_MASTER 1 diff --git a/test/test.inc b/test/test.inc index c280b51ec4..3bb4c94983 100644 --- a/test/test.inc +++ b/test/test.inc @@ -348,15 +348,14 @@ class Project { } //moves the download directory to temp. This is used to test exponential backoff on the client side. - function delete_downloaddir($time = null,$download_dir_number = null) + function delete_downloaddir($time = null,$download_dir_num = null) { if($time != null) { echo "\nsleeping for $time seconds"; PassThru("sleep $time"); } - - PassThru("mv $this->project_dir/download$download_dir_num $this->project_dir/download_moved$download_dir_num"); + PassThru("mv $this->project_dir/download$download_dir_num $this->project_dir/download_moved$download_dir_num"); } diff --git a/test/test_download_backoff.php b/test/test_download_backoff.php index de8efad84c..4f45a585f2 100644 --- a/test/test_download_backoff.php +++ b/test/test_download_backoff.php @@ -52,6 +52,6 @@ $result->stderr_out = "APP: upper_case: starting, argc 1"; $result->exit_status = 0; $project->check_results(2, $result); - $project->compare_file("uc_wu_0_0", "uc_correct_output"); - $project->compare_file("uc_wu_1_0", "uc_correct_output"); + $project->compare_file("uc_multiple_download_wu_0_0", "uc_correct_output"); + $project->compare_file("uc_multiple_download_wu_1_0", "uc_correct_output"); ?> diff --git a/test/test_masterurl_failure.php b/test/test_masterurl_failure.php index 8a33f4af87..5a5c9198a1 100644 --- a/test/test_masterurl_failure.php +++ b/test/test_masterurl_failure.php @@ -2,7 +2,6 @@ delete_scheduler(); $pid = $host->run_asynch("-exit_when_idle"); //reinstall scheduler after 500 seconds - $project->reinstall_scheduler(); + $project->reinstall_scheduler(100); $status = 0; //wait until the host has stopped running pcntl_waitpid($pid,$status,0);