diff --git a/checkin_notes b/checkin_notes index d069182e4c..18d66fc93c 100644 --- a/checkin_notes +++ b/checkin_notes @@ -10725,3 +10725,19 @@ David 24 Dec 2009 client/ work_fetch.cpp + +David 24 Dec 2009 + - client: add logic to work fetch so that each project + will have enough jobs to use its share of resource instances. + This avoids situations where e.g. on a 2-CPU system + a project has 75% resource share and 1 CPU job, + and its STD increases without bound. + + Did a general cleanup of the logic for computing + work request sizes (seconds and instances). + + client/ + work_fetch.cpp,h + rr_sim.cpp + sched/ + handle_request.cpp diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index eaf6ba0103..a0ed790fde 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -266,7 +266,9 @@ void CLIENT_STATE::rr_simulation() { p = rp->project; p->pwf.has_runnable_jobs = true; + p->cpu_pwf.nused_total += rp->avp->avg_ncpus; if (rp->uses_cuda()) { + p->cuda_pwf.nused_total += rp->avp->ncudas; p->cuda_pwf.has_runnable_jobs = true; if (cuda_work_fetch.sim_nused < coproc_cuda->count) { sim_status.activate(rp, 0); @@ -275,6 +277,7 @@ void CLIENT_STATE::rr_simulation() { cuda_work_fetch.pending.push_back(rp); } } else if (rp->uses_ati()) { + p->ati_pwf.nused_total += rp->avp->natis; p->ati_pwf.has_runnable_jobs = true; if (ati_work_fetch.sim_nused < coproc_ati->count) { sim_status.activate(rp, 0); diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index 5dd78b4b67..c65ee4e9d5 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -104,6 +104,7 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT* p, int rsc_type) { fetchable_share = 0; has_runnable_jobs = false; sim_nused = 0; + nused_total = 0; deadlines_missed = 0; } @@ -204,10 +205,20 @@ bool RSC_PROJECT_WORK_FETCH::overworked() { // If resource is saturated for less than work_buf_total(), // get work for it from the non-overworked project with greatest LTD. #define FETCH_IF_PROJECT_STARVED 3 - // If any project is not overworked and has no runnable jobs - // (for any resource, not just this one) + // If any project is not overworked and has too few jobs + // to use its instance share, // get work from the one with greatest LTD. +static char* criterion_name(int criterion) { + switch (criterion) { + case FETCH_IF_IDLE_INSTANCE: return "idle instance"; + case FETCH_IF_MAJOR_SHORTFALL: return "major shortfall"; + case FETCH_IF_MINOR_SHORTFALL: return "minor shortfall"; + case FETCH_IF_PROJECT_STARVED: return "starved"; + } + return "unknown"; +} + // Choose the best project to ask for work for this resource, // given the specific criterion // @@ -241,7 +252,7 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) { break; case FETCH_IF_PROJECT_STARVED: if (rpwf.overworked()) continue; - if (p->pwf.has_runnable_jobs) continue; + if (rpwf.nused_total >= ninstances*rpwf.fetchable_share) continue; break; } if (pbest) { @@ -254,67 +265,68 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) { } if (!pbest) return NULL; + if (log_flags.work_fetch_debug) { + msg_printf(pbest, MSG_INFO, + "chosen: %s %s", criterion_name(criterion), rsc_name(rsc_type) + ); + } + // decide how much work to request from each resource // work_fetch.clear_request(); switch (criterion) { case FETCH_IF_IDLE_INSTANCE: - if (log_flags.work_fetch_debug) { - msg_printf(pbest, MSG_INFO, - "chosen: %s idle instance", rsc_name(rsc_type) - ); - } - req = share_request(pbest); - if (req > shortfall) req = shortfall; - set_request(pbest, req); + set_request(pbest); break; case FETCH_IF_MAJOR_SHORTFALL: - if (log_flags.work_fetch_debug) { - msg_printf(pbest, MSG_INFO, - "chosen: %s major shortfall", rsc_name(rsc_type) - ); - } - req = share_request(pbest); - if (req > shortfall) req = shortfall; - set_request(pbest, req); + case FETCH_IF_PROJECT_STARVED: + set_request(pbest); break; case FETCH_IF_MINOR_SHORTFALL: - if (log_flags.work_fetch_debug) { - msg_printf(pbest, MSG_INFO, - "chosen: %s minor shortfall", rsc_name(rsc_type) - ); - } - work_fetch.set_shortfall_requests(pbest); - break; - case FETCH_IF_PROJECT_STARVED: - if (log_flags.work_fetch_debug) { - msg_printf(pbest, MSG_INFO, - "chosen: %s starved", rsc_name(rsc_type) - ); - } - req = share_request(pbest); - set_request(pbest, req); + // in this case, potentially request work for all resources + // + work_fetch.set_all_requests(pbest); break; } return pbest; } -void WORK_FETCH::set_shortfall_requests(PROJECT* p) { - cpu_work_fetch.set_shortfall_request(p); - if (coproc_cuda && gpus_usable) { - cuda_work_fetch.set_shortfall_request(p); - } - if (coproc_ati && gpus_usable) { - ati_work_fetch.set_shortfall_request(p); - } -} - -void RSC_WORK_FETCH::set_shortfall_request(PROJECT* p) { +// request this project's share of shortfall and instances +// +void RSC_WORK_FETCH::set_request(PROJECT* p) { if (!shortfall) return; RSC_PROJECT_WORK_FETCH& w = project_state(p); if (!w.may_have_work) return; if (w.overworked()) return; - set_request(p, shortfall); + double dcf = p->duration_correction_factor; + if (dcf < 0.02 || dcf > 80.0) { + // if project's DCF is too big or small, + // its completion time estimates are useless; just ask for 1 second + // + req_secs = 1; + } else { + req_secs = shortfall * w.fetchable_share; + } + + // the number of additional instances needed to have our share + // + double x1 = (ninstances * w.fetchable_share) - w.nused_total; + + // our share of the idle instances + // + double x2 = nidle_now * w.fetchable_share; + + req_instances = std::max(x1, x2); +} + +void WORK_FETCH::set_all_requests(PROJECT* p) { + cpu_work_fetch.set_request(p); + if (coproc_cuda && gpus_usable) { + cuda_work_fetch.set_request(p); + } + if (coproc_ati && gpus_usable) { + ati_work_fetch.set_request(p); + } } void WORK_FETCH::set_overall_debts() { @@ -454,7 +466,7 @@ void WORK_FETCH::compute_work_request(PROJECT* p) { // Otherwise we can have a situation where a GPU is idle, // we ask only for GPU work, and the project never has any // - work_fetch.set_shortfall_requests(pbest); + work_fetch.set_all_requests(pbest); return; } @@ -548,27 +560,6 @@ PROJECT* WORK_FETCH::choose_project() { return p; } -double RSC_WORK_FETCH::share_request(PROJECT* p) { - double dcf = p->duration_correction_factor; - if (dcf < 0.02 || dcf > 80.0) { - // if project's DCF is too big or small, - // its completion time estimates are useless; just ask for 1 second - // - return 1; - } else { - // otherwise ask for the project's share - // - RSC_PROJECT_WORK_FETCH& w = project_state(p); - return gstate.work_buf_total()*w.fetchable_share; - } -} - -void RSC_WORK_FETCH::set_request(PROJECT* p, double r) { - RSC_PROJECT_WORK_FETCH& w = project_state(p); - req_secs = r; - req_instances = (int)ceil(w.fetchable_share*nidle_now); -} - void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) { APP_VERSION* avp = atp->result->avp; PROJECT* p = atp->result->project; diff --git a/client/work_fetch.h b/client/work_fetch.h index d1fc675a6a..0e9a9267bb 100644 --- a/client/work_fetch.h +++ b/client/work_fetch.h @@ -62,6 +62,7 @@ struct RSC_PROJECT_WORK_FETCH { // determines how many instances this project deserves bool has_runnable_jobs; double sim_nused; + double nused_total; // sum of instances over all runnable jobs int deadlines_missed; int deadlines_missed_copy; // copy of the above used during schedule_cpus() @@ -185,7 +186,7 @@ struct RSC_WORK_FETCH { // the following specify the work request for this resource // double req_secs; - int req_instances; + double req_instances; // debt accounting double secs_this_debt_interval; @@ -204,9 +205,7 @@ struct RSC_WORK_FETCH { void update_short_term_debts(); void print_state(const char*); void clear_request(); - void set_request(PROJECT*, double); - double share_request(PROJECT*); - void set_shortfall_request(PROJECT*); + void set_request(PROJECT*); bool may_have_work(PROJECT*); RSC_WORK_FETCH() { memset(this, 0, sizeof(*this)); @@ -243,7 +242,7 @@ struct WORK_FETCH { PROJECT*, SCHEDULER_REPLY*, std::vectornew_results ); void set_initial_work_request(); - void set_shortfall_requests(PROJECT*); + void set_all_requests(PROJECT*); void print_state(); void init(); void rr_init(); diff --git a/sched/handle_request.cpp b/sched/handle_request.cpp index 170e8c039a..89234b1fcd 100644 --- a/sched/handle_request.cpp +++ b/sched/handle_request.cpp @@ -336,7 +336,7 @@ int authenticate_user() { ); goto make_new_host; } - + } else { // Here no hostid was given, or the ID was bad. // Look up the user, then create a new host record @@ -480,7 +480,7 @@ got_host: user.update_field(buf); } } - + return 0; } @@ -585,7 +585,7 @@ int send_result_abort() { DB_IN_PROGRESS_RESULT result; std::string result_names; unsigned int i; - + if (g_request->other_results.size() == 0) { return 0; } @@ -595,6 +595,7 @@ int send_result_abort() { for (i=0; iother_results.size(); i++) { OTHER_RESULT& orp=g_request->other_results[i]; orp.abort = true; + // if the host has a result not in the DB, abort it orp.abort_if_not_started = false; orp.reason = ABORT_REASON_NOT_FOUND; if (i > 0) result_names.append(", "); @@ -619,7 +620,7 @@ int send_result_abort() { orp.abort = true; orp.abort_if_not_started = false; orp.reason = ABORT_REASON_WU_CANCELLED; - } else if ( result.assimilate_state == ASSIMILATE_DONE ) { + } else if (result.assimilate_state == ASSIMILATE_DONE) { // if the WU has been assimilated, abort if not started // orp.abort = false; @@ -658,8 +659,8 @@ int send_result_abort() { log_messages.printf(MSG_NORMAL, "[HOST#%d]: Send result_abort for result %s; reason: %s\n", g_reply->host.id, orp.name, reason_str(orp.reason) - ); - // send user message + ); + // send user message char buf[256]; sprintf(buf, "Result %s is no longer usable", orp.name); g_reply->insert_message(buf, "high"); @@ -668,10 +669,10 @@ int send_result_abort() { log_messages.printf(MSG_NORMAL, "[HOST#%d]: Send result_abort_if_unstarted for result %s; reason %d\n", g_reply->host.id, orp.name, orp.reason - ); + ); } } - + return aborts_sent; } @@ -857,11 +858,11 @@ void warn_user_if_core_client_upgrade_scheduled() { remaining /= 3600; if (0 < remaining) { - + char msg[512]; int days = remaining / 24; int hours = remaining % 24; - + sprintf(msg, "Starting in %d days and %d hours, project will require a minimum " "BOINC core client version of %d.%d.0. You are currently using " @@ -1275,7 +1276,7 @@ void process_request(char* code_sign_key) { send_result_abort(); } } - + if (requesting_work()) { if (!send_code_sign_key(code_sign_key)) { ok_to_send_work = false;