From b5124fe729059e955d2440eff2dbee4012247b5e Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 21 Jan 2010 00:14:56 +0000 Subject: [PATCH] - client: brute-force attempt at eliminating domino-effect preemption: if job A is unstarted and EDF, and there's a job B that is later in the list, is started, has the same app version, and has the same arrival time, move A after B. - client: remove the "temp_dcf" mechanism, which had the same goal but didn't work. - client: in computing overall debt for a project, subtract a term that reflects pending work. This should reduce repeated fetches from the same project. - client simulator: tweaks svn path=/trunk/boinc/; revision=20223 --- checkin_notes | 24 ++++++++ client/client_types.h | 2 +- client/cpu_sched.cpp | 74 ++++++++++++++++++++++++- client/rr_sim.cpp | 10 ++-- client/sim.cpp | 101 ++++++++++++++++++++++++++-------- client/sim.h | 7 ++- client/sim_util.cpp | 15 +++-- client/work_fetch.cpp | 46 ++++++++++++++-- client/work_fetch.h | 8 ++- html/user/language_select.php | 29 +++++----- 10 files changed, 259 insertions(+), 57 deletions(-) diff --git a/checkin_notes b/checkin_notes index 1ddd523581..5d7e07baca 100644 --- a/checkin_notes +++ b/checkin_notes @@ -562,3 +562,27 @@ David 19 Jan 2010 client/ sim.cpp rr_sim.cpp + +David 19 Jan 2010 + - client: brute-force attempt at eliminating domino-effect preemption: + if job A is unstarted and EDF, + and there's a job B that is later in the list, + is started, has the same app version, + and has the same arrival time, + move A after B. + - client: remove the "temp_dcf" mechanism, + which had the same goal but didn't work. + - client: in computing overall debt for a project, + subtract a term that reflects pending work. + This should reduce repeated fetches from the same project. + - client simulator: tweaks + + client/ + client_types.h + cpu_sched.cpp + rr_sim.cpp + sim.cpp,h + sim_util.cpp + work_fetch.cpp,h + html/user/ + language_select.php diff --git a/client/client_types.h b/client/client_types.h index 830c1216ee..f34d129f49 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -434,7 +434,7 @@ struct APP_VERSION { double max_working_set_size; // max working set of tasks using this app version. // temp var used in schedule_cpus() - double temp_dcf; + //double temp_dcf; APP_VERSION(){} ~APP_VERSION(){} diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index e06f2102d1..1653538f68 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -42,6 +42,7 @@ #include #include +#include #include "str_util.h" #include "util.h" @@ -59,6 +60,7 @@ #endif using std::vector; +using std::list; #define DEADLINE_CUSHION 0 // try to finish jobs this much in advance of their deadline @@ -817,6 +819,67 @@ static void promote_multi_thread_jobs(vector& runnable_jobs) { } } +// if job A is unstarted and EDF, +// and there's a job later in the list that is started +// and has the same arrival time and app version, +// move A after B +// +static void demote_unstarted_edf(vector& runnable_jobs) { + list x; + RESULT *rp, *rp2; + + // transfer to a list + // + unsigned int i; + for (i=0; i::iterator p = x.end(); p != x.begin(); --p) { + rp = *p; + if (rp->not_started()) continue; + list::iterator q = p; + --q; + while (1) { + rp2 = *q; + if (rp2->not_started() + && (rp2->received_time==rp->received_time) + && (rp2->avp==rp->avp) + && rp2->edf_scheduled + ) { + list::iterator p2 = p; + p2++; + x.insert(p2, rp2); + + if (q == x.begin()) { + x.erase(q); + break; + } + list::iterator q2 = q; + --q; + x.erase(q2); + } else { + if (q == x.begin()) { + break; + } + --q; + } + + } + } + + // transfer back to vector + // + runnable_jobs.clear(); + for (list::iterator p = x.begin(); p!= x.end(); ++p) { + runnable_jobs.push_back(*p); + } +} + // return true if r0 is more important to run than r1 // static inline bool more_important(RESULT* r0, RESULT* r1) { @@ -843,7 +906,7 @@ static inline bool more_important(RESULT* r0, RESULT* r1) { if (!unfin0 && unfin1) return false; // favor jobs selected first by schedule_cpus() - // (e.g., because their project has high debt) + // (e.g., because their project has high STD) // if (r0->seqno < r1->seqno) return true; if (r0->seqno > r1->seqno) return false; @@ -1196,6 +1259,15 @@ bool CLIENT_STATE::enforce_schedule() { // append_unfinished_time_slice(runnable_jobs); + // Remove the EDF flag from unstarted jobs for which + // there's a running job with the same app version. + // This is a (crude) mechanism to avoid the situation + // where there's a set of EDF unstarted jobs, + // each one runs for a little and leaves EDF + // and is preempted by the next. + // + demote_unstarted_edf(runnable_jobs); + // sort to-run list by decreasing importance // std::sort( diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index 9b3804132d..8284bcde37 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -221,6 +221,7 @@ void CLIENT_STATE::print_deadline_misses() { } } +#if 0 // compute a per-app-version "temporary DCF" based on the elapsed time // and fraction done of running jobs // @@ -238,6 +239,7 @@ void compute_temp_dcf() { } } } +#endif void CLIENT_STATE::rr_simulation() { PROJECT* p, *pbest; @@ -248,7 +250,7 @@ void CLIENT_STATE::rr_simulation() { double ar = available_ram(); work_fetch.rr_init(); - compute_temp_dcf(); + //compute_temp_dcf(); if (log_flags.rr_simulation) { msg_printf(0, MSG_INFO, @@ -338,7 +340,8 @@ void CLIENT_STATE::rr_simulation() { for (i=0; irrsim_finish_delay = rp->avp->temp_dcf*rp->rrsim_flops_left/rp->rrsim_flops; + //rp->rrsim_finish_delay = rp->avp->temp_dcf*rp->rrsim_flops_left/rp->rrsim_flops; + rp->rrsim_finish_delay = rp->rrsim_flops_left/rp->rrsim_flops; if (!rpbest || rp->rrsim_finish_delay < rpbest->rrsim_finish_delay) { rpbest = rp; } @@ -348,10 +351,9 @@ void CLIENT_STATE::rr_simulation() { if (log_flags.rr_simulation) { msg_printf(pbest, MSG_INFO, - "[rr_sim] %.2f: %s finishes after %.2f (%f * %.2fG/%.2fG)", + "[rr_sim] %.2f: %s finishes after %.2f (%.2fG/%.2fG)", sim_now - now, rpbest->name, rpbest->rrsim_finish_delay, - rpbest->avp->temp_dcf, rpbest->rrsim_flops_left/1e9, rpbest->rrsim_flops/1e9 ); } diff --git a/client/sim.cpp b/client/sim.cpp index 9c12243516..e2f673bb3f 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -189,10 +189,13 @@ void CLIENT_STATE::get_workload(vector& ip_results) { init_ip_results(work_buf_min(), ncpus, ip_results); } +// simulate trying to do an RPC +// return false if we didn't actually do one +// bool CLIENT_STATE::simulate_rpc(PROJECT* _p) { char buf[256]; SIM_PROJECT* p = (SIM_PROJECT*) _p; - static double last_time=0; + static double last_time=-1e9; vector ip_results; int infeasible_count = 0; @@ -206,11 +209,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) { } last_time = now; - sprintf(buf, "RPC to %s; asking for %f
", - p->project_name, cpu_work_fetch.req_secs + sprintf(buf, "RPC to %s; asking for %f/%.2f
", + p->project_name, cpu_work_fetch.req_secs, cpu_work_fetch.req_instances ); html_msg += buf; + msg_printf(0, MSG_INFO, buf); + handle_completed_results(); if (server_uses_workload) { @@ -219,7 +224,8 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) { bool sent_something = false; double work_left = cpu_work_fetch.req_secs; - while (work_left > 0) { + double instances_needed = cpu_work_fetch.req_instances; + while (work_left > 0 || instances_needed>0) { RESULT* rp = new RESULT; WORKUNIT* wup = new WORKUNIT; make_job(p, wup, rp); @@ -246,17 +252,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) { ); html_msg += buf; work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops; + instances_needed -= 1; } if (cpu_work_fetch.req_secs > 0 && !sent_something) { p->backoff(); - return false; - } else { - p->nrpc_failures = 0; + } + p->nrpc_failures = 0; + if (sent_something) { request_schedule_cpus("simulate_rpc"); request_work_fetch("simulate_rpc"); - return true; } + return true; } void SIM_PROJECT::backoff() { @@ -269,23 +276,62 @@ void SIM_PROJECT::backoff() { bool CLIENT_STATE::scheduler_rpc_poll() { PROJECT *p; + bool action = false; + static double last_time=0; + static double last_work_fetch_time = 0; + double elapsed_time; + + // check only every 5 sec + // + if (now - last_time < SCHEDULER_RPC_POLL_PERIOD) { + msg_printf(NULL, MSG_INFO, "RPC poll: not time %f - %f < %f", + now, last_time, SCHEDULER_RPC_POLL_PERIOD + ); + return false; + } + last_time = now; msg_printf(NULL, MSG_INFO, "RPC poll start"); - p = next_project_sched_rpc_pending(); - if (p) { - return simulate_rpc(p); - } + while (1) { + p = next_project_sched_rpc_pending(); + if (p) { + work_fetch.compute_work_request(p); + action = simulate_rpc(p); + break; + } - p = find_project_with_overdue_results(); - if (p) { - return simulate_rpc(p); + p = find_project_with_overdue_results(); + if (p) { + work_fetch.compute_work_request(p); + action = simulate_rpc(p); + break; + } + + // should we check work fetch? Do this at most once/minute + + if (must_check_work_fetch) { + last_work_fetch_time = 0; + } + elapsed_time = now - last_work_fetch_time; + if (elapsed_time < WORK_FETCH_PERIOD) { + return false; + } + must_check_work_fetch = false; + last_work_fetch_time = now; + + p = work_fetch.choose_project(); + if (p) { + action = simulate_rpc(p); + break; + } + break; } - p = work_fetch.choose_project(); - if (p) { - return simulate_rpc(p); + if (action) { + msg_printf(p, MSG_INFO, "RPC poll: did an RPC"); + } else { + msg_printf(0, MSG_INFO, "RPC poll: didn't do an RPC"); } - msg_printf(NULL, MSG_INFO, "RPC poll: nothing to do"); - return false; + return action; } bool ACTIVE_TASK_SET::poll() { @@ -303,9 +349,9 @@ bool ACTIVE_TASK_SET::poll() { for (i=0; iidle = true; - sprintf(buf, "%s STD: %f min RPC
", + sprintf(buf, "%s STD: %f LTD %f
", p->project_name, p->cpu_pwf.short_term_debt, - time_to_string(p->min_rpc_time) + p->pwf.overall_debt ); gstate.html_msg += buf; } @@ -606,11 +652,16 @@ void CLIENT_STATE::html_end(bool show_next) { void CLIENT_STATE::simulate() { bool action; - now = 0; + double start = START_TIME; + now = start; html_start(false); + msg_printf(0, MSG_INFO, + "starting simultion. delta %f duration %f", delta, duration + ); while (1) { running = host_info.available.sample(now); while (1) { + msg_printf(0, MSG_INFO, "polling"); action = active_tasks.poll(); if (running) { action |= handle_finished_apps(); @@ -618,9 +669,11 @@ void CLIENT_STATE::simulate() { action |= enforce_schedule(); action |= scheduler_rpc_poll(); } + msg_printf(0, MSG_INFO, action?"did action":"did no action"); if (!action) break; } now += delta; + msg_printf(0, MSG_INFO, "took time step"); for (unsigned int i=0; itask_state() == PROCESS_EXECUTING) { @@ -628,7 +681,7 @@ void CLIENT_STATE::simulate() { } } html_rec(); - if (now > duration) break; + if (now > start + duration) break; } html_end(false); } diff --git a/client/sim.h b/client/sim.h index d968f75f92..53b00ca010 100644 --- a/client/sim.h +++ b/client/sim.h @@ -90,7 +90,7 @@ public: double lambda; int parse(XML_PARSER&, char* end_tag); bool sample(double); - void init(); + void init(double); RANDOM_PROCESS(); }; @@ -313,6 +313,8 @@ extern bool dual_dcf; extern bool work_fetch_old; extern bool gpus_usable; +#define SCHEDULER_RPC_POLL_PERIOD 5.0 + #define WORK_FETCH_PERIOD 60 #define CPU_SCHED_ENFORCE_PERIOD 60 @@ -325,3 +327,6 @@ extern bool gpus_usable; #define MAX_STD (86400) // maximum short-term debt + +#define START_TIME 946684800 + // Jan 1 2000 diff --git a/client/sim_util.cpp b/client/sim_util.cpp index 9f86f61c77..7facaf93c3 100644 --- a/client/sim_util.cpp +++ b/client/sim_util.cpp @@ -395,6 +395,10 @@ bool RANDOM_PROCESS::sample(double t) { value = true; } } + msg_printf(0, MSG_INFO, + "value: %d lambda: %f t %f time_left %f", + value, lambda, t, time_left + ); return value; } @@ -402,7 +406,8 @@ RANDOM_PROCESS::RANDOM_PROCESS() { frac = 1; } -void RANDOM_PROCESS::init() { +void RANDOM_PROCESS::init(double st) { + last_time = st; value = true; time_left = exponential(lambda); off_lambda = lambda/frac - lambda; @@ -502,23 +507,23 @@ int SIM_HOST::parse(XML_PARSER& xp) { bool is_tag; int retval; - p_ncpus = 1; connection_interval = 0; + p_ncpus = 1; while(!xp.get(tag, sizeof(tag), is_tag)) { if (!is_tag) return ERR_XML_PARSE; if (!strcmp(tag, "/host")) return 0; else if (xp.parse_double(tag, "p_fpops", p_fpops)) continue; else if (xp.parse_double(tag, "m_nbytes", m_nbytes)) continue; - else if (xp.parse_double(tag, "connection_interval", connection_interval)) continue; else if (xp.parse_int(tag, "p_ncpus", p_ncpus)) continue; + else if (xp.parse_double(tag, "connection_interval", connection_interval)) continue; else if (!strcmp(tag, "available")) { retval = available.parse(xp, "/available"); if (retval) return retval; - available.init(); + available.init(START_TIME); } else if (!strcmp(tag, "idle")) { retval = idle.parse(xp, "/idle"); if (retval) return retval; - idle.init(); + idle.init(START_TIME); } else { printf("unrecognized: %s\n", tag); return ERR_XML_PARSE; diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index 2b0b6781ee..859efa848a 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -513,12 +513,13 @@ void RSC_WORK_FETCH::update_long_term_debts() { double offset; double delta_limit = secs_this_debt_interval; if (max_debt > -2*delta_limit) { + if (fabs(max_debt) < 1e-6) max_debt = 0; offset = max_debt?-max_debt:0; // avoid -0 } else { offset = 2*delta_limit; } if (log_flags.debt_debug) { - msg_printf(0, MSG_INFO, "[debt] %s LTD: adding offset %.2f", + msg_printf(0, MSG_INFO, "[debt] %s LTD: adding offset %f", rsc_name(rsc_type), offset ); } @@ -658,15 +659,43 @@ void WORK_FETCH::set_all_requests(PROJECT* p) { } } +// Compute an "overall long-term debt" for each project. +// This is a sum of per-resource terms, scaled by the relative speed of the resource. +// The term for a resource is its LTD plus an estimate of queued work. +// void WORK_FETCH::set_overall_debts() { - for (unsigned i=0; ipwf.overall_debt = p->cpu_pwf.long_term_debt; + unsigned int i; + PROJECT* p; + RESULT* rp; + APP_VERSION* avp; + + for (i=0; icpu_pwf.queue_est = 0; + p->cuda_pwf.queue_est = 0; + p->ati_pwf.queue_est = 0; + } + for (i=0; iproject; + if (!rp->nearly_runnable()) continue; + if (p->non_cpu_intensive) continue; + double dt = rp->estimated_time_remaining(false); + avp = rp->avp; + p->cpu_pwf.queue_est += dt*avp->avg_ncpus; + p->cuda_pwf.queue_est += dt*avp->ncudas; + p->ati_pwf.queue_est += dt*avp->natis; + } + for (i=0; ipwf.overall_debt = p->cpu_pwf.long_term_debt - p->cpu_pwf.queue_est/gstate.ncpus; if (coproc_cuda) { - p->pwf.overall_debt += cuda_work_fetch.speed*p->cuda_pwf.long_term_debt; + p->pwf.overall_debt += cuda_work_fetch.relative_speed* + (p->cuda_pwf.long_term_debt - p->cuda_pwf.queue_est/coproc_cuda->count); } if (coproc_ati) { - p->pwf.overall_debt += ati_work_fetch.speed*p->ati_pwf.long_term_debt; + p->pwf.overall_debt += ati_work_fetch.relative_speed* + (p->ati_pwf.long_term_debt - p->ati_pwf.queue_est/coproc_ati->count); } } } @@ -769,6 +798,10 @@ PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() { PROJECT* WORK_FETCH::choose_project() { PROJECT* p = 0; + if (log_flags.work_fetch_debug) { + msg_printf(0, MSG_INFO, "[wfd]: work fetch start"); + } + p = non_cpu_intensive_project_needing_work(); if (p) return p; @@ -893,6 +926,7 @@ void WORK_FETCH::compute_shares() { if (!p->pwf.can_fetch_work) continue; if (p->cpu_pwf.may_have_work) { p->cpu_pwf.fetchable_share = p->resource_share/cpu_work_fetch.total_fetchable_share; + msg_printf(p, MSG_INFO, "FS: %f = %f/%f\n", p->cpu_pwf.fetchable_share, p->resource_share, cpu_work_fetch.total_fetchable_share); } if (coproc_cuda && p->cuda_pwf.may_have_work) { p->cuda_pwf.fetchable_share = p->resource_share/cuda_work_fetch.total_fetchable_share; diff --git a/client/work_fetch.h b/client/work_fetch.h index 0e9a9267bb..df2c522723 100644 --- a/client/work_fetch.h +++ b/client/work_fetch.h @@ -45,10 +45,14 @@ struct RSC_PROJECT_WORK_FETCH { // the following used by debt accounting double anticipated_debt; + // short-term debt, adjusted by scheduled jobs double secs_this_debt_interval; inline void reset_debt_accounting() { secs_this_debt_interval = 0; } + double queue_est; + // an estimate of instance-secs of queued work; + // a temp used in computing overall debts // the following are used by rr_simulation() // @@ -157,7 +161,7 @@ struct BUSY_TIME_ESTIMATOR { struct RSC_WORK_FETCH { int rsc_type; int ninstances; - double speed; // total FLOPS relative to CPU total FLOPS + double relative_speed; // total FLOPS relative to CPU total FLOPS // the following used/set by rr_simulation(): // @@ -180,7 +184,7 @@ struct RSC_WORK_FETCH { void init(int t, int n, double sp) { rsc_type = t; ninstances = n; - speed = sp; + relative_speed = sp; busy_time_estimator.init(n); } // the following specify the work request for this resource diff --git a/html/user/language_select.php b/html/user/language_select.php index b151dfdc47..6a1197e8ab 100644 --- a/html/user/language_select.php +++ b/html/user/language_select.php @@ -41,24 +41,27 @@ if ($set_lang){ page_head(tra("Language selection")); echo "

", - tra("This web site is available in several languages. ". - "The currently selected language is: %1 (%2).", - "".tra("LANG_NAME_INTERNATIONAL")."", - tra("LANG_NAME_NATIVE")), + tra( + "This web site is available in several languages. The currently selected language is: %1 (%2).", + "".tra("LANG_NAME_INTERNATIONAL")."", + tra("LANG_NAME_NATIVE") + ), "

", "

", - tra("Normally the choice of language ". - "is determined by your browser's language setting, ". - "which is: %1. ". - "You can change this setting using:", "$prefs"), + tra( + "Normally the choice of language is determined by your browser's language setting, which is: %1. You can change this setting using:", + "$prefs" + ), "

    ", - "
  • ", tra("Firefox: Tools/Options/General"), - "
  • ", tra("Microsoft IE: Tools/Internet Options/Languages"), + "
  • ", + tra("Firefox: Tools/Options/General"), + "
  • ", + tra("Microsoft IE: Tools/Internet Options/Languages"), "
", "

", - tra("Or you can select a language by clicking on one of the links. ". - "This will send your browser a cookie; ". - "make sure your browser accepts cookies from our domain."), + tra( + "Or you can select a language by clicking on one of the links. This will send your browser a cookie; make sure your browser accepts cookies from our domain." + ), "

" ;