From 330a25893f8f857663b8a7d71ebd8c0a922e08ed Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 4 Apr 2013 10:25:56 -0700 Subject: [PATCH] - client emulator: parse in in client_state.xml. This gives you a way to simulate the effects of app_config.xml - client: piggyback requests for resources even if we're backed off from them - client: change resource backoff logic Old: if we requested work and didn't get any, back off from resources for which we requested work New: for each resource type T: if we requested work for T and didn't get any, back off from T Also, don't back off if we're already backed off (i.e. if this is a piggyback request) Also, only back off if the RPC was due to an automatic and potentially rapid source (namely: work fetch, result report, trickle up) - client: fix small work fetch bug --- client/client_types.cpp | 4 ++ client/client_types.h | 1 + client/sim.cpp | 7 +++- client/work_fetch.cpp | 85 +++++++++++++++++++++++------------------ client/work_fetch.h | 1 + 5 files changed, 60 insertions(+), 38 deletions(-) diff --git a/client/client_types.cpp b/client/client_types.cpp index 7ed53bd958..4014422be1 100644 --- a/client/client_types.cpp +++ b/client/client_types.cpp @@ -143,6 +143,10 @@ int APP::parse(XML_PARSER& xp) { fpops.parse(xp, "/fpops"); continue; } + if (xp.parse_int("max_concurrent", max_concurrent)) { + if (max_concurrent) have_max_concurrent = true; + continue; + } if (xp.match_tag("checkpoint_period")) { checkpoint_period.parse(xp, "/checkpoint_period"); continue; diff --git a/client/client_types.h b/client/client_types.h index d889ac4fa0..c507972b3a 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -247,6 +247,7 @@ struct APP { int max_concurrent; // Limit on # of concurrent jobs of this app; 0 if none // Specified in app_config.xml + // Can also specify in client_state.xml (for client emulator) int n_concurrent; // temp during job scheduling, to enforce max_concurrent int non_excluded_instances[MAX_RSC]; diff --git a/client/sim.cpp b/client/sim.cpp index 126b5eb81e..6b1e1b8de6 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -1168,12 +1168,17 @@ void show_app(APP* app) { fprintf(summary_file, " app %s\n" " job params: fpops_est %.0fG fpops mean %.0fG std_dev %.0fG\n" - " latency %.2f weight %.2f\n", + " latency %.2f weight %.2f", app->name, app->fpops_est/1e9, app->fpops.mean/1e9, app->fpops.std_dev/1e9, app->latency_bound, app->weight ); + if (app->max_concurrent) { + fprintf(summary_file, " max_concurrent %d\n", app->max_concurrent); + } else { + fprintf(summary_file, "\n"); + } for (unsigned int i=0; iapp != app) continue; diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index fcc6338cc9..4c8f72803c 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -41,7 +41,7 @@ using std::vector; RSC_WORK_FETCH rsc_work_fetch[MAX_RSC]; WORK_FETCH work_fetch; -inline bool dont_fetch(PROJECT* p, int rsc_type) { +static inline bool dont_fetch(PROJECT* p, int rsc_type) { if (p->no_rsc_pref[rsc_type]) return true; if (p->no_rsc_config[rsc_type]) return true; if (p->no_rsc_apps[rsc_type]) return true; @@ -187,7 +187,6 @@ static bool wacky_dcf(PROJECT* p) { // don't request anything if project is backed off. // void RSC_WORK_FETCH::set_request(PROJECT* p) { - if (dont_fetch(p, rsc_type)) return; // if backup project, fetch 1 job per idle instance // @@ -202,8 +201,6 @@ void RSC_WORK_FETCH::set_request(PROJECT* p) { return; } RSC_PROJECT_WORK_FETCH& w = project_state(p); - if (!w.may_have_work) return; - if (w.anon_skip) return; double non_excl_inst = ninstances - w.ncoprocs_excluded; if (shortfall) { if (wacky_dcf(p)) { @@ -476,7 +473,6 @@ void WORK_FETCH::piggyback_work_request(PROJECT* p) { bool check_higher_priority_projects = true; if (p->sched_rpc_pending && config.fetch_on_update) { check_higher_priority_projects = false; - return; } setup(); @@ -508,7 +504,7 @@ void WORK_FETCH::piggyback_work_request(PROJECT* p) { DEBUG(msg_printf(p, MSG_INFO, "piggyback: %s can't fetch work", p2->project_name);) continue; } - if (rwf.can_fetch(p2)) { + if (rwf.can_fetch(p2) && !rwf.backed_off(p2)) { DEBUG(msg_printf(p, MSG_INFO, "piggyback: better proj %s", p2->project_name);) break; } @@ -544,6 +540,16 @@ static bool higher_priority(PROJECT *p1, PROJECT *p2) { return (p1->sched_priority > p2->sched_priority); } +// check resource-level backoff +// +bool RSC_WORK_FETCH::backed_off(PROJECT* p) { + if (project_state(p).backoff_time > gstate.now) { + DEBUG(msg_printf(p, MSG_INFO, "skip: backoff");) + return true; + } + return false; +} + // a variety of checks for whether we should ask this project // for work of this type // @@ -556,12 +562,7 @@ bool RSC_WORK_FETCH::can_fetch(PROJECT *p) { return false; } - // check resource-level backoff - // - if (project_state(p).backoff_time > gstate.now) { - DEBUG(msg_printf(p, MSG_INFO, "skip: backoff");) - return false; - } + RSC_PROJECT_WORK_FETCH& rpwf = project_state(p); // if project has zero resource share, // only fetch work if a device is idle @@ -585,16 +586,15 @@ bool RSC_WORK_FETCH::can_fetch(PROJECT *p) { // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require // computing shortfall etc. on a per-project basis // - int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded; + int nexcl = rpwf.ncoprocs_excluded; if (rsc_type && nexcl) { int n_not_excluded = ninstances - nexcl; - if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) { + if (rpwf.queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) { DEBUG(msg_printf(p, MSG_INFO, "skip: too much work");) return false; } } - RSC_PROJECT_WORK_FETCH& rpwf = project_state(p); if (rpwf.anon_skip) { DEBUG(msg_printf(p, MSG_INFO, "skip: anon");) return false; @@ -690,7 +690,7 @@ PROJECT* WORK_FETCH::choose_project() { for (int i=0; i new_results ) { - bool got_rsc[MAX_RSC]; + bool got_work[MAX_RSC]; + bool requested_work[MAX_RSC]; for (int i=0; i 0); + } + for (unsigned int i=0; iavp->gpu_usage.rsc_type] = true; } - // if didn't get any jobs, back off on requested resource types - // - if (!new_results.size()) { - // but not if RPC was requested by project + for (int i=0; isched_rpc_pending != RPC_REASON_PROJECT_REQ) { - for (int i=0; irsc_pwf[i].backoff_time < gstate.now) { + switch (p->sched_rpc_pending) { + case RPC_REASON_RESULTS_DUE: + case RPC_REASON_NEED_WORK: + case RPC_REASON_TRICKLE_UP: p->rsc_pwf[i].resource_backoff(p, rsc_name(i)); } } } - return; - } - - // if we did get jobs, clear backoff on resource types - // - for (unsigned int i=0; iavp->gpu_usage.rsc_type] = true; - } - for (int i=0; irsc_pwf[i].clear_backoff(); + // if we did get jobs, clear backoff + // + if (got_work[i]) { + p->rsc_pwf[i].clear_backoff(); + } } } diff --git a/client/work_fetch.h b/client/work_fetch.h index 19840f257a..a8ba32e327 100644 --- a/client/work_fetch.h +++ b/client/work_fetch.h @@ -253,6 +253,7 @@ struct RSC_WORK_FETCH { void set_request_excluded(PROJECT*); bool may_have_work(PROJECT*); bool can_fetch(PROJECT*); + bool backed_off(PROJECT*); bool uses_starved_excluded_instances(PROJECT*); RSC_WORK_FETCH() { rsc_type = 0;