diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index f29ac5b69d..de2772fed5 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -491,7 +491,7 @@ bool CLIENT_STATE::scheduler_rpc_poll() { && !(config.fetch_minimal_work && had_or_requested_work) ) { - p = work_fetch.choose_project(true, NULL); + p = work_fetch.choose_project(); if (p) { if (actively_uploading(p)) { if (log_flags.work_fetch_debug) { diff --git a/client/sim.cpp b/client/sim.cpp index 3cd7921c79..66729eb75a 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -525,7 +525,7 @@ bool CLIENT_STATE::scheduler_rpc_poll() { p = find_project_with_overdue_results(false); if (p) { - //printf("doing RPC to %s to report results\n", p->project_name); + msg_printf(p, MSG_INFO, "doing RPC to report results"); work_fetch.piggyback_work_request(p); action = simulate_rpc(p); break; @@ -546,6 +546,7 @@ bool CLIENT_STATE::scheduler_rpc_poll() { p = work_fetch.choose_project(true, NULL); if (p) { + msg_printf(p, MSG_INFO, "doing RPC to get work"); action = simulate_rpc(p); break; } @@ -870,7 +871,7 @@ void show_resource(int rsc_type) { found = true; fprintf(html_out, "\n" - "%s\n", + "%s\n", rsc_type?"":"" ); } @@ -882,7 +883,7 @@ void show_resource(int rsc_type) { fprintf(html_out, "%s\n", ninst, colors[p->index%NCOLORS], - rp->rr_sim_misses_deadline?"*":"", + rp->edf_scheduled?"*":"", rp->name, rp->sim_flops_left/1e9, buf @@ -1006,8 +1007,17 @@ void set_initial_rec() { } } +static bool compare_names(PROJECT* p1, PROJECT* p2) { + return (strcmp(p1->project_name, p2->project_name) < 0); +} + void write_recs() { fprintf(rec_file, "%f ", gstate.now); + std::sort( + gstate.projects.begin(), + gstate.projects.end(), + compare_names + ); for (unsigned int i=0; ipwf.rec); diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index f5ce61b6cb..c0835dfde9 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -33,6 +33,9 @@ #include "work_fetch.h" +//#define DEBUG(x) x +#define DEBUG(X) + using std::vector; RSC_WORK_FETCH rsc_work_fetch[MAX_RSC]; @@ -180,142 +183,6 @@ static bool wacky_dcf(PROJECT* p) { return (dcf < 0.02 || dcf > 80.0); } -// If this resource is below min buffer level, -// return the highest-priority project that may have jobs for it. -// -// It the resource has instanced starved because of exclusions, -// return the highest-priority project that may have jobs -// and doesn't exclude those instances. -// -// Only choose a project if the buffer is below min level; -// if strict_hyst is true, relax this to max level -// -// If backoff_exempt_project is non-NULL, -// don't enforce resource backoffs for that project; -// this is for when we're going to do a scheduler RPC anyway -// and we're deciding whether to piggyback a work request -// -PROJECT* RSC_WORK_FETCH::choose_project_hyst( - bool strict_hyst, - PROJECT* backoff_exempt_project -) { - PROJECT* pbest = NULL; - bool buffer_low = true; - if (strict_hyst) { - if (saturated_time > gstate.work_buf_min()) buffer_low = false; - } else { - if (saturated_time > gstate.work_buf_total()) buffer_low = false; - } - - if (log_flags.work_fetch_debug) { - msg_printf(0, MSG_INFO, - "[work_fetch] choose_project() for %s: buffer_low: %s; sim_excluded_instances %d\n", - rsc_name(rsc_type), buffer_low?"yes":"no", sim_excluded_instances - ); - } - - if (!buffer_low && !sim_excluded_instances) return NULL; - - for (unsigned i=0; ipwf.cant_fetch_work_reason) { - //msg_printf(p, MSG_INFO, "skip: cfwr %d", p->pwf.cant_fetch_work_reason); - continue; - } - - // see whether work fetch for this resource is banned - // by prefs, config, project, or acct mgr - // - if (dont_fetch(p, rsc_type)) { - //msg_printf(p, MSG_INFO, "skip: dont_fetch"); - continue; - } - - // check backoff - // - if (p != backoff_exempt_project) { - if (project_state(p).backoff_time > gstate.now) { - //msg_printf(p, MSG_INFO, "skip: backoff"); - continue; - } - } - - // if project has zero resource share, - // only fetch work if a device is idle - // - if (p->resource_share == 0 && nidle_now == 0) { - //msg_printf(p, MSG_INFO, "skip: zero share"); - continue; - } - - // if project has excluded GPUs of this type, - // we need to avoid fetching work just because there's an idle instance - // or a shortfall; - // fetching work might not alleviate either of these, - // and we'd end up fetching unbounded work. - // At the same time, we want to respect work buf params if possible. - // - // Current policy: - // don't fetch work if remaining time of this project's jobs - // exceeds work_buf_min * (#usable instances / #instances) - // - // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require - // computing shortfall etc. on a per-project basis - // - int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded; - if (rsc_type && nexcl) { - int n_not_excluded = ninstances - nexcl; - if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) { - //msg_printf(p, MSG_INFO, "skip: too much work"); - continue; - } - } - - RSC_PROJECT_WORK_FETCH& rpwf = project_state(p); - if (rpwf.anon_skip) { - //msg_printf(p, MSG_INFO, "skip: anon"); - continue; - } - - // if we're sending work only because of exclusion starvation, - // make sure this project can use the starved instances - // - if (!buffer_low) { - if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) { - //msg_printf(p, MSG_INFO, "skip: excl"); - continue; - } - } - - if (pbest) { - if (pbest->sched_priority > p->sched_priority) { - //msg_printf(p, MSG_INFO, "skip: prio"); - continue; - } - } - pbest = p; - } - if (!pbest) { - if (log_flags.work_fetch_debug) { - msg_printf(0, MSG_INFO, - "[work_fetch] no eligible project for %s", - rsc_name(rsc_type) - ); - } - return NULL; - } - work_fetch.clear_request(); - if (buffer_low) { - work_fetch.set_all_requests_hyst(pbest, rsc_type); - } else { - set_request_excluded(pbest); - } - return pbest; -} - // request this project's share of shortfall and instances. // don't request anything if project is backed off. // @@ -483,6 +350,7 @@ void WORK_FETCH::rr_init() { } } +#if 0 // if the given project is highest-priority among the projects // eligible for the resource, set request fields // @@ -546,6 +414,7 @@ void WORK_FETCH::set_all_requests(PROJECT* p) { } } } +#endif void WORK_FETCH::print_state() { msg_printf(0, MSG_INFO, "[work_fetch] ------- start work fetch state -------"); @@ -590,7 +459,7 @@ void WORK_FETCH::clear_request() { // decide if we should piggy-back a work fetch request. // void WORK_FETCH::piggyback_work_request(PROJECT* p) { - clear_request(); + DEBUG(msg_printf(p, MSG_INFO, "piggyback_work_request()");) if (config.fetch_minimal_work && gstate.had_or_requested_work) return; if (p->dont_request_more_work) return; if (p->non_cpu_intensive) { @@ -601,29 +470,53 @@ void WORK_FETCH::piggyback_work_request(PROJECT* p) { } // if project was updated from manager and config says so, - // always fetch work if needed + // fetch work for a resource even if there are higher-prio projects + // able to fetch it // + bool check_higher_priority_projects = true; if (p->sched_rpc_pending && config.fetch_on_update) { - set_all_requests_hyst(p, -1); + check_higher_priority_projects = false; return; } - compute_cant_fetch_work_reason(); - PROJECT* bestp = choose_project(false, p); - if (p != bestp) { - if (p->pwf.cant_fetch_work_reason == 0) { - if (bestp) { - p->pwf.cant_fetch_work_reason = CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY; - if (log_flags.work_fetch_debug) { - msg_printf(0, MSG_INFO, - "[work_fetch] not piggybacking work req: %s has higher priority", - bestp->get_project_name() - ); - } - } else { - p->pwf.cant_fetch_work_reason = CANT_FETCH_WORK_DONT_NEED; - } + + setup(); + + // For each resource, scan projects in decreasing priority, + // seeing if there's one that's higher-priority than this + // able to fetch work for the resource. + // If not, and the resource needs topping off, do so + // + for (int i=0; iproject_name);) + break; + } + } + if (p != p2) continue; + } + DEBUG(msg_printf(p, MSG_INFO, "piggyback: requesting %s", rsc_name(i));) + if (buffer_low) { + rwf.set_request(p); + } else { + rwf.set_request_excluded(p); } - clear_request(); } } @@ -643,13 +536,117 @@ PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() { return 0; } -// choose a project to fetch work from, -// and set the request fields of resource objects. +static bool higher_priority(PROJECT *p1, PROJECT *p2) { + return (p1->sched_priority > p2->sched_priority); +} + +// a variety of checks for whether we should ask this project +// for work of this type // -PROJECT* WORK_FETCH::choose_project( - bool strict_hyst, - PROJECT* backoff_exempt_project -) { +bool RSC_WORK_FETCH::can_fetch(PROJECT *p) { + // see whether work fetch for this resource is banned + // by prefs, config, project, or acct mgr + // + if (dont_fetch(p, rsc_type)) { + DEBUG(msg_printf(p, MSG_INFO, "skip: dont_fetch");) + return false; + } + + // check backoff + // + if (project_state(p).backoff_time > gstate.now) { + DEBUG(msg_printf(p, MSG_INFO, "skip: backoff");) + return false; + } + + // if project has zero resource share, + // only fetch work if a device is idle + // + if (p->resource_share == 0 && nidle_now == 0) { + DEBUG(msg_printf(p, MSG_INFO, "skip: zero share");) + return false; + } + + // if project has excluded GPUs of this type, + // we need to avoid fetching work just because there's an idle instance + // or a shortfall; + // fetching work might not alleviate either of these, + // and we'd end up fetching unbounded work. + // At the same time, we want to respect work buf params if possible. + // + // Current policy: + // don't fetch work if remaining time of this project's jobs + // exceeds work_buf_min * (#usable instances / #instances) + // + // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require + // computing shortfall etc. on a per-project basis + // + int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded; + if (rsc_type && nexcl) { + int n_not_excluded = ninstances - nexcl; + if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) { + DEBUG(msg_printf(p, MSG_INFO, "skip: too much work");) + return false; + } + } + + RSC_PROJECT_WORK_FETCH& rpwf = project_state(p); + if (rpwf.anon_skip) { + DEBUG(msg_printf(p, MSG_INFO, "skip: anon");) + return false; + } + return true; +} + +// return true if there is exclusion starvation +// and this project can use the starved instances +// +bool RSC_WORK_FETCH::uses_starved_excluded_instances(PROJECT* p) { + RSC_PROJECT_WORK_FETCH& rpwf = project_state(p); + if (!sim_excluded_instances) return false; + if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) { + DEBUG(msg_printf(p, MSG_INFO, "skip: excl");) + return false; + } + return true; +} + +// setup for choose_project() and piggyback() +// +void WORK_FETCH::setup() { + gstate.compute_nuploading_results(); + + rr_simulation(); + compute_shares(); + project_priority_init(true); + clear_request(); + + // Decrement the priority of projects that have work queued. + // Specifically, subtract + // (FLOPs queued for P)/(FLOPs of max queue) + // which will generally be between 0 and 1. + // This is a little arbitrary but I can't think of anything better. + // + double max_queued_flops = gstate.work_buf_total()*total_peak_flops(); + for (unsigned int i=0; iproject; + p->sched_priority -= rp->estimated_flops_remaining()/max_queued_flops; + } + std::sort( + gstate.projects.begin(), + gstate.projects.end(), + higher_priority + ); +} + +// Choose a project to fetch work from, +// and set the request fields of resource objects. +// If "strict_hyst" is set, require that some resource be below min buf; +// otherwise require below max buf +// (or +// +PROJECT* WORK_FETCH::choose_project() { PROJECT* p; if (log_flags.work_fetch_debug) { @@ -659,34 +656,86 @@ PROJECT* WORK_FETCH::choose_project( p = non_cpu_intensive_project_needing_work(); if (p) return p; - gstate.compute_nuploading_results(); + setup(); - rr_simulation(); - compute_shares(); - project_priority_init(true); + for (int i=0; iproject; - p->sched_priority -= rp->estimated_flops_remaining()/max_queued_flops; - } - - p = 0; - if (gpus_usable) { - for (int i=1; ipwf.cant_fetch_work_reason) { + DEBUG(msg_printf(p, MSG_INFO, "skip: cfwr %d", p->pwf.cant_fetch_work_reason);) + continue; + } + + // For each resource type: + // - See if we can ask this project for work of that type; + // if so set a flag so that lower-priority projects + // won't request it + // - If so, see if work is needed for this type; + // if so, set "found" + // + int rsc_index = -1; + for (int i=0; i= 0) { + for (int i=0; isecs_this_rec_interval = 0; } + // temp in choose_project() + PROJECT* found_project; // a project able to ask for this work + void rr_init(); void update_stats(double sim_now, double dt, double buf_end); void update_busy_time(double dur, double nused); - PROJECT* choose_project_hyst(bool strict_hyst, PROJECT*); void supplement(PROJECT*); RSC_PROJECT_WORK_FETCH& project_state(PROJECT*); void print_state(const char*); @@ -250,6 +252,8 @@ struct RSC_WORK_FETCH { void set_request(PROJECT*); void set_request_excluded(PROJECT*); bool may_have_work(PROJECT*); + bool can_fetch(PROJECT*); + bool uses_starved_excluded_instances(PROJECT*); RSC_WORK_FETCH() { rsc_type = 0; ninstances = 0; @@ -287,11 +291,9 @@ struct PROJECT_WORK_FETCH { // global work fetch state // struct WORK_FETCH { - PROJECT* choose_project(bool strict_hyst, PROJECT*); + void setup(); + PROJECT* choose_project(); // Find a project to ask for work. - // If strict is false consider requesting work - // even if buffer is above min level - // or project is backed off for a resource type PROJECT* non_cpu_intensive_project_needing_work(); void piggyback_work_request(PROJECT*); // we're going to contact this project anyway;
#devsJob nameGFLOPs left
#devsJob name (* = high priority)GFLOPs left
GPU
%.2f%s%s%.0f