diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp
index f29ac5b69d..de2772fed5 100644
--- a/client/cs_scheduler.cpp
+++ b/client/cs_scheduler.cpp
@@ -491,7 +491,7 @@ bool CLIENT_STATE::scheduler_rpc_poll() {
&& !(config.fetch_minimal_work && had_or_requested_work)
) {
- p = work_fetch.choose_project(true, NULL);
+ p = work_fetch.choose_project();
if (p) {
if (actively_uploading(p)) {
if (log_flags.work_fetch_debug) {
diff --git a/client/sim.cpp b/client/sim.cpp
index 3cd7921c79..66729eb75a 100644
--- a/client/sim.cpp
+++ b/client/sim.cpp
@@ -525,7 +525,7 @@ bool CLIENT_STATE::scheduler_rpc_poll() {
p = find_project_with_overdue_results(false);
if (p) {
- //printf("doing RPC to %s to report results\n", p->project_name);
+ msg_printf(p, MSG_INFO, "doing RPC to report results");
work_fetch.piggyback_work_request(p);
action = simulate_rpc(p);
break;
@@ -546,6 +546,7 @@ bool CLIENT_STATE::scheduler_rpc_poll() {
p = work_fetch.choose_project(true, NULL);
if (p) {
+ msg_printf(p, MSG_INFO, "doing RPC to get work");
action = simulate_rpc(p);
break;
}
@@ -870,7 +871,7 @@ void show_resource(int rsc_type) {
found = true;
fprintf(html_out,
"
\n"
- "#devs | Job name | GFLOPs left | %s
\n",
+ "#devs | Job name (* = high priority) | GFLOPs left | %s
\n",
rsc_type?"GPU | ":""
);
}
@@ -882,7 +883,7 @@ void show_resource(int rsc_type) {
fprintf(html_out, "%.2f | %s%s | %.0f | %s
\n",
ninst,
colors[p->index%NCOLORS],
- rp->rr_sim_misses_deadline?"*":"",
+ rp->edf_scheduled?"*":"",
rp->name,
rp->sim_flops_left/1e9,
buf
@@ -1006,8 +1007,17 @@ void set_initial_rec() {
}
}
+static bool compare_names(PROJECT* p1, PROJECT* p2) {
+ return (strcmp(p1->project_name, p2->project_name) < 0);
+}
+
void write_recs() {
fprintf(rec_file, "%f ", gstate.now);
+ std::sort(
+ gstate.projects.begin(),
+ gstate.projects.end(),
+ compare_names
+ );
for (unsigned int i=0; ipwf.rec);
diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp
index f5ce61b6cb..c0835dfde9 100644
--- a/client/work_fetch.cpp
+++ b/client/work_fetch.cpp
@@ -33,6 +33,9 @@
#include "work_fetch.h"
+//#define DEBUG(x) x
+#define DEBUG(X)
+
using std::vector;
RSC_WORK_FETCH rsc_work_fetch[MAX_RSC];
@@ -180,142 +183,6 @@ static bool wacky_dcf(PROJECT* p) {
return (dcf < 0.02 || dcf > 80.0);
}
-// If this resource is below min buffer level,
-// return the highest-priority project that may have jobs for it.
-//
-// It the resource has instanced starved because of exclusions,
-// return the highest-priority project that may have jobs
-// and doesn't exclude those instances.
-//
-// Only choose a project if the buffer is below min level;
-// if strict_hyst is true, relax this to max level
-//
-// If backoff_exempt_project is non-NULL,
-// don't enforce resource backoffs for that project;
-// this is for when we're going to do a scheduler RPC anyway
-// and we're deciding whether to piggyback a work request
-//
-PROJECT* RSC_WORK_FETCH::choose_project_hyst(
- bool strict_hyst,
- PROJECT* backoff_exempt_project
-) {
- PROJECT* pbest = NULL;
- bool buffer_low = true;
- if (strict_hyst) {
- if (saturated_time > gstate.work_buf_min()) buffer_low = false;
- } else {
- if (saturated_time > gstate.work_buf_total()) buffer_low = false;
- }
-
- if (log_flags.work_fetch_debug) {
- msg_printf(0, MSG_INFO,
- "[work_fetch] choose_project() for %s: buffer_low: %s; sim_excluded_instances %d\n",
- rsc_name(rsc_type), buffer_low?"yes":"no", sim_excluded_instances
- );
- }
-
- if (!buffer_low && !sim_excluded_instances) return NULL;
-
- for (unsigned i=0; ipwf.cant_fetch_work_reason) {
- //msg_printf(p, MSG_INFO, "skip: cfwr %d", p->pwf.cant_fetch_work_reason);
- continue;
- }
-
- // see whether work fetch for this resource is banned
- // by prefs, config, project, or acct mgr
- //
- if (dont_fetch(p, rsc_type)) {
- //msg_printf(p, MSG_INFO, "skip: dont_fetch");
- continue;
- }
-
- // check backoff
- //
- if (p != backoff_exempt_project) {
- if (project_state(p).backoff_time > gstate.now) {
- //msg_printf(p, MSG_INFO, "skip: backoff");
- continue;
- }
- }
-
- // if project has zero resource share,
- // only fetch work if a device is idle
- //
- if (p->resource_share == 0 && nidle_now == 0) {
- //msg_printf(p, MSG_INFO, "skip: zero share");
- continue;
- }
-
- // if project has excluded GPUs of this type,
- // we need to avoid fetching work just because there's an idle instance
- // or a shortfall;
- // fetching work might not alleviate either of these,
- // and we'd end up fetching unbounded work.
- // At the same time, we want to respect work buf params if possible.
- //
- // Current policy:
- // don't fetch work if remaining time of this project's jobs
- // exceeds work_buf_min * (#usable instances / #instances)
- //
- // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require
- // computing shortfall etc. on a per-project basis
- //
- int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded;
- if (rsc_type && nexcl) {
- int n_not_excluded = ninstances - nexcl;
- if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) {
- //msg_printf(p, MSG_INFO, "skip: too much work");
- continue;
- }
- }
-
- RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
- if (rpwf.anon_skip) {
- //msg_printf(p, MSG_INFO, "skip: anon");
- continue;
- }
-
- // if we're sending work only because of exclusion starvation,
- // make sure this project can use the starved instances
- //
- if (!buffer_low) {
- if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) {
- //msg_printf(p, MSG_INFO, "skip: excl");
- continue;
- }
- }
-
- if (pbest) {
- if (pbest->sched_priority > p->sched_priority) {
- //msg_printf(p, MSG_INFO, "skip: prio");
- continue;
- }
- }
- pbest = p;
- }
- if (!pbest) {
- if (log_flags.work_fetch_debug) {
- msg_printf(0, MSG_INFO,
- "[work_fetch] no eligible project for %s",
- rsc_name(rsc_type)
- );
- }
- return NULL;
- }
- work_fetch.clear_request();
- if (buffer_low) {
- work_fetch.set_all_requests_hyst(pbest, rsc_type);
- } else {
- set_request_excluded(pbest);
- }
- return pbest;
-}
-
// request this project's share of shortfall and instances.
// don't request anything if project is backed off.
//
@@ -483,6 +350,7 @@ void WORK_FETCH::rr_init() {
}
}
+#if 0
// if the given project is highest-priority among the projects
// eligible for the resource, set request fields
//
@@ -546,6 +414,7 @@ void WORK_FETCH::set_all_requests(PROJECT* p) {
}
}
}
+#endif
void WORK_FETCH::print_state() {
msg_printf(0, MSG_INFO, "[work_fetch] ------- start work fetch state -------");
@@ -590,7 +459,7 @@ void WORK_FETCH::clear_request() {
// decide if we should piggy-back a work fetch request.
//
void WORK_FETCH::piggyback_work_request(PROJECT* p) {
- clear_request();
+ DEBUG(msg_printf(p, MSG_INFO, "piggyback_work_request()");)
if (config.fetch_minimal_work && gstate.had_or_requested_work) return;
if (p->dont_request_more_work) return;
if (p->non_cpu_intensive) {
@@ -601,29 +470,53 @@ void WORK_FETCH::piggyback_work_request(PROJECT* p) {
}
// if project was updated from manager and config says so,
- // always fetch work if needed
+ // fetch work for a resource even if there are higher-prio projects
+ // able to fetch it
//
+ bool check_higher_priority_projects = true;
if (p->sched_rpc_pending && config.fetch_on_update) {
- set_all_requests_hyst(p, -1);
+ check_higher_priority_projects = false;
return;
}
- compute_cant_fetch_work_reason();
- PROJECT* bestp = choose_project(false, p);
- if (p != bestp) {
- if (p->pwf.cant_fetch_work_reason == 0) {
- if (bestp) {
- p->pwf.cant_fetch_work_reason = CANT_FETCH_WORK_NOT_HIGHEST_PRIORITY;
- if (log_flags.work_fetch_debug) {
- msg_printf(0, MSG_INFO,
- "[work_fetch] not piggybacking work req: %s has higher priority",
- bestp->get_project_name()
- );
- }
- } else {
- p->pwf.cant_fetch_work_reason = CANT_FETCH_WORK_DONT_NEED;
- }
+
+ setup();
+
+ // For each resource, scan projects in decreasing priority,
+ // seeing if there's one that's higher-priority than this
+ // able to fetch work for the resource.
+ // If not, and the resource needs topping off, do so
+ //
+ for (int i=0; iproject_name);)
+ break;
+ }
+ }
+ if (p != p2) continue;
+ }
+ DEBUG(msg_printf(p, MSG_INFO, "piggyback: requesting %s", rsc_name(i));)
+ if (buffer_low) {
+ rwf.set_request(p);
+ } else {
+ rwf.set_request_excluded(p);
}
- clear_request();
}
}
@@ -643,13 +536,117 @@ PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() {
return 0;
}
-// choose a project to fetch work from,
-// and set the request fields of resource objects.
+static bool higher_priority(PROJECT *p1, PROJECT *p2) {
+ return (p1->sched_priority > p2->sched_priority);
+}
+
+// a variety of checks for whether we should ask this project
+// for work of this type
//
-PROJECT* WORK_FETCH::choose_project(
- bool strict_hyst,
- PROJECT* backoff_exempt_project
-) {
+bool RSC_WORK_FETCH::can_fetch(PROJECT *p) {
+ // see whether work fetch for this resource is banned
+ // by prefs, config, project, or acct mgr
+ //
+ if (dont_fetch(p, rsc_type)) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: dont_fetch");)
+ return false;
+ }
+
+ // check backoff
+ //
+ if (project_state(p).backoff_time > gstate.now) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: backoff");)
+ return false;
+ }
+
+ // if project has zero resource share,
+ // only fetch work if a device is idle
+ //
+ if (p->resource_share == 0 && nidle_now == 0) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: zero share");)
+ return false;
+ }
+
+ // if project has excluded GPUs of this type,
+ // we need to avoid fetching work just because there's an idle instance
+ // or a shortfall;
+ // fetching work might not alleviate either of these,
+ // and we'd end up fetching unbounded work.
+ // At the same time, we want to respect work buf params if possible.
+ //
+ // Current policy:
+ // don't fetch work if remaining time of this project's jobs
+ // exceeds work_buf_min * (#usable instances / #instances)
+ //
+ // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require
+ // computing shortfall etc. on a per-project basis
+ //
+ int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded;
+ if (rsc_type && nexcl) {
+ int n_not_excluded = ninstances - nexcl;
+ if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: too much work");)
+ return false;
+ }
+ }
+
+ RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
+ if (rpwf.anon_skip) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: anon");)
+ return false;
+ }
+ return true;
+}
+
+// return true if there is exclusion starvation
+// and this project can use the starved instances
+//
+bool RSC_WORK_FETCH::uses_starved_excluded_instances(PROJECT* p) {
+ RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
+ if (!sim_excluded_instances) return false;
+ if ((sim_excluded_instances & rpwf.non_excluded_instances) == 0) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: excl");)
+ return false;
+ }
+ return true;
+}
+
+// setup for choose_project() and piggyback()
+//
+void WORK_FETCH::setup() {
+ gstate.compute_nuploading_results();
+
+ rr_simulation();
+ compute_shares();
+ project_priority_init(true);
+ clear_request();
+
+ // Decrement the priority of projects that have work queued.
+ // Specifically, subtract
+ // (FLOPs queued for P)/(FLOPs of max queue)
+ // which will generally be between 0 and 1.
+ // This is a little arbitrary but I can't think of anything better.
+ //
+ double max_queued_flops = gstate.work_buf_total()*total_peak_flops();
+ for (unsigned int i=0; iproject;
+ p->sched_priority -= rp->estimated_flops_remaining()/max_queued_flops;
+ }
+ std::sort(
+ gstate.projects.begin(),
+ gstate.projects.end(),
+ higher_priority
+ );
+}
+
+// Choose a project to fetch work from,
+// and set the request fields of resource objects.
+// If "strict_hyst" is set, require that some resource be below min buf;
+// otherwise require below max buf
+// (or
+//
+PROJECT* WORK_FETCH::choose_project() {
PROJECT* p;
if (log_flags.work_fetch_debug) {
@@ -659,34 +656,86 @@ PROJECT* WORK_FETCH::choose_project(
p = non_cpu_intensive_project_needing_work();
if (p) return p;
- gstate.compute_nuploading_results();
+ setup();
- rr_simulation();
- compute_shares();
- project_priority_init(true);
+ for (int i=0; iproject;
- p->sched_priority -= rp->estimated_flops_remaining()/max_queued_flops;
- }
-
- p = 0;
- if (gpus_usable) {
- for (int i=1; ipwf.cant_fetch_work_reason) {
+ DEBUG(msg_printf(p, MSG_INFO, "skip: cfwr %d", p->pwf.cant_fetch_work_reason);)
+ continue;
+ }
+
+ // For each resource type:
+ // - See if we can ask this project for work of that type;
+ // if so set a flag so that lower-priority projects
+ // won't request it
+ // - If so, see if work is needed for this type;
+ // if so, set "found"
+ //
+ int rsc_index = -1;
+ for (int i=0; i= 0) {
+ for (int i=0; isecs_this_rec_interval = 0;
}
+ // temp in choose_project()
+ PROJECT* found_project; // a project able to ask for this work
+
void rr_init();
void update_stats(double sim_now, double dt, double buf_end);
void update_busy_time(double dur, double nused);
- PROJECT* choose_project_hyst(bool strict_hyst, PROJECT*);
void supplement(PROJECT*);
RSC_PROJECT_WORK_FETCH& project_state(PROJECT*);
void print_state(const char*);
@@ -250,6 +252,8 @@ struct RSC_WORK_FETCH {
void set_request(PROJECT*);
void set_request_excluded(PROJECT*);
bool may_have_work(PROJECT*);
+ bool can_fetch(PROJECT*);
+ bool uses_starved_excluded_instances(PROJECT*);
RSC_WORK_FETCH() {
rsc_type = 0;
ninstances = 0;
@@ -287,11 +291,9 @@ struct PROJECT_WORK_FETCH {
// global work fetch state
//
struct WORK_FETCH {
- PROJECT* choose_project(bool strict_hyst, PROJECT*);
+ void setup();
+ PROJECT* choose_project();
// Find a project to ask for work.
- // If strict is false consider requesting work
- // even if buffer is above min level
- // or project is backed off for a resource type
PROJECT* non_cpu_intensive_project_needing_work();
void piggyback_work_request(PROJECT*);
// we're going to contact this project anyway;