client: scheduling and work fetch tweaks for GPU exclusion cases

Scheduling: if a resource has exclusions, put all jobs in the run list;
otherwise we might fail to have a job for a GPU instance, and starve it.

Work fetch: allow work fetch from zero-share projects if the resource
has instances that are idle because of GPU exclusion
This commit is contained in:
David Anderson 2014-05-24 15:18:41 -07:00
parent 02003bc2da
commit 8d009ce3b3
2 changed files with 10 additions and 12 deletions

View File

@ -170,23 +170,19 @@ struct PROC_RESOURCES {
void schedule(RESULT* rp, ACTIVE_TASK* atp, const char* description) { void schedule(RESULT* rp, ACTIVE_TASK* atp, const char* description) {
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO, msg_printf(rp->project, MSG_INFO,
"[cpu_sched_debug] scheduling %s (%s) (prio %f)", "[cpu_sched_debug] add to run list: %s (%s) (prio %f)",
rp->name, description, rp->name, description,
rp->project->sched_priority rp->project->sched_priority
); );
} }
if (rp->uses_coprocs()) { if (rp->uses_coprocs()) {
// if this job is currently running, // if the resource type has exclusions, don't reserve instances.
// and the resource type has exclusions, // It means that the run list will include all jobs
// don't reserve instances; // for that resource type.
// This allows more jobs in the run list // Inefficient, but necessary to avoid starvation cases.
// and avoids a starvation case
// //
int rt = rp->avp->gpu_usage.rsc_type; int rt = rp->avp->gpu_usage.rsc_type;
bool dont_reserve = bool dont_reserve = rsc_work_fetch[rt].has_exclusions;
rsc_work_fetch[rt].has_exclusions
&& atp != NULL
&& atp->is_gpu_task_running();
if (!dont_reserve) { if (!dont_reserve) {
reserve_coprocs(*rp); reserve_coprocs(*rp);
} }

View File

@ -270,7 +270,9 @@ void RSC_WORK_FETCH::print_state(const char* name) {
shortfall, nidle_now, saturated_time, shortfall, nidle_now, saturated_time,
busy_time_estimator.get_busy_time() busy_time_estimator.get_busy_time()
); );
//msg_printf(0, MSG_INFO, "[work_fetch] sim used inst %d sim excl inst %d", sim_used_instances, sim_excluded_instances); msg_printf(0, MSG_INFO, "[work_fetch] sim used inst %d sim excl inst %d",
sim_used_instances, sim_excluded_instances
);
for (unsigned int i=0; i<gstate.projects.size(); i++) { for (unsigned int i=0; i<gstate.projects.size(); i++) {
char buf[256]; char buf[256];
PROJECT* p = gstate.projects[i]; PROJECT* p = gstate.projects[i];
@ -603,7 +605,7 @@ int RSC_WORK_FETCH::cant_fetch(PROJECT *p) {
// if project has zero resource share, // if project has zero resource share,
// only fetch work if a device is idle // only fetch work if a device is idle
// //
if (p->resource_share == 0 && nidle_now == 0) { if (p->resource_share == 0 && nidle_now == 0 && sim_excluded_instances==0) {
WF_DEBUG(msg_printf(p, MSG_INFO, "skip: zero share");) WF_DEBUG(msg_printf(p, MSG_INFO, "skip: zero share");)
return DONT_FETCH_ZERO_SHARE; return DONT_FETCH_ZERO_SHARE;
} }