client: scheduling and work fetch tweaks for GPU exclusion cases

Scheduling: if a resource has exclusions, put all jobs in the run list;
otherwise we might fail to have a job for a GPU instance, and starve it.

Work fetch: allow work fetch from zero-share projects if the resource
has instances that are idle because of GPU exclusion
This commit is contained in:
David Anderson 2014-05-24 15:18:41 -07:00
parent 02003bc2da
commit 8d009ce3b3
2 changed files with 10 additions and 12 deletions

View File

@ -170,23 +170,19 @@ struct PROC_RESOURCES {
void schedule(RESULT* rp, ACTIVE_TASK* atp, const char* description) {
if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO,
"[cpu_sched_debug] scheduling %s (%s) (prio %f)",
"[cpu_sched_debug] add to run list: %s (%s) (prio %f)",
rp->name, description,
rp->project->sched_priority
);
}
if (rp->uses_coprocs()) {
// if this job is currently running,
// and the resource type has exclusions,
// don't reserve instances;
// This allows more jobs in the run list
// and avoids a starvation case
// if the resource type has exclusions, don't reserve instances.
// It means that the run list will include all jobs
// for that resource type.
// Inefficient, but necessary to avoid starvation cases.
//
int rt = rp->avp->gpu_usage.rsc_type;
bool dont_reserve =
rsc_work_fetch[rt].has_exclusions
&& atp != NULL
&& atp->is_gpu_task_running();
bool dont_reserve = rsc_work_fetch[rt].has_exclusions;
if (!dont_reserve) {
reserve_coprocs(*rp);
}

View File

@ -270,7 +270,9 @@ void RSC_WORK_FETCH::print_state(const char* name) {
shortfall, nidle_now, saturated_time,
busy_time_estimator.get_busy_time()
);
//msg_printf(0, MSG_INFO, "[work_fetch] sim used inst %d sim excl inst %d", sim_used_instances, sim_excluded_instances);
msg_printf(0, MSG_INFO, "[work_fetch] sim used inst %d sim excl inst %d",
sim_used_instances, sim_excluded_instances
);
for (unsigned int i=0; i<gstate.projects.size(); i++) {
char buf[256];
PROJECT* p = gstate.projects[i];
@ -603,7 +605,7 @@ int RSC_WORK_FETCH::cant_fetch(PROJECT *p) {
// if project has zero resource share,
// only fetch work if a device is idle
//
if (p->resource_share == 0 && nidle_now == 0) {
if (p->resource_share == 0 && nidle_now == 0 && sim_excluded_instances==0) {
WF_DEBUG(msg_printf(p, MSG_INFO, "skip: zero share");)
return DONT_FETCH_ZERO_SHARE;
}