From 8d009ce3b3a511289ea641ee917cb1e90ddd82aa Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sat, 24 May 2014 15:18:41 -0700 Subject: [PATCH] client: scheduling and work fetch tweaks for GPU exclusion cases Scheduling: if a resource has exclusions, put all jobs in the run list; otherwise we might fail to have a job for a GPU instance, and starve it. Work fetch: allow work fetch from zero-share projects if the resource has instances that are idle because of GPU exclusion --- client/cpu_sched.cpp | 16 ++++++---------- client/work_fetch.cpp | 6 ++++-- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 29eb03b8a9..584ad4d6a9 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -170,23 +170,19 @@ struct PROC_RESOURCES { void schedule(RESULT* rp, ACTIVE_TASK* atp, const char* description) { if (log_flags.cpu_sched_debug) { msg_printf(rp->project, MSG_INFO, - "[cpu_sched_debug] scheduling %s (%s) (prio %f)", + "[cpu_sched_debug] add to run list: %s (%s) (prio %f)", rp->name, description, rp->project->sched_priority ); } if (rp->uses_coprocs()) { - // if this job is currently running, - // and the resource type has exclusions, - // don't reserve instances; - // This allows more jobs in the run list - // and avoids a starvation case + // if the resource type has exclusions, don't reserve instances. + // It means that the run list will include all jobs + // for that resource type. + // Inefficient, but necessary to avoid starvation cases. // int rt = rp->avp->gpu_usage.rsc_type; - bool dont_reserve = - rsc_work_fetch[rt].has_exclusions - && atp != NULL - && atp->is_gpu_task_running(); + bool dont_reserve = rsc_work_fetch[rt].has_exclusions; if (!dont_reserve) { reserve_coprocs(*rp); } diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp index acb0575a23..ee7ed0624e 100644 --- a/client/work_fetch.cpp +++ b/client/work_fetch.cpp @@ -270,7 +270,9 @@ void RSC_WORK_FETCH::print_state(const char* name) { shortfall, nidle_now, saturated_time, busy_time_estimator.get_busy_time() ); - //msg_printf(0, MSG_INFO, "[work_fetch] sim used inst %d sim excl inst %d", sim_used_instances, sim_excluded_instances); + msg_printf(0, MSG_INFO, "[work_fetch] sim used inst %d sim excl inst %d", + sim_used_instances, sim_excluded_instances + ); for (unsigned int i=0; iresource_share == 0 && nidle_now == 0) { + if (p->resource_share == 0 && nidle_now == 0 && sim_excluded_instances==0) { WF_DEBUG(msg_printf(p, MSG_INFO, "skip: zero share");) return DONT_FETCH_ZERO_SHARE; }