From 17e44af601da218df6da0d90f065029f86b9de65 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 11 Feb 2014 12:33:13 -0800 Subject: [PATCH] Client: fix job scheduling bug that could starve CPUs Job scheduling has 2 phases: make_run_list(): build a sorted list of runnable jobs enforce_run_list() go through the list and run jobs The run list in general contains more jobs than can actually be run. This is intentional. There are lots of reasons why enforce_run_list() might not be able to run a particular job, and we don't know these during make_run_list(). So we need to give enforce_run_list() a surplus of choices. The problem: make_run_list() was accounting RAM usage of jobs in the list, and stopping when this exceeded physical RAM. This led to a situation where we added a bunch of GPU jobs to the list - more than could actually be run - and this caused too few CPU jobs to be put in the list. Oddly, the comment at the start of cpu_sched.cpp said that RAM usage was ignored by make_run_list(); this was not the case. Anyway, I removed RAM accounting from make_run_list(). --- client/cpu_sched.cpp | 32 +++++++++++--------------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index e6164647b1..029ca32558 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -17,7 +17,7 @@ // CPU scheduling logic. // -// - create an ordered "run list" (schedule_cpus). +// - create an ordered "run list" (make_run_list()). // The ordering is roughly as follows: // - GPU jobs first, then CPU jobs // - for a given resource, jobs in deadline danger first @@ -39,8 +39,8 @@ // - sort the list according to "more_important()" // - shuffle the list to avoid starving multi-thread jobs // -// - scan through the resulting list, -// running the jobs and preempting other jobs. +// - scan through the resulting list, running the jobs and preempting +// other jobs (enforce_run_list). // Don't run a job if // - its GPUs can't be assigned (possible if need >1 GPU) // - it's a multi-thread job, and CPU usage would be #CPUs+1 or more @@ -99,7 +99,6 @@ struct PROC_RESOURCES { double ncpus_used_st; // #CPUs of GPU or single-thread jobs double ncpus_used_mt; // #CPUs of multi-thread jobs COPROCS pr_coprocs; - double ram_left; void init() { ncpus = gstate.ncpus; @@ -107,7 +106,6 @@ struct PROC_RESOURCES { ncpus_used_mt = 0; pr_coprocs.clone(coprocs, false); pr_coprocs.clear_usage(); - ram_left = gstate.available_ram(); if (have_max_concurrent) { max_concurrent_init(); } @@ -116,7 +114,11 @@ struct PROC_RESOURCES { // should we stop scanning jobs? // inline bool stop_scan_cpu() { - return ncpus_used_st >= ncpus; + if (ncpus_used_st >= ncpus) return true; + if (ncpus_used_mt >= 2*ncpus) return true; + // kind of arbitrary, but need to have some limit + // in case there are only MT jobs, and lots of them + return false; } inline bool stop_scan_coproc(int rsc_type) { @@ -131,7 +133,6 @@ struct PROC_RESOURCES { // (i.e add it to the runnable list; not actually run it) // bool can_schedule(RESULT* rp, ACTIVE_TASK* atp) { - double wss; if (max_concurrent_exceeded(rp)) return false; if (atp) { // don't schedule if something's pending @@ -154,11 +155,7 @@ struct PROC_RESOURCES { } atp->needs_shmem = false; } - wss = atp->procinfo.working_set_size_smoothed; - } else { - wss = rp->avp->max_working_set_size; } - if (wss > ram_left) return false; if (rp->schedule_backoff > gstate.now) return false; if (rp->uses_coprocs()) { if (gpu_suspend_reason) return false; @@ -211,13 +208,6 @@ struct PROC_RESOURCES { } else { ncpus_used_st += rp->avp->avg_ncpus; } - double wss; - if (atp) { - wss = atp->procinfo.working_set_size_smoothed; - } else { - wss = rp->avp->max_working_set_size; - } - ram_left -= wss; adjust_rec_sched(rp); max_concurrent_inc(rp); @@ -1093,7 +1083,7 @@ void CLIENT_STATE::append_unfinished_time_slice(vector &run_list) { // That's the only kind of suspended GPU job. // CORPOC::usage[]: for each instance, its usage // -// enforce_schedule() calls assign_coprocs(), +// enforce_run_list() calls assign_coprocs(), // which assigns coproc instances to scheduled jobs, // and prunes jobs for which we can't make an assignment // (the job list is in order of decreasing priority) @@ -1536,7 +1526,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { #endif if (log_flags.cpu_sched_debug) { - msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule(): start"); + msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list(): start"); msg_printf(0, MSG_INFO, "[cpu_sched_debug] preliminary job list:"); print_job_list(run_list); } @@ -1901,7 +1891,7 @@ bool CLIENT_STATE::enforce_run_list(vector& run_list) { set_client_state_dirty("enforce_cpu_schedule"); } if (log_flags.cpu_sched_debug) { - msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule: end"); + msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list: end"); } if (coproc_start_deferred) { if (log_flags.cpu_sched_debug) {