Client: fix job scheduling bug that could starve CPUs

Job scheduling has 2 phases:
    make_run_list(): build a sorted list of runnable jobs
    enforce_run_list() go through the list and run jobs
The run list in general contains more jobs than can actually be run.
This is intentional.
There are lots of reasons why enforce_run_list() might not be able
to run a particular job, and we don't know these during make_run_list().
So we need to give enforce_run_list() a surplus of choices.

The problem: make_run_list() was accounting RAM usage of jobs in the list,
and stopping when this exceeded physical RAM.
This led to a situation where we added a bunch of GPU jobs to the list -
more than could actually be run -
and this caused too few CPU jobs to be put in the list.

Oddly, the comment at the start of cpu_sched.cpp said that RAM usage
was ignored by make_run_list(); this was not the case.

Anyway, I removed RAM accounting from make_run_list().
This commit is contained in:
David Anderson 2014-02-11 12:33:13 -08:00
parent 2a84d20eae
commit 17e44af601
1 changed files with 11 additions and 21 deletions

View File

@ -17,7 +17,7 @@
// CPU scheduling logic. // CPU scheduling logic.
// //
// - create an ordered "run list" (schedule_cpus). // - create an ordered "run list" (make_run_list()).
// The ordering is roughly as follows: // The ordering is roughly as follows:
// - GPU jobs first, then CPU jobs // - GPU jobs first, then CPU jobs
// - for a given resource, jobs in deadline danger first // - for a given resource, jobs in deadline danger first
@ -39,8 +39,8 @@
// - sort the list according to "more_important()" // - sort the list according to "more_important()"
// - shuffle the list to avoid starving multi-thread jobs // - shuffle the list to avoid starving multi-thread jobs
// //
// - scan through the resulting list, // - scan through the resulting list, running the jobs and preempting
// running the jobs and preempting other jobs. // other jobs (enforce_run_list).
// Don't run a job if // Don't run a job if
// - its GPUs can't be assigned (possible if need >1 GPU) // - its GPUs can't be assigned (possible if need >1 GPU)
// - it's a multi-thread job, and CPU usage would be #CPUs+1 or more // - it's a multi-thread job, and CPU usage would be #CPUs+1 or more
@ -99,7 +99,6 @@ struct PROC_RESOURCES {
double ncpus_used_st; // #CPUs of GPU or single-thread jobs double ncpus_used_st; // #CPUs of GPU or single-thread jobs
double ncpus_used_mt; // #CPUs of multi-thread jobs double ncpus_used_mt; // #CPUs of multi-thread jobs
COPROCS pr_coprocs; COPROCS pr_coprocs;
double ram_left;
void init() { void init() {
ncpus = gstate.ncpus; ncpus = gstate.ncpus;
@ -107,7 +106,6 @@ struct PROC_RESOURCES {
ncpus_used_mt = 0; ncpus_used_mt = 0;
pr_coprocs.clone(coprocs, false); pr_coprocs.clone(coprocs, false);
pr_coprocs.clear_usage(); pr_coprocs.clear_usage();
ram_left = gstate.available_ram();
if (have_max_concurrent) { if (have_max_concurrent) {
max_concurrent_init(); max_concurrent_init();
} }
@ -116,7 +114,11 @@ struct PROC_RESOURCES {
// should we stop scanning jobs? // should we stop scanning jobs?
// //
inline bool stop_scan_cpu() { inline bool stop_scan_cpu() {
return ncpus_used_st >= ncpus; if (ncpus_used_st >= ncpus) return true;
if (ncpus_used_mt >= 2*ncpus) return true;
// kind of arbitrary, but need to have some limit
// in case there are only MT jobs, and lots of them
return false;
} }
inline bool stop_scan_coproc(int rsc_type) { inline bool stop_scan_coproc(int rsc_type) {
@ -131,7 +133,6 @@ struct PROC_RESOURCES {
// (i.e add it to the runnable list; not actually run it) // (i.e add it to the runnable list; not actually run it)
// //
bool can_schedule(RESULT* rp, ACTIVE_TASK* atp) { bool can_schedule(RESULT* rp, ACTIVE_TASK* atp) {
double wss;
if (max_concurrent_exceeded(rp)) return false; if (max_concurrent_exceeded(rp)) return false;
if (atp) { if (atp) {
// don't schedule if something's pending // don't schedule if something's pending
@ -154,11 +155,7 @@ struct PROC_RESOURCES {
} }
atp->needs_shmem = false; atp->needs_shmem = false;
} }
wss = atp->procinfo.working_set_size_smoothed;
} else {
wss = rp->avp->max_working_set_size;
} }
if (wss > ram_left) return false;
if (rp->schedule_backoff > gstate.now) return false; if (rp->schedule_backoff > gstate.now) return false;
if (rp->uses_coprocs()) { if (rp->uses_coprocs()) {
if (gpu_suspend_reason) return false; if (gpu_suspend_reason) return false;
@ -211,13 +208,6 @@ struct PROC_RESOURCES {
} else { } else {
ncpus_used_st += rp->avp->avg_ncpus; ncpus_used_st += rp->avp->avg_ncpus;
} }
double wss;
if (atp) {
wss = atp->procinfo.working_set_size_smoothed;
} else {
wss = rp->avp->max_working_set_size;
}
ram_left -= wss;
adjust_rec_sched(rp); adjust_rec_sched(rp);
max_concurrent_inc(rp); max_concurrent_inc(rp);
@ -1093,7 +1083,7 @@ void CLIENT_STATE::append_unfinished_time_slice(vector<RESULT*> &run_list) {
// That's the only kind of suspended GPU job. // That's the only kind of suspended GPU job.
// CORPOC::usage[]: for each instance, its usage // CORPOC::usage[]: for each instance, its usage
// //
// enforce_schedule() calls assign_coprocs(), // enforce_run_list() calls assign_coprocs(),
// which assigns coproc instances to scheduled jobs, // which assigns coproc instances to scheduled jobs,
// and prunes jobs for which we can't make an assignment // and prunes jobs for which we can't make an assignment
// (the job list is in order of decreasing priority) // (the job list is in order of decreasing priority)
@ -1536,7 +1526,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
#endif #endif
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule(): start"); msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list(): start");
msg_printf(0, MSG_INFO, "[cpu_sched_debug] preliminary job list:"); msg_printf(0, MSG_INFO, "[cpu_sched_debug] preliminary job list:");
print_job_list(run_list); print_job_list(run_list);
} }
@ -1901,7 +1891,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
set_client_state_dirty("enforce_cpu_schedule"); set_client_state_dirty("enforce_cpu_schedule");
} }
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule: end"); msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list: end");
} }
if (coproc_start_deferred) { if (coproc_start_deferred) {
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {