Client: fix job scheduling bug that could starve CPUs

Job scheduling has 2 phases:
    make_run_list(): build a sorted list of runnable jobs
    enforce_run_list() go through the list and run jobs
The run list in general contains more jobs than can actually be run.
This is intentional.
There are lots of reasons why enforce_run_list() might not be able
to run a particular job, and we don't know these during make_run_list().
So we need to give enforce_run_list() a surplus of choices.

The problem: make_run_list() was accounting RAM usage of jobs in the list,
and stopping when this exceeded physical RAM.
This led to a situation where we added a bunch of GPU jobs to the list -
more than could actually be run -
and this caused too few CPU jobs to be put in the list.

Oddly, the comment at the start of cpu_sched.cpp said that RAM usage
was ignored by make_run_list(); this was not the case.

Anyway, I removed RAM accounting from make_run_list().
This commit is contained in:
David Anderson 2014-02-11 12:33:13 -08:00
parent 2a84d20eae
commit 17e44af601
1 changed files with 11 additions and 21 deletions

View File

@ -17,7 +17,7 @@
// CPU scheduling logic.
//
// - create an ordered "run list" (schedule_cpus).
// - create an ordered "run list" (make_run_list()).
// The ordering is roughly as follows:
// - GPU jobs first, then CPU jobs
// - for a given resource, jobs in deadline danger first
@ -39,8 +39,8 @@
// - sort the list according to "more_important()"
// - shuffle the list to avoid starving multi-thread jobs
//
// - scan through the resulting list,
// running the jobs and preempting other jobs.
// - scan through the resulting list, running the jobs and preempting
// other jobs (enforce_run_list).
// Don't run a job if
// - its GPUs can't be assigned (possible if need >1 GPU)
// - it's a multi-thread job, and CPU usage would be #CPUs+1 or more
@ -99,7 +99,6 @@ struct PROC_RESOURCES {
double ncpus_used_st; // #CPUs of GPU or single-thread jobs
double ncpus_used_mt; // #CPUs of multi-thread jobs
COPROCS pr_coprocs;
double ram_left;
void init() {
ncpus = gstate.ncpus;
@ -107,7 +106,6 @@ struct PROC_RESOURCES {
ncpus_used_mt = 0;
pr_coprocs.clone(coprocs, false);
pr_coprocs.clear_usage();
ram_left = gstate.available_ram();
if (have_max_concurrent) {
max_concurrent_init();
}
@ -116,7 +114,11 @@ struct PROC_RESOURCES {
// should we stop scanning jobs?
//
inline bool stop_scan_cpu() {
return ncpus_used_st >= ncpus;
if (ncpus_used_st >= ncpus) return true;
if (ncpus_used_mt >= 2*ncpus) return true;
// kind of arbitrary, but need to have some limit
// in case there are only MT jobs, and lots of them
return false;
}
inline bool stop_scan_coproc(int rsc_type) {
@ -131,7 +133,6 @@ struct PROC_RESOURCES {
// (i.e add it to the runnable list; not actually run it)
//
bool can_schedule(RESULT* rp, ACTIVE_TASK* atp) {
double wss;
if (max_concurrent_exceeded(rp)) return false;
if (atp) {
// don't schedule if something's pending
@ -154,11 +155,7 @@ struct PROC_RESOURCES {
}
atp->needs_shmem = false;
}
wss = atp->procinfo.working_set_size_smoothed;
} else {
wss = rp->avp->max_working_set_size;
}
if (wss > ram_left) return false;
if (rp->schedule_backoff > gstate.now) return false;
if (rp->uses_coprocs()) {
if (gpu_suspend_reason) return false;
@ -211,13 +208,6 @@ struct PROC_RESOURCES {
} else {
ncpus_used_st += rp->avp->avg_ncpus;
}
double wss;
if (atp) {
wss = atp->procinfo.working_set_size_smoothed;
} else {
wss = rp->avp->max_working_set_size;
}
ram_left -= wss;
adjust_rec_sched(rp);
max_concurrent_inc(rp);
@ -1093,7 +1083,7 @@ void CLIENT_STATE::append_unfinished_time_slice(vector<RESULT*> &run_list) {
// That's the only kind of suspended GPU job.
// CORPOC::usage[]: for each instance, its usage
//
// enforce_schedule() calls assign_coprocs(),
// enforce_run_list() calls assign_coprocs(),
// which assigns coproc instances to scheduled jobs,
// and prunes jobs for which we can't make an assignment
// (the job list is in order of decreasing priority)
@ -1536,7 +1526,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
#endif
if (log_flags.cpu_sched_debug) {
msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule(): start");
msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list(): start");
msg_printf(0, MSG_INFO, "[cpu_sched_debug] preliminary job list:");
print_job_list(run_list);
}
@ -1901,7 +1891,7 @@ bool CLIENT_STATE::enforce_run_list(vector<RESULT*>& run_list) {
set_client_state_dirty("enforce_cpu_schedule");
}
if (log_flags.cpu_sched_debug) {
msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_schedule: end");
msg_printf(0, MSG_INFO, "[cpu_sched_debug] enforce_run_list: end");
}
if (coproc_start_deferred) {
if (log_flags.cpu_sched_debug) {