diff --git a/checkin_notes b/checkin_notes index 55bbeca1e7..52c9135576 100644 --- a/checkin_notes +++ b/checkin_notes @@ -7713,3 +7713,52 @@ David 01 Nov 2010 client/ cpu_sched.cpp + +David 01 Nov 2010 + - client: fix problems with job scheduling policy. + Old: job scheduling has 2 phases. + In the first phase (schedule_cpus()) we make a list of jobs, + with deadline-miss and high-STD jobs first. + Keep track of the RAM used, + and skip jobs that would exceed available RAM. + Stop scanning when the # of CPUs used by jobs in the list + exceeds the # of actual CPUs. + + In the 2nd phase (enforce_schedule()), we add currently running jobs + (which may be in the middle of a time slice) to the list, + and reorder to give priority to such jobs, + and possibly also to multi-thread jobs. + We then run and/or preempt jobs, keeping track of RAM used. + + Problems: + - suppose we add an EDF 1-CPU job to the list, then a MT job. + We'll stop at that point because #CPUs is exceeded. + But enforce_schedule() won't run the MT job, + and CPUs will be idle. + - Because the list may be reordered, skipping jobs based + on RAM is not correct, and may cause deadlines to be missed. + + New: + - when making the job list, keep track of #CPUs used + by MT jobs and non-MT jobs separately. + Stop the scan only if the non-MT count exceeds #CPUs. + This ensures that we have enough jobs to use all the CPUs, + even if the MT jobs can't be run for whatever reason. + - don't skip jobs because of RAM usage + - skip MT jobs if the MT CPU count is at least #CPUs + + Notes: + - ignoring RAM usage in phase 1 can cause idleness in some cases, + e.g. suppose there are 4 GB of RAM and the list has + jobs that use 3 GB, but there are also some jobs that use 1 GB. + I'm not sure how to fix this. + - Maybe the 2-phase approach is not a good idea. + We did it this way for efficiency, + so that we don't have to recompute the job list + each time a job checkpoints. + But this is probably not a concern, + and I like the idea of a simpler approach, + e.g. reducing the policy to a single comparison function. + + client/ + cpu_sched.cpp diff --git a/client/cpu_sched.cpp b/client/cpu_sched.cpp index 9cff03fb78..76a1b372f3 100644 --- a/client/cpu_sched.cpp +++ b/client/cpu_sched.cpp @@ -71,14 +71,14 @@ using std::list; // struct PROC_RESOURCES { int ncpus; - double ncpus_used; - double ram_left; + double ncpus_used_st; // #CPUs of GPU or single-thread jobs + double ncpus_used_mt; // #CPUs of multi-thread jobs COPROCS coprocs; // should we stop scanning jobs? // inline bool stop_scan_cpu() { - return ncpus_used >= ncpus; + return ncpus_used_st >= ncpus; } inline bool stop_scan_coproc(int rsc_type) { @@ -94,9 +94,7 @@ struct PROC_RESOURCES { if (rp->schedule_backoff > gstate.now) return false; if (rp->uses_coprocs()) { if (gpu_suspend_reason) return false; - if (sufficient_coprocs( - *rp->avp, log_flags.cpu_sched_debug) - ) { + if (sufficient_coprocs(*rp->avp, log_flags.cpu_sched_debug)) { return true; } else { if (log_flags.cpu_sched_debug) { @@ -106,10 +104,10 @@ struct PROC_RESOURCES { } return false; } + } else if (rp->avp->avg_ncpus > 1) { + return (ncpus_used_mt + rp->avp->avg_ncpus < ncpus); } else { - // otherwise, only if CPUs are available - // - return (ncpus_used < ncpus); + return (ncpus_used_st < ncpus); } } @@ -119,7 +117,13 @@ struct PROC_RESOURCES { reserve_coprocs( *rp->avp, log_flags.cpu_sched_debug, "cpu_sched_debug" ); - ncpus_used += rp->avp->avg_ncpus; + if (rp->uses_coprocs()) { + ncpus_used_st += rp->avp->avg_ncpus; + } else if (rp->avp->avg_ncpus > 1) { + ncpus_used_mt += rp->avp->avg_ncpus; + } else { + ncpus_used_st += rp->avp->avg_ncpus; + } } bool sufficient_coprocs(APP_VERSION& av, bool log_flag) { @@ -409,7 +413,9 @@ RESULT* first_coproc_result(int rsc_type) { return best; } -// Return earliest-deadline result for given resource type. +// Return earliest-deadline result for given resource type; +// return only results projected to miss their deadline, +// or from projects with extreme DCF // static RESULT* earliest_deadline_result(int rsc_type) { RESULT *best_result = NULL; @@ -624,7 +630,6 @@ bool CLIENT_STATE::possibly_schedule_cpus() { } // Check whether the job can be run: -// - it will fit in RAM // - we have enough shared-mem segments (old Mac problem) // If so, update proc_rsc and anticipated debts, and return true // @@ -633,20 +638,6 @@ static bool schedule_if_possible( const char* description ) { if (atp) { - // see if it fits in available RAM - // - if (atp->procinfo.working_set_size_smoothed > proc_rsc.ram_left) { - if (log_flags.cpu_sched_debug) { - msg_printf(rp->project, MSG_INFO, - "[cpu_sched] %s working set too large: %.2fMB", - rp->name, atp->procinfo.working_set_size_smoothed/MEGA - ); - } - atp->too_large = true; - return false; - } - atp->too_large = false; - if (gstate.retry_shmem_time > gstate.now) { if (atp->app_client_shm.shm == NULL) { if (log_flags.cpu_sched_debug) { @@ -660,17 +651,6 @@ static bool schedule_if_possible( } atp->needs_shmem = false; } - proc_rsc.ram_left -= atp->procinfo.working_set_size_smoothed; - } else { - if (rp->avp->max_working_set_size > proc_rsc.ram_left) { - if (log_flags.cpu_sched_debug) { - msg_printf(rp->project, MSG_INFO, - "[cpu_sched] %s projected working set too large: %.2fMB", - rp->name, rp->avp->max_working_set_size/MEGA - ); - } - return false; - } } if (log_flags.cpu_sched_debug) { @@ -761,8 +741,8 @@ void CLIENT_STATE::schedule_cpus() { bool can_run; proc_rsc.ncpus = ncpus; - proc_rsc.ncpus_used = 0; - proc_rsc.ram_left = available_ram(); + proc_rsc.ncpus_used_st = 0; + proc_rsc.ncpus_used_mt = 0; proc_rsc.coprocs.clone(host_info.coprocs, false); if (log_flags.cpu_sched_debug) { @@ -819,9 +799,17 @@ void CLIENT_STATE::schedule_cpus() { ordered_scheduled_results.clear(); + // first, add GPU jobs + add_coproc_jobs(RSC_TYPE_CUDA, proc_rsc); add_coproc_jobs(RSC_TYPE_ATI, proc_rsc); + // then add CPU jobs. + // Note: the jobs that actually get run are not necessarily + // an initial segment of this list; + // e.g. a multithread job may not get run because it has + // a high-priority single-thread job ahead of it. + // choose CPU jobs from projects with CPU deadline misses // #ifdef SIM @@ -874,6 +862,9 @@ static inline bool in_ordered_scheduled_results(ACTIVE_TASK* atp) { // if find a MT job J, and X < ncpus, move J before all non-MT jobs // But don't promote a MT job ahead of a job in EDF // +// This is needed because there may always be a 1-CPU jobs +// in the middle of its time-slice, and MT jobs could starve. +// static void promote_multi_thread_jobs(vector& runnable_jobs) { double cpus_used = 0; vector::iterator first_non_mt = runnable_jobs.end();