- client: fix problems with job scheduling policy.

Old: job scheduling has 2 phases.
        In the first phase (schedule_cpus()) we make a list of jobs,
        with deadline-miss and high-STD jobs first.
        Keep track of the RAM used,
        and skip jobs that would exceed available RAM.
        Stop scanning when the # of CPUs used by jobs in the list
        exceeds the # of actual CPUs.

        In the 2nd phase (enforce_schedule()), we add currently running jobs
        (which may be in the middle of a time slice) to the list,
        and reorder to give priority to such jobs,
        and possibly also to multi-thread jobs.
        We then run and/or preempt jobs, keeping track of RAM used.

    Problems:
        - suppose we add an EDF 1-CPU job to the list, then a MT job.
            We'll stop at that point because #CPUs is exceeded.
            But enforce_schedule() won't run the MT job,
            and CPUs will be idle.
        - Because the list may be reordered, skipping jobs based
            on RAM is not correct, and may cause deadlines to be missed.

    New:
        - when making the job list, keep track of #CPUs used
            by MT jobs and non-MT jobs separately.
            Stop the scan only if the non-MT count exceeds #CPUs.
            This ensures that we have enough jobs to use all the CPUs,
            even if the MT jobs can't be run for whatever reason.
        - don't skip jobs because of RAM usage
        - skip MT jobs if the MT CPU count is at least #CPUs

    Notes:
        - ignoring RAM usage in phase 1 can cause idleness in some cases,
            e.g. suppose there are 4 GB of RAM and the list has
            jobs that use 3 GB, but there are also some jobs that use 1 GB.
            I'm not sure how to fix this.
        - Maybe the 2-phase approach is not a good idea.
            We did it this way for efficiency,
            so that we don't have to recompute the job list
            each time a job checkpoints.
            But this is probably not a concern,
            and I like the idea of a simpler approach,
            e.g. reducing the policy to a single comparison function.


svn path=/trunk/boinc/; revision=22615
This commit is contained in:
David Anderson 2010-11-03 17:13:51 +00:00
parent 4e10f8bb88
commit 2d6d69ac86
2 changed files with 79 additions and 39 deletions

View File

@ -7713,3 +7713,52 @@ David 01 Nov 2010
client/
cpu_sched.cpp
David 01 Nov 2010
- client: fix problems with job scheduling policy.
Old: job scheduling has 2 phases.
In the first phase (schedule_cpus()) we make a list of jobs,
with deadline-miss and high-STD jobs first.
Keep track of the RAM used,
and skip jobs that would exceed available RAM.
Stop scanning when the # of CPUs used by jobs in the list
exceeds the # of actual CPUs.
In the 2nd phase (enforce_schedule()), we add currently running jobs
(which may be in the middle of a time slice) to the list,
and reorder to give priority to such jobs,
and possibly also to multi-thread jobs.
We then run and/or preempt jobs, keeping track of RAM used.
Problems:
- suppose we add an EDF 1-CPU job to the list, then a MT job.
We'll stop at that point because #CPUs is exceeded.
But enforce_schedule() won't run the MT job,
and CPUs will be idle.
- Because the list may be reordered, skipping jobs based
on RAM is not correct, and may cause deadlines to be missed.
New:
- when making the job list, keep track of #CPUs used
by MT jobs and non-MT jobs separately.
Stop the scan only if the non-MT count exceeds #CPUs.
This ensures that we have enough jobs to use all the CPUs,
even if the MT jobs can't be run for whatever reason.
- don't skip jobs because of RAM usage
- skip MT jobs if the MT CPU count is at least #CPUs
Notes:
- ignoring RAM usage in phase 1 can cause idleness in some cases,
e.g. suppose there are 4 GB of RAM and the list has
jobs that use 3 GB, but there are also some jobs that use 1 GB.
I'm not sure how to fix this.
- Maybe the 2-phase approach is not a good idea.
We did it this way for efficiency,
so that we don't have to recompute the job list
each time a job checkpoints.
But this is probably not a concern,
and I like the idea of a simpler approach,
e.g. reducing the policy to a single comparison function.
client/
cpu_sched.cpp

View File

@ -71,14 +71,14 @@ using std::list;
//
struct PROC_RESOURCES {
int ncpus;
double ncpus_used;
double ram_left;
double ncpus_used_st; // #CPUs of GPU or single-thread jobs
double ncpus_used_mt; // #CPUs of multi-thread jobs
COPROCS coprocs;
// should we stop scanning jobs?
//
inline bool stop_scan_cpu() {
return ncpus_used >= ncpus;
return ncpus_used_st >= ncpus;
}
inline bool stop_scan_coproc(int rsc_type) {
@ -94,9 +94,7 @@ struct PROC_RESOURCES {
if (rp->schedule_backoff > gstate.now) return false;
if (rp->uses_coprocs()) {
if (gpu_suspend_reason) return false;
if (sufficient_coprocs(
*rp->avp, log_flags.cpu_sched_debug)
) {
if (sufficient_coprocs(*rp->avp, log_flags.cpu_sched_debug)) {
return true;
} else {
if (log_flags.cpu_sched_debug) {
@ -106,10 +104,10 @@ struct PROC_RESOURCES {
}
return false;
}
} else if (rp->avp->avg_ncpus > 1) {
return (ncpus_used_mt + rp->avp->avg_ncpus < ncpus);
} else {
// otherwise, only if CPUs are available
//
return (ncpus_used < ncpus);
return (ncpus_used_st < ncpus);
}
}
@ -119,7 +117,13 @@ struct PROC_RESOURCES {
reserve_coprocs(
*rp->avp, log_flags.cpu_sched_debug, "cpu_sched_debug"
);
ncpus_used += rp->avp->avg_ncpus;
if (rp->uses_coprocs()) {
ncpus_used_st += rp->avp->avg_ncpus;
} else if (rp->avp->avg_ncpus > 1) {
ncpus_used_mt += rp->avp->avg_ncpus;
} else {
ncpus_used_st += rp->avp->avg_ncpus;
}
}
bool sufficient_coprocs(APP_VERSION& av, bool log_flag) {
@ -409,7 +413,9 @@ RESULT* first_coproc_result(int rsc_type) {
return best;
}
// Return earliest-deadline result for given resource type.
// Return earliest-deadline result for given resource type;
// return only results projected to miss their deadline,
// or from projects with extreme DCF
//
static RESULT* earliest_deadline_result(int rsc_type) {
RESULT *best_result = NULL;
@ -624,7 +630,6 @@ bool CLIENT_STATE::possibly_schedule_cpus() {
}
// Check whether the job can be run:
// - it will fit in RAM
// - we have enough shared-mem segments (old Mac problem)
// If so, update proc_rsc and anticipated debts, and return true
//
@ -633,20 +638,6 @@ static bool schedule_if_possible(
const char* description
) {
if (atp) {
// see if it fits in available RAM
//
if (atp->procinfo.working_set_size_smoothed > proc_rsc.ram_left) {
if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO,
"[cpu_sched] %s working set too large: %.2fMB",
rp->name, atp->procinfo.working_set_size_smoothed/MEGA
);
}
atp->too_large = true;
return false;
}
atp->too_large = false;
if (gstate.retry_shmem_time > gstate.now) {
if (atp->app_client_shm.shm == NULL) {
if (log_flags.cpu_sched_debug) {
@ -660,17 +651,6 @@ static bool schedule_if_possible(
}
atp->needs_shmem = false;
}
proc_rsc.ram_left -= atp->procinfo.working_set_size_smoothed;
} else {
if (rp->avp->max_working_set_size > proc_rsc.ram_left) {
if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO,
"[cpu_sched] %s projected working set too large: %.2fMB",
rp->name, rp->avp->max_working_set_size/MEGA
);
}
return false;
}
}
if (log_flags.cpu_sched_debug) {
@ -761,8 +741,8 @@ void CLIENT_STATE::schedule_cpus() {
bool can_run;
proc_rsc.ncpus = ncpus;
proc_rsc.ncpus_used = 0;
proc_rsc.ram_left = available_ram();
proc_rsc.ncpus_used_st = 0;
proc_rsc.ncpus_used_mt = 0;
proc_rsc.coprocs.clone(host_info.coprocs, false);
if (log_flags.cpu_sched_debug) {
@ -819,9 +799,17 @@ void CLIENT_STATE::schedule_cpus() {
ordered_scheduled_results.clear();
// first, add GPU jobs
add_coproc_jobs(RSC_TYPE_CUDA, proc_rsc);
add_coproc_jobs(RSC_TYPE_ATI, proc_rsc);
// then add CPU jobs.
// Note: the jobs that actually get run are not necessarily
// an initial segment of this list;
// e.g. a multithread job may not get run because it has
// a high-priority single-thread job ahead of it.
// choose CPU jobs from projects with CPU deadline misses
//
#ifdef SIM
@ -874,6 +862,9 @@ static inline bool in_ordered_scheduled_results(ACTIVE_TASK* atp) {
// if find a MT job J, and X < ncpus, move J before all non-MT jobs
// But don't promote a MT job ahead of a job in EDF
//
// This is needed because there may always be a 1-CPU jobs
// in the middle of its time-slice, and MT jobs could starve.
//
static void promote_multi_thread_jobs(vector<RESULT*>& runnable_jobs) {
double cpus_used = 0;
vector<RESULT*>::iterator first_non_mt = runnable_jobs.end();