- client: fix CPU scheduling logic related to coprocessors

Old: when checking whether an app can be run,
        check for sufficient coprocessors relative to
        the current coprocessor usage.
        Bug: it there are 2 CUDA jobs,
        the scheduler will decide to run both.
        enforce_scheduler() will only be able to run one,
        and the other CPU will be idle.
    New: include coprocessor usage (along with RAM and CPUs)
        in the check, and do a simulated reservation.
        In the above scenario, the scheduler will select
        one CUDA app and one non-CUDA app.

svn path=/trunk/boinc/; revision=15904
This commit is contained in:
David Anderson 2008-08-20 17:34:18 +00:00
parent a5f80bc2e6
commit 87cf35f89b
4 changed files with 80 additions and 35 deletions

View File

@ -6868,3 +6868,23 @@ David 20 Aug 2008
lib/ lib/
filesys.C filesys.C
David 20 Aug 2008
- client: fix CPU scheduling logic related to coprocessors
Old: when checking whether an app can be run,
check for sufficient coprocessors relative to
the current coprocessor usage.
Bug: it there are 2 CUDA jobs,
the scheduler will decide to run both.
enforce_scheduler() will only be able to run one,
and the other CPU will be idle.
New: include coprocessor usage (along with RAM and CPUs)
in the check, and do a simulated reservation.
In the above scenario, the scheduler will select
one CUDA app and one non-CUDA app.
client/
cpu_sched.C
app.C
lib/
coproc.h

View File

@ -598,7 +598,7 @@ int ACTIVE_TASK::parse(MIOFILE& fin) {
void ACTIVE_TASK::reserve_coprocs() { void ACTIVE_TASK::reserve_coprocs() {
gstate.coprocs.reserve_coprocs( gstate.coprocs.reserve_coprocs(
app_version->coprocs, this, log_flags.cpu_sched_debug app_version->coprocs, this, log_flags.cpu_sched_debug, ""
); );
coprocs_reserved = true; coprocs_reserved = true;
} }

View File

@ -68,12 +68,13 @@ using std::vector;
#define DEADLINE_CUSHION 0 #define DEADLINE_CUSHION 0
// try to finish jobs this much in advance of their deadline // try to finish jobs this much in advance of their deadline
bool COPROCS::sufficient_coprocs(COPROCS& needed, bool verbose) { bool COPROCS::sufficient_coprocs(
COPROCS& needed, bool verbose, const char* prefix) {
for (unsigned int i=0; i<needed.coprocs.size(); i++) { for (unsigned int i=0; i<needed.coprocs.size(); i++) {
COPROC* cp = needed.coprocs[i]; COPROC* cp = needed.coprocs[i];
COPROC* cp2 = lookup(cp->type); COPROC* cp2 = lookup(cp->type);
if (!cp2) { if (!cp2) {
msg_printf(NULL, MSG_INFO, msg_printf(NULL, MSG_INTERNAL_ERROR,
"Missing a %s coprocessor", cp->type "Missing a %s coprocessor", cp->type
); );
return false; return false;
@ -81,8 +82,8 @@ bool COPROCS::sufficient_coprocs(COPROCS& needed, bool verbose) {
if (cp2->used + cp->count > cp2->count) { if (cp2->used + cp->count > cp2->count) {
if (verbose) { if (verbose) {
msg_printf(NULL, MSG_INFO, msg_printf(NULL, MSG_INFO,
"insufficient coproc %s (%d + %d > %d)", "%sinsufficient coproc %s (%d + %d > %d)",
cp2->type, cp2->used, cp->count, cp2->count prefix, cp2->type, cp2->used, cp->count, cp2->count
); );
} }
return false; return false;
@ -91,7 +92,9 @@ bool COPROCS::sufficient_coprocs(COPROCS& needed, bool verbose) {
return true; return true;
} }
void COPROCS::reserve_coprocs(COPROCS& needed, void* owner, bool verbose) { void COPROCS::reserve_coprocs(
COPROCS& needed, void* owner, bool verbose, const char* prefix
) {
for (unsigned int i=0; i<needed.coprocs.size(); i++) { for (unsigned int i=0; i<needed.coprocs.size(); i++) {
COPROC* cp = needed.coprocs[i]; COPROC* cp = needed.coprocs[i];
COPROC* cp2 = lookup(cp->type); COPROC* cp2 = lookup(cp->type);
@ -103,7 +106,7 @@ void COPROCS::reserve_coprocs(COPROCS& needed, void* owner, bool verbose) {
} }
if (verbose) { if (verbose) {
msg_printf(NULL, MSG_INFO, msg_printf(NULL, MSG_INFO,
"reserving %d of coproc %s", cp->count, cp2->type "%sreserving %d of coproc %s", prefix, cp->count, cp2->type
); );
} }
cp2->used += cp->count; cp2->used += cp->count;
@ -533,15 +536,20 @@ void CLIENT_STATE::print_deadline_misses() {
} }
} }
struct PROC_RESOURCES {
double ncpus_used;
double ram_left;
COPROCS coprocs;
};
static bool schedule_if_possible( static bool schedule_if_possible(
RESULT* rp, double& ncpus_used, double& ram_left, double rrs, double expected_payoff RESULT* rp, PROC_RESOURCES& proc_rsc, double rrs, double expected_payoff
) { ) {
ACTIVE_TASK* atp; ACTIVE_TASK* atp;
atp = gstate.lookup_active_task_by_result(rp); atp = gstate.lookup_active_task_by_result(rp);
if (!atp || atp->task_state() == PROCESS_UNINITIALIZED) { if (!proc_rsc.coprocs.sufficient_coprocs(
if (!gstate.coprocs.sufficient_coprocs( rp->avp->coprocs, log_flags.cpu_sched_debug, "(CPU sched sim) ")
rp->avp->coprocs, log_flags.cpu_sched_debug)
) { ) {
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO, msg_printf(rp->project, MSG_INFO,
@ -550,11 +558,10 @@ static bool schedule_if_possible(
} }
return false; return false;
} }
}
if (atp) { if (atp) {
// see if it fits in available RAM // see if it fits in available RAM
// //
if (atp->procinfo.working_set_size_smoothed > ram_left) { if (atp->procinfo.working_set_size_smoothed > proc_rsc.ram_left) {
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO, msg_printf(rp->project, MSG_INFO,
"[cpu_sched_debug] %s misses deadline but too large: %.2fMB", "[cpu_sched_debug] %s misses deadline but too large: %.2fMB",
@ -568,20 +575,28 @@ static bool schedule_if_possible(
if (gstate.retry_shmem_time > gstate.now) { if (gstate.retry_shmem_time > gstate.now) {
if (atp->app_client_shm.shm == NULL) { if (atp->app_client_shm.shm == NULL) {
if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO,
"[cpu_sched_debug] waiting for shared mem: %s",
rp->name
);
}
atp->needs_shmem = true; atp->needs_shmem = true;
return false; return false;
} }
atp->needs_shmem = false; atp->needs_shmem = false;
} }
ram_left -= atp->procinfo.working_set_size_smoothed; proc_rsc.ram_left -= atp->procinfo.working_set_size_smoothed;
} }
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(rp->project, MSG_INFO, msg_printf(rp->project, MSG_INFO,
"[cpu_sched_debug] scheduling %s", "[cpu_sched_debug] scheduling %s", rp->name
rp->name
); );
} }
ncpus_used += rp->avp->avg_ncpus; proc_rsc.coprocs.reserve_coprocs(
rp->avp->coprocs, rp, log_flags.cpu_sched_debug, "(CPU sched sim) "
);
proc_rsc.ncpus_used += rp->avp->avg_ncpus;
rp->project->anticipated_debt -= (rp->project->resource_share / rrs) * expected_payoff; rp->project->anticipated_debt -= (rp->project->resource_share / rrs) * expected_payoff;
return true; return true;
} }
@ -595,7 +610,11 @@ void CLIENT_STATE::schedule_cpus() {
double expected_payoff; double expected_payoff;
unsigned int i; unsigned int i;
double rrs = runnable_resource_share(); double rrs = runnable_resource_share();
double ncpus_used = 0; PROC_RESOURCES proc_rsc;
proc_rsc.ncpus_used = 0;
proc_rsc.ram_left = available_ram();
proc_rsc.coprocs.clone(gstate.coprocs);
if (log_flags.cpu_sched_debug) { if (log_flags.cpu_sched_debug) {
msg_printf(0, MSG_INFO, "[cpu_sched_debug] schedule_cpus(): start"); msg_printf(0, MSG_INFO, "[cpu_sched_debug] schedule_cpus(): start");
@ -629,19 +648,18 @@ void CLIENT_STATE::schedule_cpus() {
expected_payoff = global_prefs.cpu_scheduling_period(); expected_payoff = global_prefs.cpu_scheduling_period();
ordered_scheduled_results.clear(); ordered_scheduled_results.clear();
double ram_left = available_ram();
// First choose results from projects with P.deadlines_missed>0 // First choose results from projects with P.deadlines_missed>0
// //
#ifdef SIM #ifdef SIM
if (!cpu_sched_rr_only) { if (!cpu_sched_rr_only) {
#endif #endif
while (ncpus_used < ncpus) { while (proc_rsc.ncpus_used < ncpus) {
rp = earliest_deadline_result(); rp = earliest_deadline_result();
if (!rp) break; if (!rp) break;
rp->already_selected = true; rp->already_selected = true;
if (!schedule_if_possible(rp, ncpus_used, ram_left, rrs, expected_payoff)) continue; if (!schedule_if_possible(rp, proc_rsc, rrs, expected_payoff)) continue;
rp->project->deadlines_missed--; rp->project->deadlines_missed--;
rp->edf_scheduled = true; rp->edf_scheduled = true;
@ -653,11 +671,11 @@ void CLIENT_STATE::schedule_cpus() {
// Next, choose results from projects with large debt // Next, choose results from projects with large debt
// //
while (ncpus_used < ncpus) { while (proc_rsc.ncpus_used < ncpus) {
assign_results_to_projects(); assign_results_to_projects();
rp = largest_debt_project_best_result(); rp = largest_debt_project_best_result();
if (!rp) break; if (!rp) break;
if (!schedule_if_possible(rp, ncpus_used, ram_left, rrs, expected_payoff)) continue; if (!schedule_if_possible(rp, proc_rsc, rrs, expected_payoff)) continue;
ordered_scheduled_results.push_back(rp); ordered_scheduled_results.push_back(rp);
} }
@ -998,7 +1016,7 @@ bool CLIENT_STATE::enforce_schedule() {
switch (atp->task_state()) { switch (atp->task_state()) {
case PROCESS_UNINITIALIZED: case PROCESS_UNINITIALIZED:
if (!coprocs.sufficient_coprocs( if (!coprocs.sufficient_coprocs(
atp->app_version->coprocs, log_flags.cpu_sched_debug atp->app_version->coprocs, log_flags.cpu_sched_debug, ""
)){ )){
continue; continue;
} }
@ -1098,10 +1116,14 @@ struct RR_SIM_STATUS {
COPROCS coprocs; COPROCS coprocs;
inline bool can_run(RESULT* rp) { inline bool can_run(RESULT* rp) {
return coprocs.sufficient_coprocs(rp->avp->coprocs, log_flags.rr_simulation); return coprocs.sufficient_coprocs(
rp->avp->coprocs, log_flags.rr_simulation, ""
);
} }
inline void activate(RESULT* rp) { inline void activate(RESULT* rp) {
coprocs.reserve_coprocs(rp->avp->coprocs, rp, log_flags.rr_simulation); coprocs.reserve_coprocs(
rp->avp->coprocs, rp, log_flags.rr_simulation, ""
);
active.push_back(rp); active.push_back(rp);
} }
// remove *rpbest from active set, // remove *rpbest from active set,

View File

@ -72,11 +72,14 @@ struct COPROCS {
std::vector<std::string> get(); std::vector<std::string> get();
int parse(FILE*); int parse(FILE*);
COPROC* lookup(char*); COPROC* lookup(char*);
bool sufficient_coprocs(COPROCS&, bool verbose); bool sufficient_coprocs(COPROCS&, bool verbose, const char* prefix);
void reserve_coprocs(COPROCS&, void*, bool verbose); void reserve_coprocs(COPROCS&, void*, bool verbose, const char* prefix);
void free_coprocs(COPROCS&, void*, bool verbose); void free_coprocs(COPROCS&, void*, bool verbose);
// used in round-robin simulator, to avoid messing w/ master copy // Copy a coproc set, setting usage to zero.
// used in round-robin simulator and CPU scheduler,
// to avoid messing w/ master copy
//
void clone(COPROCS& c) { void clone(COPROCS& c) {
for (unsigned int i=0; i<c.coprocs.size(); i++) { for (unsigned int i=0; i<c.coprocs.size(); i++) {
COPROC* cp = c.coprocs[i]; COPROC* cp = c.coprocs[i];