mirror of https://github.com/BOINC/boinc.git
- client: fix CPU scheduling logic related to coprocessors
Old: when checking whether an app can be run, check for sufficient coprocessors relative to the current coprocessor usage. Bug: it there are 2 CUDA jobs, the scheduler will decide to run both. enforce_scheduler() will only be able to run one, and the other CPU will be idle. New: include coprocessor usage (along with RAM and CPUs) in the check, and do a simulated reservation. In the above scenario, the scheduler will select one CUDA app and one non-CUDA app. svn path=/trunk/boinc/; revision=15904
This commit is contained in:
parent
a5f80bc2e6
commit
87cf35f89b
|
@ -6868,3 +6868,23 @@ David 20 Aug 2008
|
||||||
|
|
||||||
lib/
|
lib/
|
||||||
filesys.C
|
filesys.C
|
||||||
|
|
||||||
|
David 20 Aug 2008
|
||||||
|
- client: fix CPU scheduling logic related to coprocessors
|
||||||
|
Old: when checking whether an app can be run,
|
||||||
|
check for sufficient coprocessors relative to
|
||||||
|
the current coprocessor usage.
|
||||||
|
Bug: it there are 2 CUDA jobs,
|
||||||
|
the scheduler will decide to run both.
|
||||||
|
enforce_scheduler() will only be able to run one,
|
||||||
|
and the other CPU will be idle.
|
||||||
|
New: include coprocessor usage (along with RAM and CPUs)
|
||||||
|
in the check, and do a simulated reservation.
|
||||||
|
In the above scenario, the scheduler will select
|
||||||
|
one CUDA app and one non-CUDA app.
|
||||||
|
|
||||||
|
client/
|
||||||
|
cpu_sched.C
|
||||||
|
app.C
|
||||||
|
lib/
|
||||||
|
coproc.h
|
||||||
|
|
|
@ -598,7 +598,7 @@ int ACTIVE_TASK::parse(MIOFILE& fin) {
|
||||||
|
|
||||||
void ACTIVE_TASK::reserve_coprocs() {
|
void ACTIVE_TASK::reserve_coprocs() {
|
||||||
gstate.coprocs.reserve_coprocs(
|
gstate.coprocs.reserve_coprocs(
|
||||||
app_version->coprocs, this, log_flags.cpu_sched_debug
|
app_version->coprocs, this, log_flags.cpu_sched_debug, ""
|
||||||
);
|
);
|
||||||
coprocs_reserved = true;
|
coprocs_reserved = true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -68,12 +68,13 @@ using std::vector;
|
||||||
#define DEADLINE_CUSHION 0
|
#define DEADLINE_CUSHION 0
|
||||||
// try to finish jobs this much in advance of their deadline
|
// try to finish jobs this much in advance of their deadline
|
||||||
|
|
||||||
bool COPROCS::sufficient_coprocs(COPROCS& needed, bool verbose) {
|
bool COPROCS::sufficient_coprocs(
|
||||||
|
COPROCS& needed, bool verbose, const char* prefix) {
|
||||||
for (unsigned int i=0; i<needed.coprocs.size(); i++) {
|
for (unsigned int i=0; i<needed.coprocs.size(); i++) {
|
||||||
COPROC* cp = needed.coprocs[i];
|
COPROC* cp = needed.coprocs[i];
|
||||||
COPROC* cp2 = lookup(cp->type);
|
COPROC* cp2 = lookup(cp->type);
|
||||||
if (!cp2) {
|
if (!cp2) {
|
||||||
msg_printf(NULL, MSG_INFO,
|
msg_printf(NULL, MSG_INTERNAL_ERROR,
|
||||||
"Missing a %s coprocessor", cp->type
|
"Missing a %s coprocessor", cp->type
|
||||||
);
|
);
|
||||||
return false;
|
return false;
|
||||||
|
@ -81,8 +82,8 @@ bool COPROCS::sufficient_coprocs(COPROCS& needed, bool verbose) {
|
||||||
if (cp2->used + cp->count > cp2->count) {
|
if (cp2->used + cp->count > cp2->count) {
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
msg_printf(NULL, MSG_INFO,
|
msg_printf(NULL, MSG_INFO,
|
||||||
"insufficient coproc %s (%d + %d > %d)",
|
"%sinsufficient coproc %s (%d + %d > %d)",
|
||||||
cp2->type, cp2->used, cp->count, cp2->count
|
prefix, cp2->type, cp2->used, cp->count, cp2->count
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -91,7 +92,9 @@ bool COPROCS::sufficient_coprocs(COPROCS& needed, bool verbose) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void COPROCS::reserve_coprocs(COPROCS& needed, void* owner, bool verbose) {
|
void COPROCS::reserve_coprocs(
|
||||||
|
COPROCS& needed, void* owner, bool verbose, const char* prefix
|
||||||
|
) {
|
||||||
for (unsigned int i=0; i<needed.coprocs.size(); i++) {
|
for (unsigned int i=0; i<needed.coprocs.size(); i++) {
|
||||||
COPROC* cp = needed.coprocs[i];
|
COPROC* cp = needed.coprocs[i];
|
||||||
COPROC* cp2 = lookup(cp->type);
|
COPROC* cp2 = lookup(cp->type);
|
||||||
|
@ -103,7 +106,7 @@ void COPROCS::reserve_coprocs(COPROCS& needed, void* owner, bool verbose) {
|
||||||
}
|
}
|
||||||
if (verbose) {
|
if (verbose) {
|
||||||
msg_printf(NULL, MSG_INFO,
|
msg_printf(NULL, MSG_INFO,
|
||||||
"reserving %d of coproc %s", cp->count, cp2->type
|
"%sreserving %d of coproc %s", prefix, cp->count, cp2->type
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
cp2->used += cp->count;
|
cp2->used += cp->count;
|
||||||
|
@ -533,15 +536,20 @@ void CLIENT_STATE::print_deadline_misses() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct PROC_RESOURCES {
|
||||||
|
double ncpus_used;
|
||||||
|
double ram_left;
|
||||||
|
COPROCS coprocs;
|
||||||
|
};
|
||||||
|
|
||||||
static bool schedule_if_possible(
|
static bool schedule_if_possible(
|
||||||
RESULT* rp, double& ncpus_used, double& ram_left, double rrs, double expected_payoff
|
RESULT* rp, PROC_RESOURCES& proc_rsc, double rrs, double expected_payoff
|
||||||
) {
|
) {
|
||||||
ACTIVE_TASK* atp;
|
ACTIVE_TASK* atp;
|
||||||
|
|
||||||
atp = gstate.lookup_active_task_by_result(rp);
|
atp = gstate.lookup_active_task_by_result(rp);
|
||||||
if (!atp || atp->task_state() == PROCESS_UNINITIALIZED) {
|
if (!proc_rsc.coprocs.sufficient_coprocs(
|
||||||
if (!gstate.coprocs.sufficient_coprocs(
|
rp->avp->coprocs, log_flags.cpu_sched_debug, "(CPU sched sim) ")
|
||||||
rp->avp->coprocs, log_flags.cpu_sched_debug)
|
|
||||||
) {
|
) {
|
||||||
if (log_flags.cpu_sched_debug) {
|
if (log_flags.cpu_sched_debug) {
|
||||||
msg_printf(rp->project, MSG_INFO,
|
msg_printf(rp->project, MSG_INFO,
|
||||||
|
@ -550,11 +558,10 @@ static bool schedule_if_possible(
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
if (atp) {
|
if (atp) {
|
||||||
// see if it fits in available RAM
|
// see if it fits in available RAM
|
||||||
//
|
//
|
||||||
if (atp->procinfo.working_set_size_smoothed > ram_left) {
|
if (atp->procinfo.working_set_size_smoothed > proc_rsc.ram_left) {
|
||||||
if (log_flags.cpu_sched_debug) {
|
if (log_flags.cpu_sched_debug) {
|
||||||
msg_printf(rp->project, MSG_INFO,
|
msg_printf(rp->project, MSG_INFO,
|
||||||
"[cpu_sched_debug] %s misses deadline but too large: %.2fMB",
|
"[cpu_sched_debug] %s misses deadline but too large: %.2fMB",
|
||||||
|
@ -568,20 +575,28 @@ static bool schedule_if_possible(
|
||||||
|
|
||||||
if (gstate.retry_shmem_time > gstate.now) {
|
if (gstate.retry_shmem_time > gstate.now) {
|
||||||
if (atp->app_client_shm.shm == NULL) {
|
if (atp->app_client_shm.shm == NULL) {
|
||||||
|
if (log_flags.cpu_sched_debug) {
|
||||||
|
msg_printf(rp->project, MSG_INFO,
|
||||||
|
"[cpu_sched_debug] waiting for shared mem: %s",
|
||||||
|
rp->name
|
||||||
|
);
|
||||||
|
}
|
||||||
atp->needs_shmem = true;
|
atp->needs_shmem = true;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
atp->needs_shmem = false;
|
atp->needs_shmem = false;
|
||||||
}
|
}
|
||||||
ram_left -= atp->procinfo.working_set_size_smoothed;
|
proc_rsc.ram_left -= atp->procinfo.working_set_size_smoothed;
|
||||||
}
|
}
|
||||||
if (log_flags.cpu_sched_debug) {
|
if (log_flags.cpu_sched_debug) {
|
||||||
msg_printf(rp->project, MSG_INFO,
|
msg_printf(rp->project, MSG_INFO,
|
||||||
"[cpu_sched_debug] scheduling %s",
|
"[cpu_sched_debug] scheduling %s", rp->name
|
||||||
rp->name
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
ncpus_used += rp->avp->avg_ncpus;
|
proc_rsc.coprocs.reserve_coprocs(
|
||||||
|
rp->avp->coprocs, rp, log_flags.cpu_sched_debug, "(CPU sched sim) "
|
||||||
|
);
|
||||||
|
proc_rsc.ncpus_used += rp->avp->avg_ncpus;
|
||||||
rp->project->anticipated_debt -= (rp->project->resource_share / rrs) * expected_payoff;
|
rp->project->anticipated_debt -= (rp->project->resource_share / rrs) * expected_payoff;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -595,7 +610,11 @@ void CLIENT_STATE::schedule_cpus() {
|
||||||
double expected_payoff;
|
double expected_payoff;
|
||||||
unsigned int i;
|
unsigned int i;
|
||||||
double rrs = runnable_resource_share();
|
double rrs = runnable_resource_share();
|
||||||
double ncpus_used = 0;
|
PROC_RESOURCES proc_rsc;
|
||||||
|
|
||||||
|
proc_rsc.ncpus_used = 0;
|
||||||
|
proc_rsc.ram_left = available_ram();
|
||||||
|
proc_rsc.coprocs.clone(gstate.coprocs);
|
||||||
|
|
||||||
if (log_flags.cpu_sched_debug) {
|
if (log_flags.cpu_sched_debug) {
|
||||||
msg_printf(0, MSG_INFO, "[cpu_sched_debug] schedule_cpus(): start");
|
msg_printf(0, MSG_INFO, "[cpu_sched_debug] schedule_cpus(): start");
|
||||||
|
@ -629,19 +648,18 @@ void CLIENT_STATE::schedule_cpus() {
|
||||||
|
|
||||||
expected_payoff = global_prefs.cpu_scheduling_period();
|
expected_payoff = global_prefs.cpu_scheduling_period();
|
||||||
ordered_scheduled_results.clear();
|
ordered_scheduled_results.clear();
|
||||||
double ram_left = available_ram();
|
|
||||||
|
|
||||||
// First choose results from projects with P.deadlines_missed>0
|
// First choose results from projects with P.deadlines_missed>0
|
||||||
//
|
//
|
||||||
#ifdef SIM
|
#ifdef SIM
|
||||||
if (!cpu_sched_rr_only) {
|
if (!cpu_sched_rr_only) {
|
||||||
#endif
|
#endif
|
||||||
while (ncpus_used < ncpus) {
|
while (proc_rsc.ncpus_used < ncpus) {
|
||||||
rp = earliest_deadline_result();
|
rp = earliest_deadline_result();
|
||||||
if (!rp) break;
|
if (!rp) break;
|
||||||
rp->already_selected = true;
|
rp->already_selected = true;
|
||||||
|
|
||||||
if (!schedule_if_possible(rp, ncpus_used, ram_left, rrs, expected_payoff)) continue;
|
if (!schedule_if_possible(rp, proc_rsc, rrs, expected_payoff)) continue;
|
||||||
|
|
||||||
rp->project->deadlines_missed--;
|
rp->project->deadlines_missed--;
|
||||||
rp->edf_scheduled = true;
|
rp->edf_scheduled = true;
|
||||||
|
@ -653,11 +671,11 @@ void CLIENT_STATE::schedule_cpus() {
|
||||||
|
|
||||||
// Next, choose results from projects with large debt
|
// Next, choose results from projects with large debt
|
||||||
//
|
//
|
||||||
while (ncpus_used < ncpus) {
|
while (proc_rsc.ncpus_used < ncpus) {
|
||||||
assign_results_to_projects();
|
assign_results_to_projects();
|
||||||
rp = largest_debt_project_best_result();
|
rp = largest_debt_project_best_result();
|
||||||
if (!rp) break;
|
if (!rp) break;
|
||||||
if (!schedule_if_possible(rp, ncpus_used, ram_left, rrs, expected_payoff)) continue;
|
if (!schedule_if_possible(rp, proc_rsc, rrs, expected_payoff)) continue;
|
||||||
ordered_scheduled_results.push_back(rp);
|
ordered_scheduled_results.push_back(rp);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -998,7 +1016,7 @@ bool CLIENT_STATE::enforce_schedule() {
|
||||||
switch (atp->task_state()) {
|
switch (atp->task_state()) {
|
||||||
case PROCESS_UNINITIALIZED:
|
case PROCESS_UNINITIALIZED:
|
||||||
if (!coprocs.sufficient_coprocs(
|
if (!coprocs.sufficient_coprocs(
|
||||||
atp->app_version->coprocs, log_flags.cpu_sched_debug
|
atp->app_version->coprocs, log_flags.cpu_sched_debug, ""
|
||||||
)){
|
)){
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1098,10 +1116,14 @@ struct RR_SIM_STATUS {
|
||||||
COPROCS coprocs;
|
COPROCS coprocs;
|
||||||
|
|
||||||
inline bool can_run(RESULT* rp) {
|
inline bool can_run(RESULT* rp) {
|
||||||
return coprocs.sufficient_coprocs(rp->avp->coprocs, log_flags.rr_simulation);
|
return coprocs.sufficient_coprocs(
|
||||||
|
rp->avp->coprocs, log_flags.rr_simulation, ""
|
||||||
|
);
|
||||||
}
|
}
|
||||||
inline void activate(RESULT* rp) {
|
inline void activate(RESULT* rp) {
|
||||||
coprocs.reserve_coprocs(rp->avp->coprocs, rp, log_flags.rr_simulation);
|
coprocs.reserve_coprocs(
|
||||||
|
rp->avp->coprocs, rp, log_flags.rr_simulation, ""
|
||||||
|
);
|
||||||
active.push_back(rp);
|
active.push_back(rp);
|
||||||
}
|
}
|
||||||
// remove *rpbest from active set,
|
// remove *rpbest from active set,
|
||||||
|
|
|
@ -72,11 +72,14 @@ struct COPROCS {
|
||||||
std::vector<std::string> get();
|
std::vector<std::string> get();
|
||||||
int parse(FILE*);
|
int parse(FILE*);
|
||||||
COPROC* lookup(char*);
|
COPROC* lookup(char*);
|
||||||
bool sufficient_coprocs(COPROCS&, bool verbose);
|
bool sufficient_coprocs(COPROCS&, bool verbose, const char* prefix);
|
||||||
void reserve_coprocs(COPROCS&, void*, bool verbose);
|
void reserve_coprocs(COPROCS&, void*, bool verbose, const char* prefix);
|
||||||
void free_coprocs(COPROCS&, void*, bool verbose);
|
void free_coprocs(COPROCS&, void*, bool verbose);
|
||||||
|
|
||||||
// used in round-robin simulator, to avoid messing w/ master copy
|
// Copy a coproc set, setting usage to zero.
|
||||||
|
// used in round-robin simulator and CPU scheduler,
|
||||||
|
// to avoid messing w/ master copy
|
||||||
|
//
|
||||||
void clone(COPROCS& c) {
|
void clone(COPROCS& c) {
|
||||||
for (unsigned int i=0; i<c.coprocs.size(); i++) {
|
for (unsigned int i=0; i<c.coprocs.size(); i++) {
|
||||||
COPROC* cp = c.coprocs[i];
|
COPROC* cp = c.coprocs[i];
|
||||||
|
|
Loading…
Reference in New Issue