mirror of https://github.com/BOINC/boinc.git
client: improve work fetch in presence of max concurrent
Re-enable work buffering in the presence of max concurrent constraints. See https://boinc.berkeley.edu/trac/wiki/WorkFetchMaxConcurrent
This commit is contained in:
parent
fbed9f3290
commit
81a880c74d
|
@ -99,6 +99,8 @@ int APP_CONFIGS::config_app_versions(PROJECT* p, bool show_warnings) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// clear app- and project-level counters to enforce max concurrent limits
|
||||||
|
//
|
||||||
void max_concurrent_init() {
|
void max_concurrent_init() {
|
||||||
for (unsigned int i=0; i<gstate.apps.size(); i++) {
|
for (unsigned int i=0; i<gstate.apps.size(); i++) {
|
||||||
gstate.apps[i]->app_n_concurrent = 0;
|
gstate.apps[i]->app_n_concurrent = 0;
|
||||||
|
|
|
@ -99,6 +99,7 @@ struct RR_SIM {
|
||||||
active.push_back(rp);
|
active.push_back(rp);
|
||||||
rsc_work_fetch[0].sim_nused += rp->avp->avg_ncpus;
|
rsc_work_fetch[0].sim_nused += rp->avp->avg_ncpus;
|
||||||
p->rsc_pwf[0].sim_nused += rp->avp->avg_ncpus;
|
p->rsc_pwf[0].sim_nused += rp->avp->avg_ncpus;
|
||||||
|
|
||||||
int rt = rp->avp->gpu_usage.rsc_type;
|
int rt = rp->avp->gpu_usage.rsc_type;
|
||||||
if (rt) {
|
if (rt) {
|
||||||
rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage;
|
rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage;
|
||||||
|
@ -118,7 +119,15 @@ struct RR_SIM {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
max_concurrent_inc(rp);
|
if (have_max_concurrent) {
|
||||||
|
max_concurrent_inc(rp);
|
||||||
|
if (p->rsc_pwf[0].sim_nused > p->rsc_pwf[0].max_nused) {
|
||||||
|
p->rsc_pwf[0].max_nused = p->rsc_pwf[0].sim_nused;
|
||||||
|
}
|
||||||
|
if (rt && p->rsc_pwf[rt].sim_nused > p->rsc_pwf[0].max_nused) {
|
||||||
|
p->rsc_pwf[rt].max_nused = p->rsc_pwf[0].sim_nused;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_pending_lists();
|
void init_pending_lists();
|
||||||
|
@ -236,6 +245,7 @@ void RR_SIM::pick_jobs_to_run(double reltime) {
|
||||||
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
||||||
PROJECT* p = gstate.projects[i];
|
PROJECT* p = gstate.projects[i];
|
||||||
p->pwf.rec_temp_save = p->pwf.rec_temp;
|
p->pwf.rec_temp_save = p->pwf.rec_temp;
|
||||||
|
p->pwf.at_max_concurrent_limit = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
rsc_work_fetch[0].sim_nused = 0;
|
rsc_work_fetch[0].sim_nused = 0;
|
||||||
|
@ -254,11 +264,11 @@ void RR_SIM::pick_jobs_to_run(double reltime) {
|
||||||
//
|
//
|
||||||
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
||||||
PROJECT* p = gstate.projects[i];
|
PROJECT* p = gstate.projects[i];
|
||||||
if (p->pwf.at_max_concurrent_limit) continue;
|
|
||||||
RSC_PROJECT_WORK_FETCH& rsc_pwf = p->rsc_pwf[rt];
|
RSC_PROJECT_WORK_FETCH& rsc_pwf = p->rsc_pwf[rt];
|
||||||
if (rsc_pwf.pending.size() ==0) continue;
|
if (rsc_pwf.pending.size() ==0) continue;
|
||||||
rsc_pwf.pending_iter = rsc_pwf.pending.begin();
|
rsc_pwf.pending_iter = rsc_pwf.pending.begin();
|
||||||
rsc_pwf.sim_nused = 0;
|
rsc_pwf.sim_nused = 0;
|
||||||
|
rsc_pwf.max_nused = 0;
|
||||||
p->pwf.rec_temp = p->pwf.rec;
|
p->pwf.rec_temp = p->pwf.rec;
|
||||||
p->compute_sched_priority();
|
p->compute_sched_priority();
|
||||||
project_heap.push_back(p);
|
project_heap.push_back(p);
|
||||||
|
@ -418,6 +428,30 @@ static void handle_missed_deadline(RESULT* rpbest, double diff, double ar) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update "MC shortfall" for projects with max concurrent restrictions
|
||||||
|
//
|
||||||
|
static void mc_update_stats(double sim_now, double dt, double buf_end) {
|
||||||
|
for (unsigned int i=0; i<gstate.projects.size(); i++) {
|
||||||
|
PROJECT* p = gstate.projects[i];
|
||||||
|
if (!p->app_configs.project_has_mc) continue;
|
||||||
|
for (int rt=0; rt<coprocs.n_rsc; rt++) {
|
||||||
|
RSC_PROJECT_WORK_FETCH& rsc_pwf = p->rsc_pwf[rt];
|
||||||
|
RSC_WORK_FETCH& rwf = rsc_work_fetch[rt];
|
||||||
|
double x = rsc_pwf.max_nused - rsc_pwf.sim_nused;
|
||||||
|
x = std::min(x, rwf.ninstances - rwf.sim_nused);
|
||||||
|
if (x > 1e-6 && sim_now < buf_end) {
|
||||||
|
double dt2;
|
||||||
|
if (sim_now + dt > buf_end) {
|
||||||
|
dt2 = buf_end - sim_now;
|
||||||
|
} else {
|
||||||
|
dt2 = dt;
|
||||||
|
}
|
||||||
|
rsc_pwf.mc_shortfall += x*dt2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// do a round_robin simulation,
|
// do a round_robin simulation,
|
||||||
// for either CPU scheduling (to find deadline misses)
|
// for either CPU scheduling (to find deadline misses)
|
||||||
// or work fetch (do compute idleness and shortfall)
|
// or work fetch (do compute idleness and shortfall)
|
||||||
|
@ -557,9 +591,14 @@ void RR_SIM::simulate() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update shortfall and saturated time for each resource
|
||||||
|
//
|
||||||
for (int i=0; i<coprocs.n_rsc; i++) {
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
||||||
rsc_work_fetch[i].update_stats(sim_now, delta_t, buf_end);
|
rsc_work_fetch[i].update_stats(sim_now, delta_t, buf_end);
|
||||||
}
|
}
|
||||||
|
if (have_max_concurrent) {
|
||||||
|
mc_update_stats(sim_now, delta_t, buf_end);
|
||||||
|
}
|
||||||
|
|
||||||
// update project REC
|
// update project REC
|
||||||
//
|
//
|
||||||
|
@ -612,6 +651,9 @@ void RR_SIM::simulate() {
|
||||||
for (int i=0; i<coprocs.n_rsc; i++) {
|
for (int i=0; i<coprocs.n_rsc; i++) {
|
||||||
rsc_work_fetch[i].update_stats(sim_now, d_time, buf_end);
|
rsc_work_fetch[i].update_stats(sim_now, d_time, buf_end);
|
||||||
}
|
}
|
||||||
|
if (have_max_concurrent) {
|
||||||
|
mc_update_stats(sim_now, d_time, buf_end);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,6 +76,7 @@ void RSC_PROJECT_WORK_FETCH::rr_init() {
|
||||||
sim_nused = 0;
|
sim_nused = 0;
|
||||||
nused_total = 0;
|
nused_total = 0;
|
||||||
deadlines_missed = 0;
|
deadlines_missed = 0;
|
||||||
|
mc_shortfall = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void RSC_PROJECT_WORK_FETCH::resource_backoff(PROJECT* p, const char* name) {
|
void RSC_PROJECT_WORK_FETCH::resource_backoff(PROJECT* p, const char* name) {
|
||||||
|
@ -177,6 +178,9 @@ void RSC_WORK_FETCH::rr_init() {
|
||||||
sim_used_instances = 0;
|
sim_used_instances = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// update shortfall and saturated time for a given resource;
|
||||||
|
// called at each time step in RR sim
|
||||||
|
//
|
||||||
void RSC_WORK_FETCH::update_stats(double sim_now, double dt, double buf_end) {
|
void RSC_WORK_FETCH::update_stats(double sim_now, double dt, double buf_end) {
|
||||||
double idle = ninstances - sim_nused;
|
double idle = ninstances - sim_nused;
|
||||||
if (idle > 1e-6 && sim_now < buf_end) {
|
if (idle > 1e-6 && sim_now < buf_end) {
|
||||||
|
@ -222,6 +226,15 @@ void RSC_WORK_FETCH::set_request(PROJECT* p) {
|
||||||
}
|
}
|
||||||
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
||||||
double non_excl_inst = ninstances - w.ncoprocs_excluded;
|
double non_excl_inst = ninstances - w.ncoprocs_excluded;
|
||||||
|
|
||||||
|
// if this project has max concurrent,
|
||||||
|
// use the project-specific "MC shortfall" instead of global shortfall
|
||||||
|
//
|
||||||
|
if (p->app_configs.project_has_mc) {
|
||||||
|
RSC_PROJECT_WORK_FETCH& rsc_pwf = p->rsc_pwf[rsc_type];
|
||||||
|
shortfall = rsc_pwf.mc_shortfall;
|
||||||
|
}
|
||||||
|
|
||||||
if (shortfall) {
|
if (shortfall) {
|
||||||
if (wacky_dcf(p)) {
|
if (wacky_dcf(p)) {
|
||||||
// if project's DCF is too big or small,
|
// if project's DCF is too big or small,
|
||||||
|
|
|
@ -93,6 +93,7 @@ struct RSC_PROJECT_WORK_FETCH {
|
||||||
int n_runnable_jobs;
|
int n_runnable_jobs;
|
||||||
double sim_nused;
|
double sim_nused;
|
||||||
// # of instances used at this point in the simulation
|
// # of instances used at this point in the simulation
|
||||||
|
// Used for GPU exclusion logic
|
||||||
double nused_total; // sum of instances over all runnable jobs
|
double nused_total; // sum of instances over all runnable jobs
|
||||||
int ncoprocs_excluded;
|
int ncoprocs_excluded;
|
||||||
// number of excluded instances
|
// number of excluded instances
|
||||||
|
@ -114,6 +115,13 @@ struct RSC_PROJECT_WORK_FETCH {
|
||||||
// If zero, it's OK to ask this project for this type of work.
|
// If zero, it's OK to ask this project for this type of work.
|
||||||
// If nonzero, the reason why it's not OK
|
// If nonzero, the reason why it's not OK
|
||||||
|
|
||||||
|
// stuff for max concurrent logic
|
||||||
|
//
|
||||||
|
double max_nused;
|
||||||
|
// max # instances used so far in simulation.
|
||||||
|
double mc_shortfall;
|
||||||
|
// project's shortfall for this resources, given MC limits
|
||||||
|
|
||||||
RSC_PROJECT_WORK_FETCH() {
|
RSC_PROJECT_WORK_FETCH() {
|
||||||
backoff_time = 0;
|
backoff_time = 0;
|
||||||
backoff_interval = 0;
|
backoff_interval = 0;
|
||||||
|
@ -218,6 +226,7 @@ struct RSC_WORK_FETCH {
|
||||||
double nidle_now;
|
double nidle_now;
|
||||||
// # idle instances now (at the beginning of RR sim)
|
// # idle instances now (at the beginning of RR sim)
|
||||||
double sim_nused;
|
double sim_nused;
|
||||||
|
// # instance used at this point in RR sim
|
||||||
COPROC_INSTANCE_BITMAP sim_used_instances;
|
COPROC_INSTANCE_BITMAP sim_used_instances;
|
||||||
// bitmap of instances used in simulation,
|
// bitmap of instances used in simulation,
|
||||||
// taking into account GPU exclusions
|
// taking into account GPU exclusions
|
||||||
|
|
|
@ -729,7 +729,9 @@ int APP_CONFIG::parse(XML_PARSER& xp, MSG_VEC& mv, LOG_FLAGS& log_flags) {
|
||||||
if (xp.match_tag("/app")) return 0;
|
if (xp.match_tag("/app")) return 0;
|
||||||
if (xp.parse_str("name", name, 256)) continue;
|
if (xp.parse_str("name", name, 256)) continue;
|
||||||
if (xp.parse_int("max_concurrent", max_concurrent)) {
|
if (xp.parse_int("max_concurrent", max_concurrent)) {
|
||||||
if (max_concurrent) have_max_concurrent = true;
|
if (max_concurrent) {
|
||||||
|
have_max_concurrent = true;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (xp.match_tag("gpu_versions")) {
|
if (xp.match_tag("gpu_versions")) {
|
||||||
|
@ -800,6 +802,9 @@ int APP_CONFIGS::parse(XML_PARSER& xp, MSG_VEC& mv, LOG_FLAGS& log_flags) {
|
||||||
int retval = ac.parse(xp, mv, log_flags);
|
int retval = ac.parse(xp, mv, log_flags);
|
||||||
if (retval) return retval;
|
if (retval) return retval;
|
||||||
app_configs.push_back(ac);
|
app_configs.push_back(ac);
|
||||||
|
if (ac.max_concurrent) {
|
||||||
|
project_has_mc = true;
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (xp.match_tag("app_version")) {
|
if (xp.match_tag("app_version")) {
|
||||||
|
@ -812,6 +817,7 @@ int APP_CONFIGS::parse(XML_PARSER& xp, MSG_VEC& mv, LOG_FLAGS& log_flags) {
|
||||||
if (xp.parse_int("project_max_concurrent", n)) {
|
if (xp.parse_int("project_max_concurrent", n)) {
|
||||||
if (n >= 0) {
|
if (n >= 0) {
|
||||||
have_max_concurrent = true;
|
have_max_concurrent = true;
|
||||||
|
project_has_mc = true;
|
||||||
project_max_concurrent = n;
|
project_max_concurrent = n;
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
|
|
|
@ -240,6 +240,8 @@ struct APP_CONFIGS {
|
||||||
std::vector<APP_CONFIG> app_configs;
|
std::vector<APP_CONFIG> app_configs;
|
||||||
std::vector<APP_VERSION_CONFIG> app_version_configs;
|
std::vector<APP_VERSION_CONFIG> app_version_configs;
|
||||||
int project_max_concurrent;
|
int project_max_concurrent;
|
||||||
|
bool project_has_mc;
|
||||||
|
// have app- or project-level max concurrent restriction
|
||||||
bool report_results_immediately;
|
bool report_results_immediately;
|
||||||
|
|
||||||
int parse(XML_PARSER&, MSG_VEC&, LOG_FLAGS&);
|
int parse(XML_PARSER&, MSG_VEC&, LOG_FLAGS&);
|
||||||
|
@ -250,6 +252,7 @@ struct APP_CONFIGS {
|
||||||
app_configs.clear();
|
app_configs.clear();
|
||||||
app_version_configs.clear();
|
app_version_configs.clear();
|
||||||
project_max_concurrent = 0;
|
project_max_concurrent = 0;
|
||||||
|
project_has_mc = false;
|
||||||
report_results_immediately = false;
|
report_results_immediately = false;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue