mirror of https://github.com/BOINC/boinc.git
- client: add logic to work fetch so that each project
will have enough jobs to use its share of resource instances. This avoids situations where e.g. on a 2-CPU system a project has 75% resource share and 1 CPU job, and its STD increases without bound. Did a general cleanup of the logic for computing work request sizes (seconds and instances). svn path=/trunk/boinc/; revision=20036
This commit is contained in:
parent
868c18d2b2
commit
876522c6aa
|
@ -10725,3 +10725,19 @@ David 24 Dec 2009
|
|||
|
||||
client/
|
||||
work_fetch.cpp
|
||||
|
||||
David 24 Dec 2009
|
||||
- client: add logic to work fetch so that each project
|
||||
will have enough jobs to use its share of resource instances.
|
||||
This avoids situations where e.g. on a 2-CPU system
|
||||
a project has 75% resource share and 1 CPU job,
|
||||
and its STD increases without bound.
|
||||
|
||||
Did a general cleanup of the logic for computing
|
||||
work request sizes (seconds and instances).
|
||||
|
||||
client/
|
||||
work_fetch.cpp,h
|
||||
rr_sim.cpp
|
||||
sched/
|
||||
handle_request.cpp
|
||||
|
|
|
@ -266,7 +266,9 @@ void CLIENT_STATE::rr_simulation() {
|
|||
|
||||
p = rp->project;
|
||||
p->pwf.has_runnable_jobs = true;
|
||||
p->cpu_pwf.nused_total += rp->avp->avg_ncpus;
|
||||
if (rp->uses_cuda()) {
|
||||
p->cuda_pwf.nused_total += rp->avp->ncudas;
|
||||
p->cuda_pwf.has_runnable_jobs = true;
|
||||
if (cuda_work_fetch.sim_nused < coproc_cuda->count) {
|
||||
sim_status.activate(rp, 0);
|
||||
|
@ -275,6 +277,7 @@ void CLIENT_STATE::rr_simulation() {
|
|||
cuda_work_fetch.pending.push_back(rp);
|
||||
}
|
||||
} else if (rp->uses_ati()) {
|
||||
p->ati_pwf.nused_total += rp->avp->natis;
|
||||
p->ati_pwf.has_runnable_jobs = true;
|
||||
if (ati_work_fetch.sim_nused < coproc_ati->count) {
|
||||
sim_status.activate(rp, 0);
|
||||
|
|
|
@ -104,6 +104,7 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT* p, int rsc_type) {
|
|||
fetchable_share = 0;
|
||||
has_runnable_jobs = false;
|
||||
sim_nused = 0;
|
||||
nused_total = 0;
|
||||
deadlines_missed = 0;
|
||||
}
|
||||
|
||||
|
@ -204,10 +205,20 @@ bool RSC_PROJECT_WORK_FETCH::overworked() {
|
|||
// If resource is saturated for less than work_buf_total(),
|
||||
// get work for it from the non-overworked project with greatest LTD.
|
||||
#define FETCH_IF_PROJECT_STARVED 3
|
||||
// If any project is not overworked and has no runnable jobs
|
||||
// (for any resource, not just this one)
|
||||
// If any project is not overworked and has too few jobs
|
||||
// to use its instance share,
|
||||
// get work from the one with greatest LTD.
|
||||
|
||||
static char* criterion_name(int criterion) {
|
||||
switch (criterion) {
|
||||
case FETCH_IF_IDLE_INSTANCE: return "idle instance";
|
||||
case FETCH_IF_MAJOR_SHORTFALL: return "major shortfall";
|
||||
case FETCH_IF_MINOR_SHORTFALL: return "minor shortfall";
|
||||
case FETCH_IF_PROJECT_STARVED: return "starved";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
// Choose the best project to ask for work for this resource,
|
||||
// given the specific criterion
|
||||
//
|
||||
|
@ -241,7 +252,7 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
|
|||
break;
|
||||
case FETCH_IF_PROJECT_STARVED:
|
||||
if (rpwf.overworked()) continue;
|
||||
if (p->pwf.has_runnable_jobs) continue;
|
||||
if (rpwf.nused_total >= ninstances*rpwf.fetchable_share) continue;
|
||||
break;
|
||||
}
|
||||
if (pbest) {
|
||||
|
@ -254,67 +265,68 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
|
|||
}
|
||||
if (!pbest) return NULL;
|
||||
|
||||
if (log_flags.work_fetch_debug) {
|
||||
msg_printf(pbest, MSG_INFO,
|
||||
"chosen: %s %s", criterion_name(criterion), rsc_name(rsc_type)
|
||||
);
|
||||
}
|
||||
|
||||
// decide how much work to request from each resource
|
||||
//
|
||||
work_fetch.clear_request();
|
||||
switch (criterion) {
|
||||
case FETCH_IF_IDLE_INSTANCE:
|
||||
if (log_flags.work_fetch_debug) {
|
||||
msg_printf(pbest, MSG_INFO,
|
||||
"chosen: %s idle instance", rsc_name(rsc_type)
|
||||
);
|
||||
}
|
||||
req = share_request(pbest);
|
||||
if (req > shortfall) req = shortfall;
|
||||
set_request(pbest, req);
|
||||
set_request(pbest);
|
||||
break;
|
||||
case FETCH_IF_MAJOR_SHORTFALL:
|
||||
if (log_flags.work_fetch_debug) {
|
||||
msg_printf(pbest, MSG_INFO,
|
||||
"chosen: %s major shortfall", rsc_name(rsc_type)
|
||||
);
|
||||
}
|
||||
req = share_request(pbest);
|
||||
if (req > shortfall) req = shortfall;
|
||||
set_request(pbest, req);
|
||||
case FETCH_IF_PROJECT_STARVED:
|
||||
set_request(pbest);
|
||||
break;
|
||||
case FETCH_IF_MINOR_SHORTFALL:
|
||||
if (log_flags.work_fetch_debug) {
|
||||
msg_printf(pbest, MSG_INFO,
|
||||
"chosen: %s minor shortfall", rsc_name(rsc_type)
|
||||
);
|
||||
}
|
||||
work_fetch.set_shortfall_requests(pbest);
|
||||
break;
|
||||
case FETCH_IF_PROJECT_STARVED:
|
||||
if (log_flags.work_fetch_debug) {
|
||||
msg_printf(pbest, MSG_INFO,
|
||||
"chosen: %s starved", rsc_name(rsc_type)
|
||||
);
|
||||
}
|
||||
req = share_request(pbest);
|
||||
set_request(pbest, req);
|
||||
// in this case, potentially request work for all resources
|
||||
//
|
||||
work_fetch.set_all_requests(pbest);
|
||||
break;
|
||||
}
|
||||
return pbest;
|
||||
}
|
||||
|
||||
void WORK_FETCH::set_shortfall_requests(PROJECT* p) {
|
||||
cpu_work_fetch.set_shortfall_request(p);
|
||||
if (coproc_cuda && gpus_usable) {
|
||||
cuda_work_fetch.set_shortfall_request(p);
|
||||
}
|
||||
if (coproc_ati && gpus_usable) {
|
||||
ati_work_fetch.set_shortfall_request(p);
|
||||
}
|
||||
}
|
||||
|
||||
void RSC_WORK_FETCH::set_shortfall_request(PROJECT* p) {
|
||||
// request this project's share of shortfall and instances
|
||||
//
|
||||
void RSC_WORK_FETCH::set_request(PROJECT* p) {
|
||||
if (!shortfall) return;
|
||||
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
||||
if (!w.may_have_work) return;
|
||||
if (w.overworked()) return;
|
||||
set_request(p, shortfall);
|
||||
double dcf = p->duration_correction_factor;
|
||||
if (dcf < 0.02 || dcf > 80.0) {
|
||||
// if project's DCF is too big or small,
|
||||
// its completion time estimates are useless; just ask for 1 second
|
||||
//
|
||||
req_secs = 1;
|
||||
} else {
|
||||
req_secs = shortfall * w.fetchable_share;
|
||||
}
|
||||
|
||||
// the number of additional instances needed to have our share
|
||||
//
|
||||
double x1 = (ninstances * w.fetchable_share) - w.nused_total;
|
||||
|
||||
// our share of the idle instances
|
||||
//
|
||||
double x2 = nidle_now * w.fetchable_share;
|
||||
|
||||
req_instances = std::max(x1, x2);
|
||||
}
|
||||
|
||||
void WORK_FETCH::set_all_requests(PROJECT* p) {
|
||||
cpu_work_fetch.set_request(p);
|
||||
if (coproc_cuda && gpus_usable) {
|
||||
cuda_work_fetch.set_request(p);
|
||||
}
|
||||
if (coproc_ati && gpus_usable) {
|
||||
ati_work_fetch.set_request(p);
|
||||
}
|
||||
}
|
||||
|
||||
void WORK_FETCH::set_overall_debts() {
|
||||
|
@ -454,7 +466,7 @@ void WORK_FETCH::compute_work_request(PROJECT* p) {
|
|||
// Otherwise we can have a situation where a GPU is idle,
|
||||
// we ask only for GPU work, and the project never has any
|
||||
//
|
||||
work_fetch.set_shortfall_requests(pbest);
|
||||
work_fetch.set_all_requests(pbest);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -548,27 +560,6 @@ PROJECT* WORK_FETCH::choose_project() {
|
|||
return p;
|
||||
}
|
||||
|
||||
double RSC_WORK_FETCH::share_request(PROJECT* p) {
|
||||
double dcf = p->duration_correction_factor;
|
||||
if (dcf < 0.02 || dcf > 80.0) {
|
||||
// if project's DCF is too big or small,
|
||||
// its completion time estimates are useless; just ask for 1 second
|
||||
//
|
||||
return 1;
|
||||
} else {
|
||||
// otherwise ask for the project's share
|
||||
//
|
||||
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
||||
return gstate.work_buf_total()*w.fetchable_share;
|
||||
}
|
||||
}
|
||||
|
||||
void RSC_WORK_FETCH::set_request(PROJECT* p, double r) {
|
||||
RSC_PROJECT_WORK_FETCH& w = project_state(p);
|
||||
req_secs = r;
|
||||
req_instances = (int)ceil(w.fetchable_share*nidle_now);
|
||||
}
|
||||
|
||||
void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) {
|
||||
APP_VERSION* avp = atp->result->avp;
|
||||
PROJECT* p = atp->result->project;
|
||||
|
|
|
@ -62,6 +62,7 @@ struct RSC_PROJECT_WORK_FETCH {
|
|||
// determines how many instances this project deserves
|
||||
bool has_runnable_jobs;
|
||||
double sim_nused;
|
||||
double nused_total; // sum of instances over all runnable jobs
|
||||
int deadlines_missed;
|
||||
int deadlines_missed_copy;
|
||||
// copy of the above used during schedule_cpus()
|
||||
|
@ -185,7 +186,7 @@ struct RSC_WORK_FETCH {
|
|||
// the following specify the work request for this resource
|
||||
//
|
||||
double req_secs;
|
||||
int req_instances;
|
||||
double req_instances;
|
||||
|
||||
// debt accounting
|
||||
double secs_this_debt_interval;
|
||||
|
@ -204,9 +205,7 @@ struct RSC_WORK_FETCH {
|
|||
void update_short_term_debts();
|
||||
void print_state(const char*);
|
||||
void clear_request();
|
||||
void set_request(PROJECT*, double);
|
||||
double share_request(PROJECT*);
|
||||
void set_shortfall_request(PROJECT*);
|
||||
void set_request(PROJECT*);
|
||||
bool may_have_work(PROJECT*);
|
||||
RSC_WORK_FETCH() {
|
||||
memset(this, 0, sizeof(*this));
|
||||
|
@ -243,7 +242,7 @@ struct WORK_FETCH {
|
|||
PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results
|
||||
);
|
||||
void set_initial_work_request();
|
||||
void set_shortfall_requests(PROJECT*);
|
||||
void set_all_requests(PROJECT*);
|
||||
void print_state();
|
||||
void init();
|
||||
void rr_init();
|
||||
|
|
|
@ -336,7 +336,7 @@ int authenticate_user() {
|
|||
);
|
||||
goto make_new_host;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
// Here no hostid was given, or the ID was bad.
|
||||
// Look up the user, then create a new host record
|
||||
|
@ -480,7 +480,7 @@ got_host:
|
|||
user.update_field(buf);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -585,7 +585,7 @@ int send_result_abort() {
|
|||
DB_IN_PROGRESS_RESULT result;
|
||||
std::string result_names;
|
||||
unsigned int i;
|
||||
|
||||
|
||||
if (g_request->other_results.size() == 0) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -595,6 +595,7 @@ int send_result_abort() {
|
|||
for (i=0; i<g_request->other_results.size(); i++) {
|
||||
OTHER_RESULT& orp=g_request->other_results[i];
|
||||
orp.abort = true;
|
||||
// if the host has a result not in the DB, abort it
|
||||
orp.abort_if_not_started = false;
|
||||
orp.reason = ABORT_REASON_NOT_FOUND;
|
||||
if (i > 0) result_names.append(", ");
|
||||
|
@ -619,7 +620,7 @@ int send_result_abort() {
|
|||
orp.abort = true;
|
||||
orp.abort_if_not_started = false;
|
||||
orp.reason = ABORT_REASON_WU_CANCELLED;
|
||||
} else if ( result.assimilate_state == ASSIMILATE_DONE ) {
|
||||
} else if (result.assimilate_state == ASSIMILATE_DONE) {
|
||||
// if the WU has been assimilated, abort if not started
|
||||
//
|
||||
orp.abort = false;
|
||||
|
@ -658,8 +659,8 @@ int send_result_abort() {
|
|||
log_messages.printf(MSG_NORMAL,
|
||||
"[HOST#%d]: Send result_abort for result %s; reason: %s\n",
|
||||
g_reply->host.id, orp.name, reason_str(orp.reason)
|
||||
);
|
||||
// send user message
|
||||
);
|
||||
// send user message
|
||||
char buf[256];
|
||||
sprintf(buf, "Result %s is no longer usable", orp.name);
|
||||
g_reply->insert_message(buf, "high");
|
||||
|
@ -668,10 +669,10 @@ int send_result_abort() {
|
|||
log_messages.printf(MSG_NORMAL,
|
||||
"[HOST#%d]: Send result_abort_if_unstarted for result %s; reason %d\n",
|
||||
g_reply->host.id, orp.name, orp.reason
|
||||
);
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return aborts_sent;
|
||||
}
|
||||
|
||||
|
@ -857,11 +858,11 @@ void warn_user_if_core_client_upgrade_scheduled() {
|
|||
remaining /= 3600;
|
||||
|
||||
if (0 < remaining) {
|
||||
|
||||
|
||||
char msg[512];
|
||||
int days = remaining / 24;
|
||||
int hours = remaining % 24;
|
||||
|
||||
|
||||
sprintf(msg,
|
||||
"Starting in %d days and %d hours, project will require a minimum "
|
||||
"BOINC core client version of %d.%d.0. You are currently using "
|
||||
|
@ -1275,7 +1276,7 @@ void process_request(char* code_sign_key) {
|
|||
send_result_abort();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (requesting_work()) {
|
||||
if (!send_code_sign_key(code_sign_key)) {
|
||||
ok_to_send_work = false;
|
||||
|
|
Loading…
Reference in New Issue