- client: add logic to work fetch so that each project

will have enough jobs to use its share of resource instances.
    This avoids situations where e.g. on a 2-CPU system
    a project has 75% resource share and 1 CPU job,
    and its STD increases without bound.
    
    Did a general cleanup of the logic for computing
    work request sizes (seconds and instances).

svn path=/trunk/boinc/; revision=20036
This commit is contained in:
David Anderson 2009-12-24 20:40:27 +00:00
parent 868c18d2b2
commit 876522c6aa
5 changed files with 94 additions and 84 deletions

View File

@ -10725,3 +10725,19 @@ David 24 Dec 2009
client/
work_fetch.cpp
David 24 Dec 2009
- client: add logic to work fetch so that each project
will have enough jobs to use its share of resource instances.
This avoids situations where e.g. on a 2-CPU system
a project has 75% resource share and 1 CPU job,
and its STD increases without bound.
Did a general cleanup of the logic for computing
work request sizes (seconds and instances).
client/
work_fetch.cpp,h
rr_sim.cpp
sched/
handle_request.cpp

View File

@ -266,7 +266,9 @@ void CLIENT_STATE::rr_simulation() {
p = rp->project;
p->pwf.has_runnable_jobs = true;
p->cpu_pwf.nused_total += rp->avp->avg_ncpus;
if (rp->uses_cuda()) {
p->cuda_pwf.nused_total += rp->avp->ncudas;
p->cuda_pwf.has_runnable_jobs = true;
if (cuda_work_fetch.sim_nused < coproc_cuda->count) {
sim_status.activate(rp, 0);
@ -275,6 +277,7 @@ void CLIENT_STATE::rr_simulation() {
cuda_work_fetch.pending.push_back(rp);
}
} else if (rp->uses_ati()) {
p->ati_pwf.nused_total += rp->avp->natis;
p->ati_pwf.has_runnable_jobs = true;
if (ati_work_fetch.sim_nused < coproc_ati->count) {
sim_status.activate(rp, 0);

View File

@ -104,6 +104,7 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT* p, int rsc_type) {
fetchable_share = 0;
has_runnable_jobs = false;
sim_nused = 0;
nused_total = 0;
deadlines_missed = 0;
}
@ -204,10 +205,20 @@ bool RSC_PROJECT_WORK_FETCH::overworked() {
// If resource is saturated for less than work_buf_total(),
// get work for it from the non-overworked project with greatest LTD.
#define FETCH_IF_PROJECT_STARVED 3
// If any project is not overworked and has no runnable jobs
// (for any resource, not just this one)
// If any project is not overworked and has too few jobs
// to use its instance share,
// get work from the one with greatest LTD.
static char* criterion_name(int criterion) {
switch (criterion) {
case FETCH_IF_IDLE_INSTANCE: return "idle instance";
case FETCH_IF_MAJOR_SHORTFALL: return "major shortfall";
case FETCH_IF_MINOR_SHORTFALL: return "minor shortfall";
case FETCH_IF_PROJECT_STARVED: return "starved";
}
return "unknown";
}
// Choose the best project to ask for work for this resource,
// given the specific criterion
//
@ -241,7 +252,7 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
break;
case FETCH_IF_PROJECT_STARVED:
if (rpwf.overworked()) continue;
if (p->pwf.has_runnable_jobs) continue;
if (rpwf.nused_total >= ninstances*rpwf.fetchable_share) continue;
break;
}
if (pbest) {
@ -254,67 +265,68 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
}
if (!pbest) return NULL;
if (log_flags.work_fetch_debug) {
msg_printf(pbest, MSG_INFO,
"chosen: %s %s", criterion_name(criterion), rsc_name(rsc_type)
);
}
// decide how much work to request from each resource
//
work_fetch.clear_request();
switch (criterion) {
case FETCH_IF_IDLE_INSTANCE:
if (log_flags.work_fetch_debug) {
msg_printf(pbest, MSG_INFO,
"chosen: %s idle instance", rsc_name(rsc_type)
);
}
req = share_request(pbest);
if (req > shortfall) req = shortfall;
set_request(pbest, req);
set_request(pbest);
break;
case FETCH_IF_MAJOR_SHORTFALL:
if (log_flags.work_fetch_debug) {
msg_printf(pbest, MSG_INFO,
"chosen: %s major shortfall", rsc_name(rsc_type)
);
}
req = share_request(pbest);
if (req > shortfall) req = shortfall;
set_request(pbest, req);
case FETCH_IF_PROJECT_STARVED:
set_request(pbest);
break;
case FETCH_IF_MINOR_SHORTFALL:
if (log_flags.work_fetch_debug) {
msg_printf(pbest, MSG_INFO,
"chosen: %s minor shortfall", rsc_name(rsc_type)
);
}
work_fetch.set_shortfall_requests(pbest);
break;
case FETCH_IF_PROJECT_STARVED:
if (log_flags.work_fetch_debug) {
msg_printf(pbest, MSG_INFO,
"chosen: %s starved", rsc_name(rsc_type)
);
}
req = share_request(pbest);
set_request(pbest, req);
// in this case, potentially request work for all resources
//
work_fetch.set_all_requests(pbest);
break;
}
return pbest;
}
void WORK_FETCH::set_shortfall_requests(PROJECT* p) {
cpu_work_fetch.set_shortfall_request(p);
if (coproc_cuda && gpus_usable) {
cuda_work_fetch.set_shortfall_request(p);
}
if (coproc_ati && gpus_usable) {
ati_work_fetch.set_shortfall_request(p);
}
}
void RSC_WORK_FETCH::set_shortfall_request(PROJECT* p) {
// request this project's share of shortfall and instances
//
void RSC_WORK_FETCH::set_request(PROJECT* p) {
if (!shortfall) return;
RSC_PROJECT_WORK_FETCH& w = project_state(p);
if (!w.may_have_work) return;
if (w.overworked()) return;
set_request(p, shortfall);
double dcf = p->duration_correction_factor;
if (dcf < 0.02 || dcf > 80.0) {
// if project's DCF is too big or small,
// its completion time estimates are useless; just ask for 1 second
//
req_secs = 1;
} else {
req_secs = shortfall * w.fetchable_share;
}
// the number of additional instances needed to have our share
//
double x1 = (ninstances * w.fetchable_share) - w.nused_total;
// our share of the idle instances
//
double x2 = nidle_now * w.fetchable_share;
req_instances = std::max(x1, x2);
}
void WORK_FETCH::set_all_requests(PROJECT* p) {
cpu_work_fetch.set_request(p);
if (coproc_cuda && gpus_usable) {
cuda_work_fetch.set_request(p);
}
if (coproc_ati && gpus_usable) {
ati_work_fetch.set_request(p);
}
}
void WORK_FETCH::set_overall_debts() {
@ -454,7 +466,7 @@ void WORK_FETCH::compute_work_request(PROJECT* p) {
// Otherwise we can have a situation where a GPU is idle,
// we ask only for GPU work, and the project never has any
//
work_fetch.set_shortfall_requests(pbest);
work_fetch.set_all_requests(pbest);
return;
}
@ -548,27 +560,6 @@ PROJECT* WORK_FETCH::choose_project() {
return p;
}
double RSC_WORK_FETCH::share_request(PROJECT* p) {
double dcf = p->duration_correction_factor;
if (dcf < 0.02 || dcf > 80.0) {
// if project's DCF is too big or small,
// its completion time estimates are useless; just ask for 1 second
//
return 1;
} else {
// otherwise ask for the project's share
//
RSC_PROJECT_WORK_FETCH& w = project_state(p);
return gstate.work_buf_total()*w.fetchable_share;
}
}
void RSC_WORK_FETCH::set_request(PROJECT* p, double r) {
RSC_PROJECT_WORK_FETCH& w = project_state(p);
req_secs = r;
req_instances = (int)ceil(w.fetchable_share*nidle_now);
}
void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) {
APP_VERSION* avp = atp->result->avp;
PROJECT* p = atp->result->project;

View File

@ -62,6 +62,7 @@ struct RSC_PROJECT_WORK_FETCH {
// determines how many instances this project deserves
bool has_runnable_jobs;
double sim_nused;
double nused_total; // sum of instances over all runnable jobs
int deadlines_missed;
int deadlines_missed_copy;
// copy of the above used during schedule_cpus()
@ -185,7 +186,7 @@ struct RSC_WORK_FETCH {
// the following specify the work request for this resource
//
double req_secs;
int req_instances;
double req_instances;
// debt accounting
double secs_this_debt_interval;
@ -204,9 +205,7 @@ struct RSC_WORK_FETCH {
void update_short_term_debts();
void print_state(const char*);
void clear_request();
void set_request(PROJECT*, double);
double share_request(PROJECT*);
void set_shortfall_request(PROJECT*);
void set_request(PROJECT*);
bool may_have_work(PROJECT*);
RSC_WORK_FETCH() {
memset(this, 0, sizeof(*this));
@ -243,7 +242,7 @@ struct WORK_FETCH {
PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results
);
void set_initial_work_request();
void set_shortfall_requests(PROJECT*);
void set_all_requests(PROJECT*);
void print_state();
void init();
void rr_init();

View File

@ -336,7 +336,7 @@ int authenticate_user() {
);
goto make_new_host;
}
} else {
// Here no hostid was given, or the ID was bad.
// Look up the user, then create a new host record
@ -480,7 +480,7 @@ got_host:
user.update_field(buf);
}
}
return 0;
}
@ -585,7 +585,7 @@ int send_result_abort() {
DB_IN_PROGRESS_RESULT result;
std::string result_names;
unsigned int i;
if (g_request->other_results.size() == 0) {
return 0;
}
@ -595,6 +595,7 @@ int send_result_abort() {
for (i=0; i<g_request->other_results.size(); i++) {
OTHER_RESULT& orp=g_request->other_results[i];
orp.abort = true;
// if the host has a result not in the DB, abort it
orp.abort_if_not_started = false;
orp.reason = ABORT_REASON_NOT_FOUND;
if (i > 0) result_names.append(", ");
@ -619,7 +620,7 @@ int send_result_abort() {
orp.abort = true;
orp.abort_if_not_started = false;
orp.reason = ABORT_REASON_WU_CANCELLED;
} else if ( result.assimilate_state == ASSIMILATE_DONE ) {
} else if (result.assimilate_state == ASSIMILATE_DONE) {
// if the WU has been assimilated, abort if not started
//
orp.abort = false;
@ -658,8 +659,8 @@ int send_result_abort() {
log_messages.printf(MSG_NORMAL,
"[HOST#%d]: Send result_abort for result %s; reason: %s\n",
g_reply->host.id, orp.name, reason_str(orp.reason)
);
// send user message
);
// send user message
char buf[256];
sprintf(buf, "Result %s is no longer usable", orp.name);
g_reply->insert_message(buf, "high");
@ -668,10 +669,10 @@ int send_result_abort() {
log_messages.printf(MSG_NORMAL,
"[HOST#%d]: Send result_abort_if_unstarted for result %s; reason %d\n",
g_reply->host.id, orp.name, orp.reason
);
);
}
}
return aborts_sent;
}
@ -857,11 +858,11 @@ void warn_user_if_core_client_upgrade_scheduled() {
remaining /= 3600;
if (0 < remaining) {
char msg[512];
int days = remaining / 24;
int hours = remaining % 24;
sprintf(msg,
"Starting in %d days and %d hours, project will require a minimum "
"BOINC core client version of %d.%d.0. You are currently using "
@ -1275,7 +1276,7 @@ void process_request(char* code_sign_key) {
send_result_abort();
}
}
if (requesting_work()) {
if (!send_code_sign_key(code_sign_key)) {
ok_to_send_work = false;