mirror of https://github.com/BOINC/boinc.git
client: if using a dynamic AM and have idle resources, tell the AM
See issue #2452. If using a dynamic AM, the client may starve, e.g. because the assigned projects are down. Detect this, and send "starvation RPCs" to the AM giving details about the situation. But don't do this too often; use exponential backoff.
This commit is contained in:
parent
a9dac7ee5e
commit
99dd146bbb
|
@ -197,6 +197,24 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
|
|||
p->disk_usage,
|
||||
p->disk_share
|
||||
);
|
||||
|
||||
// send starvation-related info
|
||||
//
|
||||
if (ami.send_rec) {
|
||||
fprintf(f,
|
||||
" <nrpc_failures>%d</nrpc_failures>",
|
||||
p->nrpc_failures
|
||||
);
|
||||
for (int j=0; j<coprocs.n_rsc; j++) {
|
||||
if (p->sched_req_no_work[j]) {
|
||||
fprintf(f,
|
||||
" <sched_req_no_work>%s</sched_req_no_work>\n",
|
||||
coprocs.coprocs[j].type
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (p->attached_via_acct_mgr) {
|
||||
fprintf(f,
|
||||
" <account_key>%s</account_key>\n",
|
||||
|
@ -885,6 +903,9 @@ void ACCT_MGR_INFO::clear() {
|
|||
no_project_notices = false;
|
||||
cookie_required = false;
|
||||
user_keywords.clear();
|
||||
first_starved = 0;
|
||||
starved_rpc_backoff = 0;
|
||||
starved_rpc_min_time = 0;
|
||||
}
|
||||
|
||||
ACCT_MGR_INFO::ACCT_MGR_INFO() {
|
||||
|
@ -1013,6 +1034,11 @@ int ACCT_MGR_INFO::init() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
#define STARVED_RPC_DELAY 600
|
||||
// do RPC after this much starvation
|
||||
|
||||
// called once a second
|
||||
//
|
||||
bool ACCT_MGR_INFO::poll() {
|
||||
if (!using_am()) return false;
|
||||
if (gstate.acct_mgr_op.gui_http->is_busy()) {
|
||||
|
@ -1027,5 +1053,43 @@ bool ACCT_MGR_INFO::poll() {
|
|||
gstate.acct_mgr_op.do_rpc(*this, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
// if not dynamic AM, we're done
|
||||
//
|
||||
if (!send_rec) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// See if some resource is starved with the current set of projects,
|
||||
// and if so possibly do a "starved" RPC asking for different projects
|
||||
|
||||
// do this check once a minute
|
||||
//
|
||||
static int idle_timer = 0;
|
||||
if (++idle_timer < 60) {
|
||||
return false;
|
||||
}
|
||||
idle_timer = 0;
|
||||
get_nidle();
|
||||
if (any_resource_idle()) {
|
||||
if (first_starved == 0) {
|
||||
first_starved = gstate.now;
|
||||
starved_rpc_backoff = STARVED_RPC_DELAY;
|
||||
starved_rpc_min_time = gstate.now + STARVED_RPC_DELAY;
|
||||
} else {
|
||||
if (gstate.now < starved_rpc_min_time) {
|
||||
return false;
|
||||
}
|
||||
gstate.acct_mgr_op.do_rpc(*this, false);
|
||||
starved_rpc_backoff *= 2;
|
||||
if (starved_rpc_backoff > 86400) {
|
||||
starved_rpc_backoff = 86400;
|
||||
}
|
||||
starved_rpc_min_time = gstate.now + starved_rpc_backoff;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
first_starved = 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ struct ACCT_MGR_INFO : PROJ_AM {
|
|||
bool no_project_notices;
|
||||
// if set, don't show notices from projects
|
||||
|
||||
// TODO: get rid of the following
|
||||
// TODO: get rid of the following here and in the manager
|
||||
bool cookie_required;
|
||||
// use of cookies are required during initial signup
|
||||
// NOTE: This bool gets dropped after the client has
|
||||
|
@ -75,6 +75,16 @@ struct ACCT_MGR_INFO : PROJ_AM {
|
|||
bool send_rec;
|
||||
// send REC in AM RPCs
|
||||
USER_KEYWORDS user_keywords;
|
||||
// user's yes/no keywords.
|
||||
// These are conveyed to projects in scheduler requests
|
||||
|
||||
// vars related to starvation prevention,
|
||||
// where we issue a "starved RPC" if a resource has been idle
|
||||
// for more than 10 min
|
||||
|
||||
double first_starved; // start of starvation interval
|
||||
double starved_rpc_backoff; // interval between starved RPCs
|
||||
double starved_rpc_min_time; // earliest time to do a starved RPC
|
||||
|
||||
inline bool using_am() {
|
||||
if (!strlen(master_url)) return false;
|
||||
|
|
|
@ -932,8 +932,10 @@ int CLIENT_STATE::handle_scheduler_reply(
|
|||
workunits.push_back(wup);
|
||||
}
|
||||
double est_rsc_runtime[MAX_RSC];
|
||||
bool got_work_for_rsc[MAX_RSC];
|
||||
for (int j=0; j<coprocs.n_rsc; j++) {
|
||||
est_rsc_runtime[j] = 0;
|
||||
got_work_for_rsc[j] = false;
|
||||
}
|
||||
for (i=0; i<sr.results.size(); i++) {
|
||||
RESULT* rp2 = lookup_result(project, sr.results[i].name);
|
||||
|
@ -981,9 +983,11 @@ int CLIENT_STATE::handle_scheduler_reply(
|
|||
rp->abort_inactive(EXIT_MISSING_COPROC);
|
||||
} else {
|
||||
rp->set_state(RESULT_NEW, "handle_scheduler_reply");
|
||||
got_work_for_rsc[0] = true;
|
||||
int rt = rp->avp->gpu_usage.rsc_type;
|
||||
if (rt > 0) {
|
||||
est_rsc_runtime[rt] += rp->estimated_runtime();
|
||||
got_work_for_rsc[rt] = true;
|
||||
gpus_usable = true;
|
||||
// trigger a check of whether GPU is actually usable
|
||||
} else {
|
||||
|
@ -995,6 +999,20 @@ int CLIENT_STATE::handle_scheduler_reply(
|
|||
new_results.push_back(rp);
|
||||
results.push_back(rp);
|
||||
}
|
||||
|
||||
// find the resources for which we requested work and didn't get any
|
||||
// This is currently used for AM starvation mechanism.
|
||||
//
|
||||
if (!sr.too_recent) {
|
||||
for (int j=0; j<coprocs.n_rsc; j++) {
|
||||
RSC_WORK_FETCH& rwf = rsc_work_fetch[j];
|
||||
project->sched_req_no_work[j] =
|
||||
(rwf.req_secs>0 || rwf.req_instances>0)
|
||||
&& !got_work_for_rsc[j]
|
||||
;
|
||||
}
|
||||
}
|
||||
|
||||
sort_results();
|
||||
|
||||
if (log_flags.sched_op_debug) {
|
||||
|
|
|
@ -53,6 +53,7 @@ void PROJECT::init() {
|
|||
no_rsc_config[i] = false;
|
||||
no_rsc_apps[i] = false;
|
||||
no_rsc_ams[i] = false;
|
||||
sched_req_no_work[i] = false;
|
||||
}
|
||||
safe_strcpy(host_venue, "");
|
||||
using_venue_specific_prefs = false;
|
||||
|
|
|
@ -176,6 +176,9 @@ struct PROJECT : PROJ_AM {
|
|||
// This provides only the illusion of security.
|
||||
bool use_symlinks;
|
||||
bool report_results_immediately;
|
||||
bool sched_req_no_work[MAX_RSC];
|
||||
// the last sched request asked for work for resource i
|
||||
// and didn't get any
|
||||
|
||||
// items sent in scheduler replies,
|
||||
// requesting that various things be sent subsequent requests
|
||||
|
|
|
@ -558,3 +558,51 @@ void rr_simulation() {
|
|||
RR_SIM rr_sim;
|
||||
rr_sim.simulate();
|
||||
}
|
||||
|
||||
// Compute the number of idle instances of each resource
|
||||
// Put results in global state (rsc_work_fetch)
|
||||
//
|
||||
void get_nidle() {
|
||||
int nidle_rsc = coprocs.n_rsc;
|
||||
for (int i=1; i<coprocs.n_rsc; i++) {
|
||||
rsc_work_fetch[i].nidle_now = coprocs.coprocs[i].count;
|
||||
}
|
||||
for (unsigned int i=0; i<gstate.results.size(); i++) {
|
||||
RESULT* rp = gstate.results[i];
|
||||
if (!rp->nearly_runnable()) continue;
|
||||
if (rp->some_download_stalled()) continue;
|
||||
APP_VERSION* avp = rp->avp;
|
||||
if (rsc_work_fetch[0].nidle_now) {
|
||||
rsc_work_fetch[0].nidle_now -= avp->avg_ncpus;
|
||||
if (rsc_work_fetch[0].nidle_now <= 0) {
|
||||
nidle_rsc--;
|
||||
rsc_work_fetch[0].nidle_now = 0;
|
||||
}
|
||||
}
|
||||
int j = avp->gpu_usage.rsc_type;
|
||||
if (!j) {
|
||||
continue;
|
||||
}
|
||||
if (rsc_work_fetch[j].nidle_now) {
|
||||
rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage;
|
||||
if (rsc_work_fetch[j].nidle_now <= 0) {
|
||||
nidle_rsc--;
|
||||
rsc_work_fetch[j].nidle_now = 0;
|
||||
}
|
||||
}
|
||||
if (nidle_rsc == 0) {
|
||||
// no idle resources - no need to look further
|
||||
//
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool any_resource_idle() {
|
||||
for (int i=1; i<coprocs.n_rsc; i++) {
|
||||
if (rsc_work_fetch[i].nidle_now > 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// This file is part of BOINC.
|
||||
// http://boinc.berkeley.edu
|
||||
// Copyright (C) 2008 University of California
|
||||
// Copyright (C) 2018 University of California
|
||||
//
|
||||
// BOINC is free software; you can redistribute it and/or modify it
|
||||
// under the terms of the GNU Lesser General Public License
|
||||
|
@ -18,8 +18,9 @@
|
|||
#ifndef BOINC_RR_SIM_H
|
||||
#define BOINC_RR_SIM_H
|
||||
|
||||
|
||||
extern void rr_simulation();
|
||||
extern void print_deadline_misses();
|
||||
extern void get_nidle();
|
||||
extern bool any_resource_idle();
|
||||
|
||||
#endif
|
||||
|
|
|
@ -558,6 +558,7 @@ void SCHEDULER_REPLY::clear() {
|
|||
send_job_log = 0;
|
||||
scheduler_version = 0;
|
||||
got_rss_feeds = false;
|
||||
too_recent = false;
|
||||
}
|
||||
|
||||
SCHEDULER_REPLY::SCHEDULER_REPLY() {
|
||||
|
@ -834,6 +835,9 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) {
|
|||
file_deletes.push_back(delete_file_name);
|
||||
} else if (xp.parse_str("message", msg_buf, sizeof(msg_buf))) {
|
||||
parse_attr(attr_buf, "priority", pri_buf, sizeof(pri_buf));
|
||||
if (strstr(msg_buf, "too recent")) {
|
||||
too_recent = true;
|
||||
}
|
||||
USER_MESSAGE um(msg_buf, pri_buf);
|
||||
messages.push_back(um);
|
||||
continue;
|
||||
|
|
|
@ -134,6 +134,8 @@ struct SCHEDULER_REPLY {
|
|||
std::vector<std::string> trickle_up_urls;
|
||||
bool got_rss_feeds;
|
||||
// whether scheduler reply included <rss_feeds>
|
||||
bool too_recent;
|
||||
// whether reply included "too recent" message
|
||||
|
||||
void clear();
|
||||
SCHEDULER_REPLY();
|
||||
|
|
Loading…
Reference in New Issue