mirror of https://github.com/BOINC/boinc.git
Merge pull request #2460 from BOINC/dpa_am
client: if using a dynamic AM and have idle resources, tell the AM
This commit is contained in:
commit
f65dc21d9d
|
@ -169,12 +169,6 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
|
|||
" <detach_when_done>%d</detach_when_done>\n"
|
||||
" <ended>%d</ended>\n"
|
||||
" <resource_share>%f</resource_share>\n"
|
||||
" <cpu_ec>%f</cpu_ec>\n"
|
||||
" <cpu_time>%f</cpu_time>\n"
|
||||
" <gpu_ec>%f</gpu_ec>\n"
|
||||
" <gpu_time>%f</gpu_time>\n"
|
||||
" <njobs_success>%d</njobs_success>\n"
|
||||
" <njobs_error>%d</njobs_error>\n"
|
||||
" <disk_usage>%f</disk_usage>\n"
|
||||
" <disk_share>%f</disk_share>\n",
|
||||
p->master_url,
|
||||
|
@ -188,15 +182,39 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
|
|||
p->detach_when_done?1:0,
|
||||
p->ended?1:0,
|
||||
p->resource_share,
|
||||
p->cpu_ec,
|
||||
p->cpu_time,
|
||||
p->gpu_ec,
|
||||
p->gpu_time,
|
||||
p->njobs_success,
|
||||
p->njobs_error,
|
||||
p->disk_usage,
|
||||
p->disk_share
|
||||
);
|
||||
|
||||
// send work and starvation-related info
|
||||
//
|
||||
if (ami.dynamic) {
|
||||
fprintf(f,
|
||||
" <nrpc_failures>%d</nrpc_failures>\n"
|
||||
" <cpu_ec>%f</cpu_ec>\n"
|
||||
" <cpu_time>%f</cpu_time>\n"
|
||||
" <gpu_ec>%f</gpu_ec>\n"
|
||||
" <gpu_time>%f</gpu_time>\n"
|
||||
" <njobs_success>%d</njobs_success>\n"
|
||||
" <njobs_error>%d</njobs_error>\n",
|
||||
p->nrpc_failures,
|
||||
p->cpu_ec,
|
||||
p->cpu_time,
|
||||
p->gpu_ec,
|
||||
p->gpu_time,
|
||||
p->njobs_success,
|
||||
p->njobs_error
|
||||
);
|
||||
for (int j=0; j<coprocs.n_rsc; j++) {
|
||||
if (p->sched_req_no_work[j]) {
|
||||
fprintf(f,
|
||||
" <sched_req_no_work>%s</sched_req_no_work>\n",
|
||||
coprocs.coprocs[j].type
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (p->attached_via_acct_mgr) {
|
||||
fprintf(f,
|
||||
" <account_key>%s</account_key>\n",
|
||||
|
@ -369,6 +387,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
|
|||
safe_strcpy(host_venue, "");
|
||||
safe_strcpy(ami.opaque, "");
|
||||
ami.no_project_notices = false;
|
||||
ami.dynamic = false;
|
||||
rss_feeds.clear();
|
||||
if (!xp.parse_start("acct_mgr_reply")) return ERR_XML_PARSE;
|
||||
while (!xp.get_tag()) {
|
||||
|
@ -388,6 +407,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
|
|||
if (xp.parse_string("error", error_str)) continue;
|
||||
if (xp.parse_string("error_msg", error_str)) continue;
|
||||
if (xp.parse_double("repeat_sec", repeat_sec)) continue;
|
||||
if (xp.parse_bool("dynamic", ami.dynamic)) continue;
|
||||
if (xp.parse_string("message", message)) {
|
||||
msg_printf(NULL, MSG_INFO, "Account manager: %s", message.c_str());
|
||||
continue;
|
||||
|
@ -595,6 +615,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
|
|||
safe_strcpy(gstate.acct_mgr_info.password_hash, ami.password_hash);
|
||||
safe_strcpy(gstate.acct_mgr_info.authenticator, ami.authenticator);
|
||||
gstate.acct_mgr_info.no_project_notices = ami.no_project_notices;
|
||||
gstate.acct_mgr_info.dynamic = ami.dynamic;
|
||||
|
||||
// process projects
|
||||
//
|
||||
|
@ -822,7 +843,7 @@ int ACCT_MGR_INFO::write_info() {
|
|||
fclose(f);
|
||||
}
|
||||
|
||||
if (strlen(login_name)) {
|
||||
if (strlen(login_name) || strlen(authenticator)) {
|
||||
f = fopen(ACCT_MGR_LOGIN_FILENAME, "w");
|
||||
if (!f) {
|
||||
msg_printf(NULL, MSG_USER_ALERT,
|
||||
|
@ -852,11 +873,13 @@ int ACCT_MGR_INFO::write_info() {
|
|||
" <next_rpc_time>%f</next_rpc_time>\n"
|
||||
" <opaque>\n%s\n"
|
||||
" </opaque>\n"
|
||||
" <no_project_notices>%d</no_project_notices>\n",
|
||||
" <no_project_notices>%d</no_project_notices>\n"
|
||||
" <dynamic>%d</dynamic>\n",
|
||||
previous_host_cpid,
|
||||
next_rpc_time,
|
||||
opaque,
|
||||
no_project_notices?1:0
|
||||
no_project_notices?1:0,
|
||||
dynamic?1:0
|
||||
);
|
||||
user_keywords.write(f);
|
||||
fprintf(f,
|
||||
|
@ -885,6 +908,10 @@ void ACCT_MGR_INFO::clear() {
|
|||
no_project_notices = false;
|
||||
cookie_required = false;
|
||||
user_keywords.clear();
|
||||
first_starved = 0;
|
||||
starved_rpc_backoff = 0;
|
||||
starved_rpc_min_time = 0;
|
||||
dynamic = false;
|
||||
}
|
||||
|
||||
ACCT_MGR_INFO::ACCT_MGR_INFO() {
|
||||
|
@ -923,6 +950,7 @@ int ACCT_MGR_INFO::parse_login_file(FILE* p) {
|
|||
continue;
|
||||
}
|
||||
else if (xp.parse_bool("no_project_notices", no_project_notices)) continue;
|
||||
else if (xp.parse_bool("dynamic", dynamic)) continue;
|
||||
else if (xp.match_tag("user_keywords")) {
|
||||
retval = user_keywords.parse(xp);
|
||||
if (retval) {
|
||||
|
@ -1013,6 +1041,11 @@ int ACCT_MGR_INFO::init() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
#define STARVED_RPC_DELAY 600
|
||||
// do RPC after this much starvation
|
||||
|
||||
// called once a second
|
||||
//
|
||||
bool ACCT_MGR_INFO::poll() {
|
||||
if (!using_am()) return false;
|
||||
if (gstate.acct_mgr_op.gui_http->is_busy()) {
|
||||
|
@ -1027,5 +1060,47 @@ bool ACCT_MGR_INFO::poll() {
|
|||
gstate.acct_mgr_op.do_rpc(*this, false);
|
||||
return true;
|
||||
}
|
||||
|
||||
// if not dynamic AM, we're done
|
||||
//
|
||||
if (!dynamic) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// See if some resource is starved with the current set of projects,
|
||||
// and if so possibly do a "starved" RPC asking for different projects
|
||||
|
||||
// do this check once a minute
|
||||
//
|
||||
static int idle_timer = 0;
|
||||
if (++idle_timer < 60) {
|
||||
return false;
|
||||
}
|
||||
idle_timer = 0;
|
||||
get_nidle();
|
||||
if (any_resource_idle()) {
|
||||
if (first_starved == 0) {
|
||||
first_starved = gstate.now;
|
||||
starved_rpc_backoff = STARVED_RPC_DELAY;
|
||||
starved_rpc_min_time = gstate.now + STARVED_RPC_DELAY;
|
||||
} else {
|
||||
if (gstate.now < starved_rpc_min_time) {
|
||||
return false;
|
||||
}
|
||||
msg_printf(NULL, MSG_INFO,
|
||||
"Some devices idle - requesting new projects from %s",
|
||||
gstate.acct_mgr_info.project_name
|
||||
);
|
||||
gstate.acct_mgr_op.do_rpc(*this, false);
|
||||
starved_rpc_backoff *= 2;
|
||||
if (starved_rpc_backoff > 86400) {
|
||||
starved_rpc_backoff = 86400;
|
||||
}
|
||||
starved_rpc_min_time = gstate.now + starved_rpc_backoff;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
first_starved = 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -61,7 +61,7 @@ struct ACCT_MGR_INFO : PROJ_AM {
|
|||
bool no_project_notices;
|
||||
// if set, don't show notices from projects
|
||||
|
||||
// TODO: get rid of the following
|
||||
// TODO: get rid of the following here and in the manager
|
||||
bool cookie_required;
|
||||
// use of cookies are required during initial signup
|
||||
// NOTE: This bool gets dropped after the client has
|
||||
|
@ -72,9 +72,21 @@ struct ACCT_MGR_INFO : PROJ_AM {
|
|||
// what login name and password they have been assigned
|
||||
|
||||
bool password_error;
|
||||
bool send_rec;
|
||||
// send REC in AM RPCs
|
||||
bool dynamic;
|
||||
// This AM dynamically decides what projects to assign.
|
||||
// - send EC in AM RPCs
|
||||
// - send starvation info if idle resources
|
||||
USER_KEYWORDS user_keywords;
|
||||
// user's yes/no keywords.
|
||||
// These are conveyed to projects in scheduler requests
|
||||
|
||||
// vars related to starvation prevention,
|
||||
// where we issue a "starved RPC" if a resource has been idle
|
||||
// for more than 10 min
|
||||
|
||||
double first_starved; // start of starvation interval
|
||||
double starved_rpc_backoff; // interval between starved RPCs
|
||||
double starved_rpc_min_time; // earliest time to do a starved RPC
|
||||
|
||||
inline bool using_am() {
|
||||
if (!strlen(master_url)) return false;
|
||||
|
|
|
@ -932,8 +932,10 @@ int CLIENT_STATE::handle_scheduler_reply(
|
|||
workunits.push_back(wup);
|
||||
}
|
||||
double est_rsc_runtime[MAX_RSC];
|
||||
bool got_work_for_rsc[MAX_RSC];
|
||||
for (int j=0; j<coprocs.n_rsc; j++) {
|
||||
est_rsc_runtime[j] = 0;
|
||||
got_work_for_rsc[j] = false;
|
||||
}
|
||||
for (i=0; i<sr.results.size(); i++) {
|
||||
RESULT* rp2 = lookup_result(project, sr.results[i].name);
|
||||
|
@ -981,9 +983,11 @@ int CLIENT_STATE::handle_scheduler_reply(
|
|||
rp->abort_inactive(EXIT_MISSING_COPROC);
|
||||
} else {
|
||||
rp->set_state(RESULT_NEW, "handle_scheduler_reply");
|
||||
got_work_for_rsc[0] = true;
|
||||
int rt = rp->avp->gpu_usage.rsc_type;
|
||||
if (rt > 0) {
|
||||
est_rsc_runtime[rt] += rp->estimated_runtime();
|
||||
got_work_for_rsc[rt] = true;
|
||||
gpus_usable = true;
|
||||
// trigger a check of whether GPU is actually usable
|
||||
} else {
|
||||
|
@ -995,6 +999,21 @@ int CLIENT_STATE::handle_scheduler_reply(
|
|||
new_results.push_back(rp);
|
||||
results.push_back(rp);
|
||||
}
|
||||
|
||||
// find the resources for which we requested work and didn't get any
|
||||
// This is currently used for AM starvation mechanism.
|
||||
//
|
||||
if (!sr.too_recent) {
|
||||
for (int j=0; j<coprocs.n_rsc; j++) {
|
||||
RSC_WORK_FETCH& rwf = rsc_work_fetch[j];
|
||||
if (got_work_for_rsc[j]) {
|
||||
project->sched_req_no_work[j] = false;
|
||||
} else if (rwf.req_secs>0 || rwf.req_instances>0) {
|
||||
project->sched_req_no_work[j] = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sort_results();
|
||||
|
||||
if (log_flags.sched_op_debug) {
|
||||
|
|
|
@ -945,6 +945,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
|
|||
bool use_config_file = false;
|
||||
bool bad_arg = false;
|
||||
bool url_found=false, name_found=false, password_found = false;
|
||||
ACCT_MGR_INFO ami;
|
||||
|
||||
while (!grc.xp.get_tag()) {
|
||||
if (grc.xp.parse_string("url", url)) {
|
||||
|
@ -970,10 +971,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
|
|||
"Not using account manager"
|
||||
);
|
||||
} else {
|
||||
url = gstate.acct_mgr_info.master_url;
|
||||
name = gstate.acct_mgr_info.login_name;
|
||||
password_hash = gstate.acct_mgr_info.password_hash;
|
||||
authenticator = gstate.acct_mgr_info.authenticator;
|
||||
ami = gstate.acct_mgr_info;
|
||||
}
|
||||
} else {
|
||||
bad_arg = !url_found || !name_found || !password_found;
|
||||
|
@ -986,7 +984,11 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
|
|||
// Remove 'hash:'
|
||||
password_hash = password.substr(5);
|
||||
}
|
||||
}
|
||||
safe_strcpy(ami.master_url, url.c_str());
|
||||
safe_strcpy(ami.login_name, name.c_str());
|
||||
safe_strcpy(ami.password_hash, password_hash.c_str());
|
||||
safe_strcpy(ami.authenticator, authenticator.c_str());
|
||||
}
|
||||
}
|
||||
|
||||
if (bad_arg) {
|
||||
|
@ -997,11 +999,6 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
|
|||
){
|
||||
grc.mfout.printf("<error>attached to a different AM - detach first</error>\n");
|
||||
} else {
|
||||
ACCT_MGR_INFO ami;
|
||||
safe_strcpy(ami.master_url, url.c_str());
|
||||
safe_strcpy(ami.login_name, name.c_str());
|
||||
safe_strcpy(ami.password_hash, password_hash.c_str());
|
||||
safe_strcpy(ami.authenticator, authenticator.c_str());
|
||||
gstate.acct_mgr_op.do_rpc(ami, true);
|
||||
grc.mfout.printf("<success/>\n");
|
||||
}
|
||||
|
|
|
@ -53,6 +53,7 @@ void PROJECT::init() {
|
|||
no_rsc_config[i] = false;
|
||||
no_rsc_apps[i] = false;
|
||||
no_rsc_ams[i] = false;
|
||||
sched_req_no_work[i] = false;
|
||||
}
|
||||
safe_strcpy(host_venue, "");
|
||||
using_venue_specific_prefs = false;
|
||||
|
|
|
@ -176,6 +176,9 @@ struct PROJECT : PROJ_AM {
|
|||
// This provides only the illusion of security.
|
||||
bool use_symlinks;
|
||||
bool report_results_immediately;
|
||||
bool sched_req_no_work[MAX_RSC];
|
||||
// the last sched request asked for work for resource i
|
||||
// and didn't get any
|
||||
|
||||
// items sent in scheduler replies,
|
||||
// requesting that various things be sent subsequent requests
|
||||
|
|
|
@ -558,3 +558,51 @@ void rr_simulation() {
|
|||
RR_SIM rr_sim;
|
||||
rr_sim.simulate();
|
||||
}
|
||||
|
||||
// Compute the number of idle instances of each resource
|
||||
// Put results in global state (rsc_work_fetch)
|
||||
//
|
||||
void get_nidle() {
|
||||
int nidle_rsc = coprocs.n_rsc;
|
||||
for (int i=1; i<coprocs.n_rsc; i++) {
|
||||
rsc_work_fetch[i].nidle_now = coprocs.coprocs[i].count;
|
||||
}
|
||||
for (unsigned int i=0; i<gstate.results.size(); i++) {
|
||||
RESULT* rp = gstate.results[i];
|
||||
if (!rp->nearly_runnable()) continue;
|
||||
if (rp->some_download_stalled()) continue;
|
||||
APP_VERSION* avp = rp->avp;
|
||||
if (rsc_work_fetch[0].nidle_now) {
|
||||
rsc_work_fetch[0].nidle_now -= avp->avg_ncpus;
|
||||
if (rsc_work_fetch[0].nidle_now <= 0) {
|
||||
nidle_rsc--;
|
||||
rsc_work_fetch[0].nidle_now = 0;
|
||||
}
|
||||
}
|
||||
int j = avp->gpu_usage.rsc_type;
|
||||
if (!j) {
|
||||
continue;
|
||||
}
|
||||
if (rsc_work_fetch[j].nidle_now) {
|
||||
rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage;
|
||||
if (rsc_work_fetch[j].nidle_now <= 0) {
|
||||
nidle_rsc--;
|
||||
rsc_work_fetch[j].nidle_now = 0;
|
||||
}
|
||||
}
|
||||
if (nidle_rsc == 0) {
|
||||
// no idle resources - no need to look further
|
||||
//
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool any_resource_idle() {
|
||||
for (int i=1; i<coprocs.n_rsc; i++) {
|
||||
if (rsc_work_fetch[i].nidle_now > 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// This file is part of BOINC.
|
||||
// http://boinc.berkeley.edu
|
||||
// Copyright (C) 2008 University of California
|
||||
// Copyright (C) 2018 University of California
|
||||
//
|
||||
// BOINC is free software; you can redistribute it and/or modify it
|
||||
// under the terms of the GNU Lesser General Public License
|
||||
|
@ -18,8 +18,9 @@
|
|||
#ifndef BOINC_RR_SIM_H
|
||||
#define BOINC_RR_SIM_H
|
||||
|
||||
|
||||
extern void rr_simulation();
|
||||
extern void print_deadline_misses();
|
||||
extern void get_nidle();
|
||||
extern bool any_resource_idle();
|
||||
|
||||
#endif
|
||||
|
|
|
@ -558,6 +558,7 @@ void SCHEDULER_REPLY::clear() {
|
|||
send_job_log = 0;
|
||||
scheduler_version = 0;
|
||||
got_rss_feeds = false;
|
||||
too_recent = false;
|
||||
}
|
||||
|
||||
SCHEDULER_REPLY::SCHEDULER_REPLY() {
|
||||
|
@ -834,6 +835,9 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) {
|
|||
file_deletes.push_back(delete_file_name);
|
||||
} else if (xp.parse_str("message", msg_buf, sizeof(msg_buf))) {
|
||||
parse_attr(attr_buf, "priority", pri_buf, sizeof(pri_buf));
|
||||
if (strstr(msg_buf, "too recent")) {
|
||||
too_recent = true;
|
||||
}
|
||||
USER_MESSAGE um(msg_buf, pri_buf);
|
||||
messages.push_back(um);
|
||||
continue;
|
||||
|
|
|
@ -134,6 +134,8 @@ struct SCHEDULER_REPLY {
|
|||
std::vector<std::string> trickle_up_urls;
|
||||
bool got_rss_feeds;
|
||||
// whether scheduler reply included <rss_feeds>
|
||||
bool too_recent;
|
||||
// whether reply included "too recent" message
|
||||
|
||||
void clear();
|
||||
SCHEDULER_REPLY();
|
||||
|
|
Loading…
Reference in New Issue