Merge pull request #2460 from BOINC/dpa_am

client: if using a dynamic AM and have idle resources, tell the AM
This commit is contained in:
Kevin Reed 2018-04-13 17:36:57 -05:00 committed by GitHub
commit f65dc21d9d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 192 additions and 30 deletions

View File

@ -169,12 +169,6 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
" <detach_when_done>%d</detach_when_done>\n"
" <ended>%d</ended>\n"
" <resource_share>%f</resource_share>\n"
" <cpu_ec>%f</cpu_ec>\n"
" <cpu_time>%f</cpu_time>\n"
" <gpu_ec>%f</gpu_ec>\n"
" <gpu_time>%f</gpu_time>\n"
" <njobs_success>%d</njobs_success>\n"
" <njobs_error>%d</njobs_error>\n"
" <disk_usage>%f</disk_usage>\n"
" <disk_share>%f</disk_share>\n",
p->master_url,
@ -188,15 +182,39 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
p->detach_when_done?1:0,
p->ended?1:0,
p->resource_share,
p->cpu_ec,
p->cpu_time,
p->gpu_ec,
p->gpu_time,
p->njobs_success,
p->njobs_error,
p->disk_usage,
p->disk_share
);
// send work and starvation-related info
//
if (ami.dynamic) {
fprintf(f,
" <nrpc_failures>%d</nrpc_failures>\n"
" <cpu_ec>%f</cpu_ec>\n"
" <cpu_time>%f</cpu_time>\n"
" <gpu_ec>%f</gpu_ec>\n"
" <gpu_time>%f</gpu_time>\n"
" <njobs_success>%d</njobs_success>\n"
" <njobs_error>%d</njobs_error>\n",
p->nrpc_failures,
p->cpu_ec,
p->cpu_time,
p->gpu_ec,
p->gpu_time,
p->njobs_success,
p->njobs_error
);
for (int j=0; j<coprocs.n_rsc; j++) {
if (p->sched_req_no_work[j]) {
fprintf(f,
" <sched_req_no_work>%s</sched_req_no_work>\n",
coprocs.coprocs[j].type
);
}
}
}
if (p->attached_via_acct_mgr) {
fprintf(f,
" <account_key>%s</account_key>\n",
@ -369,6 +387,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
safe_strcpy(host_venue, "");
safe_strcpy(ami.opaque, "");
ami.no_project_notices = false;
ami.dynamic = false;
rss_feeds.clear();
if (!xp.parse_start("acct_mgr_reply")) return ERR_XML_PARSE;
while (!xp.get_tag()) {
@ -388,6 +407,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
if (xp.parse_string("error", error_str)) continue;
if (xp.parse_string("error_msg", error_str)) continue;
if (xp.parse_double("repeat_sec", repeat_sec)) continue;
if (xp.parse_bool("dynamic", ami.dynamic)) continue;
if (xp.parse_string("message", message)) {
msg_printf(NULL, MSG_INFO, "Account manager: %s", message.c_str());
continue;
@ -595,6 +615,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
safe_strcpy(gstate.acct_mgr_info.password_hash, ami.password_hash);
safe_strcpy(gstate.acct_mgr_info.authenticator, ami.authenticator);
gstate.acct_mgr_info.no_project_notices = ami.no_project_notices;
gstate.acct_mgr_info.dynamic = ami.dynamic;
// process projects
//
@ -822,7 +843,7 @@ int ACCT_MGR_INFO::write_info() {
fclose(f);
}
if (strlen(login_name)) {
if (strlen(login_name) || strlen(authenticator)) {
f = fopen(ACCT_MGR_LOGIN_FILENAME, "w");
if (!f) {
msg_printf(NULL, MSG_USER_ALERT,
@ -852,11 +873,13 @@ int ACCT_MGR_INFO::write_info() {
" <next_rpc_time>%f</next_rpc_time>\n"
" <opaque>\n%s\n"
" </opaque>\n"
" <no_project_notices>%d</no_project_notices>\n",
" <no_project_notices>%d</no_project_notices>\n"
" <dynamic>%d</dynamic>\n",
previous_host_cpid,
next_rpc_time,
opaque,
no_project_notices?1:0
no_project_notices?1:0,
dynamic?1:0
);
user_keywords.write(f);
fprintf(f,
@ -885,6 +908,10 @@ void ACCT_MGR_INFO::clear() {
no_project_notices = false;
cookie_required = false;
user_keywords.clear();
first_starved = 0;
starved_rpc_backoff = 0;
starved_rpc_min_time = 0;
dynamic = false;
}
ACCT_MGR_INFO::ACCT_MGR_INFO() {
@ -923,6 +950,7 @@ int ACCT_MGR_INFO::parse_login_file(FILE* p) {
continue;
}
else if (xp.parse_bool("no_project_notices", no_project_notices)) continue;
else if (xp.parse_bool("dynamic", dynamic)) continue;
else if (xp.match_tag("user_keywords")) {
retval = user_keywords.parse(xp);
if (retval) {
@ -1013,6 +1041,11 @@ int ACCT_MGR_INFO::init() {
return 0;
}
#define STARVED_RPC_DELAY 600
// do RPC after this much starvation
// called once a second
//
bool ACCT_MGR_INFO::poll() {
if (!using_am()) return false;
if (gstate.acct_mgr_op.gui_http->is_busy()) {
@ -1027,5 +1060,47 @@ bool ACCT_MGR_INFO::poll() {
gstate.acct_mgr_op.do_rpc(*this, false);
return true;
}
// if not dynamic AM, we're done
//
if (!dynamic) {
return false;
}
// See if some resource is starved with the current set of projects,
// and if so possibly do a "starved" RPC asking for different projects
// do this check once a minute
//
static int idle_timer = 0;
if (++idle_timer < 60) {
return false;
}
idle_timer = 0;
get_nidle();
if (any_resource_idle()) {
if (first_starved == 0) {
first_starved = gstate.now;
starved_rpc_backoff = STARVED_RPC_DELAY;
starved_rpc_min_time = gstate.now + STARVED_RPC_DELAY;
} else {
if (gstate.now < starved_rpc_min_time) {
return false;
}
msg_printf(NULL, MSG_INFO,
"Some devices idle - requesting new projects from %s",
gstate.acct_mgr_info.project_name
);
gstate.acct_mgr_op.do_rpc(*this, false);
starved_rpc_backoff *= 2;
if (starved_rpc_backoff > 86400) {
starved_rpc_backoff = 86400;
}
starved_rpc_min_time = gstate.now + starved_rpc_backoff;
return true;
}
} else {
first_starved = 0;
}
return false;
}

View File

@ -61,7 +61,7 @@ struct ACCT_MGR_INFO : PROJ_AM {
bool no_project_notices;
// if set, don't show notices from projects
// TODO: get rid of the following
// TODO: get rid of the following here and in the manager
bool cookie_required;
// use of cookies are required during initial signup
// NOTE: This bool gets dropped after the client has
@ -72,9 +72,21 @@ struct ACCT_MGR_INFO : PROJ_AM {
// what login name and password they have been assigned
bool password_error;
bool send_rec;
// send REC in AM RPCs
bool dynamic;
// This AM dynamically decides what projects to assign.
// - send EC in AM RPCs
// - send starvation info if idle resources
USER_KEYWORDS user_keywords;
// user's yes/no keywords.
// These are conveyed to projects in scheduler requests
// vars related to starvation prevention,
// where we issue a "starved RPC" if a resource has been idle
// for more than 10 min
double first_starved; // start of starvation interval
double starved_rpc_backoff; // interval between starved RPCs
double starved_rpc_min_time; // earliest time to do a starved RPC
inline bool using_am() {
if (!strlen(master_url)) return false;

View File

@ -932,8 +932,10 @@ int CLIENT_STATE::handle_scheduler_reply(
workunits.push_back(wup);
}
double est_rsc_runtime[MAX_RSC];
bool got_work_for_rsc[MAX_RSC];
for (int j=0; j<coprocs.n_rsc; j++) {
est_rsc_runtime[j] = 0;
got_work_for_rsc[j] = false;
}
for (i=0; i<sr.results.size(); i++) {
RESULT* rp2 = lookup_result(project, sr.results[i].name);
@ -981,9 +983,11 @@ int CLIENT_STATE::handle_scheduler_reply(
rp->abort_inactive(EXIT_MISSING_COPROC);
} else {
rp->set_state(RESULT_NEW, "handle_scheduler_reply");
got_work_for_rsc[0] = true;
int rt = rp->avp->gpu_usage.rsc_type;
if (rt > 0) {
est_rsc_runtime[rt] += rp->estimated_runtime();
got_work_for_rsc[rt] = true;
gpus_usable = true;
// trigger a check of whether GPU is actually usable
} else {
@ -995,6 +999,21 @@ int CLIENT_STATE::handle_scheduler_reply(
new_results.push_back(rp);
results.push_back(rp);
}
// find the resources for which we requested work and didn't get any
// This is currently used for AM starvation mechanism.
//
if (!sr.too_recent) {
for (int j=0; j<coprocs.n_rsc; j++) {
RSC_WORK_FETCH& rwf = rsc_work_fetch[j];
if (got_work_for_rsc[j]) {
project->sched_req_no_work[j] = false;
} else if (rwf.req_secs>0 || rwf.req_instances>0) {
project->sched_req_no_work[j] = true;
}
}
}
sort_results();
if (log_flags.sched_op_debug) {

View File

@ -945,6 +945,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
bool use_config_file = false;
bool bad_arg = false;
bool url_found=false, name_found=false, password_found = false;
ACCT_MGR_INFO ami;
while (!grc.xp.get_tag()) {
if (grc.xp.parse_string("url", url)) {
@ -970,10 +971,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
"Not using account manager"
);
} else {
url = gstate.acct_mgr_info.master_url;
name = gstate.acct_mgr_info.login_name;
password_hash = gstate.acct_mgr_info.password_hash;
authenticator = gstate.acct_mgr_info.authenticator;
ami = gstate.acct_mgr_info;
}
} else {
bad_arg = !url_found || !name_found || !password_found;
@ -986,7 +984,11 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
// Remove 'hash:'
password_hash = password.substr(5);
}
}
safe_strcpy(ami.master_url, url.c_str());
safe_strcpy(ami.login_name, name.c_str());
safe_strcpy(ami.password_hash, password_hash.c_str());
safe_strcpy(ami.authenticator, authenticator.c_str());
}
}
if (bad_arg) {
@ -997,11 +999,6 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
){
grc.mfout.printf("<error>attached to a different AM - detach first</error>\n");
} else {
ACCT_MGR_INFO ami;
safe_strcpy(ami.master_url, url.c_str());
safe_strcpy(ami.login_name, name.c_str());
safe_strcpy(ami.password_hash, password_hash.c_str());
safe_strcpy(ami.authenticator, authenticator.c_str());
gstate.acct_mgr_op.do_rpc(ami, true);
grc.mfout.printf("<success/>\n");
}

View File

@ -53,6 +53,7 @@ void PROJECT::init() {
no_rsc_config[i] = false;
no_rsc_apps[i] = false;
no_rsc_ams[i] = false;
sched_req_no_work[i] = false;
}
safe_strcpy(host_venue, "");
using_venue_specific_prefs = false;

View File

@ -176,6 +176,9 @@ struct PROJECT : PROJ_AM {
// This provides only the illusion of security.
bool use_symlinks;
bool report_results_immediately;
bool sched_req_no_work[MAX_RSC];
// the last sched request asked for work for resource i
// and didn't get any
// items sent in scheduler replies,
// requesting that various things be sent subsequent requests

View File

@ -558,3 +558,51 @@ void rr_simulation() {
RR_SIM rr_sim;
rr_sim.simulate();
}
// Compute the number of idle instances of each resource
// Put results in global state (rsc_work_fetch)
//
void get_nidle() {
int nidle_rsc = coprocs.n_rsc;
for (int i=1; i<coprocs.n_rsc; i++) {
rsc_work_fetch[i].nidle_now = coprocs.coprocs[i].count;
}
for (unsigned int i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
if (!rp->nearly_runnable()) continue;
if (rp->some_download_stalled()) continue;
APP_VERSION* avp = rp->avp;
if (rsc_work_fetch[0].nidle_now) {
rsc_work_fetch[0].nidle_now -= avp->avg_ncpus;
if (rsc_work_fetch[0].nidle_now <= 0) {
nidle_rsc--;
rsc_work_fetch[0].nidle_now = 0;
}
}
int j = avp->gpu_usage.rsc_type;
if (!j) {
continue;
}
if (rsc_work_fetch[j].nidle_now) {
rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage;
if (rsc_work_fetch[j].nidle_now <= 0) {
nidle_rsc--;
rsc_work_fetch[j].nidle_now = 0;
}
}
if (nidle_rsc == 0) {
// no idle resources - no need to look further
//
break;
}
}
}
bool any_resource_idle() {
for (int i=1; i<coprocs.n_rsc; i++) {
if (rsc_work_fetch[i].nidle_now > 0) {
return true;
}
}
return false;
}

View File

@ -1,6 +1,6 @@
// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
// Copyright (C) 2018 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
@ -18,8 +18,9 @@
#ifndef BOINC_RR_SIM_H
#define BOINC_RR_SIM_H
extern void rr_simulation();
extern void print_deadline_misses();
extern void get_nidle();
extern bool any_resource_idle();
#endif

View File

@ -558,6 +558,7 @@ void SCHEDULER_REPLY::clear() {
send_job_log = 0;
scheduler_version = 0;
got_rss_feeds = false;
too_recent = false;
}
SCHEDULER_REPLY::SCHEDULER_REPLY() {
@ -834,6 +835,9 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) {
file_deletes.push_back(delete_file_name);
} else if (xp.parse_str("message", msg_buf, sizeof(msg_buf))) {
parse_attr(attr_buf, "priority", pri_buf, sizeof(pri_buf));
if (strstr(msg_buf, "too recent")) {
too_recent = true;
}
USER_MESSAGE um(msg_buf, pri_buf);
messages.push_back(um);
continue;

View File

@ -134,6 +134,8 @@ struct SCHEDULER_REPLY {
std::vector<std::string> trickle_up_urls;
bool got_rss_feeds;
// whether scheduler reply included <rss_feeds>
bool too_recent;
// whether reply included "too recent" message
void clear();
SCHEDULER_REPLY();