Debug and finish starvation prevention for dynamic AMs.

- Add an official notion of dynamic AM.
  Conveyed in AM reply message, stored in AM account file.
- Report accounting info only to dynamic AMs
- Fix bug where account info file wasn't written for AMs using authenticators
- In handling GUI RPC to do AM RPC, use info for current AM, e.g. if it's dynamic
This commit is contained in:
David Anderson 2018-04-08 19:46:39 -07:00
parent 99dd146bbb
commit 644f0ae6ba
3 changed files with 38 additions and 32 deletions

View File

@ -169,12 +169,6 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
" <detach_when_done>%d</detach_when_done>\n"
" <ended>%d</ended>\n"
" <resource_share>%f</resource_share>\n"
" <cpu_ec>%f</cpu_ec>\n"
" <cpu_time>%f</cpu_time>\n"
" <gpu_ec>%f</gpu_ec>\n"
" <gpu_time>%f</gpu_time>\n"
" <njobs_success>%d</njobs_success>\n"
" <njobs_error>%d</njobs_error>\n"
" <disk_usage>%f</disk_usage>\n"
" <disk_share>%f</disk_share>\n",
p->master_url,
@ -188,22 +182,28 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
p->detach_when_done?1:0,
p->ended?1:0,
p->resource_share,
p->cpu_ec,
p->cpu_time,
p->gpu_ec,
p->gpu_time,
p->njobs_success,
p->njobs_error,
p->disk_usage,
p->disk_share
);
// send starvation-related info
// send work and starvation-related info
//
if (ami.send_rec) {
if (ami.dynamic) {
fprintf(f,
" <nrpc_failures>%d</nrpc_failures>",
p->nrpc_failures
" <nrpc_failures>%d</nrpc_failures>"
" <cpu_ec>%f</cpu_ec>\n"
" <cpu_time>%f</cpu_time>\n"
" <gpu_ec>%f</gpu_ec>\n"
" <gpu_time>%f</gpu_time>\n"
" <njobs_success>%d</njobs_success>\n"
" <njobs_error>%d</njobs_error>\n",
p->nrpc_failures,
p->cpu_ec,
p->cpu_time,
p->gpu_ec,
p->gpu_time,
p->njobs_success,
p->njobs_error
);
for (int j=0; j<coprocs.n_rsc; j++) {
if (p->sched_req_no_work[j]) {
@ -387,6 +387,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
safe_strcpy(host_venue, "");
safe_strcpy(ami.opaque, "");
ami.no_project_notices = false;
ami.dynamic = false;
rss_feeds.clear();
if (!xp.parse_start("acct_mgr_reply")) return ERR_XML_PARSE;
while (!xp.get_tag()) {
@ -406,6 +407,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
if (xp.parse_string("error", error_str)) continue;
if (xp.parse_string("error_msg", error_str)) continue;
if (xp.parse_double("repeat_sec", repeat_sec)) continue;
if (xp.parse_bool("dynamic", ami.dynamic)) continue;
if (xp.parse_string("message", message)) {
msg_printf(NULL, MSG_INFO, "Account manager: %s", message.c_str());
continue;
@ -613,6 +615,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
safe_strcpy(gstate.acct_mgr_info.password_hash, ami.password_hash);
safe_strcpy(gstate.acct_mgr_info.authenticator, ami.authenticator);
gstate.acct_mgr_info.no_project_notices = ami.no_project_notices;
gstate.acct_mgr_info.dynamic = ami.dynamic;
// process projects
//
@ -840,7 +843,7 @@ int ACCT_MGR_INFO::write_info() {
fclose(f);
}
if (strlen(login_name)) {
if (strlen(login_name) || strlen(authenticator)) {
f = fopen(ACCT_MGR_LOGIN_FILENAME, "w");
if (!f) {
msg_printf(NULL, MSG_USER_ALERT,
@ -870,11 +873,13 @@ int ACCT_MGR_INFO::write_info() {
" <next_rpc_time>%f</next_rpc_time>\n"
" <opaque>\n%s\n"
" </opaque>\n"
" <no_project_notices>%d</no_project_notices>\n",
" <no_project_notices>%d</no_project_notices>\n"
" <dynamic>%d</dynamic>\n",
previous_host_cpid,
next_rpc_time,
opaque,
no_project_notices?1:0
no_project_notices?1:0,
dynamic?1:0
);
user_keywords.write(f);
fprintf(f,
@ -906,6 +911,7 @@ void ACCT_MGR_INFO::clear() {
first_starved = 0;
starved_rpc_backoff = 0;
starved_rpc_min_time = 0;
dynamic = false;
}
ACCT_MGR_INFO::ACCT_MGR_INFO() {
@ -944,6 +950,7 @@ int ACCT_MGR_INFO::parse_login_file(FILE* p) {
continue;
}
else if (xp.parse_bool("no_project_notices", no_project_notices)) continue;
else if (xp.parse_bool("dynamic", dynamic)) continue;
else if (xp.match_tag("user_keywords")) {
retval = user_keywords.parse(xp);
if (retval) {
@ -1056,7 +1063,7 @@ bool ACCT_MGR_INFO::poll() {
// if not dynamic AM, we're done
//
if (!send_rec) {
if (!dynamic) {
return false;
}

View File

@ -72,8 +72,10 @@ struct ACCT_MGR_INFO : PROJ_AM {
// what login name and password they have been assigned
bool password_error;
bool send_rec;
// send REC in AM RPCs
bool dynamic;
// This AM dynamically decides what projects to assign.
// - send EC in AM RPCs
// - send starvation info if idle resources
USER_KEYWORDS user_keywords;
// user's yes/no keywords.
// These are conveyed to projects in scheduler requests

View File

@ -945,6 +945,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
bool use_config_file = false;
bool bad_arg = false;
bool url_found=false, name_found=false, password_found = false;
ACCT_MGR_INFO ami;
while (!grc.xp.get_tag()) {
if (grc.xp.parse_string("url", url)) {
@ -970,10 +971,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
"Not using account manager"
);
} else {
url = gstate.acct_mgr_info.master_url;
name = gstate.acct_mgr_info.login_name;
password_hash = gstate.acct_mgr_info.password_hash;
authenticator = gstate.acct_mgr_info.authenticator;
ami = gstate.acct_mgr_info;
}
} else {
bad_arg = !url_found || !name_found || !password_found;
@ -986,7 +984,11 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
// Remove 'hash:'
password_hash = password.substr(5);
}
}
safe_strcpy(ami.master_url, url.c_str());
safe_strcpy(ami.login_name, name.c_str());
safe_strcpy(ami.password_hash, password_hash.c_str());
safe_strcpy(ami.authenticator, authenticator.c_str());
}
}
if (bad_arg) {
@ -997,11 +999,6 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
){
grc.mfout.printf("<error>attached to a different AM - detach first</error>\n");
} else {
ACCT_MGR_INFO ami;
safe_strcpy(ami.master_url, url.c_str());
safe_strcpy(ami.login_name, name.c_str());
safe_strcpy(ami.password_hash, password_hash.c_str());
safe_strcpy(ami.authenticator, authenticator.c_str());
gstate.acct_mgr_op.do_rpc(ami, true);
grc.mfout.printf("<success/>\n");
}