From 99dd146bbba204a18bb79a8a30c1f7a58cea3e1a Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sat, 7 Apr 2018 15:42:19 -0700 Subject: [PATCH 1/5] client: if using a dynamic AM and have idle resources, tell the AM See issue #2452. If using a dynamic AM, the client may starve, e.g. because the assigned projects are down. Detect this, and send "starvation RPCs" to the AM giving details about the situation. But don't do this too often; use exponential backoff. --- client/acct_mgr.cpp | 64 +++++++++++++++++++++++++++++++++++++++++ client/acct_mgr.h | 12 +++++++- client/cs_scheduler.cpp | 18 ++++++++++++ client/project.cpp | 1 + client/project.h | 3 ++ client/rr_sim.cpp | 48 +++++++++++++++++++++++++++++++ client/rr_sim.h | 5 ++-- client/scheduler_op.cpp | 4 +++ client/scheduler_op.h | 2 ++ 9 files changed, 154 insertions(+), 3 deletions(-) diff --git a/client/acct_mgr.cpp b/client/acct_mgr.cpp index d708bcf60c..e84275ba06 100644 --- a/client/acct_mgr.cpp +++ b/client/acct_mgr.cpp @@ -197,6 +197,24 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) { p->disk_usage, p->disk_share ); + + // send starvation-related info + // + if (ami.send_rec) { + fprintf(f, + " %d", + p->nrpc_failures + ); + for (int j=0; jsched_req_no_work[j]) { + fprintf(f, + " %s\n", + coprocs.coprocs[j].type + ); + } + } + } + if (p->attached_via_acct_mgr) { fprintf(f, " %s\n", @@ -885,6 +903,9 @@ void ACCT_MGR_INFO::clear() { no_project_notices = false; cookie_required = false; user_keywords.clear(); + first_starved = 0; + starved_rpc_backoff = 0; + starved_rpc_min_time = 0; } ACCT_MGR_INFO::ACCT_MGR_INFO() { @@ -1013,6 +1034,11 @@ int ACCT_MGR_INFO::init() { return 0; } +#define STARVED_RPC_DELAY 600 + // do RPC after this much starvation + +// called once a second +// bool ACCT_MGR_INFO::poll() { if (!using_am()) return false; if (gstate.acct_mgr_op.gui_http->is_busy()) { @@ -1027,5 +1053,43 @@ bool ACCT_MGR_INFO::poll() { gstate.acct_mgr_op.do_rpc(*this, false); return true; } + + // if not dynamic AM, we're done + // + if (!send_rec) { + return false; + } + + // See if some resource is starved with the current set of projects, + // and if so possibly do a "starved" RPC asking for different projects + + // do this check once a minute + // + static int idle_timer = 0; + if (++idle_timer < 60) { + return false; + } + idle_timer = 0; + get_nidle(); + if (any_resource_idle()) { + if (first_starved == 0) { + first_starved = gstate.now; + starved_rpc_backoff = STARVED_RPC_DELAY; + starved_rpc_min_time = gstate.now + STARVED_RPC_DELAY; + } else { + if (gstate.now < starved_rpc_min_time) { + return false; + } + gstate.acct_mgr_op.do_rpc(*this, false); + starved_rpc_backoff *= 2; + if (starved_rpc_backoff > 86400) { + starved_rpc_backoff = 86400; + } + starved_rpc_min_time = gstate.now + starved_rpc_backoff; + return true; + } + } else { + first_starved = 0; + } return false; } diff --git a/client/acct_mgr.h b/client/acct_mgr.h index ba2638e646..ddbe22ffb9 100644 --- a/client/acct_mgr.h +++ b/client/acct_mgr.h @@ -61,7 +61,7 @@ struct ACCT_MGR_INFO : PROJ_AM { bool no_project_notices; // if set, don't show notices from projects - // TODO: get rid of the following + // TODO: get rid of the following here and in the manager bool cookie_required; // use of cookies are required during initial signup // NOTE: This bool gets dropped after the client has @@ -75,6 +75,16 @@ struct ACCT_MGR_INFO : PROJ_AM { bool send_rec; // send REC in AM RPCs USER_KEYWORDS user_keywords; + // user's yes/no keywords. + // These are conveyed to projects in scheduler requests + + // vars related to starvation prevention, + // where we issue a "starved RPC" if a resource has been idle + // for more than 10 min + + double first_starved; // start of starvation interval + double starved_rpc_backoff; // interval between starved RPCs + double starved_rpc_min_time; // earliest time to do a starved RPC inline bool using_am() { if (!strlen(master_url)) return false; diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index 750bb887fc..eed5079365 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -932,8 +932,10 @@ int CLIENT_STATE::handle_scheduler_reply( workunits.push_back(wup); } double est_rsc_runtime[MAX_RSC]; + bool got_work_for_rsc[MAX_RSC]; for (int j=0; jabort_inactive(EXIT_MISSING_COPROC); } else { rp->set_state(RESULT_NEW, "handle_scheduler_reply"); + got_work_for_rsc[0] = true; int rt = rp->avp->gpu_usage.rsc_type; if (rt > 0) { est_rsc_runtime[rt] += rp->estimated_runtime(); + got_work_for_rsc[rt] = true; gpus_usable = true; // trigger a check of whether GPU is actually usable } else { @@ -995,6 +999,20 @@ int CLIENT_STATE::handle_scheduler_reply( new_results.push_back(rp); results.push_back(rp); } + + // find the resources for which we requested work and didn't get any + // This is currently used for AM starvation mechanism. + // + if (!sr.too_recent) { + for (int j=0; jsched_req_no_work[j] = + (rwf.req_secs>0 || rwf.req_instances>0) + && !got_work_for_rsc[j] + ; + } + } + sort_results(); if (log_flags.sched_op_debug) { diff --git a/client/project.cpp b/client/project.cpp index eab070619c..3059a479e2 100644 --- a/client/project.cpp +++ b/client/project.cpp @@ -53,6 +53,7 @@ void PROJECT::init() { no_rsc_config[i] = false; no_rsc_apps[i] = false; no_rsc_ams[i] = false; + sched_req_no_work[i] = false; } safe_strcpy(host_venue, ""); using_venue_specific_prefs = false; diff --git a/client/project.h b/client/project.h index 27428aac7f..8f863bc88d 100644 --- a/client/project.h +++ b/client/project.h @@ -176,6 +176,9 @@ struct PROJECT : PROJ_AM { // This provides only the illusion of security. bool use_symlinks; bool report_results_immediately; + bool sched_req_no_work[MAX_RSC]; + // the last sched request asked for work for resource i + // and didn't get any // items sent in scheduler replies, // requesting that various things be sent subsequent requests diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index 71242fe597..b4eebeb420 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -558,3 +558,51 @@ void rr_simulation() { RR_SIM rr_sim; rr_sim.simulate(); } + +// Compute the number of idle instances of each resource +// Put results in global state (rsc_work_fetch) +// +void get_nidle() { + int nidle_rsc = coprocs.n_rsc; + for (int i=1; inearly_runnable()) continue; + if (rp->some_download_stalled()) continue; + APP_VERSION* avp = rp->avp; + if (rsc_work_fetch[0].nidle_now) { + rsc_work_fetch[0].nidle_now -= avp->avg_ncpus; + if (rsc_work_fetch[0].nidle_now <= 0) { + nidle_rsc--; + rsc_work_fetch[0].nidle_now = 0; + } + } + int j = avp->gpu_usage.rsc_type; + if (!j) { + continue; + } + if (rsc_work_fetch[j].nidle_now) { + rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage; + if (rsc_work_fetch[j].nidle_now <= 0) { + nidle_rsc--; + rsc_work_fetch[j].nidle_now = 0; + } + } + if (nidle_rsc == 0) { + // no idle resources - no need to look further + // + break; + } + } +} + +bool any_resource_idle() { + for (int i=1; i 0) { + return true; + } + } + return false; +} diff --git a/client/rr_sim.h b/client/rr_sim.h index 18d7c2785b..d8c6c1901d 100644 --- a/client/rr_sim.h +++ b/client/rr_sim.h @@ -1,6 +1,6 @@ // This file is part of BOINC. // http://boinc.berkeley.edu -// Copyright (C) 2008 University of California +// Copyright (C) 2018 University of California // // BOINC is free software; you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License @@ -18,8 +18,9 @@ #ifndef BOINC_RR_SIM_H #define BOINC_RR_SIM_H - extern void rr_simulation(); extern void print_deadline_misses(); +extern void get_nidle(); +extern bool any_resource_idle(); #endif diff --git a/client/scheduler_op.cpp b/client/scheduler_op.cpp index afc9ba31b9..f8db720ad8 100644 --- a/client/scheduler_op.cpp +++ b/client/scheduler_op.cpp @@ -558,6 +558,7 @@ void SCHEDULER_REPLY::clear() { send_job_log = 0; scheduler_version = 0; got_rss_feeds = false; + too_recent = false; } SCHEDULER_REPLY::SCHEDULER_REPLY() { @@ -834,6 +835,9 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) { file_deletes.push_back(delete_file_name); } else if (xp.parse_str("message", msg_buf, sizeof(msg_buf))) { parse_attr(attr_buf, "priority", pri_buf, sizeof(pri_buf)); + if (strstr(msg_buf, "too recent")) { + too_recent = true; + } USER_MESSAGE um(msg_buf, pri_buf); messages.push_back(um); continue; diff --git a/client/scheduler_op.h b/client/scheduler_op.h index 7125ba3c78..576a556b82 100644 --- a/client/scheduler_op.h +++ b/client/scheduler_op.h @@ -134,6 +134,8 @@ struct SCHEDULER_REPLY { std::vector trickle_up_urls; bool got_rss_feeds; // whether scheduler reply included + bool too_recent; + // whether reply included "too recent" message void clear(); SCHEDULER_REPLY(); From 644f0ae6ba05f506aeaf83a75b01cac471bdfc79 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Sun, 8 Apr 2018 19:46:39 -0700 Subject: [PATCH 2/5] Debug and finish starvation prevention for dynamic AMs. - Add an official notion of dynamic AM. Conveyed in AM reply message, stored in AM account file. - Report accounting info only to dynamic AMs - Fix bug where account info file wasn't written for AMs using authenticators - In handling GUI RPC to do AM RPC, use info for current AM, e.g. if it's dynamic --- client/acct_mgr.cpp | 47 ++++++++++++++++++++--------------- client/acct_mgr.h | 6 +++-- client/gui_rpc_server_ops.cpp | 17 ++++++------- 3 files changed, 38 insertions(+), 32 deletions(-) diff --git a/client/acct_mgr.cpp b/client/acct_mgr.cpp index e84275ba06..a4d2192d61 100644 --- a/client/acct_mgr.cpp +++ b/client/acct_mgr.cpp @@ -169,12 +169,6 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) { " %d\n" " %d\n" " %f\n" - " %f\n" - " %f\n" - " %f\n" - " %f\n" - " %d\n" - " %d\n" " %f\n" " %f\n", p->master_url, @@ -188,22 +182,28 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) { p->detach_when_done?1:0, p->ended?1:0, p->resource_share, - p->cpu_ec, - p->cpu_time, - p->gpu_ec, - p->gpu_time, - p->njobs_success, - p->njobs_error, p->disk_usage, p->disk_share ); - // send starvation-related info + // send work and starvation-related info // - if (ami.send_rec) { + if (ami.dynamic) { fprintf(f, - " %d", - p->nrpc_failures + " %d" + " %f\n" + " %f\n" + " %f\n" + " %f\n" + " %d\n" + " %d\n", + p->nrpc_failures, + p->cpu_ec, + p->cpu_time, + p->gpu_ec, + p->gpu_time, + p->njobs_success, + p->njobs_error ); for (int j=0; jsched_req_no_work[j]) { @@ -387,6 +387,7 @@ int ACCT_MGR_OP::parse(FILE* f) { safe_strcpy(host_venue, ""); safe_strcpy(ami.opaque, ""); ami.no_project_notices = false; + ami.dynamic = false; rss_feeds.clear(); if (!xp.parse_start("acct_mgr_reply")) return ERR_XML_PARSE; while (!xp.get_tag()) { @@ -406,6 +407,7 @@ int ACCT_MGR_OP::parse(FILE* f) { if (xp.parse_string("error", error_str)) continue; if (xp.parse_string("error_msg", error_str)) continue; if (xp.parse_double("repeat_sec", repeat_sec)) continue; + if (xp.parse_bool("dynamic", ami.dynamic)) continue; if (xp.parse_string("message", message)) { msg_printf(NULL, MSG_INFO, "Account manager: %s", message.c_str()); continue; @@ -613,6 +615,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) { safe_strcpy(gstate.acct_mgr_info.password_hash, ami.password_hash); safe_strcpy(gstate.acct_mgr_info.authenticator, ami.authenticator); gstate.acct_mgr_info.no_project_notices = ami.no_project_notices; + gstate.acct_mgr_info.dynamic = ami.dynamic; // process projects // @@ -840,7 +843,7 @@ int ACCT_MGR_INFO::write_info() { fclose(f); } - if (strlen(login_name)) { + if (strlen(login_name) || strlen(authenticator)) { f = fopen(ACCT_MGR_LOGIN_FILENAME, "w"); if (!f) { msg_printf(NULL, MSG_USER_ALERT, @@ -870,11 +873,13 @@ int ACCT_MGR_INFO::write_info() { " %f\n" " \n%s\n" " \n" - " %d\n", + " %d\n" + " %d\n", previous_host_cpid, next_rpc_time, opaque, - no_project_notices?1:0 + no_project_notices?1:0, + dynamic?1:0 ); user_keywords.write(f); fprintf(f, @@ -906,6 +911,7 @@ void ACCT_MGR_INFO::clear() { first_starved = 0; starved_rpc_backoff = 0; starved_rpc_min_time = 0; + dynamic = false; } ACCT_MGR_INFO::ACCT_MGR_INFO() { @@ -944,6 +950,7 @@ int ACCT_MGR_INFO::parse_login_file(FILE* p) { continue; } else if (xp.parse_bool("no_project_notices", no_project_notices)) continue; + else if (xp.parse_bool("dynamic", dynamic)) continue; else if (xp.match_tag("user_keywords")) { retval = user_keywords.parse(xp); if (retval) { @@ -1056,7 +1063,7 @@ bool ACCT_MGR_INFO::poll() { // if not dynamic AM, we're done // - if (!send_rec) { + if (!dynamic) { return false; } diff --git a/client/acct_mgr.h b/client/acct_mgr.h index ddbe22ffb9..ab4ce99a21 100644 --- a/client/acct_mgr.h +++ b/client/acct_mgr.h @@ -72,8 +72,10 @@ struct ACCT_MGR_INFO : PROJ_AM { // what login name and password they have been assigned bool password_error; - bool send_rec; - // send REC in AM RPCs + bool dynamic; + // This AM dynamically decides what projects to assign. + // - send EC in AM RPCs + // - send starvation info if idle resources USER_KEYWORDS user_keywords; // user's yes/no keywords. // These are conveyed to projects in scheduler requests diff --git a/client/gui_rpc_server_ops.cpp b/client/gui_rpc_server_ops.cpp index a950cccf16..5f1318a1e3 100644 --- a/client/gui_rpc_server_ops.cpp +++ b/client/gui_rpc_server_ops.cpp @@ -945,6 +945,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { bool use_config_file = false; bool bad_arg = false; bool url_found=false, name_found=false, password_found = false; + ACCT_MGR_INFO ami; while (!grc.xp.get_tag()) { if (grc.xp.parse_string("url", url)) { @@ -970,10 +971,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { "Not using account manager" ); } else { - url = gstate.acct_mgr_info.master_url; - name = gstate.acct_mgr_info.login_name; - password_hash = gstate.acct_mgr_info.password_hash; - authenticator = gstate.acct_mgr_info.authenticator; + ami = gstate.acct_mgr_info; } } else { bad_arg = !url_found || !name_found || !password_found; @@ -986,7 +984,11 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { // Remove 'hash:' password_hash = password.substr(5); } - } + safe_strcpy(ami.master_url, url.c_str()); + safe_strcpy(ami.login_name, name.c_str()); + safe_strcpy(ami.password_hash, password_hash.c_str()); + safe_strcpy(ami.authenticator, authenticator.c_str()); + } } if (bad_arg) { @@ -997,11 +999,6 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { ){ grc.mfout.printf("attached to a different AM - detach first\n"); } else { - ACCT_MGR_INFO ami; - safe_strcpy(ami.master_url, url.c_str()); - safe_strcpy(ami.login_name, name.c_str()); - safe_strcpy(ami.password_hash, password_hash.c_str()); - safe_strcpy(ami.authenticator, authenticator.c_str()); gstate.acct_mgr_op.do_rpc(ami, true); grc.mfout.printf("\n"); } From f38a5d67e5cabbd28e69d8b929c82887c6818b25 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 9 Apr 2018 11:47:42 -0700 Subject: [PATCH 3/5] client: fix logic for assigning sched_req_no_work: if we didn't ask for work, don't change it --- client/acct_mgr.cpp | 2 +- client/cs_scheduler.cpp | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/client/acct_mgr.cpp b/client/acct_mgr.cpp index a4d2192d61..45227454d8 100644 --- a/client/acct_mgr.cpp +++ b/client/acct_mgr.cpp @@ -190,7 +190,7 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) { // if (ami.dynamic) { fprintf(f, - " %d" + " %d\n" " %f\n" " %f\n" " %f\n" diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index eed5079365..10cc724217 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -1006,9 +1006,10 @@ int CLIENT_STATE::handle_scheduler_reply( if (!sr.too_recent) { for (int j=0; jsched_req_no_work[j] = - (rwf.req_secs>0 || rwf.req_instances>0) - && !got_work_for_rsc[j] + if (got_work_for_rsc[j]) { + project->sched_req_no_work[j] = false; + } else if (rwf.req_secs>0 || rwf.req_instances>0) { + project->sched_req_no_work[j] = true; ; } } From f2c89d0c4a06cd87f57971933ae990ae08149d61 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 9 Apr 2018 11:50:34 -0700 Subject: [PATCH 4/5] fix typo --- client/cs_scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index 10cc724217..f44037e71b 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -1010,7 +1010,7 @@ int CLIENT_STATE::handle_scheduler_reply( project->sched_req_no_work[j] = false; } else if (rwf.req_secs>0 || rwf.req_instances>0) { project->sched_req_no_work[j] = true; - ; + } } } From 2eb2b63b25df66dee365c41e9c74f4d13e7785d0 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 9 Apr 2018 18:14:53 -0700 Subject: [PATCH 5/5] client: print message if do starvation AM RPC --- client/acct_mgr.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/client/acct_mgr.cpp b/client/acct_mgr.cpp index 45227454d8..511f0c64f3 100644 --- a/client/acct_mgr.cpp +++ b/client/acct_mgr.cpp @@ -1087,6 +1087,10 @@ bool ACCT_MGR_INFO::poll() { if (gstate.now < starved_rpc_min_time) { return false; } + msg_printf(NULL, MSG_INFO, + "Some devices idle - requesting new projects from %s", + gstate.acct_mgr_info.project_name + ); gstate.acct_mgr_op.do_rpc(*this, false); starved_rpc_backoff *= 2; if (starved_rpc_backoff > 86400) {