diff --git a/client/acct_mgr.cpp b/client/acct_mgr.cpp index d708bcf60c..511f0c64f3 100644 --- a/client/acct_mgr.cpp +++ b/client/acct_mgr.cpp @@ -169,12 +169,6 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) { " %d\n" " %d\n" " %f\n" - " %f\n" - " %f\n" - " %f\n" - " %f\n" - " %d\n" - " %d\n" " %f\n" " %f\n", p->master_url, @@ -188,15 +182,39 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) { p->detach_when_done?1:0, p->ended?1:0, p->resource_share, - p->cpu_ec, - p->cpu_time, - p->gpu_ec, - p->gpu_time, - p->njobs_success, - p->njobs_error, p->disk_usage, p->disk_share ); + + // send work and starvation-related info + // + if (ami.dynamic) { + fprintf(f, + " %d\n" + " %f\n" + " %f\n" + " %f\n" + " %f\n" + " %d\n" + " %d\n", + p->nrpc_failures, + p->cpu_ec, + p->cpu_time, + p->gpu_ec, + p->gpu_time, + p->njobs_success, + p->njobs_error + ); + for (int j=0; jsched_req_no_work[j]) { + fprintf(f, + " %s\n", + coprocs.coprocs[j].type + ); + } + } + } + if (p->attached_via_acct_mgr) { fprintf(f, " %s\n", @@ -369,6 +387,7 @@ int ACCT_MGR_OP::parse(FILE* f) { safe_strcpy(host_venue, ""); safe_strcpy(ami.opaque, ""); ami.no_project_notices = false; + ami.dynamic = false; rss_feeds.clear(); if (!xp.parse_start("acct_mgr_reply")) return ERR_XML_PARSE; while (!xp.get_tag()) { @@ -388,6 +407,7 @@ int ACCT_MGR_OP::parse(FILE* f) { if (xp.parse_string("error", error_str)) continue; if (xp.parse_string("error_msg", error_str)) continue; if (xp.parse_double("repeat_sec", repeat_sec)) continue; + if (xp.parse_bool("dynamic", ami.dynamic)) continue; if (xp.parse_string("message", message)) { msg_printf(NULL, MSG_INFO, "Account manager: %s", message.c_str()); continue; @@ -595,6 +615,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) { safe_strcpy(gstate.acct_mgr_info.password_hash, ami.password_hash); safe_strcpy(gstate.acct_mgr_info.authenticator, ami.authenticator); gstate.acct_mgr_info.no_project_notices = ami.no_project_notices; + gstate.acct_mgr_info.dynamic = ami.dynamic; // process projects // @@ -822,7 +843,7 @@ int ACCT_MGR_INFO::write_info() { fclose(f); } - if (strlen(login_name)) { + if (strlen(login_name) || strlen(authenticator)) { f = fopen(ACCT_MGR_LOGIN_FILENAME, "w"); if (!f) { msg_printf(NULL, MSG_USER_ALERT, @@ -852,11 +873,13 @@ int ACCT_MGR_INFO::write_info() { " %f\n" " \n%s\n" " \n" - " %d\n", + " %d\n" + " %d\n", previous_host_cpid, next_rpc_time, opaque, - no_project_notices?1:0 + no_project_notices?1:0, + dynamic?1:0 ); user_keywords.write(f); fprintf(f, @@ -885,6 +908,10 @@ void ACCT_MGR_INFO::clear() { no_project_notices = false; cookie_required = false; user_keywords.clear(); + first_starved = 0; + starved_rpc_backoff = 0; + starved_rpc_min_time = 0; + dynamic = false; } ACCT_MGR_INFO::ACCT_MGR_INFO() { @@ -923,6 +950,7 @@ int ACCT_MGR_INFO::parse_login_file(FILE* p) { continue; } else if (xp.parse_bool("no_project_notices", no_project_notices)) continue; + else if (xp.parse_bool("dynamic", dynamic)) continue; else if (xp.match_tag("user_keywords")) { retval = user_keywords.parse(xp); if (retval) { @@ -1013,6 +1041,11 @@ int ACCT_MGR_INFO::init() { return 0; } +#define STARVED_RPC_DELAY 600 + // do RPC after this much starvation + +// called once a second +// bool ACCT_MGR_INFO::poll() { if (!using_am()) return false; if (gstate.acct_mgr_op.gui_http->is_busy()) { @@ -1027,5 +1060,47 @@ bool ACCT_MGR_INFO::poll() { gstate.acct_mgr_op.do_rpc(*this, false); return true; } + + // if not dynamic AM, we're done + // + if (!dynamic) { + return false; + } + + // See if some resource is starved with the current set of projects, + // and if so possibly do a "starved" RPC asking for different projects + + // do this check once a minute + // + static int idle_timer = 0; + if (++idle_timer < 60) { + return false; + } + idle_timer = 0; + get_nidle(); + if (any_resource_idle()) { + if (first_starved == 0) { + first_starved = gstate.now; + starved_rpc_backoff = STARVED_RPC_DELAY; + starved_rpc_min_time = gstate.now + STARVED_RPC_DELAY; + } else { + if (gstate.now < starved_rpc_min_time) { + return false; + } + msg_printf(NULL, MSG_INFO, + "Some devices idle - requesting new projects from %s", + gstate.acct_mgr_info.project_name + ); + gstate.acct_mgr_op.do_rpc(*this, false); + starved_rpc_backoff *= 2; + if (starved_rpc_backoff > 86400) { + starved_rpc_backoff = 86400; + } + starved_rpc_min_time = gstate.now + starved_rpc_backoff; + return true; + } + } else { + first_starved = 0; + } return false; } diff --git a/client/acct_mgr.h b/client/acct_mgr.h index ba2638e646..ab4ce99a21 100644 --- a/client/acct_mgr.h +++ b/client/acct_mgr.h @@ -61,7 +61,7 @@ struct ACCT_MGR_INFO : PROJ_AM { bool no_project_notices; // if set, don't show notices from projects - // TODO: get rid of the following + // TODO: get rid of the following here and in the manager bool cookie_required; // use of cookies are required during initial signup // NOTE: This bool gets dropped after the client has @@ -72,9 +72,21 @@ struct ACCT_MGR_INFO : PROJ_AM { // what login name and password they have been assigned bool password_error; - bool send_rec; - // send REC in AM RPCs + bool dynamic; + // This AM dynamically decides what projects to assign. + // - send EC in AM RPCs + // - send starvation info if idle resources USER_KEYWORDS user_keywords; + // user's yes/no keywords. + // These are conveyed to projects in scheduler requests + + // vars related to starvation prevention, + // where we issue a "starved RPC" if a resource has been idle + // for more than 10 min + + double first_starved; // start of starvation interval + double starved_rpc_backoff; // interval between starved RPCs + double starved_rpc_min_time; // earliest time to do a starved RPC inline bool using_am() { if (!strlen(master_url)) return false; diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp index 750bb887fc..f44037e71b 100644 --- a/client/cs_scheduler.cpp +++ b/client/cs_scheduler.cpp @@ -932,8 +932,10 @@ int CLIENT_STATE::handle_scheduler_reply( workunits.push_back(wup); } double est_rsc_runtime[MAX_RSC]; + bool got_work_for_rsc[MAX_RSC]; for (int j=0; jabort_inactive(EXIT_MISSING_COPROC); } else { rp->set_state(RESULT_NEW, "handle_scheduler_reply"); + got_work_for_rsc[0] = true; int rt = rp->avp->gpu_usage.rsc_type; if (rt > 0) { est_rsc_runtime[rt] += rp->estimated_runtime(); + got_work_for_rsc[rt] = true; gpus_usable = true; // trigger a check of whether GPU is actually usable } else { @@ -995,6 +999,21 @@ int CLIENT_STATE::handle_scheduler_reply( new_results.push_back(rp); results.push_back(rp); } + + // find the resources for which we requested work and didn't get any + // This is currently used for AM starvation mechanism. + // + if (!sr.too_recent) { + for (int j=0; jsched_req_no_work[j] = false; + } else if (rwf.req_secs>0 || rwf.req_instances>0) { + project->sched_req_no_work[j] = true; + } + } + } + sort_results(); if (log_flags.sched_op_debug) { diff --git a/client/gui_rpc_server_ops.cpp b/client/gui_rpc_server_ops.cpp index a950cccf16..5f1318a1e3 100644 --- a/client/gui_rpc_server_ops.cpp +++ b/client/gui_rpc_server_ops.cpp @@ -945,6 +945,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { bool use_config_file = false; bool bad_arg = false; bool url_found=false, name_found=false, password_found = false; + ACCT_MGR_INFO ami; while (!grc.xp.get_tag()) { if (grc.xp.parse_string("url", url)) { @@ -970,10 +971,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { "Not using account manager" ); } else { - url = gstate.acct_mgr_info.master_url; - name = gstate.acct_mgr_info.login_name; - password_hash = gstate.acct_mgr_info.password_hash; - authenticator = gstate.acct_mgr_info.authenticator; + ami = gstate.acct_mgr_info; } } else { bad_arg = !url_found || !name_found || !password_found; @@ -986,7 +984,11 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { // Remove 'hash:' password_hash = password.substr(5); } - } + safe_strcpy(ami.master_url, url.c_str()); + safe_strcpy(ami.login_name, name.c_str()); + safe_strcpy(ami.password_hash, password_hash.c_str()); + safe_strcpy(ami.authenticator, authenticator.c_str()); + } } if (bad_arg) { @@ -997,11 +999,6 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) { ){ grc.mfout.printf("attached to a different AM - detach first\n"); } else { - ACCT_MGR_INFO ami; - safe_strcpy(ami.master_url, url.c_str()); - safe_strcpy(ami.login_name, name.c_str()); - safe_strcpy(ami.password_hash, password_hash.c_str()); - safe_strcpy(ami.authenticator, authenticator.c_str()); gstate.acct_mgr_op.do_rpc(ami, true); grc.mfout.printf("\n"); } diff --git a/client/project.cpp b/client/project.cpp index eab070619c..3059a479e2 100644 --- a/client/project.cpp +++ b/client/project.cpp @@ -53,6 +53,7 @@ void PROJECT::init() { no_rsc_config[i] = false; no_rsc_apps[i] = false; no_rsc_ams[i] = false; + sched_req_no_work[i] = false; } safe_strcpy(host_venue, ""); using_venue_specific_prefs = false; diff --git a/client/project.h b/client/project.h index 27428aac7f..8f863bc88d 100644 --- a/client/project.h +++ b/client/project.h @@ -176,6 +176,9 @@ struct PROJECT : PROJ_AM { // This provides only the illusion of security. bool use_symlinks; bool report_results_immediately; + bool sched_req_no_work[MAX_RSC]; + // the last sched request asked for work for resource i + // and didn't get any // items sent in scheduler replies, // requesting that various things be sent subsequent requests diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp index 71242fe597..b4eebeb420 100644 --- a/client/rr_sim.cpp +++ b/client/rr_sim.cpp @@ -558,3 +558,51 @@ void rr_simulation() { RR_SIM rr_sim; rr_sim.simulate(); } + +// Compute the number of idle instances of each resource +// Put results in global state (rsc_work_fetch) +// +void get_nidle() { + int nidle_rsc = coprocs.n_rsc; + for (int i=1; inearly_runnable()) continue; + if (rp->some_download_stalled()) continue; + APP_VERSION* avp = rp->avp; + if (rsc_work_fetch[0].nidle_now) { + rsc_work_fetch[0].nidle_now -= avp->avg_ncpus; + if (rsc_work_fetch[0].nidle_now <= 0) { + nidle_rsc--; + rsc_work_fetch[0].nidle_now = 0; + } + } + int j = avp->gpu_usage.rsc_type; + if (!j) { + continue; + } + if (rsc_work_fetch[j].nidle_now) { + rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage; + if (rsc_work_fetch[j].nidle_now <= 0) { + nidle_rsc--; + rsc_work_fetch[j].nidle_now = 0; + } + } + if (nidle_rsc == 0) { + // no idle resources - no need to look further + // + break; + } + } +} + +bool any_resource_idle() { + for (int i=1; i 0) { + return true; + } + } + return false; +} diff --git a/client/rr_sim.h b/client/rr_sim.h index 18d7c2785b..d8c6c1901d 100644 --- a/client/rr_sim.h +++ b/client/rr_sim.h @@ -1,6 +1,6 @@ // This file is part of BOINC. // http://boinc.berkeley.edu -// Copyright (C) 2008 University of California +// Copyright (C) 2018 University of California // // BOINC is free software; you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License @@ -18,8 +18,9 @@ #ifndef BOINC_RR_SIM_H #define BOINC_RR_SIM_H - extern void rr_simulation(); extern void print_deadline_misses(); +extern void get_nidle(); +extern bool any_resource_idle(); #endif diff --git a/client/scheduler_op.cpp b/client/scheduler_op.cpp index afc9ba31b9..f8db720ad8 100644 --- a/client/scheduler_op.cpp +++ b/client/scheduler_op.cpp @@ -558,6 +558,7 @@ void SCHEDULER_REPLY::clear() { send_job_log = 0; scheduler_version = 0; got_rss_feeds = false; + too_recent = false; } SCHEDULER_REPLY::SCHEDULER_REPLY() { @@ -834,6 +835,9 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) { file_deletes.push_back(delete_file_name); } else if (xp.parse_str("message", msg_buf, sizeof(msg_buf))) { parse_attr(attr_buf, "priority", pri_buf, sizeof(pri_buf)); + if (strstr(msg_buf, "too recent")) { + too_recent = true; + } USER_MESSAGE um(msg_buf, pri_buf); messages.push_back(um); continue; diff --git a/client/scheduler_op.h b/client/scheduler_op.h index 7125ba3c78..576a556b82 100644 --- a/client/scheduler_op.h +++ b/client/scheduler_op.h @@ -134,6 +134,8 @@ struct SCHEDULER_REPLY { std::vector trickle_up_urls; bool got_rss_feeds; // whether scheduler reply included + bool too_recent; + // whether reply included "too recent" message void clear(); SCHEDULER_REPLY();