diff --git a/client/acct_mgr.cpp b/client/acct_mgr.cpp
index d708bcf60c..511f0c64f3 100644
--- a/client/acct_mgr.cpp
+++ b/client/acct_mgr.cpp
@@ -169,12 +169,6 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
" %d\n"
" %d\n"
" %f\n"
- " %f\n"
- " %f\n"
- " %f\n"
- " %f\n"
- " %d\n"
- " %d\n"
" %f\n"
" %f\n",
p->master_url,
@@ -188,15 +182,39 @@ int ACCT_MGR_OP::do_rpc(ACCT_MGR_INFO& _ami, bool _via_gui) {
p->detach_when_done?1:0,
p->ended?1:0,
p->resource_share,
- p->cpu_ec,
- p->cpu_time,
- p->gpu_ec,
- p->gpu_time,
- p->njobs_success,
- p->njobs_error,
p->disk_usage,
p->disk_share
);
+
+ // send work and starvation-related info
+ //
+ if (ami.dynamic) {
+ fprintf(f,
+ " %d\n"
+ " %f\n"
+ " %f\n"
+ " %f\n"
+ " %f\n"
+ " %d\n"
+ " %d\n",
+ p->nrpc_failures,
+ p->cpu_ec,
+ p->cpu_time,
+ p->gpu_ec,
+ p->gpu_time,
+ p->njobs_success,
+ p->njobs_error
+ );
+ for (int j=0; jsched_req_no_work[j]) {
+ fprintf(f,
+ " %s\n",
+ coprocs.coprocs[j].type
+ );
+ }
+ }
+ }
+
if (p->attached_via_acct_mgr) {
fprintf(f,
" %s\n",
@@ -369,6 +387,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
safe_strcpy(host_venue, "");
safe_strcpy(ami.opaque, "");
ami.no_project_notices = false;
+ ami.dynamic = false;
rss_feeds.clear();
if (!xp.parse_start("acct_mgr_reply")) return ERR_XML_PARSE;
while (!xp.get_tag()) {
@@ -388,6 +407,7 @@ int ACCT_MGR_OP::parse(FILE* f) {
if (xp.parse_string("error", error_str)) continue;
if (xp.parse_string("error_msg", error_str)) continue;
if (xp.parse_double("repeat_sec", repeat_sec)) continue;
+ if (xp.parse_bool("dynamic", ami.dynamic)) continue;
if (xp.parse_string("message", message)) {
msg_printf(NULL, MSG_INFO, "Account manager: %s", message.c_str());
continue;
@@ -595,6 +615,7 @@ void ACCT_MGR_OP::handle_reply(int http_op_retval) {
safe_strcpy(gstate.acct_mgr_info.password_hash, ami.password_hash);
safe_strcpy(gstate.acct_mgr_info.authenticator, ami.authenticator);
gstate.acct_mgr_info.no_project_notices = ami.no_project_notices;
+ gstate.acct_mgr_info.dynamic = ami.dynamic;
// process projects
//
@@ -822,7 +843,7 @@ int ACCT_MGR_INFO::write_info() {
fclose(f);
}
- if (strlen(login_name)) {
+ if (strlen(login_name) || strlen(authenticator)) {
f = fopen(ACCT_MGR_LOGIN_FILENAME, "w");
if (!f) {
msg_printf(NULL, MSG_USER_ALERT,
@@ -852,11 +873,13 @@ int ACCT_MGR_INFO::write_info() {
" %f\n"
" \n%s\n"
" \n"
- " %d\n",
+ " %d\n"
+ " %d\n",
previous_host_cpid,
next_rpc_time,
opaque,
- no_project_notices?1:0
+ no_project_notices?1:0,
+ dynamic?1:0
);
user_keywords.write(f);
fprintf(f,
@@ -885,6 +908,10 @@ void ACCT_MGR_INFO::clear() {
no_project_notices = false;
cookie_required = false;
user_keywords.clear();
+ first_starved = 0;
+ starved_rpc_backoff = 0;
+ starved_rpc_min_time = 0;
+ dynamic = false;
}
ACCT_MGR_INFO::ACCT_MGR_INFO() {
@@ -923,6 +950,7 @@ int ACCT_MGR_INFO::parse_login_file(FILE* p) {
continue;
}
else if (xp.parse_bool("no_project_notices", no_project_notices)) continue;
+ else if (xp.parse_bool("dynamic", dynamic)) continue;
else if (xp.match_tag("user_keywords")) {
retval = user_keywords.parse(xp);
if (retval) {
@@ -1013,6 +1041,11 @@ int ACCT_MGR_INFO::init() {
return 0;
}
+#define STARVED_RPC_DELAY 600
+ // do RPC after this much starvation
+
+// called once a second
+//
bool ACCT_MGR_INFO::poll() {
if (!using_am()) return false;
if (gstate.acct_mgr_op.gui_http->is_busy()) {
@@ -1027,5 +1060,47 @@ bool ACCT_MGR_INFO::poll() {
gstate.acct_mgr_op.do_rpc(*this, false);
return true;
}
+
+ // if not dynamic AM, we're done
+ //
+ if (!dynamic) {
+ return false;
+ }
+
+ // See if some resource is starved with the current set of projects,
+ // and if so possibly do a "starved" RPC asking for different projects
+
+ // do this check once a minute
+ //
+ static int idle_timer = 0;
+ if (++idle_timer < 60) {
+ return false;
+ }
+ idle_timer = 0;
+ get_nidle();
+ if (any_resource_idle()) {
+ if (first_starved == 0) {
+ first_starved = gstate.now;
+ starved_rpc_backoff = STARVED_RPC_DELAY;
+ starved_rpc_min_time = gstate.now + STARVED_RPC_DELAY;
+ } else {
+ if (gstate.now < starved_rpc_min_time) {
+ return false;
+ }
+ msg_printf(NULL, MSG_INFO,
+ "Some devices idle - requesting new projects from %s",
+ gstate.acct_mgr_info.project_name
+ );
+ gstate.acct_mgr_op.do_rpc(*this, false);
+ starved_rpc_backoff *= 2;
+ if (starved_rpc_backoff > 86400) {
+ starved_rpc_backoff = 86400;
+ }
+ starved_rpc_min_time = gstate.now + starved_rpc_backoff;
+ return true;
+ }
+ } else {
+ first_starved = 0;
+ }
return false;
}
diff --git a/client/acct_mgr.h b/client/acct_mgr.h
index ba2638e646..ab4ce99a21 100644
--- a/client/acct_mgr.h
+++ b/client/acct_mgr.h
@@ -61,7 +61,7 @@ struct ACCT_MGR_INFO : PROJ_AM {
bool no_project_notices;
// if set, don't show notices from projects
- // TODO: get rid of the following
+ // TODO: get rid of the following here and in the manager
bool cookie_required;
// use of cookies are required during initial signup
// NOTE: This bool gets dropped after the client has
@@ -72,9 +72,21 @@ struct ACCT_MGR_INFO : PROJ_AM {
// what login name and password they have been assigned
bool password_error;
- bool send_rec;
- // send REC in AM RPCs
+ bool dynamic;
+ // This AM dynamically decides what projects to assign.
+ // - send EC in AM RPCs
+ // - send starvation info if idle resources
USER_KEYWORDS user_keywords;
+ // user's yes/no keywords.
+ // These are conveyed to projects in scheduler requests
+
+ // vars related to starvation prevention,
+ // where we issue a "starved RPC" if a resource has been idle
+ // for more than 10 min
+
+ double first_starved; // start of starvation interval
+ double starved_rpc_backoff; // interval between starved RPCs
+ double starved_rpc_min_time; // earliest time to do a starved RPC
inline bool using_am() {
if (!strlen(master_url)) return false;
diff --git a/client/cs_scheduler.cpp b/client/cs_scheduler.cpp
index 750bb887fc..f44037e71b 100644
--- a/client/cs_scheduler.cpp
+++ b/client/cs_scheduler.cpp
@@ -932,8 +932,10 @@ int CLIENT_STATE::handle_scheduler_reply(
workunits.push_back(wup);
}
double est_rsc_runtime[MAX_RSC];
+ bool got_work_for_rsc[MAX_RSC];
for (int j=0; jabort_inactive(EXIT_MISSING_COPROC);
} else {
rp->set_state(RESULT_NEW, "handle_scheduler_reply");
+ got_work_for_rsc[0] = true;
int rt = rp->avp->gpu_usage.rsc_type;
if (rt > 0) {
est_rsc_runtime[rt] += rp->estimated_runtime();
+ got_work_for_rsc[rt] = true;
gpus_usable = true;
// trigger a check of whether GPU is actually usable
} else {
@@ -995,6 +999,21 @@ int CLIENT_STATE::handle_scheduler_reply(
new_results.push_back(rp);
results.push_back(rp);
}
+
+ // find the resources for which we requested work and didn't get any
+ // This is currently used for AM starvation mechanism.
+ //
+ if (!sr.too_recent) {
+ for (int j=0; jsched_req_no_work[j] = false;
+ } else if (rwf.req_secs>0 || rwf.req_instances>0) {
+ project->sched_req_no_work[j] = true;
+ }
+ }
+ }
+
sort_results();
if (log_flags.sched_op_debug) {
diff --git a/client/gui_rpc_server_ops.cpp b/client/gui_rpc_server_ops.cpp
index a950cccf16..5f1318a1e3 100644
--- a/client/gui_rpc_server_ops.cpp
+++ b/client/gui_rpc_server_ops.cpp
@@ -945,6 +945,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
bool use_config_file = false;
bool bad_arg = false;
bool url_found=false, name_found=false, password_found = false;
+ ACCT_MGR_INFO ami;
while (!grc.xp.get_tag()) {
if (grc.xp.parse_string("url", url)) {
@@ -970,10 +971,7 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
"Not using account manager"
);
} else {
- url = gstate.acct_mgr_info.master_url;
- name = gstate.acct_mgr_info.login_name;
- password_hash = gstate.acct_mgr_info.password_hash;
- authenticator = gstate.acct_mgr_info.authenticator;
+ ami = gstate.acct_mgr_info;
}
} else {
bad_arg = !url_found || !name_found || !password_found;
@@ -986,7 +984,11 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
// Remove 'hash:'
password_hash = password.substr(5);
}
- }
+ safe_strcpy(ami.master_url, url.c_str());
+ safe_strcpy(ami.login_name, name.c_str());
+ safe_strcpy(ami.password_hash, password_hash.c_str());
+ safe_strcpy(ami.authenticator, authenticator.c_str());
+ }
}
if (bad_arg) {
@@ -997,11 +999,6 @@ static void handle_acct_mgr_rpc(GUI_RPC_CONN& grc) {
){
grc.mfout.printf("attached to a different AM - detach first\n");
} else {
- ACCT_MGR_INFO ami;
- safe_strcpy(ami.master_url, url.c_str());
- safe_strcpy(ami.login_name, name.c_str());
- safe_strcpy(ami.password_hash, password_hash.c_str());
- safe_strcpy(ami.authenticator, authenticator.c_str());
gstate.acct_mgr_op.do_rpc(ami, true);
grc.mfout.printf("\n");
}
diff --git a/client/project.cpp b/client/project.cpp
index eab070619c..3059a479e2 100644
--- a/client/project.cpp
+++ b/client/project.cpp
@@ -53,6 +53,7 @@ void PROJECT::init() {
no_rsc_config[i] = false;
no_rsc_apps[i] = false;
no_rsc_ams[i] = false;
+ sched_req_no_work[i] = false;
}
safe_strcpy(host_venue, "");
using_venue_specific_prefs = false;
diff --git a/client/project.h b/client/project.h
index 27428aac7f..8f863bc88d 100644
--- a/client/project.h
+++ b/client/project.h
@@ -176,6 +176,9 @@ struct PROJECT : PROJ_AM {
// This provides only the illusion of security.
bool use_symlinks;
bool report_results_immediately;
+ bool sched_req_no_work[MAX_RSC];
+ // the last sched request asked for work for resource i
+ // and didn't get any
// items sent in scheduler replies,
// requesting that various things be sent subsequent requests
diff --git a/client/rr_sim.cpp b/client/rr_sim.cpp
index 71242fe597..b4eebeb420 100644
--- a/client/rr_sim.cpp
+++ b/client/rr_sim.cpp
@@ -558,3 +558,51 @@ void rr_simulation() {
RR_SIM rr_sim;
rr_sim.simulate();
}
+
+// Compute the number of idle instances of each resource
+// Put results in global state (rsc_work_fetch)
+//
+void get_nidle() {
+ int nidle_rsc = coprocs.n_rsc;
+ for (int i=1; inearly_runnable()) continue;
+ if (rp->some_download_stalled()) continue;
+ APP_VERSION* avp = rp->avp;
+ if (rsc_work_fetch[0].nidle_now) {
+ rsc_work_fetch[0].nidle_now -= avp->avg_ncpus;
+ if (rsc_work_fetch[0].nidle_now <= 0) {
+ nidle_rsc--;
+ rsc_work_fetch[0].nidle_now = 0;
+ }
+ }
+ int j = avp->gpu_usage.rsc_type;
+ if (!j) {
+ continue;
+ }
+ if (rsc_work_fetch[j].nidle_now) {
+ rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage;
+ if (rsc_work_fetch[j].nidle_now <= 0) {
+ nidle_rsc--;
+ rsc_work_fetch[j].nidle_now = 0;
+ }
+ }
+ if (nidle_rsc == 0) {
+ // no idle resources - no need to look further
+ //
+ break;
+ }
+ }
+}
+
+bool any_resource_idle() {
+ for (int i=1; i 0) {
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/client/rr_sim.h b/client/rr_sim.h
index 18d7c2785b..d8c6c1901d 100644
--- a/client/rr_sim.h
+++ b/client/rr_sim.h
@@ -1,6 +1,6 @@
// This file is part of BOINC.
// http://boinc.berkeley.edu
-// Copyright (C) 2008 University of California
+// Copyright (C) 2018 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
@@ -18,8 +18,9 @@
#ifndef BOINC_RR_SIM_H
#define BOINC_RR_SIM_H
-
extern void rr_simulation();
extern void print_deadline_misses();
+extern void get_nidle();
+extern bool any_resource_idle();
#endif
diff --git a/client/scheduler_op.cpp b/client/scheduler_op.cpp
index afc9ba31b9..f8db720ad8 100644
--- a/client/scheduler_op.cpp
+++ b/client/scheduler_op.cpp
@@ -558,6 +558,7 @@ void SCHEDULER_REPLY::clear() {
send_job_log = 0;
scheduler_version = 0;
got_rss_feeds = false;
+ too_recent = false;
}
SCHEDULER_REPLY::SCHEDULER_REPLY() {
@@ -834,6 +835,9 @@ int SCHEDULER_REPLY::parse(FILE* in, PROJECT* project) {
file_deletes.push_back(delete_file_name);
} else if (xp.parse_str("message", msg_buf, sizeof(msg_buf))) {
parse_attr(attr_buf, "priority", pri_buf, sizeof(pri_buf));
+ if (strstr(msg_buf, "too recent")) {
+ too_recent = true;
+ }
USER_MESSAGE um(msg_buf, pri_buf);
messages.push_back(um);
continue;
diff --git a/client/scheduler_op.h b/client/scheduler_op.h
index 7125ba3c78..576a556b82 100644
--- a/client/scheduler_op.h
+++ b/client/scheduler_op.h
@@ -134,6 +134,8 @@ struct SCHEDULER_REPLY {
std::vector trickle_up_urls;
bool got_rss_feeds;
// whether scheduler reply included
+ bool too_recent;
+ // whether reply included "too recent" message
void clear();
SCHEDULER_REPLY();