diff --git a/checkin_notes b/checkin_notes
index ae5992bdaf..af48c59bb8 100644
--- a/checkin_notes
+++ b/checkin_notes
@@ -2655,3 +2655,29 @@ David 4 Mar 2009
server_types.cpp,h
sched_send.cpp
sched_plan.cpp
+
+David 5 Mar 2009
+ - scheduler: add support for anonymous-platform coproc apps.
+ Old: although the request message contained all info
+ about the app version (flops, coproc usage etc.)
+ the server ignored this info,
+ and assumed that all anonymous platform apps where CPU.
+ With 6.6 client, this could produce infinite work fetch:
+ - client uses anon platform, has coproc app
+ - client has idle CPU, requests CPU work
+ - scheduler sends it jobs, thinking they will be done by CPU app
+ - client asks for more work etc.
+ New: scheduler parses full info on anon platform app versions:
+ plan class, FLOPS, coprocs.
+ It uses this info to make scheduling decisions;
+ in particular, if the request is for CUDA work,
+ if will only send jobs that use a CUDA app version.
+ The records it returns contain info
+ (plan_class) that tells the client which app_version to use.
+ This will work correctly even if the client has multiple app versions
+ for the same app (e.g., a CPU version and a GPU version)
+
+ sched/
+ handle_request.cpp
+ server_types.cpp,h
+ sched_send.cpp
diff --git a/sched/handle_request.cpp b/sched/handle_request.cpp
index 98fa11dda4..5eaa68aaa2 100644
--- a/sched/handle_request.cpp
+++ b/sched/handle_request.cpp
@@ -1446,7 +1446,8 @@ static void log_user_messages() {
for (unsigned int i=0; imessages.size(); i++) {
USER_MESSAGE um = g_reply->messages[i];
log_messages.printf(MSG_DEBUG,
- "[HOST#%d] MSG(%4s) %s \n", g_reply->host.id, um.priority.c_str(), um.message.c_str()
+ "[HOST#%d] MSG(%4s) %s\n",
+ g_reply->host.id, um.priority.c_str(), um.message.c_str()
);
}
}
@@ -1484,8 +1485,9 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
}
sreply.write(fout, sreq);
- log_messages.printf(MSG_NORMAL, "Scheduler ran %f seconds\n", dtime()-start_time);
-
+ log_messages.printf(MSG_NORMAL,
+ "Scheduler ran %.3f seconds\n", dtime()-start_time
+ );
if (strlen(config.sched_lockfile_dir)) {
unlock_sched();
diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp
index 7d805aba34..50cf85a4d6 100644
--- a/sched/sched_send.cpp
+++ b/sched/sched_send.cpp
@@ -54,6 +54,9 @@
#include "boinc_fcgi.h"
#endif
+// if host sends us an impossible RAM size, use this instead
+//
+const double DEFAULT_RAM_SIZE = 64000000;
void send_work_matchmaker();
@@ -123,19 +126,76 @@ inline int max_wus_in_progress_multiplier() {
return n;
}
-const double DEFAULT_RAM_SIZE = 64000000;
- // if host sends us an impossible RAM size, use this instead
-
-bool SCHEDULER_REQUEST::has_version(APP& app) {
- unsigned int i;
-
- for (i=0; i= app.min_version) {
- return true;
+// for new-style requests, check that the app version uses a
+// resource for which we need work
+//
+bool need_this_resource(HOST_USAGE& host_usage) {
+ if (g_wreq->rsc_spec_request) {
+ if (host_usage.ncudas) {
+ if (!g_wreq->need_cuda()) {
+ if (config.debug_version_select) {
+ log_messages.printf(MSG_NORMAL,
+ "[version] Don't need CUDA jobs, skipping\n"
+ );
+ }
+ return false;
+ }
+ } else {
+ if (!g_wreq->need_cpu()) {
+ if (config.debug_version_select) {
+ log_messages.printf(MSG_NORMAL,
+ "[version] Don't need CPU jobs, skipping\n"
+ );
+ }
+ return false;;
+ }
}
}
- return false;
+ return true;
+}
+
+// scan through client's anonymous apps and pick the best one
+//
+CLIENT_APP_VERSION* get_app_version_anonymous(APP& app) {
+ unsigned int i;
+ CLIENT_APP_VERSION* best = NULL;
+ bool found = false;
+ char message[256];
+
+ for (i=0; iclient_app_versions.size(); i++) {
+ CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
+ if (strcmp(cav.app_name, app.name)) {
+ continue;
+ }
+ if (cav.version_num < app.min_version) {
+ continue;
+ }
+ bool found = true;
+ if (!need_this_resource(cav.host_usage)) {
+ continue;
+ }
+ if (best) {
+ if (cav.host_usage.flops > best->host_usage.flops) {
+ best = &cav;
+ }
+ } else {
+ best = &cav;
+ }
+ }
+ if (config.debug_send) {
+ log_messages.printf(MSG_NORMAL,
+ "[send] Didn't find anonymous platform app for %s\n",
+ app.name
+ );
+ }
+ if (!found) {
+ sprintf(message,
+ "Your app_info.xml file doesn't have a version of %s.",
+ app.user_friendly_name
+ );
+ g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
+ }
+ return best;
}
// return BEST_APP_VERSION for the given host, or NULL if none
@@ -155,7 +215,7 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
while (bavi != g_wreq->best_app_versions.end()) {
bavp = *bavi;
if (bavp->appid == wu.appid) {
- if (!bavp->avp) return NULL;
+ if (!bavp->present) return NULL;
// if we previously chose a CUDA app but don't need more CUDA work,
// delete record, fall through, and find another version
@@ -183,35 +243,31 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
bavp = new BEST_APP_VERSION;
bavp->appid = wu.appid;
if (g_wreq->anonymous_platform) {
- found = g_request->has_version(*app);
- if (!found) {
- if (config.debug_send) {
- log_messages.printf(MSG_NORMAL,
- "[send] Didn't find anonymous platform app for %s\n", app->name
- );
- sprintf(message,
- "Your app_info.xml file doesn't have a version of %s.",
- app->user_friendly_name
- );
- g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
- }
- bavp->avp = 0;
+ CLIENT_APP_VERSION* cavp = get_app_version_anonymous(*app);
+ if (!cavp) {
+ bavp->present = false;
} else {
+ bavp->present = true;
if (config.debug_send) {
log_messages.printf(MSG_NORMAL,
"[send] Found anonymous platform app for %s\n", app->name
);
}
- // TODO: anonymous platform apps should be able to tell us
- // how fast they are and how many CPUs and coprocs they use.
- // For now, assume they use 1 CPU
+ bavp->host_usage = cavp->host_usage;
+
+ // if client didn't tell us about the app version,
+ // assume it uses 1 CPU
//
- bavp->host_usage.sequential_app(g_reply->host.p_fpops);
- bavp->avp = (APP_VERSION*)1; // arbitrary nonzero value;
- // means the client already has the app version
+ if (bavp->host_usage.flops == 0) {
+ bavp->host_usage.flops = g_reply->host.p_fpops;
+ }
+ if (bavp->host_usage.avg_ncpus == 0 && bavp->host_usage.ncudas == 0) {
+ bavp->host_usage.avg_ncpus = 1;
+ }
+ bavp->cavp = cavp;
}
g_wreq->best_app_versions.push_back(bavp);
- if (!bavp->avp) return NULL;
+ if (!bavp->present) return NULL;
return bavp;
}
@@ -223,8 +279,6 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
bavp->avp = NULL;
bool no_version_for_platform = true;
int app_plan_reject = 0;
- bool no_cuda_requested = false;
- bool no_cpu_requested = false;
for (i=0; iplatforms.list.size(); i++) {
PLATFORM* p = g_request->platforms.list[i];
for (j=0; jnapp_versions; j++) {
@@ -258,30 +312,8 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
host_usage.sequential_app(g_reply->host.p_fpops);
}
- // for new-style requests, check that the app version is relevant
- //
- if (g_wreq->rsc_spec_request) {
- if (host_usage.ncudas) {
- if (!g_wreq->need_cuda()) {
- if (config.debug_version_select) {
- log_messages.printf(MSG_NORMAL,
- "[version] Don't need CUDA jobs, skipping\n"
- );
- }
- no_cuda_requested = true;
- continue;
- }
- } else {
- if (!g_wreq->need_cpu()) {
- if (config.debug_version_select) {
- log_messages.printf(MSG_NORMAL,
- "[version] Don't need CPU jobs, skipping\n"
- );
- }
- no_cpu_requested = true;
- continue;
- }
- }
+ if (!need_this_resource(host_usage)) {
+ continue;
}
if (host_usage.flops > bavp->host_usage.flops) {
bavp->host_usage = host_usage;
@@ -344,20 +376,6 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
);
g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
}
- if (no_cpu_requested) {
- sprintf(message,
- "CPU app exists for %s but no CPU work requested",
- app->user_friendly_name
- );
- g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
- }
- if (no_cuda_requested) {
- sprintf(message,
- "CUDA app exists for %s but no CUDA work requested",
- app->user_friendly_name
- );
- g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
- }
return NULL;
}
return bavp;
@@ -906,12 +924,11 @@ int add_wu_to_reply(
WORKUNIT wu2, wu3;
APP_VERSION* avp = bavp->avp;
- if (avp == (APP_VERSION*)1) avp = NULL;
// add the app, app_version, and workunit to the reply,
// but only if they aren't already there
//
- if (!bavp->anonymous_platform) {
+ if (avp) {
APP_VERSION av2=*avp, *avp2=&av2;
if (strlen(config.replace_download_url_by_timezone)) {
@@ -1986,8 +2003,8 @@ void JOB_SET::add_job(JOB& job) {
disk_usage += job.disk_usage;
if (config.debug_send) {
log_messages.printf(MSG_NORMAL,
- "[send] added job to set. est_time %f disk_usage %f\n",
- est_time, disk_usage
+ "[send] added job to set. est_time %.2f disk_usage %.2fGB\n",
+ est_time, disk_usage/GIGA
);
}
}
diff --git a/sched/server_types.cpp b/sched/server_types.cpp
index 1a3357c7b9..e53de639ab 100644
--- a/sched/server_types.cpp
+++ b/sched/server_types.cpp
@@ -57,10 +57,21 @@ void remove_quotes(char* p) {
int CLIENT_APP_VERSION::parse(FILE* f) {
char buf[256];
+ memset(this, 0, sizeof(CLIENT_APP_VERSION));
while (fgets(buf, sizeof(buf), f)) {
if (match_tag(buf, "")) return 0;
if (parse_str(buf, "", app_name, 256)) continue;
+ if (parse_str(buf, "", plan_class, 256)) continue;
if (parse_int(buf, "", version_num)) continue;
+ if (parse_double(buf, "", host_usage.flops)) continue;
+ if (match_tag(buf, "")) {
+ COPROCS coprocs;
+ coprocs.parse(f);
+ COPROC* cp = coprocs.lookup("CUDA");
+ if (cp) {
+ host_usage.ncudas = cp->count;
+ }
+ }
}
return ERR_XML_PARSE;
}
@@ -572,7 +583,7 @@ int SCHEDULER_REPLY::write(FILE* fout, SCHEDULER_REQUEST& sreq) {
fprintf(fout, "%f\n", request_delay);
}
log_messages.printf(MSG_NORMAL,
- "Sending reply to [HOST#%d]: %d results, delay req %f\n",
+ "Sending reply to [HOST#%d]: %d results, delay req %.2f\n",
host.id, wreq.nresults, request_delay
);
@@ -920,7 +931,7 @@ int RESULT::write_to_client(FILE* fout) {
fputs(buf, fout);
APP_VERSION* avp = bavp->avp;
- if (avp == (APP_VERSION*)1) avp = NULL;
+ CLIENT_APP_VERSION* cavp = bavp->cavp;
if (avp) {
PLATFORM* pp = ssp->lookup_platform_id(avp->platformid);
fprintf(fout,
@@ -929,7 +940,14 @@ int RESULT::write_to_client(FILE* fout) {
" %s\n",
pp->name, avp->version_num, avp->plan_class
);
+ } else if (cavp) {
+ fprintf(fout,
+ " %d\n"
+ " %s\n",
+ cavp->version_num, cavp->plan_class
+ );
}
+
fputs("\n", fout);
return 0;
}
diff --git a/sched/server_types.h b/sched/server_types.h
index ffdfd51aea..9ba5873fe1 100644
--- a/sched/server_types.h
+++ b/sched/server_types.h
@@ -85,15 +85,6 @@ struct HOST_USAGE {
~HOST_USAGE(){}
};
-// keep track of the best app_version for each app for this host
-//
-struct BEST_APP_VERSION {
- int appid;
- APP_VERSION* avp; // NULL if none exists
- HOST_USAGE host_usage;
- bool anonymous_platform; // client has app_version
-};
-
// summary of a client's request for work, and our response to it
// Note: this is zeroed out in SCHEDULER_REPLY constructor
//
@@ -179,6 +170,8 @@ struct WORK_REQ {
std::vector no_work_messages;
std::vector best_app_versions;
+ // various reasons for not sending jobs (used to explain why)
+ //
bool no_allowed_apps_available;
bool excessive_work_buf;
bool hr_reject_temp;
@@ -187,10 +180,11 @@ struct WORK_REQ {
bool gpu_too_slow;
bool no_gpus_prefs;
bool daily_result_quota_exceeded;
- int total_max_results_day;
- // host.max_results_day * (NCPUS + NCUDA*cuda_multiplier)
bool cache_size_exceeded;
bool no_jobs_available; // project has no work right now
+
+ int total_max_results_day;
+ // host.max_results_day * (NCPUS + NCUDA*cuda_multiplier)
int nresults_on_host;
// How many results from this project are in progress on the host.
// Initially this is the number of "other_results"
@@ -222,10 +216,33 @@ struct MSG_FROM_HOST_DESC {
struct CLIENT_APP_VERSION {
char app_name[256];
int version_num;
+ char plan_class[256];
+ HOST_USAGE host_usage;
int parse(FILE*);
};
+// keep track of the best app_version for each app for this host
+//
+struct BEST_APP_VERSION {
+ int appid;
+
+ bool present;
+
+ // populated if anonymous platform:
+ CLIENT_APP_VERSION* cavp;
+
+ // populated otherwise:
+ APP_VERSION* avp;
+ HOST_USAGE host_usage;
+
+ BEST_APP_VERSION() {
+ present = false;
+ cavp = NULL;
+ avp = NULL;
+ }
+};
+
// subset of global prefs used by scheduler
//
struct GLOBAL_PREFS {
@@ -343,7 +360,6 @@ struct SCHEDULER_REQUEST {
SCHEDULER_REQUEST();
~SCHEDULER_REQUEST();
const char* parse(FILE*);
- bool has_version(APP& app);
int write(FILE*); // write request info to file: not complete
};