diff --git a/checkin_notes b/checkin_notes index ae5992bdaf..af48c59bb8 100644 --- a/checkin_notes +++ b/checkin_notes @@ -2655,3 +2655,29 @@ David 4 Mar 2009 server_types.cpp,h sched_send.cpp sched_plan.cpp + +David 5 Mar 2009 + - scheduler: add support for anonymous-platform coproc apps. + Old: although the request message contained all info + about the app version (flops, coproc usage etc.) + the server ignored this info, + and assumed that all anonymous platform apps where CPU. + With 6.6 client, this could produce infinite work fetch: + - client uses anon platform, has coproc app + - client has idle CPU, requests CPU work + - scheduler sends it jobs, thinking they will be done by CPU app + - client asks for more work etc. + New: scheduler parses full info on anon platform app versions: + plan class, FLOPS, coprocs. + It uses this info to make scheduling decisions; + in particular, if the request is for CUDA work, + if will only send jobs that use a CUDA app version. + The records it returns contain info + (plan_class) that tells the client which app_version to use. + This will work correctly even if the client has multiple app versions + for the same app (e.g., a CPU version and a GPU version) + + sched/ + handle_request.cpp + server_types.cpp,h + sched_send.cpp diff --git a/sched/handle_request.cpp b/sched/handle_request.cpp index 98fa11dda4..5eaa68aaa2 100644 --- a/sched/handle_request.cpp +++ b/sched/handle_request.cpp @@ -1446,7 +1446,8 @@ static void log_user_messages() { for (unsigned int i=0; imessages.size(); i++) { USER_MESSAGE um = g_reply->messages[i]; log_messages.printf(MSG_DEBUG, - "[HOST#%d] MSG(%4s) %s \n", g_reply->host.id, um.priority.c_str(), um.message.c_str() + "[HOST#%d] MSG(%4s) %s\n", + g_reply->host.id, um.priority.c_str(), um.message.c_str() ); } } @@ -1484,8 +1485,9 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) { } sreply.write(fout, sreq); - log_messages.printf(MSG_NORMAL, "Scheduler ran %f seconds\n", dtime()-start_time); - + log_messages.printf(MSG_NORMAL, + "Scheduler ran %.3f seconds\n", dtime()-start_time + ); if (strlen(config.sched_lockfile_dir)) { unlock_sched(); diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 7d805aba34..50cf85a4d6 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -54,6 +54,9 @@ #include "boinc_fcgi.h" #endif +// if host sends us an impossible RAM size, use this instead +// +const double DEFAULT_RAM_SIZE = 64000000; void send_work_matchmaker(); @@ -123,19 +126,76 @@ inline int max_wus_in_progress_multiplier() { return n; } -const double DEFAULT_RAM_SIZE = 64000000; - // if host sends us an impossible RAM size, use this instead - -bool SCHEDULER_REQUEST::has_version(APP& app) { - unsigned int i; - - for (i=0; i= app.min_version) { - return true; +// for new-style requests, check that the app version uses a +// resource for which we need work +// +bool need_this_resource(HOST_USAGE& host_usage) { + if (g_wreq->rsc_spec_request) { + if (host_usage.ncudas) { + if (!g_wreq->need_cuda()) { + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] Don't need CUDA jobs, skipping\n" + ); + } + return false; + } + } else { + if (!g_wreq->need_cpu()) { + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] Don't need CPU jobs, skipping\n" + ); + } + return false;; + } } } - return false; + return true; +} + +// scan through client's anonymous apps and pick the best one +// +CLIENT_APP_VERSION* get_app_version_anonymous(APP& app) { + unsigned int i; + CLIENT_APP_VERSION* best = NULL; + bool found = false; + char message[256]; + + for (i=0; iclient_app_versions.size(); i++) { + CLIENT_APP_VERSION& cav = g_request->client_app_versions[i]; + if (strcmp(cav.app_name, app.name)) { + continue; + } + if (cav.version_num < app.min_version) { + continue; + } + bool found = true; + if (!need_this_resource(cav.host_usage)) { + continue; + } + if (best) { + if (cav.host_usage.flops > best->host_usage.flops) { + best = &cav; + } + } else { + best = &cav; + } + } + if (config.debug_send) { + log_messages.printf(MSG_NORMAL, + "[send] Didn't find anonymous platform app for %s\n", + app.name + ); + } + if (!found) { + sprintf(message, + "Your app_info.xml file doesn't have a version of %s.", + app.user_friendly_name + ); + g_wreq->insert_no_work_message(USER_MESSAGE(message, "high")); + } + return best; } // return BEST_APP_VERSION for the given host, or NULL if none @@ -155,7 +215,7 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) { while (bavi != g_wreq->best_app_versions.end()) { bavp = *bavi; if (bavp->appid == wu.appid) { - if (!bavp->avp) return NULL; + if (!bavp->present) return NULL; // if we previously chose a CUDA app but don't need more CUDA work, // delete record, fall through, and find another version @@ -183,35 +243,31 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) { bavp = new BEST_APP_VERSION; bavp->appid = wu.appid; if (g_wreq->anonymous_platform) { - found = g_request->has_version(*app); - if (!found) { - if (config.debug_send) { - log_messages.printf(MSG_NORMAL, - "[send] Didn't find anonymous platform app for %s\n", app->name - ); - sprintf(message, - "Your app_info.xml file doesn't have a version of %s.", - app->user_friendly_name - ); - g_wreq->insert_no_work_message(USER_MESSAGE(message, "high")); - } - bavp->avp = 0; + CLIENT_APP_VERSION* cavp = get_app_version_anonymous(*app); + if (!cavp) { + bavp->present = false; } else { + bavp->present = true; if (config.debug_send) { log_messages.printf(MSG_NORMAL, "[send] Found anonymous platform app for %s\n", app->name ); } - // TODO: anonymous platform apps should be able to tell us - // how fast they are and how many CPUs and coprocs they use. - // For now, assume they use 1 CPU + bavp->host_usage = cavp->host_usage; + + // if client didn't tell us about the app version, + // assume it uses 1 CPU // - bavp->host_usage.sequential_app(g_reply->host.p_fpops); - bavp->avp = (APP_VERSION*)1; // arbitrary nonzero value; - // means the client already has the app version + if (bavp->host_usage.flops == 0) { + bavp->host_usage.flops = g_reply->host.p_fpops; + } + if (bavp->host_usage.avg_ncpus == 0 && bavp->host_usage.ncudas == 0) { + bavp->host_usage.avg_ncpus = 1; + } + bavp->cavp = cavp; } g_wreq->best_app_versions.push_back(bavp); - if (!bavp->avp) return NULL; + if (!bavp->present) return NULL; return bavp; } @@ -223,8 +279,6 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) { bavp->avp = NULL; bool no_version_for_platform = true; int app_plan_reject = 0; - bool no_cuda_requested = false; - bool no_cpu_requested = false; for (i=0; iplatforms.list.size(); i++) { PLATFORM* p = g_request->platforms.list[i]; for (j=0; jnapp_versions; j++) { @@ -258,30 +312,8 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) { host_usage.sequential_app(g_reply->host.p_fpops); } - // for new-style requests, check that the app version is relevant - // - if (g_wreq->rsc_spec_request) { - if (host_usage.ncudas) { - if (!g_wreq->need_cuda()) { - if (config.debug_version_select) { - log_messages.printf(MSG_NORMAL, - "[version] Don't need CUDA jobs, skipping\n" - ); - } - no_cuda_requested = true; - continue; - } - } else { - if (!g_wreq->need_cpu()) { - if (config.debug_version_select) { - log_messages.printf(MSG_NORMAL, - "[version] Don't need CPU jobs, skipping\n" - ); - } - no_cpu_requested = true; - continue; - } - } + if (!need_this_resource(host_usage)) { + continue; } if (host_usage.flops > bavp->host_usage.flops) { bavp->host_usage = host_usage; @@ -344,20 +376,6 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) { ); g_wreq->insert_no_work_message(USER_MESSAGE(message, "high")); } - if (no_cpu_requested) { - sprintf(message, - "CPU app exists for %s but no CPU work requested", - app->user_friendly_name - ); - g_wreq->insert_no_work_message(USER_MESSAGE(message, "high")); - } - if (no_cuda_requested) { - sprintf(message, - "CUDA app exists for %s but no CUDA work requested", - app->user_friendly_name - ); - g_wreq->insert_no_work_message(USER_MESSAGE(message, "high")); - } return NULL; } return bavp; @@ -906,12 +924,11 @@ int add_wu_to_reply( WORKUNIT wu2, wu3; APP_VERSION* avp = bavp->avp; - if (avp == (APP_VERSION*)1) avp = NULL; // add the app, app_version, and workunit to the reply, // but only if they aren't already there // - if (!bavp->anonymous_platform) { + if (avp) { APP_VERSION av2=*avp, *avp2=&av2; if (strlen(config.replace_download_url_by_timezone)) { @@ -1986,8 +2003,8 @@ void JOB_SET::add_job(JOB& job) { disk_usage += job.disk_usage; if (config.debug_send) { log_messages.printf(MSG_NORMAL, - "[send] added job to set. est_time %f disk_usage %f\n", - est_time, disk_usage + "[send] added job to set. est_time %.2f disk_usage %.2fGB\n", + est_time, disk_usage/GIGA ); } } diff --git a/sched/server_types.cpp b/sched/server_types.cpp index 1a3357c7b9..e53de639ab 100644 --- a/sched/server_types.cpp +++ b/sched/server_types.cpp @@ -57,10 +57,21 @@ void remove_quotes(char* p) { int CLIENT_APP_VERSION::parse(FILE* f) { char buf[256]; + memset(this, 0, sizeof(CLIENT_APP_VERSION)); while (fgets(buf, sizeof(buf), f)) { if (match_tag(buf, "")) return 0; if (parse_str(buf, "", app_name, 256)) continue; + if (parse_str(buf, "", plan_class, 256)) continue; if (parse_int(buf, "", version_num)) continue; + if (parse_double(buf, "", host_usage.flops)) continue; + if (match_tag(buf, "")) { + COPROCS coprocs; + coprocs.parse(f); + COPROC* cp = coprocs.lookup("CUDA"); + if (cp) { + host_usage.ncudas = cp->count; + } + } } return ERR_XML_PARSE; } @@ -572,7 +583,7 @@ int SCHEDULER_REPLY::write(FILE* fout, SCHEDULER_REQUEST& sreq) { fprintf(fout, "%f\n", request_delay); } log_messages.printf(MSG_NORMAL, - "Sending reply to [HOST#%d]: %d results, delay req %f\n", + "Sending reply to [HOST#%d]: %d results, delay req %.2f\n", host.id, wreq.nresults, request_delay ); @@ -920,7 +931,7 @@ int RESULT::write_to_client(FILE* fout) { fputs(buf, fout); APP_VERSION* avp = bavp->avp; - if (avp == (APP_VERSION*)1) avp = NULL; + CLIENT_APP_VERSION* cavp = bavp->cavp; if (avp) { PLATFORM* pp = ssp->lookup_platform_id(avp->platformid); fprintf(fout, @@ -929,7 +940,14 @@ int RESULT::write_to_client(FILE* fout) { " %s\n", pp->name, avp->version_num, avp->plan_class ); + } else if (cavp) { + fprintf(fout, + " %d\n" + " %s\n", + cavp->version_num, cavp->plan_class + ); } + fputs("\n", fout); return 0; } diff --git a/sched/server_types.h b/sched/server_types.h index ffdfd51aea..9ba5873fe1 100644 --- a/sched/server_types.h +++ b/sched/server_types.h @@ -85,15 +85,6 @@ struct HOST_USAGE { ~HOST_USAGE(){} }; -// keep track of the best app_version for each app for this host -// -struct BEST_APP_VERSION { - int appid; - APP_VERSION* avp; // NULL if none exists - HOST_USAGE host_usage; - bool anonymous_platform; // client has app_version -}; - // summary of a client's request for work, and our response to it // Note: this is zeroed out in SCHEDULER_REPLY constructor // @@ -179,6 +170,8 @@ struct WORK_REQ { std::vector no_work_messages; std::vector best_app_versions; + // various reasons for not sending jobs (used to explain why) + // bool no_allowed_apps_available; bool excessive_work_buf; bool hr_reject_temp; @@ -187,10 +180,11 @@ struct WORK_REQ { bool gpu_too_slow; bool no_gpus_prefs; bool daily_result_quota_exceeded; - int total_max_results_day; - // host.max_results_day * (NCPUS + NCUDA*cuda_multiplier) bool cache_size_exceeded; bool no_jobs_available; // project has no work right now + + int total_max_results_day; + // host.max_results_day * (NCPUS + NCUDA*cuda_multiplier) int nresults_on_host; // How many results from this project are in progress on the host. // Initially this is the number of "other_results" @@ -222,10 +216,33 @@ struct MSG_FROM_HOST_DESC { struct CLIENT_APP_VERSION { char app_name[256]; int version_num; + char plan_class[256]; + HOST_USAGE host_usage; int parse(FILE*); }; +// keep track of the best app_version for each app for this host +// +struct BEST_APP_VERSION { + int appid; + + bool present; + + // populated if anonymous platform: + CLIENT_APP_VERSION* cavp; + + // populated otherwise: + APP_VERSION* avp; + HOST_USAGE host_usage; + + BEST_APP_VERSION() { + present = false; + cavp = NULL; + avp = NULL; + } +}; + // subset of global prefs used by scheduler // struct GLOBAL_PREFS { @@ -343,7 +360,6 @@ struct SCHEDULER_REQUEST { SCHEDULER_REQUEST(); ~SCHEDULER_REQUEST(); const char* parse(FILE*); - bool has_version(APP& app); int write(FILE*); // write request info to file: not complete };