- scheduler: add support for anonymous-platform coproc apps.

Old: although the request message contained all info about the app version (flops, coproc usage etc.) the server ignored this info, and assumed that all anonymous platform apps where CPU. With 6.6 client, this could produce infinite work fetch: - client uses anon platform, has coproc app - client has idle CPU, requests CPU work - scheduler sends it jobs, thinking they will be done by CPU app - client asks for more work etc. New: scheduler parses full info on anon platform app versions: plan class, FLOPS, coprocs. It uses this info to make scheduling decisions; in particular, if the request is for CUDA work, if will only send jobs that use a CUDA app version. The <result> records it returns contain info (plan_class) that tells the client which app_version to use. This will work correctly even if the client has multiple app versions for the same app (e.g., a CPU version and a GPU version) svn path=/trunk/boinc/; revision=17506
2009-03-05 17:30:10 +00:00 · 2009-03-05 17:30:10 +00:00 · e6f3027567
parent e974d06544
commit e6f3027567
5 changed files with 172 additions and 93 deletions
--- a/26
+++ b/26
@ -2655,3 +2655,29 @@ David  4 Mar 2009
        server_types.cpp,h
        sched_send.cpp
        sched_plan.cpp
+
+David  5 Mar 2009
+    - scheduler: add support for anonymous-platform coproc apps.
+        Old: although the request message contained all info
+            about the app version (flops, coproc usage etc.)
+            the server ignored this info,
+            and assumed that all anonymous platform apps where CPU.
+            With 6.6 client, this could produce infinite work fetch:
+            - client uses anon platform, has coproc app
+            - client has idle CPU, requests CPU work
+            - scheduler sends it jobs, thinking they will be done by CPU app
+            - client asks for more work etc.
+        New: scheduler parses full info on anon platform app versions:
+            plan class, FLOPS, coprocs.
+            It uses this info to make scheduling decisions;
+            in particular, if the request is for CUDA work,
+            if will only send jobs that use a CUDA app version.
+            The <result> records it returns contain info
+            (plan_class) that tells the client which app_version to use.
+        This will work correctly even if the client has multiple app versions
+        for the same app (e.g., a CPU version and a GPU version)
+
+    sched/
+        handle_request.cpp
+        server_types.cpp,h
+        sched_send.cpp
--- a/sched/handle_request.cpp
+++ b/sched/handle_request.cpp
@ -1446,7 +1446,8 @@ static void log_user_messages() {
    for (unsigned int i=0; i<g_reply->messages.size(); i++) {
        USER_MESSAGE um = g_reply->messages[i];
        log_messages.printf(MSG_DEBUG,
-            "[HOST#%d] MSG(%4s) %s \n", g_reply->host.id, um.priority.c_str(), um.message.c_str()
+            "[HOST#%d] MSG(%4s) %s\n",
+            g_reply->host.id, um.priority.c_str(), um.message.c_str()
        );
    }
 }
@ -1484,8 +1485,9 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
    }

    sreply.write(fout, sreq);
-    log_messages.printf(MSG_NORMAL, "Scheduler ran %f seconds\n", dtime()-start_time);
-
+    log_messages.printf(MSG_NORMAL,
+        "Scheduler ran %.3f seconds\n", dtime()-start_time
+    );

    if (strlen(config.sched_lockfile_dir)) {
        unlock_sched();
--- a/sched/sched_send.cpp
+++ b/sched/sched_send.cpp
@ -54,6 +54,9 @@
 #include "boinc_fcgi.h"
 #endif

+// if host sends us an impossible RAM size, use this instead
+//
+const double DEFAULT_RAM_SIZE = 64000000;

 void send_work_matchmaker();

@ -123,19 +126,76 @@ inline int max_wus_in_progress_multiplier() {
    return n;
 }

-const double DEFAULT_RAM_SIZE = 64000000;
-    // if host sends us an impossible RAM size, use this instead
-
-bool SCHEDULER_REQUEST::has_version(APP& app) {
-    unsigned int i;
-
-    for (i=0; i<client_app_versions.size(); i++) {
-        CLIENT_APP_VERSION& cav = client_app_versions[i];
-        if (!strcmp(cav.app_name, app.name) && cav.version_num >= app.min_version) {
-            return true;
+// for new-style requests, check that the app version uses a
+// resource for which we need work
+//
+bool need_this_resource(HOST_USAGE& host_usage) {
+    if (g_wreq->rsc_spec_request) {
+        if (host_usage.ncudas) {
+            if (!g_wreq->need_cuda()) {
+                if (config.debug_version_select) {
+                    log_messages.printf(MSG_NORMAL,
+                        "[version] Don't need CUDA jobs, skipping\n"
+                    );
+                }
+                return false;
+            }
+        } else {
+            if (!g_wreq->need_cpu()) {
+                if (config.debug_version_select) {
+                    log_messages.printf(MSG_NORMAL,
+                        "[version] Don't need CPU jobs, skipping\n"
+                    );
+                }
+                return false;;
+            }
        }
    }
-    return false;
+    return true;
+}
+
+// scan through client's anonymous apps and pick the best one
+//
+CLIENT_APP_VERSION* get_app_version_anonymous(APP& app) {
+    unsigned int i;
+    CLIENT_APP_VERSION* best = NULL;
+    bool found = false;
+    char message[256];
+
+    for (i=0; i<g_request->client_app_versions.size(); i++) {
+        CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
+        if (strcmp(cav.app_name, app.name)) {
+            continue;
+        }
+        if (cav.version_num < app.min_version) {
+            continue;
+        }
+        bool found = true;
+        if (!need_this_resource(cav.host_usage)) {
+            continue;
+        }
+        if (best) {
+            if (cav.host_usage.flops > best->host_usage.flops) {
+                best = &cav;
+            }
+        } else {
+            best = &cav;
+        }
+    }
+    if (config.debug_send) {
+        log_messages.printf(MSG_NORMAL,
+            "[send] Didn't find anonymous platform app for %s\n",
+            app.name
+        );
+    }
+    if (!found) {
+        sprintf(message,
+            "Your app_info.xml file doesn't have a version of %s.",
+            app.user_friendly_name
+        );
+        g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
+    }
+    return best;
 }

 // return BEST_APP_VERSION for the given host, or NULL if none
@ -155,7 +215,7 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
    while (bavi != g_wreq->best_app_versions.end()) {
        bavp = *bavi;
        if (bavp->appid == wu.appid) {
-            if (!bavp->avp) return NULL;
+            if (!bavp->present) return NULL;

            // if we previously chose a CUDA app but don't need more CUDA work,
            // delete record, fall through, and find another version
@ -183,35 +243,31 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
    bavp = new BEST_APP_VERSION;
    bavp->appid = wu.appid;
    if (g_wreq->anonymous_platform) {
-        found = g_request->has_version(*app);
-        if (!found) {
-            if (config.debug_send) {
-                log_messages.printf(MSG_NORMAL,
-                    "[send] Didn't find anonymous platform app for %s\n", app->name
-                );
-                sprintf(message,
-                    "Your app_info.xml file doesn't have a version of %s.",
-                    app->user_friendly_name
-                );
-                g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
-            }
-            bavp->avp = 0;
+        CLIENT_APP_VERSION* cavp = get_app_version_anonymous(*app);
+        if (!cavp) {
+            bavp->present = false;
        } else {
+            bavp->present = true;
            if (config.debug_send) {
                log_messages.printf(MSG_NORMAL,
                    "[send] Found anonymous platform app for %s\n", app->name
                );
            }
-            // TODO: anonymous platform apps should be able to tell us
-            // how fast they are and how many CPUs and coprocs they use.
-            // For now, assume they use 1 CPU
+            bavp->host_usage = cavp->host_usage;
+
+            // if client didn't tell us about the app version,
+            // assume it uses 1 CPU
            //
-            bavp->host_usage.sequential_app(g_reply->host.p_fpops);
-            bavp->avp = (APP_VERSION*)1;    // arbitrary nonzero value;
-                // means the client already has the app version
+            if (bavp->host_usage.flops == 0) {
+                bavp->host_usage.flops = g_reply->host.p_fpops;
+            }
+            if (bavp->host_usage.avg_ncpus == 0 && bavp->host_usage.ncudas == 0) {
+                bavp->host_usage.avg_ncpus = 1;
+            }
+            bavp->cavp = cavp;
        }
        g_wreq->best_app_versions.push_back(bavp);
-        if (!bavp->avp) return NULL;
+        if (!bavp->present) return NULL;
        return bavp;
    }

@ -223,8 +279,6 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
    bavp->avp = NULL;
    bool no_version_for_platform = true;
    int app_plan_reject = 0;
-    bool no_cuda_requested = false;
-    bool no_cpu_requested = false;
    for (i=0; i<g_request->platforms.list.size(); i++) {
        PLATFORM* p = g_request->platforms.list[i];
        for (j=0; j<ssp->napp_versions; j++) {
@ -258,30 +312,8 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
                host_usage.sequential_app(g_reply->host.p_fpops);
            }

-            // for new-style requests, check that the app version is relevant
-            //
-            if (g_wreq->rsc_spec_request) {
-                if (host_usage.ncudas) {
-                    if (!g_wreq->need_cuda()) {
-                        if (config.debug_version_select) {
-                            log_messages.printf(MSG_NORMAL,
-                                "[version] Don't need CUDA jobs, skipping\n"
-                            );
-                        }
-                        no_cuda_requested = true;
-                        continue;
-                    }
-                } else {
-                    if (!g_wreq->need_cpu()) {
-                        if (config.debug_version_select) {
-                            log_messages.printf(MSG_NORMAL,
-                                "[version] Don't need CPU jobs, skipping\n"
-                            );
-                        }
-                        no_cpu_requested = true;
-                        continue;
-                    }
-                }
+            if (!need_this_resource(host_usage)) {
+                continue;
            }
            if (host_usage.flops > bavp->host_usage.flops) {
                bavp->host_usage = host_usage;
@ -344,20 +376,6 @@ BEST_APP_VERSION* get_app_version(WORKUNIT& wu) {
            );
            g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
        }
-        if (no_cpu_requested) {
-            sprintf(message,
-                "CPU app exists for %s but no CPU work requested",
-                app->user_friendly_name
-            );
-            g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
-        }
-        if (no_cuda_requested) {
-            sprintf(message,
-                "CUDA app exists for %s but no CUDA work requested",
-                app->user_friendly_name
-            );
-            g_wreq->insert_no_work_message(USER_MESSAGE(message, "high"));
-        }
        return NULL;
    }
    return bavp;
@ -906,12 +924,11 @@ int add_wu_to_reply(
    WORKUNIT wu2, wu3;
    
    APP_VERSION* avp = bavp->avp;
-    if (avp == (APP_VERSION*)1) avp = NULL;

    // add the app, app_version, and workunit to the reply,
    // but only if they aren't already there
    //
-    if (!bavp->anonymous_platform) {
+    if (avp) {
        APP_VERSION av2=*avp, *avp2=&av2;
        
        if (strlen(config.replace_download_url_by_timezone)) {
@ -1986,8 +2003,8 @@ void JOB_SET::add_job(JOB& job) {
    disk_usage += job.disk_usage;
    if (config.debug_send) {
        log_messages.printf(MSG_NORMAL,
-            "[send] added job to set.  est_time %f disk_usage %f\n",
-            est_time, disk_usage
+            "[send] added job to set.  est_time %.2f disk_usage %.2fGB\n",
+            est_time, disk_usage/GIGA
        );
    }
 }
--- a/sched/server_types.cpp
+++ b/sched/server_types.cpp
@ -57,10 +57,21 @@ void remove_quotes(char* p) {
 int CLIENT_APP_VERSION::parse(FILE* f) {
    char buf[256];

+    memset(this, 0, sizeof(CLIENT_APP_VERSION));
    while (fgets(buf, sizeof(buf), f)) {
        if (match_tag(buf, "</app_version>")) return 0;
        if (parse_str(buf, "<app_name>", app_name, 256)) continue;
+        if (parse_str(buf, "<plan_class>", plan_class, 256)) continue;
        if (parse_int(buf, "<version_num>", version_num)) continue;
+        if (parse_double(buf, "<flops>", host_usage.flops)) continue;
+        if (match_tag(buf, "<coprocs>")) {
+            COPROCS coprocs;
+            coprocs.parse(f);
+            COPROC* cp = coprocs.lookup("CUDA");
+            if (cp) {
+                host_usage.ncudas = cp->count;
+            }
+        }
    }
    return ERR_XML_PARSE;
 }
@ -572,7 +583,7 @@ int SCHEDULER_REPLY::write(FILE* fout, SCHEDULER_REQUEST& sreq) {
        fprintf(fout, "<request_delay>%f</request_delay>\n", request_delay);
    }
    log_messages.printf(MSG_NORMAL,
-        "Sending reply to [HOST#%d]: %d results, delay req %f\n",
+        "Sending reply to [HOST#%d]: %d results, delay req %.2f\n",
        host.id, wreq.nresults, request_delay
    );

@ -920,7 +931,7 @@ int RESULT::write_to_client(FILE* fout) {
    fputs(buf, fout);

    APP_VERSION* avp = bavp->avp;
-    if (avp == (APP_VERSION*)1) avp = NULL;
+    CLIENT_APP_VERSION* cavp = bavp->cavp;
    if (avp) {
        PLATFORM* pp = ssp->lookup_platform_id(avp->platformid);
        fprintf(fout,
@ -929,7 +940,14 @@ int RESULT::write_to_client(FILE* fout) {
            "    <plan_class>%s</plan_class>\n",
            pp->name, avp->version_num, avp->plan_class
        );
+    } else if (cavp) {
+        fprintf(fout,
+            "    <version_num>%d</version_num>\n"
+            "    <plan_class>%s</plan_class>\n",
+            cavp->version_num, cavp->plan_class
+        );
    }
+
    fputs("</result>\n", fout);
    return 0;
 }
--- a/sched/server_types.h
+++ b/sched/server_types.h
@ -85,15 +85,6 @@ struct HOST_USAGE {
    ~HOST_USAGE(){}
 };

-// keep track of the best app_version for each app for this host
-//
-struct BEST_APP_VERSION {
-    int appid;
-    APP_VERSION* avp;       // NULL if none exists
-    HOST_USAGE host_usage;
-    bool anonymous_platform;    // client has app_version
-};
-
 // summary of a client's request for work, and our response to it
 // Note: this is zeroed out in SCHEDULER_REPLY constructor
 //
@ -179,6 +170,8 @@ struct WORK_REQ {
    std::vector<USER_MESSAGE> no_work_messages;
    std::vector<BEST_APP_VERSION*> best_app_versions;

+    // various reasons for not sending jobs (used to explain why)
+    //
    bool no_allowed_apps_available;
    bool excessive_work_buf;
    bool hr_reject_temp;
@ -187,10 +180,11 @@ struct WORK_REQ {
    bool gpu_too_slow;
    bool no_gpus_prefs;
    bool daily_result_quota_exceeded;
-    int total_max_results_day;
-        // host.max_results_day * (NCPUS + NCUDA*cuda_multiplier)
    bool cache_size_exceeded;
    bool no_jobs_available;     // project has no work right now
+
+    int total_max_results_day;
+        // host.max_results_day * (NCPUS + NCUDA*cuda_multiplier)
    int nresults_on_host;
        // How many results from this project are in progress on the host.
        // Initially this is the number of "other_results"
@ -222,10 +216,33 @@ struct MSG_FROM_HOST_DESC {
 struct CLIENT_APP_VERSION {
    char app_name[256];
    int version_num;
+    char plan_class[256];
+    HOST_USAGE host_usage;

    int parse(FILE*);
 };

+// keep track of the best app_version for each app for this host
+//
+struct BEST_APP_VERSION {
+    int appid;
+
+    bool present;
+
+    // populated if anonymous platform:
+    CLIENT_APP_VERSION* cavp;
+
+    // populated otherwise:
+    APP_VERSION* avp;
+    HOST_USAGE host_usage;
+
+    BEST_APP_VERSION() {
+        present = false;
+        cavp = NULL;
+        avp = NULL;
+    }
+};
+
 // subset of global prefs used by scheduler
 //
 struct GLOBAL_PREFS {
@ -343,7 +360,6 @@ struct SCHEDULER_REQUEST {
    SCHEDULER_REQUEST();
    ~SCHEDULER_REQUEST();
    const char* parse(FILE*);
-    bool has_version(APP& app);
    int write(FILE*); // write request info to file: not complete
 };