- scheduler: add <have_cuda_apps> config flag.

If set the "effective NCPUS" (which is used to scale daily_result_quota and max_wus_in_progress) is max'd with the # of CUDA GPUs. svn path=/trunk/boinc/; revision=16246
2008-10-21 23:16:07 +00:00 · 2008-10-21 23:16:07 +00:00 · 5039207e2c
parent 1a409685e5
commit 5039207e2c
8 changed files with 62 additions and 34 deletions
--- a/13
+++ b/13
@ -8511,3 +8511,16 @@ David  21 Oct 2008
    - make "make distclean" work in all directories

    samples/*/Makefile
+
+David  21 Oct 2008
+    - scheduler: add <have_cuda_apps> config flag.
+        If set the "effective NCPUS" (which is used to scale
+        daily_result_quota and max_wus_in_progress)
+        is max'd with the # of CUDA GPUs.
+
+    sched/
+        sched_array.cpp
+        sched_config.cpp,h
+        sched_locality.cpp
+        sched_send.cpp,h
+        server_types.h
--- a/sched/sched_array.cpp
+++ b/sched/sched_array.cpp
@ -54,7 +54,7 @@ void scan_work_array(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
    rnd_off = rand() % ssp->max_wu_results;
    for (j=0; j<ssp->max_wu_results; j++) {
        i = (j+rnd_off) % ssp->max_wu_results;
-        if (!reply.work_needed()) break;
+        if (!work_needed(sreq, reply, false)) break;

        WU_RESULT& wu_result = ssp->wu_results[i];

--- a/sched/sched_config.cpp
+++ b/sched/sched_config.cpp
@ -184,6 +184,7 @@ int SCHED_CONFIG::parse(FILE* f) {
        if (xp.parse_bool(tag, "use_credit_multiplier", use_credit_multiplier)) continue;
        if (xp.parse_bool(tag, "multiple_clients_per_host", multiple_clients_per_host)) continue;
        if (xp.parse_bool(tag, "no_vista_sandbox", no_vista_sandbox)) continue;
+        if (xp.parse_bool(tag, "have_cuda_apps", have_cuda_apps)) continue;


        if (xp.parse_bool(tag, "debug_version_select", debug_version_select)) continue;
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -125,6 +125,7 @@ public:
    bool use_credit_multiplier;
    bool multiple_clients_per_host;
    bool no_vista_sandbox;
+    bool have_cuda_apps;

    // log flags
    //
--- a/sched/sched_locality.cpp
+++ b/sched/sched_locality.cpp
@ -488,7 +488,7 @@ static int send_results_for_file(

    nsent = 0;

-    if (!reply.work_needed(true)) {
+    if (!work_needed(sreq, reply, true)) {
        return 0;
    }

@ -519,7 +519,7 @@ static int send_results_for_file(
    for (i=0; i<100; i++) {     // avoid infinite loop
        int query_retval;

-        if (!reply.work_needed(true)) break;
+        if (!work_needed(sreq, reply, true)) break;

        log_messages.printf(MSG_DEBUG,
            "in_send_results_for_file(%s, %d) prev_result.id=%d\n", filename, i, prev_result.id
@ -740,7 +740,7 @@ static int send_new_file_work_deterministic_seeded(

        if (retval==ERR_NO_APP_VERSION || retval==ERR_INSUFFICIENT_RESOURCE) return retval;

-        if (nsent>0 || !reply.work_needed(true)) break; 
+        if (nsent>0 || !work_needed(sreq, reply, true)) break; 
        // construct a name which is lexically greater than the name of any result
        // which uses this file.
        sprintf(min_resultname, "%s__~", filename);
@ -793,7 +793,7 @@ static int send_new_file_work_deterministic(

    // continue deterministic search at lexically first possible
    // filename, continue to randomly choosen one
-    if (!getfile_retval && reply.work_needed(true)) {
+    if (!getfile_retval && work_needed(sreq, reply, true)) {
        send_new_file_work_deterministic_seeded(
            sreq, reply, nsent, "", start_filename
        );
@ -834,7 +834,7 @@ static int send_new_file_work(
    SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
 ) {

-    while (reply.work_needed(true)) {
+    while (work_needed(sreq, reply, true)) {
        int retval_sow, retval_snfwws;
        double frac=((double)rand())/(double)RAND_MAX;
        int now   = time(0);
@ -857,7 +857,7 @@ static int send_new_file_work(
        if (retval_sow==ERR_NO_APP_VERSION || retval_sow==ERR_INSUFFICIENT_RESOURCE) return retval_sow;

    
-        while (reply.work_needed(true) && retry<5) {
+        while (work_needed(sreq, reply, true) && retry<5) {
            log_messages.printf(MSG_DEBUG,
                "send_new_file_work(%d): try to send from working set\n", retry
            );
@ -867,7 +867,7 @@ static int send_new_file_work(

        }    

-        if (reply.work_needed(true)) {
+        if (work_needed(sreq, reply, true)) {
            log_messages.printf(MSG_DEBUG,
                "send_new_file_work(): try deterministic method\n"
            );
@ -878,7 +878,7 @@ static int send_new_file_work(
                return 1;
            }
        }
-    } // while reply.work_needed(true)
+    } // while reply.work_needed(sreq, reply, true)
    return 0;
 }

@ -898,7 +898,7 @@ static int send_old_work(
    DB_RESULT result;
    int now=time(0);

-    if (!reply.work_needed(true)) {
+    if (!work_needed(sreq, reply, true)) {
        return 0;
    }

@ -1067,7 +1067,7 @@ void send_work_locality(
        int k = (i+j)%nfiles;
        int retval_srff;

-        if (!reply.work_needed(true)) break;
+        if (!work_needed(sreq, reply, true)) break;
        FILE_INFO& fi = sreq.file_infos[k];
        retval_srff=send_results_for_file(
            fi.name, nsent, sreq, reply, false
@ -1081,7 +1081,7 @@ void send_work_locality(
        // If the work was not sent for other (dynamic) reason such as insufficient
        // cpu, then DON'T delete the file.
        //
-        if (nsent == 0 && reply.work_needed(true) && config.file_deletion_strategy == 1) {
+        if (nsent == 0 && work_needed(sreq, reply, true) && config.file_deletion_strategy == 1) {
            reply.file_deletes.push_back(fi);
            log_messages.printf(MSG_DEBUG,
                "[HOST#%d]: delete file %s (not needed)\n", reply.host.id, fi.name
@ -1104,7 +1104,7 @@ void send_work_locality(

    // send new files if needed
    //
-    if (reply.work_needed(true)) {
+    if (work_needed(sreq, reply, true)) {
        send_new_file_work(sreq, reply);
    }
 }
--- a/sched/sched_send.cpp
+++ b/sched/sched_send.cpp
@ -77,10 +77,21 @@ const char* infeasible_string(int code) {
 const int MIN_SECONDS_TO_SEND = 0;
 const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY);

-inline int effective_ncpus(HOST& host) {
-    int ncpus = host.p_ncpus;
+// return a number that
+// - is the # of CPUs in EDF simulation
+// - scales the daily result quota
+// - scales max_wus_in_progress
+
+inline int effective_ncpus(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
+    int ncpus = reply.host.p_ncpus;
    if (ncpus > config.max_ncpus) ncpus = config.max_ncpus;
    if (ncpus < 1) ncpus = 1;
+    if (config.have_cuda_apps) {
+        COPROC* cp = sreq.coprocs.lookup("cuda");
+        if (cp && cp->count > ncpus) {
+            ncpus = cp->count;
+        }
+    }
    return ncpus;
 }

@ -653,7 +664,7 @@ int wu_is_infeasible_fast(
        }
        IP_RESULT candidate("", wu.delay_bound, est_cpu);
        strcpy(candidate.name, wu.name);
-        if (check_candidate(candidate, effective_ncpus(reply.host), request.ip_results)) {
+        if (check_candidate(candidate, effective_ncpus(request, reply), request.ip_results)) {
            // it passed the feasibility test,
            // but don't add it the the workload yet;
            // wait until we commit to sending it
@ -833,47 +844,49 @@ void unlock_sema() {
 // and we haven't exceeded result per RPC limit,
 // and we haven't exceeded results per day limit
 //
-bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
+bool work_needed(
+    SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, bool locality_sched
+) {
    if (locality_sched) {
        // if we've failed to send a result because of a transient condition,
        // return false to preserve invariant
        //
-        if (wreq.disk.insufficient || wreq.speed.insufficient || wreq.mem.insufficient || wreq.no_allowed_apps_available) {
+        if (reply.wreq.disk.insufficient || reply.wreq.speed.insufficient || reply.wreq.mem.insufficient || reply.wreq.no_allowed_apps_available) {
            return false;
        }
    }
-    if (wreq.seconds_to_fill <= 0) return false;
-    if (wreq.disk_available <= 0) {
+    if (reply.wreq.seconds_to_fill <= 0) return false;
+    if (reply.wreq.disk_available <= 0) {
        return false;
    }
-    if (wreq.nresults >= config.max_wus_to_send) return false;
+    if (reply.wreq.nresults >= config.max_wus_to_send) return false;

-    int ncpus = effective_ncpus(host);
+    int ncpus = effective_ncpus(sreq, reply);

    // host.max_results_day is between 1 and config.daily_result_quota inclusive
    // wreq.daily_result_quota is between ncpus
    // and ncpus*host.max_results_day inclusive
    //
    if (config.daily_result_quota) {
-        if (host.max_results_day == 0 || host.max_results_day>config.daily_result_quota) {
-            host.max_results_day = config.daily_result_quota;
+        if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {
+            reply.host.max_results_day = config.daily_result_quota;
        }
-        wreq.daily_result_quota = ncpus*host.max_results_day;
-        if (host.nresults_today >= wreq.daily_result_quota) {
-            wreq.daily_result_quota_exceeded = true;
+        reply.wreq.daily_result_quota = ncpus*reply.host.max_results_day;
+        if (reply.host.nresults_today >= reply.wreq.daily_result_quota) {
+            reply.wreq.daily_result_quota_exceeded = true;
            return false;
        }
    }

    if (config.max_wus_in_progress) {
-        if (wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) {
+        if (reply.wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) {
            if (config.debug_send) {
                log_messages.printf(MSG_DEBUG,
                    "in-progress job limit exceeded; %d > %d*%d\n",
-                    wreq.nresults_on_host, config.max_wus_in_progress, ncpus
+                    reply.wreq.nresults_on_host, config.max_wus_in_progress, ncpus
                );
            }
-            wreq.cache_size_exceeded = true;
+            reply.wreq.cache_size_exceeded = true;
            return false;
        }
    }
@ -1022,7 +1035,7 @@ int add_result_to_reply(
    result.bavp = bavp;
    reply.insert_result(result);
    reply.wreq.seconds_to_fill -= wu_seconds_filled;
-    request.estimated_delay += wu_seconds_filled/effective_ncpus(reply.host);
+    request.estimated_delay += wu_seconds_filled/effective_ncpus(request, reply);
    reply.wreq.nresults++;
    reply.wreq.nresults_on_host++;
    if (!resent_result) reply.host.nresults_today++;
@ -1407,7 +1420,7 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

    if (config.workload_sim && sreq.have_other_results_list) {
        init_ip_results(
-            sreq.global_prefs.work_buf_min(), effective_ncpus(reply.host), sreq.ip_results
+            sreq.global_prefs.work_buf_min(), effective_ncpus(sreq, reply), sreq.ip_results
        );
    }

@ -1450,7 +1463,7 @@ struct JOB_SET {
        disk_usage = 0;
        disk_limit = reply.wreq.disk_available;
        max_jobs = config.max_wus_to_send;
-        int ncpus = effective_ncpus(reply.host), n;
+        int ncpus = effective_ncpus(sreq, reply), n;

        if (config.daily_result_quota) {
            if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {
--- a/sched/sched_send.h
+++ b/sched/sched_send.h
@ -62,3 +62,4 @@ extern void lock_sema();
 extern void unlock_sema();
 extern const char* infeasible_string(int);
 extern bool app_not_selected(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
+extern bool work_needed(SCHEDULER_REQUEST&, SCHEDULER_REPLY&, bool);
--- a/sched/server_types.h
+++ b/sched/server_types.h
@ -363,7 +363,6 @@ struct SCHEDULER_REPLY {
    void insert_workunit_unique(WORKUNIT&);
    void insert_result(RESULT&);
    void insert_message(USER_MESSAGE&);
-    bool work_needed(bool locality_sched=false);
    void set_delay(double);
    void got_good_result();     // adjust max_results_day
    void got_bad_result();      // adjust max_results_day