- client: add logic to work fetch so that each project

will have enough jobs to use its share of resource instances. This avoids situations where e.g. on a 2-CPU system a project has 75% resource share and 1 CPU job, and its STD increases without bound. Did a general cleanup of the logic for computing work request sizes (seconds and instances). svn path=/trunk/boinc/; revision=20036
2009-12-24 20:40:27 +00:00 · 2009-12-24 20:40:27 +00:00 · 876522c6aa
parent 868c18d2b2
commit 876522c6aa
5 changed files with 94 additions and 84 deletions
--- a/16
+++ b/16
@ -10725,3 +10725,19 @@ David  24 Dec 2009

    client/
        work_fetch.cpp
+
+David  24 Dec 2009
+    - client: add logic to work fetch so that each project
+        will have enough jobs to use its share of resource instances.
+        This avoids situations where e.g. on a 2-CPU system
+        a project has 75% resource share and 1 CPU job,
+        and its STD increases without bound.
+        
+        Did a general cleanup of the logic for computing
+        work request sizes (seconds and instances).
+
+    client/
+        work_fetch.cpp,h
+        rr_sim.cpp
+    sched/
+        handle_request.cpp
--- a/client/rr_sim.cpp
+++ b/client/rr_sim.cpp
@ -266,7 +266,9 @@ void CLIENT_STATE::rr_simulation() {

        p = rp->project;
        p->pwf.has_runnable_jobs = true;
+        p->cpu_pwf.nused_total += rp->avp->avg_ncpus;
        if (rp->uses_cuda()) {
+            p->cuda_pwf.nused_total += rp->avp->ncudas;
            p->cuda_pwf.has_runnable_jobs = true;
            if (cuda_work_fetch.sim_nused < coproc_cuda->count) {
                sim_status.activate(rp, 0);
@ -275,6 +277,7 @@ void CLIENT_STATE::rr_simulation() {
                cuda_work_fetch.pending.push_back(rp);
            }
        } else if (rp->uses_ati()) {
+            p->ati_pwf.nused_total += rp->avp->natis;
            p->ati_pwf.has_runnable_jobs = true;
            if (ati_work_fetch.sim_nused < coproc_ati->count) {
                sim_status.activate(rp, 0);
--- a/client/work_fetch.cpp
+++ b/client/work_fetch.cpp
@ -104,6 +104,7 @@ void RSC_PROJECT_WORK_FETCH::rr_init(PROJECT* p, int rsc_type) {
    fetchable_share = 0;
    has_runnable_jobs = false;
    sim_nused = 0;
+    nused_total = 0;
    deadlines_missed = 0;
 }

@ -204,10 +205,20 @@ bool RSC_PROJECT_WORK_FETCH::overworked() {
    // If resource is saturated for less than work_buf_total(),
    // get work for it from the non-overworked project with greatest LTD.
 #define FETCH_IF_PROJECT_STARVED        3
-    // If any project is not overworked and has no runnable jobs
-    // (for any resource, not just this one)
+    // If any project is not overworked and has too few jobs
+    // to use its instance share,
    // get work from the one with greatest LTD.

+static char* criterion_name(int criterion) {
+    switch (criterion) {
+    case FETCH_IF_IDLE_INSTANCE: return "idle instance";
+    case FETCH_IF_MAJOR_SHORTFALL: return "major shortfall";
+    case FETCH_IF_MINOR_SHORTFALL: return "minor shortfall";
+    case FETCH_IF_PROJECT_STARVED: return "starved";
+    }
+    return "unknown";
+}
+
 // Choose the best project to ask for work for this resource,
 // given the specific criterion
 //
@ -241,7 +252,7 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
            break;
        case FETCH_IF_PROJECT_STARVED:
            if (rpwf.overworked()) continue;
-            if (p->pwf.has_runnable_jobs) continue;
+            if (rpwf.nused_total >= ninstances*rpwf.fetchable_share) continue;
            break;
        }
        if (pbest) {
@ -254,67 +265,68 @@ PROJECT* RSC_WORK_FETCH::choose_project(int criterion) {
    }
    if (!pbest) return NULL;

+    if (log_flags.work_fetch_debug) {
+        msg_printf(pbest, MSG_INFO,
+            "chosen: %s %s", criterion_name(criterion), rsc_name(rsc_type)
+        );
+    }
+
    // decide how much work to request from each resource
    //
    work_fetch.clear_request();
    switch (criterion) {
    case FETCH_IF_IDLE_INSTANCE:
-        if (log_flags.work_fetch_debug) {
-            msg_printf(pbest, MSG_INFO,
-                "chosen: %s idle instance", rsc_name(rsc_type)
-            );
-        }
-        req = share_request(pbest);
-        if (req > shortfall) req = shortfall;
-        set_request(pbest, req);
+        set_request(pbest);
        break;
    case FETCH_IF_MAJOR_SHORTFALL:
-        if (log_flags.work_fetch_debug) {
-            msg_printf(pbest, MSG_INFO,
-                "chosen: %s major shortfall", rsc_name(rsc_type)
-            );
-        }
-        req = share_request(pbest);
-        if (req > shortfall) req = shortfall;
-        set_request(pbest, req);
+    case FETCH_IF_PROJECT_STARVED:
+        set_request(pbest);
        break;
    case FETCH_IF_MINOR_SHORTFALL:
-        if (log_flags.work_fetch_debug) {
-            msg_printf(pbest, MSG_INFO,
-                "chosen: %s minor shortfall", rsc_name(rsc_type)
-            );
-        }
-        work_fetch.set_shortfall_requests(pbest);
-        break;
-    case FETCH_IF_PROJECT_STARVED:
-        if (log_flags.work_fetch_debug) {
-            msg_printf(pbest, MSG_INFO,
-                "chosen: %s starved", rsc_name(rsc_type)
-            );
-        }
-        req = share_request(pbest);
-        set_request(pbest, req);
+        // in this case, potentially request work for all resources
+        //
+        work_fetch.set_all_requests(pbest);
        break;
    }
    return pbest;
 }

-void WORK_FETCH::set_shortfall_requests(PROJECT* p) {
-    cpu_work_fetch.set_shortfall_request(p);
-    if (coproc_cuda && gpus_usable) {
-        cuda_work_fetch.set_shortfall_request(p);
-    }
-    if (coproc_ati && gpus_usable) {
-        ati_work_fetch.set_shortfall_request(p);
-    }
-}
-
-void RSC_WORK_FETCH::set_shortfall_request(PROJECT* p) {
+// request this project's share of shortfall and instances
+//
+void RSC_WORK_FETCH::set_request(PROJECT* p) {
    if (!shortfall) return;
    RSC_PROJECT_WORK_FETCH& w = project_state(p);
    if (!w.may_have_work) return;
    if (w.overworked()) return;
-    set_request(p, shortfall);
+    double dcf = p->duration_correction_factor;
+    if (dcf < 0.02 || dcf > 80.0) {
+        // if project's DCF is too big or small,
+        // its completion time estimates are useless; just ask for 1 second
+        //
+        req_secs = 1;
+    } else {
+        req_secs = shortfall * w.fetchable_share;
+    }
+
+    // the number of additional instances needed to have our share
+    //
+    double x1 = (ninstances * w.fetchable_share) - w.nused_total;
+
+    // our share of the idle instances
+    //
+    double x2 = nidle_now * w.fetchable_share;
+
+    req_instances = std::max(x1, x2);
+}
+
+void WORK_FETCH::set_all_requests(PROJECT* p) {
+    cpu_work_fetch.set_request(p);
+    if (coproc_cuda && gpus_usable) {
+        cuda_work_fetch.set_request(p);
+    }
+    if (coproc_ati && gpus_usable) {
+        ati_work_fetch.set_request(p);
+    }
 }

 void WORK_FETCH::set_overall_debts() {
@ -454,7 +466,7 @@ void WORK_FETCH::compute_work_request(PROJECT* p) {
        // Otherwise we can have a situation where a GPU is idle,
        // we ask only for GPU work, and the project never has any
        //
-        work_fetch.set_shortfall_requests(pbest);
+        work_fetch.set_all_requests(pbest);
        return;
    }

@ -548,27 +560,6 @@ PROJECT* WORK_FETCH::choose_project() {
    return p;
 }

-double RSC_WORK_FETCH::share_request(PROJECT* p) {
-    double dcf = p->duration_correction_factor;
-    if (dcf < 0.02 || dcf > 80.0) {
-        // if project's DCF is too big or small,
-        // its completion time estimates are useless; just ask for 1 second
-        //
-        return 1;
-    } else {
-        // otherwise ask for the project's share
-        //
-        RSC_PROJECT_WORK_FETCH& w = project_state(p);
-        return gstate.work_buf_total()*w.fetchable_share;
-    }
-}
-
-void RSC_WORK_FETCH::set_request(PROJECT* p, double r) {
-    RSC_PROJECT_WORK_FETCH& w = project_state(p);
-    req_secs = r;
-    req_instances = (int)ceil(w.fetchable_share*nidle_now);
-}
-
 void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) {
    APP_VERSION* avp = atp->result->avp;
    PROJECT* p = atp->result->project;
--- a/client/work_fetch.h
+++ b/client/work_fetch.h
@ -62,6 +62,7 @@ struct RSC_PROJECT_WORK_FETCH {
        // determines how many instances this project deserves
    bool has_runnable_jobs;
    double sim_nused;
+    double nused_total;     // sum of instances over all runnable jobs
    int deadlines_missed;
    int deadlines_missed_copy;
        // copy of the above used during schedule_cpus()
@ -185,7 +186,7 @@ struct RSC_WORK_FETCH {
    // the following specify the work request for this resource
    //
    double req_secs;
-    int req_instances;
+    double req_instances;

    // debt accounting
    double secs_this_debt_interval;
@ -204,9 +205,7 @@ struct RSC_WORK_FETCH {
    void update_short_term_debts();
    void print_state(const char*);
    void clear_request();
-    void set_request(PROJECT*, double);
-    double share_request(PROJECT*);
-    void set_shortfall_request(PROJECT*);
+    void set_request(PROJECT*);
    bool may_have_work(PROJECT*);
    RSC_WORK_FETCH() {
        memset(this, 0, sizeof(*this));
@ -243,7 +242,7 @@ struct WORK_FETCH {
        PROJECT*, SCHEDULER_REPLY*, std::vector<RESULT*>new_results
    );
    void set_initial_work_request();
-    void set_shortfall_requests(PROJECT*);
+    void set_all_requests(PROJECT*);
    void print_state();
    void init();
    void rr_init();
--- a/sched/handle_request.cpp
+++ b/sched/handle_request.cpp
@ -336,7 +336,7 @@ int authenticate_user() {
            );
            goto make_new_host;
        }
-        
+
    } else {
        // Here no hostid was given, or the ID was bad.
        // Look up the user, then create a new host record
@ -480,7 +480,7 @@ got_host:
            user.update_field(buf);
        }
    }
-    
+
    return 0;
 }

@ -585,7 +585,7 @@ int send_result_abort() {
    DB_IN_PROGRESS_RESULT result;
    std::string result_names;
    unsigned int i;
-    
+
    if (g_request->other_results.size() == 0) {
        return 0;
    }
@ -595,6 +595,7 @@ int send_result_abort() {
    for (i=0; i<g_request->other_results.size(); i++) {
        OTHER_RESULT& orp=g_request->other_results[i];
        orp.abort = true;
+            // if the host has a result not in the DB, abort it
        orp.abort_if_not_started = false;
        orp.reason = ABORT_REASON_NOT_FOUND;
        if (i > 0) result_names.append(", ");
@ -619,7 +620,7 @@ int send_result_abort() {
                    orp.abort = true;
                    orp.abort_if_not_started = false;
                    orp.reason = ABORT_REASON_WU_CANCELLED;
-                } else if ( result.assimilate_state == ASSIMILATE_DONE ) {
+                } else if (result.assimilate_state == ASSIMILATE_DONE) {
                    // if the WU has been assimilated, abort if not started
                    //
                    orp.abort = false;
@ -658,8 +659,8 @@ int send_result_abort() {
            log_messages.printf(MSG_NORMAL,
                "[HOST#%d]: Send result_abort for result %s; reason: %s\n",
                g_reply->host.id, orp.name, reason_str(orp.reason)
-            ); 
-            // send user message 
+            );
+            // send user message
            char buf[256];
            sprintf(buf, "Result %s is no longer usable", orp.name);
            g_reply->insert_message(buf, "high");
@ -668,10 +669,10 @@ int send_result_abort() {
            log_messages.printf(MSG_NORMAL,
                "[HOST#%d]: Send result_abort_if_unstarted for result %s; reason %d\n",
                g_reply->host.id, orp.name, orp.reason
-            ); 
+            );
        }
    }
-    
+
    return aborts_sent;
 }

@ -857,11 +858,11 @@ void warn_user_if_core_client_upgrade_scheduled() {
        remaining /= 3600;

        if (0 < remaining) {
-            
+
            char msg[512];
            int days  = remaining / 24;
            int hours = remaining % 24;
-      
+
            sprintf(msg,
                "Starting in %d days and %d hours, project will require a minimum "
                "BOINC core client version of %d.%d.0.  You are currently using "
@ -1275,7 +1276,7 @@ void process_request(char* code_sign_key) {
            send_result_abort();
        }
    }
-    
+
    if (requesting_work()) {
        if (!send_code_sign_key(code_sign_key)) {
            ok_to_send_work = false;