- client: brute-force attempt at eliminating domino-effect preemption:

if job A is unstarted and EDF, and there's a job B that is later in the list, is started, has the same app version, and has the same arrival time, move A after B. - client: remove the "temp_dcf" mechanism, which had the same goal but didn't work. - client: in computing overall debt for a project, subtract a term that reflects pending work. This should reduce repeated fetches from the same project. - client simulator: tweaks svn path=/trunk/boinc/; revision=20223
2010-01-21 00:14:56 +00:00 · 2010-01-21 00:14:56 +00:00 · b5124fe729
parent 3d61129049
commit b5124fe729
10 changed files with 259 additions and 57 deletions
--- a/24
+++ b/24
@ -562,3 +562,27 @@ David  19 Jan 2010
    client/
        sim.cpp
        rr_sim.cpp
+
+David  19 Jan 2010
+    - client: brute-force attempt at eliminating domino-effect preemption:
+        if job A is unstarted and EDF,
+        and there's a job B that is later in the list,
+        is started, has the same app version,
+        and has the same arrival time,
+        move A after B.
+    - client: remove the "temp_dcf" mechanism,
+        which had the same goal but didn't work.
+    - client: in computing overall debt for a project,
+        subtract a term that reflects pending work.
+        This should reduce repeated fetches from the same project.
+    - client simulator: tweaks
+
+    client/
+        client_types.h
+        cpu_sched.cpp
+        rr_sim.cpp
+        sim.cpp,h
+        sim_util.cpp
+        work_fetch.cpp,h
+    html/user/
+        language_select.php
--- a/client/client_types.h
+++ b/client/client_types.h
@ -434,7 +434,7 @@ struct APP_VERSION {
    double max_working_set_size;
        // max working set of tasks using this app version.
        // temp var used in schedule_cpus()
-    double temp_dcf;
+    //double temp_dcf;

    APP_VERSION(){}
    ~APP_VERSION(){}
--- a/client/cpu_sched.cpp
+++ b/client/cpu_sched.cpp
@ -42,6 +42,7 @@

 #include <string>
 #include <cstring>
+#include <list>

 #include "str_util.h"
 #include "util.h"
@ -59,6 +60,7 @@
 #endif

 using std::vector;
+using std::list;

 #define DEADLINE_CUSHION    0
    // try to finish jobs this much in advance of their deadline
@ -817,6 +819,67 @@ static void promote_multi_thread_jobs(vector<RESULT*>& runnable_jobs) {
    }
 }

+// if job A is unstarted and EDF,
+// and there's a job later in the list that is started
+// and has the same arrival time and app version,
+// move A after B
+//
+static void demote_unstarted_edf(vector<RESULT*>& runnable_jobs) {
+    list<RESULT*> x;
+    RESULT *rp, *rp2;
+
+    // transfer to a list
+    //
+    unsigned int i;
+    for (i=0; i<runnable_jobs.size(); i++) {
+        x.push_back(runnable_jobs[i]);
+    }
+
+    // scan backwards through the list.
+    // if find a started job, scan backwards from there,
+    // looking for jobs to demote
+    //
+    for (list<RESULT*>::iterator p = x.end(); p != x.begin(); --p) {
+        rp = *p;
+        if (rp->not_started()) continue;
+        list<RESULT*>::iterator q = p;
+        --q;
+        while (1) {
+            rp2 = *q;
+            if (rp2->not_started()
+                && (rp2->received_time==rp->received_time)
+                && (rp2->avp==rp->avp)
+                && rp2->edf_scheduled
+            ) {
+                list<RESULT*>::iterator p2 = p;
+                p2++;
+                x.insert(p2, rp2);
+
+                if (q == x.begin()) {
+                    x.erase(q);
+                    break;
+                }
+                list<RESULT*>::iterator q2 = q;
+                --q;
+                x.erase(q2);
+            } else {
+                if (q == x.begin()) {
+                    break;
+                }
+                --q;
+            }
+
+        }
+    }
+
+    // transfer back to vector
+    //
+    runnable_jobs.clear();
+    for (list<RESULT*>::iterator p = x.begin(); p!= x.end(); ++p) {
+        runnable_jobs.push_back(*p);
+    }
+}
+
 // return true if r0 is more important to run than r1
 //
 static inline bool more_important(RESULT* r0, RESULT* r1) {
@ -843,7 +906,7 @@ static inline bool more_important(RESULT* r0, RESULT* r1) {
    if (!unfin0 && unfin1) return false;

    // favor jobs selected first by schedule_cpus()
-    // (e.g., because their project has high debt)
+    // (e.g., because their project has high STD)
    //
    if (r0->seqno < r1->seqno) return true;
    if (r0->seqno > r1->seqno) return false;
@ -1196,6 +1259,15 @@ bool CLIENT_STATE::enforce_schedule() {
    //
    append_unfinished_time_slice(runnable_jobs);

+    // Remove the EDF flag from unstarted jobs for which
+    // there's a running job with the same app version.
+    // This is a (crude) mechanism to avoid the situation
+    // where there's a set of EDF unstarted jobs,
+    // each one runs for a little and leaves EDF
+    // and is preempted by the next.
+    //
+    demote_unstarted_edf(runnable_jobs);
+
    // sort to-run list by decreasing importance
    //
    std::sort(
--- a/client/rr_sim.cpp
+++ b/client/rr_sim.cpp
@ -221,6 +221,7 @@ void CLIENT_STATE::print_deadline_misses() {
    }
 }

+#if 0
 // compute a per-app-version "temporary DCF" based on the elapsed time
 // and fraction done of running jobs
 //
@ -238,6 +239,7 @@ void compute_temp_dcf() {
        }
    }
 }
+#endif

 void CLIENT_STATE::rr_simulation() {
    PROJECT* p, *pbest;
@ -248,7 +250,7 @@ void CLIENT_STATE::rr_simulation() {
    double ar = available_ram();

    work_fetch.rr_init();
-    compute_temp_dcf();
+    //compute_temp_dcf();

    if (log_flags.rr_simulation) {
        msg_printf(0, MSG_INFO,
@ -338,7 +340,8 @@ void CLIENT_STATE::rr_simulation() {
        for (i=0; i<sim_status.active.size(); i++) {
            rp = sim_status.active[i];
            set_rrsim_flops(rp);
-            rp->rrsim_finish_delay = rp->avp->temp_dcf*rp->rrsim_flops_left/rp->rrsim_flops;
+            //rp->rrsim_finish_delay = rp->avp->temp_dcf*rp->rrsim_flops_left/rp->rrsim_flops;
+            rp->rrsim_finish_delay = rp->rrsim_flops_left/rp->rrsim_flops;
            if (!rpbest || rp->rrsim_finish_delay < rpbest->rrsim_finish_delay) {
                rpbest = rp;
            }
@ -348,10 +351,9 @@ void CLIENT_STATE::rr_simulation() {

        if (log_flags.rr_simulation) {
            msg_printf(pbest, MSG_INFO,
-                "[rr_sim] %.2f: %s finishes after %.2f (%f * %.2fG/%.2fG)",
+                "[rr_sim] %.2f: %s finishes after %.2f (%.2fG/%.2fG)",
                sim_now - now,
                rpbest->name, rpbest->rrsim_finish_delay,
-                rpbest->avp->temp_dcf,
                rpbest->rrsim_flops_left/1e9, rpbest->rrsim_flops/1e9
            );
        }
--- a/client/sim.cpp
+++ b/client/sim.cpp
@ -189,10 +189,13 @@ void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
    init_ip_results(work_buf_min(), ncpus, ip_results);
 }

+// simulate trying to do an RPC
+// return false if we didn't actually do one
+//
 bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
    char buf[256];
    SIM_PROJECT* p = (SIM_PROJECT*) _p;
-    static double last_time=0;
+    static double last_time=-1e9;
    vector<IP_RESULT> ip_results;
    int infeasible_count = 0;

@ -206,11 +209,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
    }
    last_time = now;

-    sprintf(buf, "RPC to %s; asking for %f<br>",
-        p->project_name, cpu_work_fetch.req_secs
+    sprintf(buf, "RPC to %s; asking for %f/%.2f<br>",
+        p->project_name, cpu_work_fetch.req_secs, cpu_work_fetch.req_instances
    );
    html_msg += buf;

+    msg_printf(0, MSG_INFO, buf);
+
    handle_completed_results();

    if (server_uses_workload) {
@ -219,7 +224,8 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {

    bool sent_something = false;
    double work_left = cpu_work_fetch.req_secs;
-    while (work_left > 0) {
+    double instances_needed = cpu_work_fetch.req_instances;
+    while (work_left > 0 || instances_needed>0) {
        RESULT* rp = new RESULT;
        WORKUNIT* wup = new WORKUNIT;
        make_job(p, wup, rp);
@ -246,17 +252,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* _p) {
        );
        html_msg += buf;
        work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops;
+        instances_needed -= 1;
    }

    if (cpu_work_fetch.req_secs > 0 && !sent_something) {
        p->backoff();
-        return false;
-    } else {
+    }
    p->nrpc_failures = 0;
+    if (sent_something) {
        request_schedule_cpus("simulate_rpc");
        request_work_fetch("simulate_rpc");
-        return true;
    }
+    return true;
 }

 void SIM_PROJECT::backoff() {
@ -269,23 +276,62 @@ void SIM_PROJECT::backoff() {

 bool CLIENT_STATE::scheduler_rpc_poll() {
    PROJECT *p;
+    bool action = false;
+    static double last_time=0;
+    static double last_work_fetch_time = 0;
+    double elapsed_time;
+
+    // check only every 5 sec
+    //
+    if (now - last_time < SCHEDULER_RPC_POLL_PERIOD) {
+        msg_printf(NULL, MSG_INFO, "RPC poll: not time %f - %f < %f",
+            now, last_time, SCHEDULER_RPC_POLL_PERIOD
+        );
+        return false;
+    }
+    last_time = now;

    msg_printf(NULL, MSG_INFO, "RPC poll start");
+    while (1) {
        p = next_project_sched_rpc_pending();
        if (p) {
-        return simulate_rpc(p);
+            work_fetch.compute_work_request(p);
+            action = simulate_rpc(p);
+            break;
        }
    
        p = find_project_with_overdue_results();
        if (p) {
-        return simulate_rpc(p);
+            work_fetch.compute_work_request(p);
+            action = simulate_rpc(p);
+            break;
        }
+
+        // should we check work fetch?  Do this at most once/minute
+
+        if (must_check_work_fetch) {
+            last_work_fetch_time = 0;
+        }
+        elapsed_time = now - last_work_fetch_time;
+        if (elapsed_time < WORK_FETCH_PERIOD) {
+            return false;
+        }
+        must_check_work_fetch = false;
+        last_work_fetch_time = now;
+
        p = work_fetch.choose_project();
        if (p) {
-        return simulate_rpc(p);
+            action = simulate_rpc(p);
+            break;
        }
-    msg_printf(NULL, MSG_INFO, "RPC poll: nothing to do");
-    return false;
+        break;
+    }
+    if (action) {
+        msg_printf(p, MSG_INFO, "RPC poll: did an RPC");
+    } else {
+        msg_printf(0, MSG_INFO, "RPC poll: didn't do an RPC");
+    }
+    return action;
 }

 bool ACTIVE_TASK_SET::poll() {
@ -303,9 +349,9 @@ bool ACTIVE_TASK_SET::poll() {
    for (i=0; i<gstate.projects.size(); i++) {
        p = (SIM_PROJECT*) gstate.projects[i];
        p->idle = true;
-        sprintf(buf, "%s STD: %f min RPC<br>",
+        sprintf(buf, "%s STD: %f LTD %f<br>",
            p->project_name, p->cpu_pwf.short_term_debt,
-            time_to_string(p->min_rpc_time)
+            p->pwf.overall_debt
        );
        gstate.html_msg += buf;
    }
@ -606,11 +652,16 @@ void CLIENT_STATE::html_end(bool show_next) {

 void CLIENT_STATE::simulate() {
    bool action;
-    now = 0;
+    double start = START_TIME;
+    now = start;
    html_start(false);
+    msg_printf(0, MSG_INFO,
+        "starting simultion. delta %f duration %f", delta, duration
+    );
    while (1) {
        running = host_info.available.sample(now);
        while (1) {
+            msg_printf(0, MSG_INFO, "polling");
            action = active_tasks.poll();
            if (running) {
                action |= handle_finished_apps();
@ -618,9 +669,11 @@ void CLIENT_STATE::simulate() {
                action |= enforce_schedule();
                action |= scheduler_rpc_poll();
            }
+            msg_printf(0, MSG_INFO, action?"did action":"did no action");
            if (!action) break;
        }
        now += delta;
+        msg_printf(0, MSG_INFO, "took time step");
        for (unsigned int i=0; i<active_tasks.active_tasks.size(); i++) {
            ACTIVE_TASK* atp = active_tasks.active_tasks[i];
            if (atp->task_state() == PROCESS_EXECUTING) {
@ -628,7 +681,7 @@ void CLIENT_STATE::simulate() {
            }
        }
        html_rec();
-        if (now > duration) break;
+        if (now > start + duration) break;
    }
    html_end(false);
 }
--- a/client/sim.h
+++ b/client/sim.h
@ -90,7 +90,7 @@ public:
    double lambda;
    int parse(XML_PARSER&, char* end_tag);
    bool sample(double);
-    void init();
+    void init(double);
    RANDOM_PROCESS();
 };

@ -313,6 +313,8 @@ extern bool dual_dcf;
 extern bool work_fetch_old;
 extern bool gpus_usable;

+#define SCHEDULER_RPC_POLL_PERIOD   5.0
+
 #define WORK_FETCH_PERIOD   60

 #define CPU_SCHED_ENFORCE_PERIOD    60
@ -325,3 +327,6 @@ extern bool gpus_usable;

 #define MAX_STD   (86400)
    // maximum short-term debt
+
+#define START_TIME  946684800
+    // Jan 1 2000
--- a/client/sim_util.cpp
+++ b/client/sim_util.cpp
@ -395,6 +395,10 @@ bool RANDOM_PROCESS::sample(double t) {
            value = true;
        }
    }
+    msg_printf(0, MSG_INFO,
+        "value: %d lambda: %f t %f time_left %f",
+        value, lambda, t, time_left
+    );
    return value;
 }

@ -402,7 +406,8 @@ RANDOM_PROCESS::RANDOM_PROCESS() {
    frac = 1;
 }

-void RANDOM_PROCESS::init() {
+void RANDOM_PROCESS::init(double st) {
+    last_time = st;
    value = true;
    time_left = exponential(lambda);
    off_lambda = lambda/frac - lambda;
@ -502,23 +507,23 @@ int SIM_HOST::parse(XML_PARSER& xp) {
    bool is_tag;
    int retval;

-    p_ncpus = 1;
    connection_interval = 0;
+    p_ncpus = 1;
    while(!xp.get(tag, sizeof(tag), is_tag)) {
        if (!is_tag) return ERR_XML_PARSE;
        if (!strcmp(tag, "/host")) return 0;
        else if (xp.parse_double(tag, "p_fpops", p_fpops)) continue;
        else if (xp.parse_double(tag, "m_nbytes", m_nbytes)) continue;
-        else if (xp.parse_double(tag, "connection_interval", connection_interval)) continue;
        else if (xp.parse_int(tag, "p_ncpus", p_ncpus)) continue;
+        else if (xp.parse_double(tag, "connection_interval", connection_interval)) continue;
        else if (!strcmp(tag, "available")) {
            retval = available.parse(xp, "/available");
            if (retval) return retval;
-            available.init();
+            available.init(START_TIME);
        } else if (!strcmp(tag, "idle")) {
            retval = idle.parse(xp, "/idle");
            if (retval) return retval;
-            idle.init();
+            idle.init(START_TIME);
        } else {
            printf("unrecognized: %s\n", tag);
            return ERR_XML_PARSE;
--- a/client/work_fetch.cpp
+++ b/client/work_fetch.cpp
@ -513,12 +513,13 @@ void RSC_WORK_FETCH::update_long_term_debts() {
    double offset;
    double delta_limit = secs_this_debt_interval;
    if (max_debt > -2*delta_limit) {
+        if (fabs(max_debt) < 1e-6) max_debt = 0;
        offset = max_debt?-max_debt:0;  // avoid -0
    } else {
        offset = 2*delta_limit;
    }
    if (log_flags.debt_debug) {
-        msg_printf(0, MSG_INFO, "[debt] %s LTD: adding offset %.2f",
+        msg_printf(0, MSG_INFO, "[debt] %s LTD: adding offset %f",
            rsc_name(rsc_type), offset
        );
    }
@ -658,15 +659,43 @@ void WORK_FETCH::set_all_requests(PROJECT* p) {
    }
 }

+// Compute an "overall long-term debt" for each project.
+// This is a sum of per-resource terms, scaled by the relative speed of the resource.
+// The term for a resource is its LTD plus an estimate of queued work.
+//
 void WORK_FETCH::set_overall_debts() {
-    for (unsigned i=0; i<gstate.projects.size(); i++) {
-        PROJECT* p = gstate.projects[i];
-        p->pwf.overall_debt = p->cpu_pwf.long_term_debt;
+    unsigned int i;
+    PROJECT* p;
+    RESULT* rp;
+    APP_VERSION* avp;
+
+    for (i=0; i<gstate.projects.size(); i++) {
+        p = gstate.projects[i];
+        p->cpu_pwf.queue_est = 0;
+        p->cuda_pwf.queue_est = 0;
+        p->ati_pwf.queue_est = 0;
+    }
+    for (i=0; i<gstate.results.size(); i++) {
+        rp = gstate.results[i];
+        p = rp->project;
+        if (!rp->nearly_runnable()) continue;
+        if (p->non_cpu_intensive) continue;
+        double dt = rp->estimated_time_remaining(false);
+        avp = rp->avp;
+        p->cpu_pwf.queue_est += dt*avp->avg_ncpus;
+        p->cuda_pwf.queue_est += dt*avp->ncudas;
+        p->ati_pwf.queue_est += dt*avp->natis;
+    }
+    for (i=0; i<gstate.projects.size(); i++) {
+        p = gstate.projects[i];
+        p->pwf.overall_debt = p->cpu_pwf.long_term_debt - p->cpu_pwf.queue_est/gstate.ncpus;
        if (coproc_cuda) {
-            p->pwf.overall_debt += cuda_work_fetch.speed*p->cuda_pwf.long_term_debt;
+            p->pwf.overall_debt += cuda_work_fetch.relative_speed*
+                (p->cuda_pwf.long_term_debt - p->cuda_pwf.queue_est/coproc_cuda->count);
        }
        if (coproc_ati) {
-            p->pwf.overall_debt += ati_work_fetch.speed*p->ati_pwf.long_term_debt;
+            p->pwf.overall_debt += ati_work_fetch.relative_speed*
+                (p->ati_pwf.long_term_debt - p->ati_pwf.queue_est/coproc_ati->count);
        }
    }
 }
@ -769,6 +798,10 @@ PROJECT* WORK_FETCH::non_cpu_intensive_project_needing_work() {
 PROJECT* WORK_FETCH::choose_project() {
    PROJECT* p = 0;

+    if (log_flags.work_fetch_debug) {
+        msg_printf(0, MSG_INFO, "[wfd]: work fetch start");
+    }
+
    p = non_cpu_intensive_project_needing_work();
    if (p) return p;

@ -893,6 +926,7 @@ void WORK_FETCH::compute_shares() {
        if (!p->pwf.can_fetch_work) continue;
        if (p->cpu_pwf.may_have_work) {
            p->cpu_pwf.fetchable_share = p->resource_share/cpu_work_fetch.total_fetchable_share;
+            msg_printf(p, MSG_INFO, "FS: %f = %f/%f\n", p->cpu_pwf.fetchable_share, p->resource_share, cpu_work_fetch.total_fetchable_share);
        }
        if (coproc_cuda && p->cuda_pwf.may_have_work) {
            p->cuda_pwf.fetchable_share = p->resource_share/cuda_work_fetch.total_fetchable_share;
--- a/client/work_fetch.h
+++ b/client/work_fetch.h
@ -45,10 +45,14 @@ struct RSC_PROJECT_WORK_FETCH {

    // the following used by debt accounting
    double anticipated_debt;
+        // short-term debt, adjusted by scheduled jobs
    double secs_this_debt_interval;
    inline void reset_debt_accounting() {
        secs_this_debt_interval = 0;
    }
+    double queue_est;
+        // an estimate of instance-secs of queued work;
+        // a temp used in computing overall debts

    // the following are used by rr_simulation()
    //
@ -157,7 +161,7 @@ struct BUSY_TIME_ESTIMATOR {
 struct RSC_WORK_FETCH {
    int rsc_type;
    int ninstances;
-    double speed;   // total FLOPS relative to CPU total FLOPS
+    double relative_speed;   // total FLOPS relative to CPU total FLOPS

    // the following used/set by rr_simulation():
    //
@ -180,7 +184,7 @@ struct RSC_WORK_FETCH {
    void init(int t, int n, double sp) {
        rsc_type = t;
        ninstances = n;
-        speed = sp;
+        relative_speed = sp;
        busy_time_estimator.init(n);
    }
    // the following specify the work request for this resource
--- a/html/user/language_select.php
+++ b/html/user/language_select.php
@ -41,24 +41,27 @@ if ($set_lang){
 page_head(tra("Language selection"));

 echo "<p>",
-    tra("This web site is available in several languages. ".
-    "The currently selected language is: %1 (%2).",
+    tra(
+        "This web site is available in several languages. The currently selected language is: %1 (%2).",
        "<em>".tra("LANG_NAME_INTERNATIONAL")."</em>",
-    tra("LANG_NAME_NATIVE")),
+        tra("LANG_NAME_NATIVE")
+    ),
    "</p>",
    "<p>",
-    tra("Normally the choice of language ".
-    "is determined by your browser's language setting, ".
-    "which is: %1. ".
-    "You can change this setting using:", "<b>$prefs</b>"),
+    tra(
+        "Normally the choice of language is determined by your browser's language setting, which is: %1.  You can change this setting using:",
+        "<b>$prefs</b>"
+    ),
    "</p><ul>",
-    "<li>", tra("Firefox: Tools/Options/General"),
-    "<li>", tra("Microsoft IE: Tools/Internet Options/Languages"),
+    "<li>",
+    tra("Firefox: Tools/Options/General"),
+    "<li>",
+    tra("Microsoft IE: Tools/Internet Options/Languages"),
    "</ul>",
    "<p>",
-    tra("Or you can select a language by clicking on one of the links. ".
-    "This will send your browser a cookie; ".
-    "make sure your browser accepts cookies from our domain."),
+    tra(
+        "Or you can select a language by clicking on one of the links.  This will send your browser a cookie; make sure your browser accepts cookies from our domain."
+    ),
    "</p>"
 ;