- validator: if credit calculation returns an error,

wait 6 hours before retrying svn path=/trunk/boinc/; revision=22418
2010-09-28 20:17:09 +00:00 · 2010-09-28 20:17:09 +00:00 · 794214208f
parent 02da1b01c0
commit 794214208f
7 changed files with 177 additions and 147 deletions
--- a/15
+++ b/15
@ -1,4 +1,3 @@
-
 David  3 Jan 2010
    - client: ATI available RAM wasn't being detected correctly

@ -6848,6 +6847,19 @@ David  27 Sept 2010
        cs_statefile.cpp
        scheduler_op.cpp
        time_stats.cpp
+<<<<<<< .mine
+
+David  28 Sept 2010
+    - validator: if credit calculation returns an error,
+        wait 6 hours before retrying
+
+    sched/
+        validator.cpp
+    client/
+        sim.cpp,h
+        client_state.h
+        cs_statefile.cpp
+=======

 Rom    28 Sept 2010
    - MGR: Make the new BOINC skin the default skin.
@ -6871,3 +6883,4 @@ Rom    28 Sept 2010
    /
        configure.ac
        version.h
+>>>>>>> .r22417
--- a/client/client_state.h
+++ b/client/client_state.h
@ -467,8 +467,7 @@ struct CLIENT_STATE {
    double monotony();

    void do_client_simulation();
-    void make_job(PROJECT*, WORKUNIT*, RESULT*);
-    void handle_completed_results();
+    void handle_completed_results(PROJECT*);
    void get_workload(vector<IP_RESULT>&);
    void simulate();
    bool simulate_rpc(PROJECT*);
--- a/client/cs_statefile.cpp
+++ b/client/cs_statefile.cpp
@ -388,7 +388,15 @@ int CLIENT_STATE::parse_state_file() {
            continue;
        }
        if (match_tag(buf, "<host_info>")) {
+            FILE* f2 = fopen("temp", "w");
+#ifdef SIM
+            fprintf(f2, "FOOBAR\n");
+            retval = host_info.parse(mf, false);
+#else
+            fprintf(f2, "BLAH\n");
            retval = host_info.parse(mf, true);
+#endif
+            fclose(f2);
            if (retval) {
                msg_printf(NULL, MSG_INTERNAL_ERROR,
                    "Can't parse host info in state file"
--- a/client/sim.cpp
+++ b/client/sim.cpp
@ -88,37 +88,70 @@ void PROJECT::update_dcf_stats(RESULT* rp) {
    return;
 }

+APP* choose_app(vector<APP*>& apps) {
+    double x = drand();
+    double sum = 0;
+    unsigned int i;
+
+    for (i=0; i<apps.size(); i++) {
+        sum += apps[i]->weight;
+    }
+    for (i=0; i<apps.size(); i++) {
+        APP* app = apps[i];
+        x -= app->weight/sum;
+        if (x <= 0) {
+            return app;
+        }
+    }
+    return apps.back();
+}
+
+bool app_version_needs_work(APP_VERSION* avp) {
+    if (avp->ncudas) {
+        return (cuda_work_fetch.req_secs>0 || cuda_work_fetch.req_instances>0);
+    }
+    if (avp->natis) {
+        return (ati_work_fetch.req_secs>0 || ati_work_fetch.req_instances>0);
+    }
+    return (cpu_work_fetch.req_secs>0 || cpu_work_fetch.req_instances>0);
+}
+
+bool has_app_version_needing_work(APP* app) {
+    for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
+        APP_VERSION* avp = gstate.app_versions[i];
+        if (avp->app != app) continue;
+        if (app_version_needs_work(avp)) return true;
+    }
+    return false;
+}
+
+// choose a version for this app for which we need work
+//
+APP_VERSION* choose_app_version(APP* app) {
+    APP_VERSION* best_avp = NULL;
+    for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
+        APP_VERSION* avp = gstate.app_versions[i];
+        if (avp->app != app) continue;
+        if (!app_version_needs_work(avp)) continue;
+        if (!best_avp) {
+            best_avp = avp;
+        } else if (avp->flops > best_avp->flops) {
+            best_avp = avp;
+        }
+    }
+    return best_avp;
+}
+
 // generate a job; pick a random app for this project,
 // and pick a FLOP count from its distribution
 //
-void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) {
-    APP* ap1, *ap=0;
-    double net_fpops = host_info.p_fpops;
-    double x = drand();
-    unsigned int i;
-
-    for (i=0; i<apps.size();i++) {
-        ap1 = apps[i];
-        if (ap1->project != p) continue;
-        x -= ap1->weight;
-        if (x <= 0) {
-            ap = ap1;
-            break;
-        }
-    }
-    if (!ap) {
-        printf("ERROR - NO APP\n");
-        exit(1);
-    }
+void make_job(
+    PROJECT* p, WORKUNIT* wup, RESULT* rp, vector<APP*>app_list
+) {
+    APP* app = choose_app(app_list);
+    APP_VERSION* avp = choose_app_version(app);
    rp->clear();
-    rp->avp = 0;
-    for (i=0; i<gstate.app_versions.size(); i++) {
-        APP_VERSION* avp = gstate.app_versions[i];
-        if (avp->app == ap) {
-            rp->avp = avp;
-            break;
-        }
-    }
+    rp->avp = avp;
    if (!rp->avp) {
        printf("ERROR - NO APP VERSION\n");
        exit(1);
@ -127,23 +160,23 @@ void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) {
    rp->wup = wup;
    sprintf(rp->name, "%s_%d", p->project_name, p->result_index++);
    wup->project = p;
-    wup->rsc_fpops_est = ap->fpops_est;
-    double ops = ap->fpops.sample();
+    wup->rsc_fpops_est = app->fpops_est;
+    double ops = app->fpops.sample();
    if (ops < 0) ops = 0;
-    rp->final_cpu_time = ops/net_fpops;
-    rp->report_deadline = now + ap->latency_bound;
+    rp->final_cpu_time = ops/avp->flops;
+    rp->report_deadline = gstate.now + app->latency_bound;
 }

 // process ready-to-report results
 //
-void CLIENT_STATE::handle_completed_results() {
+void CLIENT_STATE::handle_completed_results(PROJECT* p) {
    char buf[256];
    vector<RESULT*>::iterator result_iter;

    result_iter = results.begin();
    while (result_iter != results.end()) {
        RESULT* rp = *result_iter;
-        if (rp->ready_to_report) {
+        if (rp->project == p && rp->ready_to_report) {
            sprintf(buf, "result %s reported; %s<br>",
                rp->name,
                (gstate.now > rp->report_deadline)?
@ -185,6 +218,31 @@ void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
    init_ip_results(work_buf_min(), ncpus, ip_results);
 }

+void get_apps_needing_work(PROJECT* p, vector<APP*>& apps) {
+    apps.clear();
+    for (unsigned int i=0; i<gstate.apps.size(); i++) {
+        APP* app = gstate.apps[i];
+        if (app->project != p) continue;
+        if (!has_app_version_needing_work(app)) continue;
+        apps.push_back(app);
+    }
+}
+
+void decrement_request_rsc(
+    RSC_WORK_FETCH& rwf, double ninstances, double est_runtime
+) {
+    rwf.req_secs -= est_runtime * ninstances;
+    rwf.req_instances -= ninstances;
+}
+
+void decrement_request(RESULT* rp) {
+    APP_VERSION* avp = rp->avp;
+    double est_runtime = rp->wup->rsc_fpops_est/avp->flops;
+    decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime);
+    decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime);
+    decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime);
+}
+
 // simulate trying to do an RPC;
 // return true if we actually did one
 //
@ -193,6 +251,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
    static double last_time=-1e9;
    vector<IP_RESULT> ip_results;
    int infeasible_count = 0;
+    vector<RESULT*> new_results;

    double diff = now - last_time;
    if (diff && diff < connection_interval) {
@ -204,25 +263,32 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
    }
    last_time = now;

-    work_fetch.request_string(buf);
+    // save request params for WORK_FETCH::handle_reply
+    double save_cpu_req_secs = cpu_work_fetch.req_secs;
+    host_info.coprocs.cuda.req_secs = cuda_work_fetch.req_secs;
+    host_info.coprocs.ati.req_secs = ati_work_fetch.req_secs;
+
+
+    work_fetch.request_string(buf2);
    sprintf(buf, "RPC to %s: %s<br>", p->project_name, buf2);
    html_msg += buf;

    msg_printf(0, MSG_INFO, buf);

-    handle_completed_results();
+    handle_completed_results(p);

    if (server_uses_workload) {
        get_workload(ip_results);
    }

    bool sent_something = false;
-    double work_left = cpu_work_fetch.req_secs;
-    double instances_needed = cpu_work_fetch.req_instances;
-    while (work_left > 0 || instances_needed>0) {
+    while (1) {
+        vector<APP*> apps;
+        get_apps_needing_work(p, apps);
+        if (apps.empty()) break;
        RESULT* rp = new RESULT;
        WORKUNIT* wup = new WORKUNIT;
-        make_job(p, wup, rp);
+        make_job(p, wup, rp, apps);

        if (server_uses_workload) {
            IP_RESULT c(rp->name, rp->report_deadline, rp->final_cpu_time);
@ -241,17 +307,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
        sent_something = true;
        rp->set_state(RESULT_FILES_DOWNLOADED, "simulate_rpc");
        results.push_back(rp);
+        new_results.push_back(rp);
        sprintf(buf, "got job %s: CPU time %.2f, deadline %s<br>",
            rp->name, rp->final_cpu_time, time_to_string(rp->report_deadline)
        );
        html_msg += buf;
-        work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops;
-        instances_needed -= 1;
+        decrement_request(rp);
    }

-    if (cpu_work_fetch.req_secs > 0 && !sent_something) {
-        p->backoff();
-    }
+
+    SCHEDULER_REPLY sr;
+    cpu_work_fetch.req_secs = save_cpu_req_secs;
+    work_fetch.handle_reply(p, &sr, new_results);
    p->nrpc_failures = 0;
    if (sent_something) {
        request_schedule_cpus("simulate_rpc");
@ -260,6 +327,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
    return true;
 }

+SCHEDULER_REPLY::SCHEDULER_REPLY() {
+    cpu_backoff = 0;
+    cuda_backoff = 0;
+    ati_backoff = 0;
+}
+SCHEDULER_REPLY::~SCHEDULER_REPLY() {}
+
 void PROJECT::backoff() {
    nrpc_failures++;
    double backoff = calculate_exponential_backoff(
@ -619,14 +693,16 @@ void CLIENT_STATE::html_rec() {
        int n=0;
        for (unsigned int i=0; i<active_tasks.active_tasks.size(); i++) {
            ACTIVE_TASK* atp = active_tasks.active_tasks[i];
+            int np = atp->result->avp->avg_ncpus;
+            if (np < 1) np = 1;
            if (atp->task_state() == PROCESS_EXECUTING) {
                PROJECT* p = atp->result->project;
-                fprintf(html_out, "<td bgcolor=%s>%s%s: %.2f</td>",
-                    colors[p->index],
+                fprintf(html_out, "<td colspan=%d bgcolor=%s>%s%s: %.2f</td>",
+                    np, colors[p->index],
                    atp->result->rr_sim_misses_deadline?"*":"",
                    atp->result->name, atp->cpu_time_left
                );
-                n++;
+                n += np;
            }
        }
        while (n<ncpus) {
@ -819,8 +895,12 @@ void CLIENT_STATE::do_client_simulation() {
    add_platform("client simulator");
    parse_state_file();
    read_global_prefs();
+    cull_projects();
+    int j=0;
    for (unsigned int i=0; i<projects.size(); i++) {
-        projects[i]->index = i;
+        if (!projects[i]->dont_request_more_work) {
+            projects[i]->index = j++;
+        }
    }

    gstate.now = 86400;
--- a/client/sim.h
+++ b/client/sim.h
@ -47,6 +47,13 @@ struct PROJECT_RESULTS {
    double cpu_wasted;
    int nresults_met_deadline;
    int nresults_missed_deadline;
+
+    PROJECT_RESULTS() {
+        cpu_used = 0;
+        cpu_wasted = 0;
+        nresults_met_deadline = 0;
+        nresults_missed_deadline = 0;
+    }
 };

 struct NORMAL_DIST {
--- a/doc/sim_form.php
+++ b/doc/sim_form.php
@ -6,102 +6,22 @@ function show_form() {
    echo "
    <form action=sim_form.php method=post>

-    <b>sim_projects.xml</b> (describes a set of projects)
+    <b>client_state.xml</b>
    <br>
-    <textarea name=projects rows=10 cols=80><projects>
-    <project>
-        <project_name>P1</project_name>
-        <resource_share>100</resource_share>
-        <app>
-            <latency_bound>15000</latency_bound>
-            <fpops_est>3600</fpops_est>
-            <fpops>
-                <mean>3600</mean>
-                <stdev>10</stdev>
-            </fpops>
-            <working_set>1e7</working_set>
-        </app>
-        <available>
-            <frac>.7</frac>
-            <lambda>1000</lambda>
-        </available>
-    </project>
-    <project>
-        <project_name>P2</project_name>
-        <resource_share>50</resource_share>
-        <app>
-            <latency_bound>10000</latency_bound>
-            <fpops_est>1800</fpops_est>
-            <fpops>
-                <mean>1800</mean>
-                <stdev>10</stdev>
-            </fpops>
-            <working_set>1e7</working_set>
-        </app>
-        <available>
-            <frac>.7</frac>
-            <lambda>1000</lambda>
-        </available>
-    </project>
-</projects></textarea>
+    <textarea name=client_state rows=10 cols=80>
+</textarea>

    <p>
-    <b>sim_host.xml</b>: (describes the simulated host)
+    <b>global_prefs.xml:</b> (the host's preferences)
    <br>
-    <textarea name=host rows=10 cols=80><host>
-    <p_fpops>1</p_fpops>
-    <m_nbytes>1e9</m_nbytes>
-    <connection_interval>600</connection_interval>
-    <p_ncpus>2</p_ncpus>
-    <available>
-        <frac>.8</frac>
-        <lambda>1000</lambda>
-    </available>
-</host></textarea>
-
-    <p>
-    <b>sim_prefs.xml:</b> (the host's preferences)
-    <br>
-    <textarea name=prefs rows=10 cols=80><global_preferences>
-    <source_project>http://isaac.ssl.berkeley.edu/alpha/</source_project>
-    <source_scheduler>isaac.ssl.berkeley.edu/alpha_cgi/cgi</source_scheduler>
-<mod_time>1170192285</mod_time>
-<run_if_user_active/>
-<idle_time_to_run>3</idle_time_to_run>
-<leave_apps_in_memory/>
-<cpu_scheduling_period_minutes>1</cpu_scheduling_period_minutes>
-<hangup_if_dialed/>
-<work_buf_min_days>0.1</work_buf_min_days>
-<work_buf_additional_days>0</work_buf_additional_days>
-<max_cpus>4</max_cpus>
-<cpu_usage_limit>100</cpu_usage_limit>
-<disk_interval>180</disk_interval>
-<disk_max_used_gb>100</disk_max_used_gb>
-<disk_max_used_pct>50</disk_max_used_pct>
-<disk_min_free_gb>2</disk_min_free_gb>
-<vm_max_used_pct>75</vm_max_used_pct>
-<ram_max_used_busy_pct>50</ram_max_used_busy_pct>
-<ram_max_used_idle_pct>90</ram_max_used_idle_pct>
-<max_bytes_sec_down>200000</max_bytes_sec_down>
-<max_bytes_sec_up>200000</max_bytes_sec_up>
-</global_preferences></textarea>
+    <textarea name=global_prefs rows=10 cols=80>
+</textarea>

    <p>
    <b>cc_config.xml:</b> (the client configuration options)
    <br>
-    <textarea name=cc_config rows=10 cols=80><cc_config>
-    <log_flags>
-        <mem_usage_debug>0</mem_usage_debug>
-        <cpu_sched_debug>1</cpu_sched_debug>
-        <cpu_sched>1</cpu_sched>
-        <rr_simulation>0</rr_simulation>
-        <benchmark_debug>1</benchmark_debug>
-        <task_debug>1</task_debug>
-        <work_fetch_debug>1</work_fetch_debug>
-        <app_msg_send>0</app_msg_send>
-        <unparsed_xml/>
-    </log_flags>
-</cc_config></textarea>
+    <textarea name=cc_config rows=10 cols=80>
+</textarea>

    <p>
    <b>
@ -140,13 +60,14 @@ function show_form() {
 if ($_POST['submit']) {
    chdir("sim");

-    $x1 = file_put_contents("sim_projects.xml", $_POST['projects']);
-    $x2 = file_put_contents("sim_host.xml", $_POST['host']);
-    $x3 = file_put_contents("sim_prefs.xml", $_POST['prefs']);
-    $x4 = file_put_contents("cc_config.xml", $_POST['cc_config']);
-    if (!$x1 || !$x2 || !$x3 || !$x4) {
-        echo "Can't write files - check permissions\n";
-        exit();
+    if (!file_put_contents("client_state.xml", $_POST['client_state'])) {
+        echo "Can't write client_state.xml - check permissions\n"; exit();
+    }
+    if (!file_put_contents("global_prefs.xml", $_POST['global_prefs'])) {
+        echo "Can't write global_prefs.xml - check permissions\n"; exit();
+    }
+    if (!file_put_contents("cc_config.xml", $_POST['cc_config'])) {
+        echo "Can't write cc_config.xml - check permissions\n"; exit();
    }
    $duration = $_POST['duration'];

--- a/sched/validator.cpp
+++ b/sched/validator.cpp
@ -373,7 +373,8 @@ int handle_wu(
                        "[WU#%d %s] assign_credit_set() returned %d\n",
                        wu.id, wu.name, retval
                    );
-                    return retval;
+                    transition_time = DELAYED;
+                    goto leave;
                }
            }

@ -530,6 +531,7 @@ int handle_wu(
        }
    }

+leave:
    --log_messages;

    switch (transition_time) {