From 794214208f6279c43bc094a647440234e1fd4d09 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Tue, 28 Sep 2010 20:17:09 +0000
Subject: [PATCH] - validator: if credit calculation returns an error,     wait
 6 hours before retrying

svn path=/trunk/boinc/; revision=22418
---
 checkin_notes           |  15 +++-
 client/client_state.h   |   3 +-
 client/cs_statefile.cpp |   8 ++
 client/sim.cpp          | 176 +++++++++++++++++++++++++++++-----------
 client/sim.h            |   7 ++
 doc/sim_form.php        | 111 ++++---------------------
 sched/validator.cpp     |   4 +-
 7 files changed, 177 insertions(+), 147 deletions(-)

diff --git a/checkin_notes b/checkin_notes
index 73a855567e..455579a0e4 100644
--- a/checkin_notes
+++ b/checkin_notes
@@ -1,4 +1,3 @@
-
 David  3 Jan 2010
     - client: ATI available RAM wasn't being detected correctly
 
@@ -6848,6 +6847,19 @@ David  27 Sept 2010
         cs_statefile.cpp
         scheduler_op.cpp
         time_stats.cpp
+<<<<<<< .mine
+
+David  28 Sept 2010
+    - validator: if credit calculation returns an error,
+        wait 6 hours before retrying
+
+    sched/
+        validator.cpp
+    client/
+        sim.cpp,h
+        client_state.h
+        cs_statefile.cpp
+=======
 
 Rom    28 Sept 2010
     - MGR: Make the new BOINC skin the default skin.
@@ -6871,3 +6883,4 @@ Rom    28 Sept 2010
     /
         configure.ac
         version.h
+>>>>>>> .r22417
diff --git a/client/client_state.h b/client/client_state.h
index 3f71756919..72ddd4c299 100644
--- a/client/client_state.h
+++ b/client/client_state.h
@@ -467,8 +467,7 @@ struct CLIENT_STATE {
     double monotony();
 
     void do_client_simulation();
-    void make_job(PROJECT*, WORKUNIT*, RESULT*);
-    void handle_completed_results();
+    void handle_completed_results(PROJECT*);
     void get_workload(vector<IP_RESULT>&);
     void simulate();
     bool simulate_rpc(PROJECT*);
diff --git a/client/cs_statefile.cpp b/client/cs_statefile.cpp
index 80d19abfa5..c4bed05040 100644
--- a/client/cs_statefile.cpp
+++ b/client/cs_statefile.cpp
@@ -388,7 +388,15 @@ int CLIENT_STATE::parse_state_file() {
             continue;
         }
         if (match_tag(buf, "<host_info>")) {
+            FILE* f2 = fopen("temp", "w");
+#ifdef SIM
+            fprintf(f2, "FOOBAR\n");
+            retval = host_info.parse(mf, false);
+#else
+            fprintf(f2, "BLAH\n");
             retval = host_info.parse(mf, true);
+#endif
+            fclose(f2);
             if (retval) {
                 msg_printf(NULL, MSG_INTERNAL_ERROR,
                     "Can't parse host info in state file"
diff --git a/client/sim.cpp b/client/sim.cpp
index 30766b11fd..c2c96dbe5f 100644
--- a/client/sim.cpp
+++ b/client/sim.cpp
@@ -88,37 +88,70 @@ void PROJECT::update_dcf_stats(RESULT* rp) {
     return;
 }
 
+APP* choose_app(vector<APP*>& apps) {
+    double x = drand();
+    double sum = 0;
+    unsigned int i;
+
+    for (i=0; i<apps.size(); i++) {
+        sum += apps[i]->weight;
+    }
+    for (i=0; i<apps.size(); i++) {
+        APP* app = apps[i];
+        x -= app->weight/sum;
+        if (x <= 0) {
+            return app;
+        }
+    }
+    return apps.back();
+}
+
+bool app_version_needs_work(APP_VERSION* avp) {
+    if (avp->ncudas) {
+        return (cuda_work_fetch.req_secs>0 || cuda_work_fetch.req_instances>0);
+    }
+    if (avp->natis) {
+        return (ati_work_fetch.req_secs>0 || ati_work_fetch.req_instances>0);
+    }
+    return (cpu_work_fetch.req_secs>0 || cpu_work_fetch.req_instances>0);
+}
+
+bool has_app_version_needing_work(APP* app) {
+    for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
+        APP_VERSION* avp = gstate.app_versions[i];
+        if (avp->app != app) continue;
+        if (app_version_needs_work(avp)) return true;
+    }
+    return false;
+}
+
+// choose a version for this app for which we need work
+//
+APP_VERSION* choose_app_version(APP* app) {
+    APP_VERSION* best_avp = NULL;
+    for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
+        APP_VERSION* avp = gstate.app_versions[i];
+        if (avp->app != app) continue;
+        if (!app_version_needs_work(avp)) continue;
+        if (!best_avp) {
+            best_avp = avp;
+        } else if (avp->flops > best_avp->flops) {
+            best_avp = avp;
+        }
+    }
+    return best_avp;
+}
+
 // generate a job; pick a random app for this project,
 // and pick a FLOP count from its distribution
 //
-void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) {
-    APP* ap1, *ap=0;
-    double net_fpops = host_info.p_fpops;
-    double x = drand();
-    unsigned int i;
-
-    for (i=0; i<apps.size();i++) {
-        ap1 = apps[i];
-        if (ap1->project != p) continue;
-        x -= ap1->weight;
-        if (x <= 0) {
-            ap = ap1;
-            break;
-        }
-    }
-    if (!ap) {
-        printf("ERROR - NO APP\n");
-        exit(1);
-    }
+void make_job(
+    PROJECT* p, WORKUNIT* wup, RESULT* rp, vector<APP*>app_list
+) {
+    APP* app = choose_app(app_list);
+    APP_VERSION* avp = choose_app_version(app);
     rp->clear();
-    rp->avp = 0;
-    for (i=0; i<gstate.app_versions.size(); i++) {
-        APP_VERSION* avp = gstate.app_versions[i];
-        if (avp->app == ap) {
-            rp->avp = avp;
-            break;
-        }
-    }
+    rp->avp = avp;
     if (!rp->avp) {
         printf("ERROR - NO APP VERSION\n");
         exit(1);
@@ -127,23 +160,23 @@ void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) {
     rp->wup = wup;
     sprintf(rp->name, "%s_%d", p->project_name, p->result_index++);
     wup->project = p;
-    wup->rsc_fpops_est = ap->fpops_est;
-    double ops = ap->fpops.sample();
+    wup->rsc_fpops_est = app->fpops_est;
+    double ops = app->fpops.sample();
     if (ops < 0) ops = 0;
-    rp->final_cpu_time = ops/net_fpops;
-    rp->report_deadline = now + ap->latency_bound;
+    rp->final_cpu_time = ops/avp->flops;
+    rp->report_deadline = gstate.now + app->latency_bound;
 }
 
 // process ready-to-report results
 //
-void CLIENT_STATE::handle_completed_results() {
+void CLIENT_STATE::handle_completed_results(PROJECT* p) {
     char buf[256];
     vector<RESULT*>::iterator result_iter;
 
     result_iter = results.begin();
     while (result_iter != results.end()) {
         RESULT* rp = *result_iter;
-        if (rp->ready_to_report) {
+        if (rp->project == p && rp->ready_to_report) {
             sprintf(buf, "result %s reported; %s<br>",
                 rp->name,
                 (gstate.now > rp->report_deadline)?
@@ -185,6 +218,31 @@ void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
     init_ip_results(work_buf_min(), ncpus, ip_results);
 }
 
+void get_apps_needing_work(PROJECT* p, vector<APP*>& apps) {
+    apps.clear();
+    for (unsigned int i=0; i<gstate.apps.size(); i++) {
+        APP* app = gstate.apps[i];
+        if (app->project != p) continue;
+        if (!has_app_version_needing_work(app)) continue;
+        apps.push_back(app);
+    }
+}
+
+void decrement_request_rsc(
+    RSC_WORK_FETCH& rwf, double ninstances, double est_runtime
+) {
+    rwf.req_secs -= est_runtime * ninstances;
+    rwf.req_instances -= ninstances;
+}
+
+void decrement_request(RESULT* rp) {
+    APP_VERSION* avp = rp->avp;
+    double est_runtime = rp->wup->rsc_fpops_est/avp->flops;
+    decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime);
+    decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime);
+    decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime);
+}
+
 // simulate trying to do an RPC;
 // return true if we actually did one
 //
@@ -193,6 +251,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
     static double last_time=-1e9;
     vector<IP_RESULT> ip_results;
     int infeasible_count = 0;
+    vector<RESULT*> new_results;
 
     double diff = now - last_time;
     if (diff && diff < connection_interval) {
@@ -204,25 +263,32 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
     }
     last_time = now;
 
-    work_fetch.request_string(buf);
+    // save request params for WORK_FETCH::handle_reply
+    double save_cpu_req_secs = cpu_work_fetch.req_secs;
+    host_info.coprocs.cuda.req_secs = cuda_work_fetch.req_secs;
+    host_info.coprocs.ati.req_secs = ati_work_fetch.req_secs;
+
+
+    work_fetch.request_string(buf2);
     sprintf(buf, "RPC to %s: %s<br>", p->project_name, buf2);
     html_msg += buf;
 
     msg_printf(0, MSG_INFO, buf);
 
-    handle_completed_results();
+    handle_completed_results(p);
 
     if (server_uses_workload) {
         get_workload(ip_results);
     }
 
     bool sent_something = false;
-    double work_left = cpu_work_fetch.req_secs;
-    double instances_needed = cpu_work_fetch.req_instances;
-    while (work_left > 0 || instances_needed>0) {
+    while (1) {
+        vector<APP*> apps;
+        get_apps_needing_work(p, apps);
+        if (apps.empty()) break;
         RESULT* rp = new RESULT;
         WORKUNIT* wup = new WORKUNIT;
-        make_job(p, wup, rp);
+        make_job(p, wup, rp, apps);
 
         if (server_uses_workload) {
             IP_RESULT c(rp->name, rp->report_deadline, rp->final_cpu_time);
@@ -241,17 +307,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
         sent_something = true;
         rp->set_state(RESULT_FILES_DOWNLOADED, "simulate_rpc");
         results.push_back(rp);
+        new_results.push_back(rp);
         sprintf(buf, "got job %s: CPU time %.2f, deadline %s<br>",
             rp->name, rp->final_cpu_time, time_to_string(rp->report_deadline)
         );
         html_msg += buf;
-        work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops;
-        instances_needed -= 1;
+        decrement_request(rp);
     }
 
-    if (cpu_work_fetch.req_secs > 0 && !sent_something) {
-        p->backoff();
-    }
+
+    SCHEDULER_REPLY sr;
+    cpu_work_fetch.req_secs = save_cpu_req_secs;
+    work_fetch.handle_reply(p, &sr, new_results);
     p->nrpc_failures = 0;
     if (sent_something) {
         request_schedule_cpus("simulate_rpc");
@@ -260,6 +327,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
     return true;
 }
 
+SCHEDULER_REPLY::SCHEDULER_REPLY() {
+    cpu_backoff = 0;
+    cuda_backoff = 0;
+    ati_backoff = 0;
+}
+SCHEDULER_REPLY::~SCHEDULER_REPLY() {}
+
 void PROJECT::backoff() {
     nrpc_failures++;
     double backoff = calculate_exponential_backoff(
@@ -619,14 +693,16 @@ void CLIENT_STATE::html_rec() {
         int n=0;
         for (unsigned int i=0; i<active_tasks.active_tasks.size(); i++) {
             ACTIVE_TASK* atp = active_tasks.active_tasks[i];
+            int np = atp->result->avp->avg_ncpus;
+            if (np < 1) np = 1;
             if (atp->task_state() == PROCESS_EXECUTING) {
                 PROJECT* p = atp->result->project;
-                fprintf(html_out, "<td bgcolor=%s>%s%s: %.2f</td>",
-                    colors[p->index],
+                fprintf(html_out, "<td colspan=%d bgcolor=%s>%s%s: %.2f</td>",
+                    np, colors[p->index],
                     atp->result->rr_sim_misses_deadline?"*":"",
                     atp->result->name, atp->cpu_time_left
                 );
-                n++;
+                n += np;
             }
         }
         while (n<ncpus) {
@@ -819,8 +895,12 @@ void CLIENT_STATE::do_client_simulation() {
     add_platform("client simulator");
     parse_state_file();
     read_global_prefs();
+    cull_projects();
+    int j=0;
     for (unsigned int i=0; i<projects.size(); i++) {
-        projects[i]->index = i;
+        if (!projects[i]->dont_request_more_work) {
+            projects[i]->index = j++;
+        }
     }
 
     gstate.now = 86400;
diff --git a/client/sim.h b/client/sim.h
index fa0d43647e..0bb583b0aa 100644
--- a/client/sim.h
+++ b/client/sim.h
@@ -47,6 +47,13 @@ struct PROJECT_RESULTS {
     double cpu_wasted;
     int nresults_met_deadline;
     int nresults_missed_deadline;
+
+    PROJECT_RESULTS() {
+        cpu_used = 0;
+        cpu_wasted = 0;
+        nresults_met_deadline = 0;
+        nresults_missed_deadline = 0;
+    }
 };
 
 struct NORMAL_DIST {
diff --git a/doc/sim_form.php b/doc/sim_form.php
index 38e862024a..1d767be569 100644
--- a/doc/sim_form.php
+++ b/doc/sim_form.php
@@ -6,102 +6,22 @@ function show_form() {
     echo "
     <form action=sim_form.php method=post>
 
-    <b>sim_projects.xml</b> (describes a set of projects)
+    <b>client_state.xml</b>
     <br>
-    <textarea name=projects rows=10 cols=80><projects>
-    <project>
-        <project_name>P1</project_name>
-        <resource_share>100</resource_share>
-        <app>
-            <latency_bound>15000</latency_bound>
-            <fpops_est>3600</fpops_est>
-            <fpops>
-                <mean>3600</mean>
-                <stdev>10</stdev>
-            </fpops>
-            <working_set>1e7</working_set>
-        </app>
-        <available>
-            <frac>.7</frac>
-            <lambda>1000</lambda>
-        </available>
-    </project>
-    <project>
-        <project_name>P2</project_name>
-        <resource_share>50</resource_share>
-        <app>
-            <latency_bound>10000</latency_bound>
-            <fpops_est>1800</fpops_est>
-            <fpops>
-                <mean>1800</mean>
-                <stdev>10</stdev>
-            </fpops>
-            <working_set>1e7</working_set>
-        </app>
-        <available>
-            <frac>.7</frac>
-            <lambda>1000</lambda>
-        </available>
-    </project>
-</projects></textarea>
+    <textarea name=client_state rows=10 cols=80>
+</textarea>
 
     <p>
-    <b>sim_host.xml</b>: (describes the simulated host)
+    <b>global_prefs.xml:</b> (the host's preferences)
     <br>
-    <textarea name=host rows=10 cols=80><host>
-    <p_fpops>1</p_fpops>
-    <m_nbytes>1e9</m_nbytes>
-    <connection_interval>600</connection_interval>
-    <p_ncpus>2</p_ncpus>
-    <available>
-        <frac>.8</frac>
-        <lambda>1000</lambda>
-    </available>
-</host></textarea>
-
-    <p>
-    <b>sim_prefs.xml:</b> (the host's preferences)
-    <br>
-    <textarea name=prefs rows=10 cols=80><global_preferences>
-    <source_project>http://isaac.ssl.berkeley.edu/alpha/</source_project>
-    <source_scheduler>isaac.ssl.berkeley.edu/alpha_cgi/cgi</source_scheduler>
-<mod_time>1170192285</mod_time>
-<run_if_user_active/>
-<idle_time_to_run>3</idle_time_to_run>
-<leave_apps_in_memory/>
-<cpu_scheduling_period_minutes>1</cpu_scheduling_period_minutes>
-<hangup_if_dialed/>
-<work_buf_min_days>0.1</work_buf_min_days>
-<work_buf_additional_days>0</work_buf_additional_days>
-<max_cpus>4</max_cpus>
-<cpu_usage_limit>100</cpu_usage_limit>
-<disk_interval>180</disk_interval>
-<disk_max_used_gb>100</disk_max_used_gb>
-<disk_max_used_pct>50</disk_max_used_pct>
-<disk_min_free_gb>2</disk_min_free_gb>
-<vm_max_used_pct>75</vm_max_used_pct>
-<ram_max_used_busy_pct>50</ram_max_used_busy_pct>
-<ram_max_used_idle_pct>90</ram_max_used_idle_pct>
-<max_bytes_sec_down>200000</max_bytes_sec_down>
-<max_bytes_sec_up>200000</max_bytes_sec_up>
-</global_preferences></textarea>
+    <textarea name=global_prefs rows=10 cols=80>
+</textarea>
 
     <p>
     <b>cc_config.xml:</b> (the client configuration options)
     <br>
-    <textarea name=cc_config rows=10 cols=80><cc_config>
-    <log_flags>
-        <mem_usage_debug>0</mem_usage_debug>
-        <cpu_sched_debug>1</cpu_sched_debug>
-        <cpu_sched>1</cpu_sched>
-        <rr_simulation>0</rr_simulation>
-        <benchmark_debug>1</benchmark_debug>
-        <task_debug>1</task_debug>
-        <work_fetch_debug>1</work_fetch_debug>
-        <app_msg_send>0</app_msg_send>
-        <unparsed_xml/>
-    </log_flags>
-</cc_config></textarea>
+    <textarea name=cc_config rows=10 cols=80>
+</textarea>
 
     <p>
     <b>
@@ -140,13 +60,14 @@ function show_form() {
 if ($_POST['submit']) {
     chdir("sim");
 
-    $x1 = file_put_contents("sim_projects.xml", $_POST['projects']);
-    $x2 = file_put_contents("sim_host.xml", $_POST['host']);
-    $x3 = file_put_contents("sim_prefs.xml", $_POST['prefs']);
-    $x4 = file_put_contents("cc_config.xml", $_POST['cc_config']);
-    if (!$x1 || !$x2 || !$x3 || !$x4) {
-        echo "Can't write files - check permissions\n";
-        exit();
+    if (!file_put_contents("client_state.xml", $_POST['client_state'])) {
+        echo "Can't write client_state.xml - check permissions\n"; exit();
+    }
+    if (!file_put_contents("global_prefs.xml", $_POST['global_prefs'])) {
+        echo "Can't write global_prefs.xml - check permissions\n"; exit();
+    }
+    if (!file_put_contents("cc_config.xml", $_POST['cc_config'])) {
+        echo "Can't write cc_config.xml - check permissions\n"; exit();
     }
     $duration = $_POST['duration'];
 
diff --git a/sched/validator.cpp b/sched/validator.cpp
index 2dc835c16c..51e513eb3d 100644
--- a/sched/validator.cpp
+++ b/sched/validator.cpp
@@ -373,7 +373,8 @@ int handle_wu(
                         "[WU#%d %s] assign_credit_set() returned %d\n",
                         wu.id, wu.name, retval
                     );
-                    return retval;
+                    transition_time = DELAYED;
+                    goto leave;
                 }
             }
 
@@ -530,6 +531,7 @@ int handle_wu(
         }
     }
 
+leave:
     --log_messages;
 
     switch (transition_time) {