From 794214208f6279c43bc094a647440234e1fd4d09 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 28 Sep 2010 20:17:09 +0000 Subject: [PATCH] - validator: if credit calculation returns an error, wait 6 hours before retrying svn path=/trunk/boinc/; revision=22418 --- checkin_notes | 15 +++- client/client_state.h | 3 +- client/cs_statefile.cpp | 8 ++ client/sim.cpp | 176 +++++++++++++++++++++++++++++----------- client/sim.h | 7 ++ doc/sim_form.php | 111 ++++--------------------- sched/validator.cpp | 4 +- 7 files changed, 177 insertions(+), 147 deletions(-) diff --git a/checkin_notes b/checkin_notes index 73a855567e..455579a0e4 100644 --- a/checkin_notes +++ b/checkin_notes @@ -1,4 +1,3 @@ - David 3 Jan 2010 - client: ATI available RAM wasn't being detected correctly @@ -6848,6 +6847,19 @@ David 27 Sept 2010 cs_statefile.cpp scheduler_op.cpp time_stats.cpp +<<<<<<< .mine + +David 28 Sept 2010 + - validator: if credit calculation returns an error, + wait 6 hours before retrying + + sched/ + validator.cpp + client/ + sim.cpp,h + client_state.h + cs_statefile.cpp +======= Rom 28 Sept 2010 - MGR: Make the new BOINC skin the default skin. @@ -6871,3 +6883,4 @@ Rom 28 Sept 2010 / configure.ac version.h +>>>>>>> .r22417 diff --git a/client/client_state.h b/client/client_state.h index 3f71756919..72ddd4c299 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -467,8 +467,7 @@ struct CLIENT_STATE { double monotony(); void do_client_simulation(); - void make_job(PROJECT*, WORKUNIT*, RESULT*); - void handle_completed_results(); + void handle_completed_results(PROJECT*); void get_workload(vector&); void simulate(); bool simulate_rpc(PROJECT*); diff --git a/client/cs_statefile.cpp b/client/cs_statefile.cpp index 80d19abfa5..c4bed05040 100644 --- a/client/cs_statefile.cpp +++ b/client/cs_statefile.cpp @@ -388,7 +388,15 @@ int CLIENT_STATE::parse_state_file() { continue; } if (match_tag(buf, "")) { + FILE* f2 = fopen("temp", "w"); +#ifdef SIM + fprintf(f2, "FOOBAR\n"); + retval = host_info.parse(mf, false); +#else + fprintf(f2, "BLAH\n"); retval = host_info.parse(mf, true); +#endif + fclose(f2); if (retval) { msg_printf(NULL, MSG_INTERNAL_ERROR, "Can't parse host info in state file" diff --git a/client/sim.cpp b/client/sim.cpp index 30766b11fd..c2c96dbe5f 100644 --- a/client/sim.cpp +++ b/client/sim.cpp @@ -88,37 +88,70 @@ void PROJECT::update_dcf_stats(RESULT* rp) { return; } +APP* choose_app(vector& apps) { + double x = drand(); + double sum = 0; + unsigned int i; + + for (i=0; iweight; + } + for (i=0; iweight/sum; + if (x <= 0) { + return app; + } + } + return apps.back(); +} + +bool app_version_needs_work(APP_VERSION* avp) { + if (avp->ncudas) { + return (cuda_work_fetch.req_secs>0 || cuda_work_fetch.req_instances>0); + } + if (avp->natis) { + return (ati_work_fetch.req_secs>0 || ati_work_fetch.req_instances>0); + } + return (cpu_work_fetch.req_secs>0 || cpu_work_fetch.req_instances>0); +} + +bool has_app_version_needing_work(APP* app) { + for (unsigned int i=0; iapp != app) continue; + if (app_version_needs_work(avp)) return true; + } + return false; +} + +// choose a version for this app for which we need work +// +APP_VERSION* choose_app_version(APP* app) { + APP_VERSION* best_avp = NULL; + for (unsigned int i=0; iapp != app) continue; + if (!app_version_needs_work(avp)) continue; + if (!best_avp) { + best_avp = avp; + } else if (avp->flops > best_avp->flops) { + best_avp = avp; + } + } + return best_avp; +} + // generate a job; pick a random app for this project, // and pick a FLOP count from its distribution // -void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) { - APP* ap1, *ap=0; - double net_fpops = host_info.p_fpops; - double x = drand(); - unsigned int i; - - for (i=0; iproject != p) continue; - x -= ap1->weight; - if (x <= 0) { - ap = ap1; - break; - } - } - if (!ap) { - printf("ERROR - NO APP\n"); - exit(1); - } +void make_job( + PROJECT* p, WORKUNIT* wup, RESULT* rp, vectorapp_list +) { + APP* app = choose_app(app_list); + APP_VERSION* avp = choose_app_version(app); rp->clear(); - rp->avp = 0; - for (i=0; iapp == ap) { - rp->avp = avp; - break; - } - } + rp->avp = avp; if (!rp->avp) { printf("ERROR - NO APP VERSION\n"); exit(1); @@ -127,23 +160,23 @@ void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) { rp->wup = wup; sprintf(rp->name, "%s_%d", p->project_name, p->result_index++); wup->project = p; - wup->rsc_fpops_est = ap->fpops_est; - double ops = ap->fpops.sample(); + wup->rsc_fpops_est = app->fpops_est; + double ops = app->fpops.sample(); if (ops < 0) ops = 0; - rp->final_cpu_time = ops/net_fpops; - rp->report_deadline = now + ap->latency_bound; + rp->final_cpu_time = ops/avp->flops; + rp->report_deadline = gstate.now + app->latency_bound; } // process ready-to-report results // -void CLIENT_STATE::handle_completed_results() { +void CLIENT_STATE::handle_completed_results(PROJECT* p) { char buf[256]; vector::iterator result_iter; result_iter = results.begin(); while (result_iter != results.end()) { RESULT* rp = *result_iter; - if (rp->ready_to_report) { + if (rp->project == p && rp->ready_to_report) { sprintf(buf, "result %s reported; %s
", rp->name, (gstate.now > rp->report_deadline)? @@ -185,6 +218,31 @@ void CLIENT_STATE::get_workload(vector& ip_results) { init_ip_results(work_buf_min(), ncpus, ip_results); } +void get_apps_needing_work(PROJECT* p, vector& apps) { + apps.clear(); + for (unsigned int i=0; iproject != p) continue; + if (!has_app_version_needing_work(app)) continue; + apps.push_back(app); + } +} + +void decrement_request_rsc( + RSC_WORK_FETCH& rwf, double ninstances, double est_runtime +) { + rwf.req_secs -= est_runtime * ninstances; + rwf.req_instances -= ninstances; +} + +void decrement_request(RESULT* rp) { + APP_VERSION* avp = rp->avp; + double est_runtime = rp->wup->rsc_fpops_est/avp->flops; + decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime); + decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime); + decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime); +} + // simulate trying to do an RPC; // return true if we actually did one // @@ -193,6 +251,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { static double last_time=-1e9; vector ip_results; int infeasible_count = 0; + vector new_results; double diff = now - last_time; if (diff && diff < connection_interval) { @@ -204,25 +263,32 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { } last_time = now; - work_fetch.request_string(buf); + // save request params for WORK_FETCH::handle_reply + double save_cpu_req_secs = cpu_work_fetch.req_secs; + host_info.coprocs.cuda.req_secs = cuda_work_fetch.req_secs; + host_info.coprocs.ati.req_secs = ati_work_fetch.req_secs; + + + work_fetch.request_string(buf2); sprintf(buf, "RPC to %s: %s
", p->project_name, buf2); html_msg += buf; msg_printf(0, MSG_INFO, buf); - handle_completed_results(); + handle_completed_results(p); if (server_uses_workload) { get_workload(ip_results); } bool sent_something = false; - double work_left = cpu_work_fetch.req_secs; - double instances_needed = cpu_work_fetch.req_instances; - while (work_left > 0 || instances_needed>0) { + while (1) { + vector apps; + get_apps_needing_work(p, apps); + if (apps.empty()) break; RESULT* rp = new RESULT; WORKUNIT* wup = new WORKUNIT; - make_job(p, wup, rp); + make_job(p, wup, rp, apps); if (server_uses_workload) { IP_RESULT c(rp->name, rp->report_deadline, rp->final_cpu_time); @@ -241,17 +307,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { sent_something = true; rp->set_state(RESULT_FILES_DOWNLOADED, "simulate_rpc"); results.push_back(rp); + new_results.push_back(rp); sprintf(buf, "got job %s: CPU time %.2f, deadline %s
", rp->name, rp->final_cpu_time, time_to_string(rp->report_deadline) ); html_msg += buf; - work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops; - instances_needed -= 1; + decrement_request(rp); } - if (cpu_work_fetch.req_secs > 0 && !sent_something) { - p->backoff(); - } + + SCHEDULER_REPLY sr; + cpu_work_fetch.req_secs = save_cpu_req_secs; + work_fetch.handle_reply(p, &sr, new_results); p->nrpc_failures = 0; if (sent_something) { request_schedule_cpus("simulate_rpc"); @@ -260,6 +327,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) { return true; } +SCHEDULER_REPLY::SCHEDULER_REPLY() { + cpu_backoff = 0; + cuda_backoff = 0; + ati_backoff = 0; +} +SCHEDULER_REPLY::~SCHEDULER_REPLY() {} + void PROJECT::backoff() { nrpc_failures++; double backoff = calculate_exponential_backoff( @@ -619,14 +693,16 @@ void CLIENT_STATE::html_rec() { int n=0; for (unsigned int i=0; iresult->avp->avg_ncpus; + if (np < 1) np = 1; if (atp->task_state() == PROCESS_EXECUTING) { PROJECT* p = atp->result->project; - fprintf(html_out, "%s%s: %.2f", - colors[p->index], + fprintf(html_out, "%s%s: %.2f", + np, colors[p->index], atp->result->rr_sim_misses_deadline?"*":"", atp->result->name, atp->cpu_time_left ); - n++; + n += np; } } while (nindex = i; + if (!projects[i]->dont_request_more_work) { + projects[i]->index = j++; + } } gstate.now = 86400; diff --git a/client/sim.h b/client/sim.h index fa0d43647e..0bb583b0aa 100644 --- a/client/sim.h +++ b/client/sim.h @@ -47,6 +47,13 @@ struct PROJECT_RESULTS { double cpu_wasted; int nresults_met_deadline; int nresults_missed_deadline; + + PROJECT_RESULTS() { + cpu_used = 0; + cpu_wasted = 0; + nresults_met_deadline = 0; + nresults_missed_deadline = 0; + } }; struct NORMAL_DIST { diff --git a/doc/sim_form.php b/doc/sim_form.php index 38e862024a..1d767be569 100644 --- a/doc/sim_form.php +++ b/doc/sim_form.php @@ -6,102 +6,22 @@ function show_form() { echo "
- sim_projects.xml (describes a set of projects) + client_state.xml
- +

- sim_host.xml: (describes the simulated host) + global_prefs.xml: (the host's preferences)
- - -

- sim_prefs.xml: (the host's preferences) -
- +

cc_config.xml: (the client configuration options)
- +

@@ -140,13 +60,14 @@ function show_form() { if ($_POST['submit']) { chdir("sim"); - $x1 = file_put_contents("sim_projects.xml", $_POST['projects']); - $x2 = file_put_contents("sim_host.xml", $_POST['host']); - $x3 = file_put_contents("sim_prefs.xml", $_POST['prefs']); - $x4 = file_put_contents("cc_config.xml", $_POST['cc_config']); - if (!$x1 || !$x2 || !$x3 || !$x4) { - echo "Can't write files - check permissions\n"; - exit(); + if (!file_put_contents("client_state.xml", $_POST['client_state'])) { + echo "Can't write client_state.xml - check permissions\n"; exit(); + } + if (!file_put_contents("global_prefs.xml", $_POST['global_prefs'])) { + echo "Can't write global_prefs.xml - check permissions\n"; exit(); + } + if (!file_put_contents("cc_config.xml", $_POST['cc_config'])) { + echo "Can't write cc_config.xml - check permissions\n"; exit(); } $duration = $_POST['duration']; diff --git a/sched/validator.cpp b/sched/validator.cpp index 2dc835c16c..51e513eb3d 100644 --- a/sched/validator.cpp +++ b/sched/validator.cpp @@ -373,7 +373,8 @@ int handle_wu( "[WU#%d %s] assign_credit_set() returned %d\n", wu.id, wu.name, retval ); - return retval; + transition_time = DELAYED; + goto leave; } } @@ -530,6 +531,7 @@ int handle_wu( } } +leave: --log_messages; switch (transition_time) {