mirror of https://github.com/BOINC/boinc.git
- validator: if credit calculation returns an error,
wait 6 hours before retrying svn path=/trunk/boinc/; revision=22418
This commit is contained in:
parent
02da1b01c0
commit
794214208f
|
@ -1,4 +1,3 @@
|
|||
|
||||
David 3 Jan 2010
|
||||
- client: ATI available RAM wasn't being detected correctly
|
||||
|
||||
|
@ -6848,6 +6847,19 @@ David 27 Sept 2010
|
|||
cs_statefile.cpp
|
||||
scheduler_op.cpp
|
||||
time_stats.cpp
|
||||
<<<<<<< .mine
|
||||
|
||||
David 28 Sept 2010
|
||||
- validator: if credit calculation returns an error,
|
||||
wait 6 hours before retrying
|
||||
|
||||
sched/
|
||||
validator.cpp
|
||||
client/
|
||||
sim.cpp,h
|
||||
client_state.h
|
||||
cs_statefile.cpp
|
||||
=======
|
||||
|
||||
Rom 28 Sept 2010
|
||||
- MGR: Make the new BOINC skin the default skin.
|
||||
|
@ -6871,3 +6883,4 @@ Rom 28 Sept 2010
|
|||
/
|
||||
configure.ac
|
||||
version.h
|
||||
>>>>>>> .r22417
|
||||
|
|
|
@ -467,8 +467,7 @@ struct CLIENT_STATE {
|
|||
double monotony();
|
||||
|
||||
void do_client_simulation();
|
||||
void make_job(PROJECT*, WORKUNIT*, RESULT*);
|
||||
void handle_completed_results();
|
||||
void handle_completed_results(PROJECT*);
|
||||
void get_workload(vector<IP_RESULT>&);
|
||||
void simulate();
|
||||
bool simulate_rpc(PROJECT*);
|
||||
|
|
|
@ -388,7 +388,15 @@ int CLIENT_STATE::parse_state_file() {
|
|||
continue;
|
||||
}
|
||||
if (match_tag(buf, "<host_info>")) {
|
||||
FILE* f2 = fopen("temp", "w");
|
||||
#ifdef SIM
|
||||
fprintf(f2, "FOOBAR\n");
|
||||
retval = host_info.parse(mf, false);
|
||||
#else
|
||||
fprintf(f2, "BLAH\n");
|
||||
retval = host_info.parse(mf, true);
|
||||
#endif
|
||||
fclose(f2);
|
||||
if (retval) {
|
||||
msg_printf(NULL, MSG_INTERNAL_ERROR,
|
||||
"Can't parse host info in state file"
|
||||
|
|
176
client/sim.cpp
176
client/sim.cpp
|
@ -88,37 +88,70 @@ void PROJECT::update_dcf_stats(RESULT* rp) {
|
|||
return;
|
||||
}
|
||||
|
||||
APP* choose_app(vector<APP*>& apps) {
|
||||
double x = drand();
|
||||
double sum = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i=0; i<apps.size(); i++) {
|
||||
sum += apps[i]->weight;
|
||||
}
|
||||
for (i=0; i<apps.size(); i++) {
|
||||
APP* app = apps[i];
|
||||
x -= app->weight/sum;
|
||||
if (x <= 0) {
|
||||
return app;
|
||||
}
|
||||
}
|
||||
return apps.back();
|
||||
}
|
||||
|
||||
bool app_version_needs_work(APP_VERSION* avp) {
|
||||
if (avp->ncudas) {
|
||||
return (cuda_work_fetch.req_secs>0 || cuda_work_fetch.req_instances>0);
|
||||
}
|
||||
if (avp->natis) {
|
||||
return (ati_work_fetch.req_secs>0 || ati_work_fetch.req_instances>0);
|
||||
}
|
||||
return (cpu_work_fetch.req_secs>0 || cpu_work_fetch.req_instances>0);
|
||||
}
|
||||
|
||||
bool has_app_version_needing_work(APP* app) {
|
||||
for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
|
||||
APP_VERSION* avp = gstate.app_versions[i];
|
||||
if (avp->app != app) continue;
|
||||
if (app_version_needs_work(avp)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// choose a version for this app for which we need work
|
||||
//
|
||||
APP_VERSION* choose_app_version(APP* app) {
|
||||
APP_VERSION* best_avp = NULL;
|
||||
for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
|
||||
APP_VERSION* avp = gstate.app_versions[i];
|
||||
if (avp->app != app) continue;
|
||||
if (!app_version_needs_work(avp)) continue;
|
||||
if (!best_avp) {
|
||||
best_avp = avp;
|
||||
} else if (avp->flops > best_avp->flops) {
|
||||
best_avp = avp;
|
||||
}
|
||||
}
|
||||
return best_avp;
|
||||
}
|
||||
|
||||
// generate a job; pick a random app for this project,
|
||||
// and pick a FLOP count from its distribution
|
||||
//
|
||||
void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) {
|
||||
APP* ap1, *ap=0;
|
||||
double net_fpops = host_info.p_fpops;
|
||||
double x = drand();
|
||||
unsigned int i;
|
||||
|
||||
for (i=0; i<apps.size();i++) {
|
||||
ap1 = apps[i];
|
||||
if (ap1->project != p) continue;
|
||||
x -= ap1->weight;
|
||||
if (x <= 0) {
|
||||
ap = ap1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!ap) {
|
||||
printf("ERROR - NO APP\n");
|
||||
exit(1);
|
||||
}
|
||||
void make_job(
|
||||
PROJECT* p, WORKUNIT* wup, RESULT* rp, vector<APP*>app_list
|
||||
) {
|
||||
APP* app = choose_app(app_list);
|
||||
APP_VERSION* avp = choose_app_version(app);
|
||||
rp->clear();
|
||||
rp->avp = 0;
|
||||
for (i=0; i<gstate.app_versions.size(); i++) {
|
||||
APP_VERSION* avp = gstate.app_versions[i];
|
||||
if (avp->app == ap) {
|
||||
rp->avp = avp;
|
||||
break;
|
||||
}
|
||||
}
|
||||
rp->avp = avp;
|
||||
if (!rp->avp) {
|
||||
printf("ERROR - NO APP VERSION\n");
|
||||
exit(1);
|
||||
|
@ -127,23 +160,23 @@ void CLIENT_STATE::make_job(PROJECT* p, WORKUNIT* wup, RESULT* rp) {
|
|||
rp->wup = wup;
|
||||
sprintf(rp->name, "%s_%d", p->project_name, p->result_index++);
|
||||
wup->project = p;
|
||||
wup->rsc_fpops_est = ap->fpops_est;
|
||||
double ops = ap->fpops.sample();
|
||||
wup->rsc_fpops_est = app->fpops_est;
|
||||
double ops = app->fpops.sample();
|
||||
if (ops < 0) ops = 0;
|
||||
rp->final_cpu_time = ops/net_fpops;
|
||||
rp->report_deadline = now + ap->latency_bound;
|
||||
rp->final_cpu_time = ops/avp->flops;
|
||||
rp->report_deadline = gstate.now + app->latency_bound;
|
||||
}
|
||||
|
||||
// process ready-to-report results
|
||||
//
|
||||
void CLIENT_STATE::handle_completed_results() {
|
||||
void CLIENT_STATE::handle_completed_results(PROJECT* p) {
|
||||
char buf[256];
|
||||
vector<RESULT*>::iterator result_iter;
|
||||
|
||||
result_iter = results.begin();
|
||||
while (result_iter != results.end()) {
|
||||
RESULT* rp = *result_iter;
|
||||
if (rp->ready_to_report) {
|
||||
if (rp->project == p && rp->ready_to_report) {
|
||||
sprintf(buf, "result %s reported; %s<br>",
|
||||
rp->name,
|
||||
(gstate.now > rp->report_deadline)?
|
||||
|
@ -185,6 +218,31 @@ void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
|
|||
init_ip_results(work_buf_min(), ncpus, ip_results);
|
||||
}
|
||||
|
||||
void get_apps_needing_work(PROJECT* p, vector<APP*>& apps) {
|
||||
apps.clear();
|
||||
for (unsigned int i=0; i<gstate.apps.size(); i++) {
|
||||
APP* app = gstate.apps[i];
|
||||
if (app->project != p) continue;
|
||||
if (!has_app_version_needing_work(app)) continue;
|
||||
apps.push_back(app);
|
||||
}
|
||||
}
|
||||
|
||||
void decrement_request_rsc(
|
||||
RSC_WORK_FETCH& rwf, double ninstances, double est_runtime
|
||||
) {
|
||||
rwf.req_secs -= est_runtime * ninstances;
|
||||
rwf.req_instances -= ninstances;
|
||||
}
|
||||
|
||||
void decrement_request(RESULT* rp) {
|
||||
APP_VERSION* avp = rp->avp;
|
||||
double est_runtime = rp->wup->rsc_fpops_est/avp->flops;
|
||||
decrement_request_rsc(cpu_work_fetch, avp->avg_ncpus, est_runtime);
|
||||
decrement_request_rsc(cuda_work_fetch, avp->ncudas, est_runtime);
|
||||
decrement_request_rsc(ati_work_fetch, avp->natis, est_runtime);
|
||||
}
|
||||
|
||||
// simulate trying to do an RPC;
|
||||
// return true if we actually did one
|
||||
//
|
||||
|
@ -193,6 +251,7 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
|
|||
static double last_time=-1e9;
|
||||
vector<IP_RESULT> ip_results;
|
||||
int infeasible_count = 0;
|
||||
vector<RESULT*> new_results;
|
||||
|
||||
double diff = now - last_time;
|
||||
if (diff && diff < connection_interval) {
|
||||
|
@ -204,25 +263,32 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
|
|||
}
|
||||
last_time = now;
|
||||
|
||||
work_fetch.request_string(buf);
|
||||
// save request params for WORK_FETCH::handle_reply
|
||||
double save_cpu_req_secs = cpu_work_fetch.req_secs;
|
||||
host_info.coprocs.cuda.req_secs = cuda_work_fetch.req_secs;
|
||||
host_info.coprocs.ati.req_secs = ati_work_fetch.req_secs;
|
||||
|
||||
|
||||
work_fetch.request_string(buf2);
|
||||
sprintf(buf, "RPC to %s: %s<br>", p->project_name, buf2);
|
||||
html_msg += buf;
|
||||
|
||||
msg_printf(0, MSG_INFO, buf);
|
||||
|
||||
handle_completed_results();
|
||||
handle_completed_results(p);
|
||||
|
||||
if (server_uses_workload) {
|
||||
get_workload(ip_results);
|
||||
}
|
||||
|
||||
bool sent_something = false;
|
||||
double work_left = cpu_work_fetch.req_secs;
|
||||
double instances_needed = cpu_work_fetch.req_instances;
|
||||
while (work_left > 0 || instances_needed>0) {
|
||||
while (1) {
|
||||
vector<APP*> apps;
|
||||
get_apps_needing_work(p, apps);
|
||||
if (apps.empty()) break;
|
||||
RESULT* rp = new RESULT;
|
||||
WORKUNIT* wup = new WORKUNIT;
|
||||
make_job(p, wup, rp);
|
||||
make_job(p, wup, rp, apps);
|
||||
|
||||
if (server_uses_workload) {
|
||||
IP_RESULT c(rp->name, rp->report_deadline, rp->final_cpu_time);
|
||||
|
@ -241,17 +307,18 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
|
|||
sent_something = true;
|
||||
rp->set_state(RESULT_FILES_DOWNLOADED, "simulate_rpc");
|
||||
results.push_back(rp);
|
||||
new_results.push_back(rp);
|
||||
sprintf(buf, "got job %s: CPU time %.2f, deadline %s<br>",
|
||||
rp->name, rp->final_cpu_time, time_to_string(rp->report_deadline)
|
||||
);
|
||||
html_msg += buf;
|
||||
work_left -= p->duration_correction_factor*wup->rsc_fpops_est/host_info.p_fpops;
|
||||
instances_needed -= 1;
|
||||
decrement_request(rp);
|
||||
}
|
||||
|
||||
if (cpu_work_fetch.req_secs > 0 && !sent_something) {
|
||||
p->backoff();
|
||||
}
|
||||
|
||||
SCHEDULER_REPLY sr;
|
||||
cpu_work_fetch.req_secs = save_cpu_req_secs;
|
||||
work_fetch.handle_reply(p, &sr, new_results);
|
||||
p->nrpc_failures = 0;
|
||||
if (sent_something) {
|
||||
request_schedule_cpus("simulate_rpc");
|
||||
|
@ -260,6 +327,13 @@ bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
|
|||
return true;
|
||||
}
|
||||
|
||||
SCHEDULER_REPLY::SCHEDULER_REPLY() {
|
||||
cpu_backoff = 0;
|
||||
cuda_backoff = 0;
|
||||
ati_backoff = 0;
|
||||
}
|
||||
SCHEDULER_REPLY::~SCHEDULER_REPLY() {}
|
||||
|
||||
void PROJECT::backoff() {
|
||||
nrpc_failures++;
|
||||
double backoff = calculate_exponential_backoff(
|
||||
|
@ -619,14 +693,16 @@ void CLIENT_STATE::html_rec() {
|
|||
int n=0;
|
||||
for (unsigned int i=0; i<active_tasks.active_tasks.size(); i++) {
|
||||
ACTIVE_TASK* atp = active_tasks.active_tasks[i];
|
||||
int np = atp->result->avp->avg_ncpus;
|
||||
if (np < 1) np = 1;
|
||||
if (atp->task_state() == PROCESS_EXECUTING) {
|
||||
PROJECT* p = atp->result->project;
|
||||
fprintf(html_out, "<td bgcolor=%s>%s%s: %.2f</td>",
|
||||
colors[p->index],
|
||||
fprintf(html_out, "<td colspan=%d bgcolor=%s>%s%s: %.2f</td>",
|
||||
np, colors[p->index],
|
||||
atp->result->rr_sim_misses_deadline?"*":"",
|
||||
atp->result->name, atp->cpu_time_left
|
||||
);
|
||||
n++;
|
||||
n += np;
|
||||
}
|
||||
}
|
||||
while (n<ncpus) {
|
||||
|
@ -819,8 +895,12 @@ void CLIENT_STATE::do_client_simulation() {
|
|||
add_platform("client simulator");
|
||||
parse_state_file();
|
||||
read_global_prefs();
|
||||
cull_projects();
|
||||
int j=0;
|
||||
for (unsigned int i=0; i<projects.size(); i++) {
|
||||
projects[i]->index = i;
|
||||
if (!projects[i]->dont_request_more_work) {
|
||||
projects[i]->index = j++;
|
||||
}
|
||||
}
|
||||
|
||||
gstate.now = 86400;
|
||||
|
|
|
@ -47,6 +47,13 @@ struct PROJECT_RESULTS {
|
|||
double cpu_wasted;
|
||||
int nresults_met_deadline;
|
||||
int nresults_missed_deadline;
|
||||
|
||||
PROJECT_RESULTS() {
|
||||
cpu_used = 0;
|
||||
cpu_wasted = 0;
|
||||
nresults_met_deadline = 0;
|
||||
nresults_missed_deadline = 0;
|
||||
}
|
||||
};
|
||||
|
||||
struct NORMAL_DIST {
|
||||
|
|
111
doc/sim_form.php
111
doc/sim_form.php
|
@ -6,102 +6,22 @@ function show_form() {
|
|||
echo "
|
||||
<form action=sim_form.php method=post>
|
||||
|
||||
<b>sim_projects.xml</b> (describes a set of projects)
|
||||
<b>client_state.xml</b>
|
||||
<br>
|
||||
<textarea name=projects rows=10 cols=80><projects>
|
||||
<project>
|
||||
<project_name>P1</project_name>
|
||||
<resource_share>100</resource_share>
|
||||
<app>
|
||||
<latency_bound>15000</latency_bound>
|
||||
<fpops_est>3600</fpops_est>
|
||||
<fpops>
|
||||
<mean>3600</mean>
|
||||
<stdev>10</stdev>
|
||||
</fpops>
|
||||
<working_set>1e7</working_set>
|
||||
</app>
|
||||
<available>
|
||||
<frac>.7</frac>
|
||||
<lambda>1000</lambda>
|
||||
</available>
|
||||
</project>
|
||||
<project>
|
||||
<project_name>P2</project_name>
|
||||
<resource_share>50</resource_share>
|
||||
<app>
|
||||
<latency_bound>10000</latency_bound>
|
||||
<fpops_est>1800</fpops_est>
|
||||
<fpops>
|
||||
<mean>1800</mean>
|
||||
<stdev>10</stdev>
|
||||
</fpops>
|
||||
<working_set>1e7</working_set>
|
||||
</app>
|
||||
<available>
|
||||
<frac>.7</frac>
|
||||
<lambda>1000</lambda>
|
||||
</available>
|
||||
</project>
|
||||
</projects></textarea>
|
||||
<textarea name=client_state rows=10 cols=80>
|
||||
</textarea>
|
||||
|
||||
<p>
|
||||
<b>sim_host.xml</b>: (describes the simulated host)
|
||||
<b>global_prefs.xml:</b> (the host's preferences)
|
||||
<br>
|
||||
<textarea name=host rows=10 cols=80><host>
|
||||
<p_fpops>1</p_fpops>
|
||||
<m_nbytes>1e9</m_nbytes>
|
||||
<connection_interval>600</connection_interval>
|
||||
<p_ncpus>2</p_ncpus>
|
||||
<available>
|
||||
<frac>.8</frac>
|
||||
<lambda>1000</lambda>
|
||||
</available>
|
||||
</host></textarea>
|
||||
|
||||
<p>
|
||||
<b>sim_prefs.xml:</b> (the host's preferences)
|
||||
<br>
|
||||
<textarea name=prefs rows=10 cols=80><global_preferences>
|
||||
<source_project>http://isaac.ssl.berkeley.edu/alpha/</source_project>
|
||||
<source_scheduler>isaac.ssl.berkeley.edu/alpha_cgi/cgi</source_scheduler>
|
||||
<mod_time>1170192285</mod_time>
|
||||
<run_if_user_active/>
|
||||
<idle_time_to_run>3</idle_time_to_run>
|
||||
<leave_apps_in_memory/>
|
||||
<cpu_scheduling_period_minutes>1</cpu_scheduling_period_minutes>
|
||||
<hangup_if_dialed/>
|
||||
<work_buf_min_days>0.1</work_buf_min_days>
|
||||
<work_buf_additional_days>0</work_buf_additional_days>
|
||||
<max_cpus>4</max_cpus>
|
||||
<cpu_usage_limit>100</cpu_usage_limit>
|
||||
<disk_interval>180</disk_interval>
|
||||
<disk_max_used_gb>100</disk_max_used_gb>
|
||||
<disk_max_used_pct>50</disk_max_used_pct>
|
||||
<disk_min_free_gb>2</disk_min_free_gb>
|
||||
<vm_max_used_pct>75</vm_max_used_pct>
|
||||
<ram_max_used_busy_pct>50</ram_max_used_busy_pct>
|
||||
<ram_max_used_idle_pct>90</ram_max_used_idle_pct>
|
||||
<max_bytes_sec_down>200000</max_bytes_sec_down>
|
||||
<max_bytes_sec_up>200000</max_bytes_sec_up>
|
||||
</global_preferences></textarea>
|
||||
<textarea name=global_prefs rows=10 cols=80>
|
||||
</textarea>
|
||||
|
||||
<p>
|
||||
<b>cc_config.xml:</b> (the client configuration options)
|
||||
<br>
|
||||
<textarea name=cc_config rows=10 cols=80><cc_config>
|
||||
<log_flags>
|
||||
<mem_usage_debug>0</mem_usage_debug>
|
||||
<cpu_sched_debug>1</cpu_sched_debug>
|
||||
<cpu_sched>1</cpu_sched>
|
||||
<rr_simulation>0</rr_simulation>
|
||||
<benchmark_debug>1</benchmark_debug>
|
||||
<task_debug>1</task_debug>
|
||||
<work_fetch_debug>1</work_fetch_debug>
|
||||
<app_msg_send>0</app_msg_send>
|
||||
<unparsed_xml/>
|
||||
</log_flags>
|
||||
</cc_config></textarea>
|
||||
<textarea name=cc_config rows=10 cols=80>
|
||||
</textarea>
|
||||
|
||||
<p>
|
||||
<b>
|
||||
|
@ -140,13 +60,14 @@ function show_form() {
|
|||
if ($_POST['submit']) {
|
||||
chdir("sim");
|
||||
|
||||
$x1 = file_put_contents("sim_projects.xml", $_POST['projects']);
|
||||
$x2 = file_put_contents("sim_host.xml", $_POST['host']);
|
||||
$x3 = file_put_contents("sim_prefs.xml", $_POST['prefs']);
|
||||
$x4 = file_put_contents("cc_config.xml", $_POST['cc_config']);
|
||||
if (!$x1 || !$x2 || !$x3 || !$x4) {
|
||||
echo "Can't write files - check permissions\n";
|
||||
exit();
|
||||
if (!file_put_contents("client_state.xml", $_POST['client_state'])) {
|
||||
echo "Can't write client_state.xml - check permissions\n"; exit();
|
||||
}
|
||||
if (!file_put_contents("global_prefs.xml", $_POST['global_prefs'])) {
|
||||
echo "Can't write global_prefs.xml - check permissions\n"; exit();
|
||||
}
|
||||
if (!file_put_contents("cc_config.xml", $_POST['cc_config'])) {
|
||||
echo "Can't write cc_config.xml - check permissions\n"; exit();
|
||||
}
|
||||
$duration = $_POST['duration'];
|
||||
|
||||
|
|
|
@ -373,7 +373,8 @@ int handle_wu(
|
|||
"[WU#%d %s] assign_credit_set() returned %d\n",
|
||||
wu.id, wu.name, retval
|
||||
);
|
||||
return retval;
|
||||
transition_time = DELAYED;
|
||||
goto leave;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -530,6 +531,7 @@ int handle_wu(
|
|||
}
|
||||
}
|
||||
|
||||
leave:
|
||||
--log_messages;
|
||||
|
||||
switch (transition_time) {
|
||||
|
|
Loading…
Reference in New Issue