mirror of https://github.com/BOINC/boinc.git
- client: abort jobs that are unstarted and past deadline
- client: abort runaway jobs based on elapsed time instead of CPU time. Specifically, abort jobs for which elapsed time > WU.rsc_fpops_bound / app_version.flops This policy works for 1) GPU jobs (which may use little CPU time) 2) jobs that run but because of bugs use little CPU time (e.g., because they're sleeping) whereas the old policy didn't. svn path=/trunk/boinc/; revision=17399
This commit is contained in:
parent
41fe3e40bf
commit
feab684b3d
|
@ -2256,3 +2256,22 @@ David 25 Feb 2009
|
|||
cs_prefs.cpp
|
||||
cs_scheduler.cpp
|
||||
cpu_sched.cpp
|
||||
|
||||
David 25 Feb 2009
|
||||
- client: abort jobs that are unstarted and past deadline
|
||||
- client: abort runaway jobs based on elapsed time instead of CPU time.
|
||||
Specifically, abort jobs for which
|
||||
elapsed time > WU.rsc_fpops_bound / app_version.flops
|
||||
This policy works for
|
||||
1) GPU jobs (which may use little CPU time)
|
||||
2) jobs that run but because of bugs use little CPU time
|
||||
(e.g., because they're sleeping)
|
||||
whereas the old policy didn't.
|
||||
|
||||
client/
|
||||
client_state.cpp,h
|
||||
app.cpp,h
|
||||
app_control.cpp
|
||||
lib/
|
||||
error_numbers.h
|
||||
str_util.cpp
|
||||
|
|
|
@ -266,7 +266,7 @@ int ACTIVE_TASK::init(RESULT* rp) {
|
|||
result = rp;
|
||||
wup = rp->wup;
|
||||
app_version = rp->avp;
|
||||
max_cpu_time = rp->wup->rsc_fpops_bound/gstate.host_info.p_fpops;
|
||||
max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops;
|
||||
max_disk_usage = rp->wup->rsc_disk_bound;
|
||||
max_mem_usage = rp->wup->rsc_memory_bound;
|
||||
get_slot_dir(slot, slot_dir, sizeof(slot_dir));
|
||||
|
|
|
@ -109,8 +109,8 @@ public:
|
|||
/// (that way don't have to worry about top-level dirs
|
||||
/// being non-readable, etc).
|
||||
char slot_path[512];
|
||||
/// abort if total CPU exceeds this
|
||||
double max_cpu_time;
|
||||
/// abort if elapsed time exceeds this
|
||||
double max_elapsed_time;
|
||||
/// abort if disk usage (in+out+temp) exceeds this
|
||||
double max_disk_usage;
|
||||
/// abort if memory usage exceeds this
|
||||
|
|
|
@ -571,12 +571,12 @@ bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() {
|
|||
for (i=0; i<active_tasks.size(); i++) {
|
||||
atp = active_tasks[i];
|
||||
if (atp->task_state() != PROCESS_EXECUTING) continue;
|
||||
if (atp->current_cpu_time > atp->max_cpu_time) {
|
||||
if (atp->elapsed_time > atp->max_elapsed_time) {
|
||||
msg_printf(atp->result->project, MSG_INFO,
|
||||
"Aborting task %s: exceeded CPU time limit %f\n",
|
||||
atp->result->name, atp->max_cpu_time
|
||||
"Aborting task %s: exceeded elapsed time limit %f\n",
|
||||
atp->result->name, atp->max_elapsed_time
|
||||
);
|
||||
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum CPU time exceeded");
|
||||
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded");
|
||||
did_anything = true;
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -919,12 +919,25 @@ int CLIENT_STATE::nresults_for_project(PROJECT* p) {
|
|||
return n;
|
||||
}
|
||||
|
||||
bool CLIENT_STATE::abort_unstarted_late_jobs() {
|
||||
if (now < 1235668593) return false; // skip if user reset system clock
|
||||
for (unsigned int i=0; i<results.size(); i++) {
|
||||
RESULT* rp = results[i];
|
||||
if (!rp->not_started()) continue;
|
||||
if (rp->report_deadline > now) continue;
|
||||
rp->abort_inactive(ERR_UNSTARTED_LATE);
|
||||
}
|
||||
}
|
||||
|
||||
bool CLIENT_STATE::garbage_collect() {
|
||||
bool action;
|
||||
static double last_time=0;
|
||||
if (gstate.now - last_time < GARBAGE_COLLECT_PERIOD) return false;
|
||||
last_time = gstate.now;
|
||||
|
||||
bool action = garbage_collect_always();
|
||||
action = abort_unstarted_late_jobs();
|
||||
if (action) return true;
|
||||
action = garbage_collect_always();
|
||||
if (action) return true;
|
||||
|
||||
// Detach projects that are marked for detach when done
|
||||
|
|
|
@ -240,6 +240,7 @@ private:
|
|||
int link_workunit(PROJECT*, WORKUNIT*);
|
||||
int link_result(PROJECT*, RESULT*);
|
||||
void print_summary();
|
||||
bool abort_unstarted_late_jobs();
|
||||
bool garbage_collect();
|
||||
bool garbage_collect_always();
|
||||
bool update_results();
|
||||
|
|
|
@ -187,8 +187,9 @@
|
|||
#define ERR_DB_CONN_LOST -230
|
||||
#define ERR_CRYPTO -231
|
||||
#define ERR_ABORTED_ON_EXIT -232
|
||||
#define ERR_UNSTARTED_LATE -233
|
||||
|
||||
// PLEASE: add a text description of your error to
|
||||
// the text description function boincerror() in str_util.C.
|
||||
// the text description function boincerror() in str_util.cpp.
|
||||
|
||||
#endif
|
||||
|
|
|
@ -736,6 +736,8 @@ const char* boincerror(int which_error) {
|
|||
case ERR_SYMLINK: return "symlink() failed";
|
||||
case ERR_DB_CONN_LOST: return "DB connection lost during enumeration";
|
||||
case ERR_CRYPTO: return "encryption error";
|
||||
case ERR_ABORTED_ON_EXIT: return "job was aborted on client exit";
|
||||
case ERR_UNSTARTED_LATE: return "job is unstarted and past deadline";
|
||||
case 404: return "HTTP file not found";
|
||||
case 407: return "HTTP proxy authentication failure";
|
||||
case 416: return "HTTP range request error";
|
||||
|
|
Loading…
Reference in New Issue