- client: abort jobs that are unstarted and past deadline

- client: abort runaway jobs based on elapsed time instead of CPU time.
    Specifically, abort jobs for which
    elapsed time > WU.rsc_fpops_bound / app_version.flops
    This policy works for
    1) GPU jobs (which may use little CPU time)
    2) jobs that run but because of bugs use little CPU time
        (e.g., because they're sleeping)
    whereas the old policy didn't.

svn path=/trunk/boinc/; revision=17399
This commit is contained in:
David Anderson 2009-02-26 17:51:38 +00:00
parent 41fe3e40bf
commit feab684b3d
8 changed files with 45 additions and 9 deletions

View File

@ -2256,3 +2256,22 @@ David 25 Feb 2009
cs_prefs.cpp
cs_scheduler.cpp
cpu_sched.cpp
David 25 Feb 2009
- client: abort jobs that are unstarted and past deadline
- client: abort runaway jobs based on elapsed time instead of CPU time.
Specifically, abort jobs for which
elapsed time > WU.rsc_fpops_bound / app_version.flops
This policy works for
1) GPU jobs (which may use little CPU time)
2) jobs that run but because of bugs use little CPU time
(e.g., because they're sleeping)
whereas the old policy didn't.
client/
client_state.cpp,h
app.cpp,h
app_control.cpp
lib/
error_numbers.h
str_util.cpp

View File

@ -266,7 +266,7 @@ int ACTIVE_TASK::init(RESULT* rp) {
result = rp;
wup = rp->wup;
app_version = rp->avp;
max_cpu_time = rp->wup->rsc_fpops_bound/gstate.host_info.p_fpops;
max_elapsed_time = rp->wup->rsc_fpops_bound/rp->avp->flops;
max_disk_usage = rp->wup->rsc_disk_bound;
max_mem_usage = rp->wup->rsc_memory_bound;
get_slot_dir(slot, slot_dir, sizeof(slot_dir));

View File

@ -109,8 +109,8 @@ public:
/// (that way don't have to worry about top-level dirs
/// being non-readable, etc).
char slot_path[512];
/// abort if total CPU exceeds this
double max_cpu_time;
/// abort if elapsed time exceeds this
double max_elapsed_time;
/// abort if disk usage (in+out+temp) exceeds this
double max_disk_usage;
/// abort if memory usage exceeds this

View File

@ -571,12 +571,12 @@ bool ACTIVE_TASK_SET::check_rsc_limits_exceeded() {
for (i=0; i<active_tasks.size(); i++) {
atp = active_tasks[i];
if (atp->task_state() != PROCESS_EXECUTING) continue;
if (atp->current_cpu_time > atp->max_cpu_time) {
if (atp->elapsed_time > atp->max_elapsed_time) {
msg_printf(atp->result->project, MSG_INFO,
"Aborting task %s: exceeded CPU time limit %f\n",
atp->result->name, atp->max_cpu_time
"Aborting task %s: exceeded elapsed time limit %f\n",
atp->result->name, atp->max_elapsed_time
);
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum CPU time exceeded");
atp->abort_task(ERR_RSC_LIMIT_EXCEEDED, "Maximum elapsed time exceeded");
did_anything = true;
continue;
}

View File

@ -919,12 +919,25 @@ int CLIENT_STATE::nresults_for_project(PROJECT* p) {
return n;
}
bool CLIENT_STATE::abort_unstarted_late_jobs() {
if (now < 1235668593) return false; // skip if user reset system clock
for (unsigned int i=0; i<results.size(); i++) {
RESULT* rp = results[i];
if (!rp->not_started()) continue;
if (rp->report_deadline > now) continue;
rp->abort_inactive(ERR_UNSTARTED_LATE);
}
}
bool CLIENT_STATE::garbage_collect() {
bool action;
static double last_time=0;
if (gstate.now - last_time < GARBAGE_COLLECT_PERIOD) return false;
last_time = gstate.now;
bool action = garbage_collect_always();
action = abort_unstarted_late_jobs();
if (action) return true;
action = garbage_collect_always();
if (action) return true;
// Detach projects that are marked for detach when done

View File

@ -240,6 +240,7 @@ private:
int link_workunit(PROJECT*, WORKUNIT*);
int link_result(PROJECT*, RESULT*);
void print_summary();
bool abort_unstarted_late_jobs();
bool garbage_collect();
bool garbage_collect_always();
bool update_results();

View File

@ -187,8 +187,9 @@
#define ERR_DB_CONN_LOST -230
#define ERR_CRYPTO -231
#define ERR_ABORTED_ON_EXIT -232
#define ERR_UNSTARTED_LATE -233
// PLEASE: add a text description of your error to
// the text description function boincerror() in str_util.C.
// the text description function boincerror() in str_util.cpp.
#endif

View File

@ -736,6 +736,8 @@ const char* boincerror(int which_error) {
case ERR_SYMLINK: return "symlink() failed";
case ERR_DB_CONN_LOST: return "DB connection lost during enumeration";
case ERR_CRYPTO: return "encryption error";
case ERR_ABORTED_ON_EXIT: return "job was aborted on client exit";
case ERR_UNSTARTED_LATE: return "job is unstarted and past deadline";
case 404: return "HTTP file not found";
case 407: return "HTTP proxy authentication failure";
case 416: return "HTTP range request error";