- scheduler: fix a problem with job resend.

When we first send a job, we pick an app version,
    then call wu_is_infeasible_fast()
    to see if the host is able to run the job with that app version.
    In addition to checking disk space etc.
    this calls wu_is_infeasible_custom() to do project-specific checks
    (e.g. for SETI@home: don't use GPUs for VLAR jobs).

    However, when we resend a job, we pick an app version
    (possibly different from the original one)
    and send the job without any checking.
    So, for example, we might send a VLAR job to a GPU,
    or send a job to a host with insufficient disk space
    (because free space has changed since original send).

    Solution: call wu_is_infeasible_fast() before resending a job,
    and if it returns true, mark the job as done and don't resend it.


svn path=/trunk/boinc/; revision=23098
This commit is contained in:
David Anderson 2011-02-24 19:30:43 +00:00
parent 00f7a7778b
commit 3b05dc6203
5 changed files with 36 additions and 6 deletions

View File

@ -1087,3 +1087,26 @@ David 23 Feb 2011
cs_notice.cpp
pers_file_xfer.cpp
work_fetch.cpp
David 24 Feb 2011
- scheduler: fix a problem with job resend.
When we first send a job, we pick an app version,
then call wu_is_infeasible_fast()
to see if the host is able to run the job with that app version.
In addition to checking disk space etc.
this calls wu_is_infeasible_custom() to do project-specific checks
(e.g. for SETI@home: don't use GPUs for VLAR jobs).
However, when we resend a job, we pick an app version
(possibly different from the original one)
and send the job without any checking.
So, for example, we might send a VLAR job to a GPU,
or send a job to a host with insufficient disk space
(because free space has changed since original send).
Solution: call wu_is_infeasible_fast() before resending a job,
and if it returns true, mark the job as done and don't resend it.
sched/
sched_send.cpp
sched_resend.cpp

View File

@ -826,6 +826,7 @@ static void promote_once_ran_edf() {
);
}
rp->rr_sim_misses_deadline = true;
continue;
}
}
if (gstate.now - atp->last_deadline_miss_time < gstate.global_prefs.cpu_scheduling_period()) {
@ -833,7 +834,7 @@ static void promote_once_ran_edf() {
RESULT* rp = atp->result;
PROJECT* p = rp->project;
msg_printf(p, MSG_INFO,
"[cpu_sched] trashing prevention: mark %s as deadline miss",
"[cpu_sched] thrashing prevention: mark %s as deadline miss",
rp->name
);
}

View File

@ -20,9 +20,10 @@ require_once("../inc/boinc_db.inc");
require_once("../inc/util.inc");
require_once("../inc/host.inc");
check_get_args(array("hostid"));
check_get_args(array("hostid", "ttok", "tnow"));
$user = get_logged_in_user();
check_tokens($user->authenticator);
$hostid = get_int("hostid");
$host = BoincHost::lookup_id($hostid);

View File

@ -148,9 +148,9 @@ bool resend_lost_work() {
continue;
}
APP* app = ssp->lookup_app(wu.appid);
bavp = get_app_version(wu, false, false);
if (!bavp) {
APP* app = ssp->lookup_app(wu.appid);
log_messages.printf(MSG_CRITICAL,
"[HOST#%d] can't resend [RESULT#%d]: no app version for %s\n",
g_reply->host.id, result.id, app->name
@ -166,9 +166,13 @@ bool resend_lost_work() {
// so that the transitioner does 'the right thing'.
//
if (
wu.error_mask ||
wu.canonical_resultid ||
possibly_give_result_new_deadline(result, wu, *bavp)
wu.error_mask
|| wu.canonical_resultid
|| wu_is_infeasible_fast(
wu, result.server_state, result.priority, result.report_deadline,
*app, *bavp
)
|| possibly_give_result_new_deadline(result, wu, *bavp)
) {
if (config.debug_resend) {
log_messages.printf(MSG_NORMAL,

View File

@ -577,6 +577,7 @@ static inline void update_estimated_delay(BEST_APP_VERSION& bav, double dt) {
// Actually, return two: optimistic (lower) and pessimistic (higher).
// If the deadline check with the optimistic bound fails,
// try the pessimistic bound.
// TODO: clean up this mess
//
static void get_delay_bound_range(
WORKUNIT& wu,