diff --git a/checkin_notes b/checkin_notes index 3007ee6317..064e99612d 100644 --- a/checkin_notes +++ b/checkin_notes @@ -6674,3 +6674,18 @@ David 7 Nov 2012 - a bunch of skin files had execute permissions (??). Clear them. clientgui/skins/Charity Engine/graphic/* + +David 7 Nov 2012 + - scheduler: when resend jobs: + - don't use devices for which work is not being requested + - obey wu_is_infeasible_custom() + (e.g. don't send SETI@home VLAR jobs to GPUs) + - scheduler: add log flag for slot-level messages + - admin web: show and allow control of app.beta + + html/ops/ + manage_apps.php + sched/ + sched_array.cpp + sched_config.cpp,h + sched_resend.cpp diff --git a/html/ops/manage_apps.php b/html/ops/manage_apps.php index 33043b03ab..cab9a5f936 100644 --- a/html/ops/manage_apps.php +++ b/html/ops/manage_apps.php @@ -78,6 +78,13 @@ function do_updates() { if ($new_v != $old_v ) { $app->update("non_cpu_intensive=$new_v"); } + + $field = "beta_".$id; + $new_v = (post_str($field, true)=='on') ? 1 : 0; + $old_v = $app->beta; + if ($new_v != $old_v ) { + $app->update("beta=$new_v"); + } } // Adding a new application @@ -127,7 +134,8 @@ function show_form($updated) { "homogeneous redundancy type
details", "homogeneous app version?
details", "deprecated?", - "Non-CPU-intensive?" + "Non-CPU-intensive?", + "Beta?" ); $total_weight = mysql_query('SELECT SUM(weight) AS total_weight FROM app WHERE deprecated=0'); @@ -189,6 +197,13 @@ function show_form($updated) { "; + $field = "beta_".$id; + $v = ''; + if ($item->beta) $v = ' CHECKED '; + echo " + + "; + echo " "; } mysql_free_result($result); diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index 0fc3ddd320..c75068eb14 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -57,8 +57,10 @@ static bool quick_check( // if (g_wreq->beta_only) { if (!app->beta) { - if (config.debug_array) { - log_messages.printf(MSG_NORMAL, "[array] not beta\n"); + if (config.debug_array_detail) { + log_messages.printf(MSG_NORMAL, + "[array_detail] job is not from beta app; skipping\n" + ); } return false; } @@ -70,8 +72,10 @@ static bool quick_check( } } else { if (app->beta) { - if (config.debug_array) { - log_messages.printf(MSG_NORMAL, "[array] is beta\n"); + if (config.debug_array_detail) { + log_messages.printf(MSG_NORMAL, + "[array_detail] job is from beta app; skipping\n" + ); } return false; } @@ -83,13 +87,17 @@ static bool quick_check( // if (!app->beta) { if (g_wreq->reliable_only && (!wu_result.need_reliable)) { - if (config.debug_array) { - log_messages.printf(MSG_NORMAL, "[array] don't need reliable\n"); + if (config.debug_array_detail) { + log_messages.printf(MSG_NORMAL, + "[array_detail] job doesn't need reliable host; skipping\n" + ); } return false; } else if (!g_wreq->reliable_only && wu_result.need_reliable) { - if (config.debug_array) { - log_messages.printf(MSG_NORMAL, "[array] need reliable\n"); + if (config.debug_array_detail) { + log_messages.printf(MSG_NORMAL, + "[array_detail] job needs reliable host; skipping\n" + ); } return false; } @@ -99,8 +107,10 @@ static bool quick_check( // and the result is not infeasible // if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) { - if (config.debug_array) { - log_messages.printf(MSG_NORMAL, "[array] not infeasible\n"); + if (config.debug_array_detail) { + log_messages.printf(MSG_NORMAL, + "[array_detail] job is not infeasible; skipping\n" + ); } return false; } @@ -137,9 +147,9 @@ static bool quick_check( // bavp = get_app_version(wu, true, g_wreq->reliable_only); if (!bavp) { - if (config.debug_array) { + if (config.debug_array_detail) { log_messages.printf(MSG_NORMAL, - "[array] No app version\n" + "[array_detail] No app version for job; skipping\n" ); } return false; @@ -154,14 +164,12 @@ static bool quick_check( ) { if (app_not_selected(wu)) { g_wreq->no_allowed_apps_available = true; -#if 1 - if (config.debug_array) { + if (config.debug_array_detail) { log_messages.printf(MSG_NORMAL, - "[array] [USER#%d] [WU#%d] user doesn't want work for app %s\n", + "[array_detail] [USER#%d] [WU#%d] user doesn't want work for app %s\n", g_reply->user.id, wu.id, app->name ); } -#endif return false; } } @@ -183,8 +191,10 @@ static bool quick_check( ); } last_retval = retval; - if (config.debug_array) { - log_messages.printf(MSG_NORMAL, "[array] infeasible\n"); + if (config.debug_array_detail) { + log_messages.printf(MSG_NORMAL, + "[array_detail] is_infeasible_fast() failed; skipping\n" + ); } return false; } @@ -376,13 +386,11 @@ static bool scan_work_array() { WU_RESULT& wu_result = ssp->wu_results[i]; -#if 0 - if (config.debug_array) { + if (config.debug_array_detail) { log_messages.printf(MSG_NORMAL, - "[array] scanning slot %d\n", i + "[array_detail] scanning slot %d\n", i ); } -#endif if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; @@ -408,9 +416,9 @@ static bool scan_work_array() { // This may modify wu.rsc_fpops_est // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { - if (config.debug_array) { + if (config.debug_array_detail) { log_messages.printf(MSG_NORMAL, - "[array] slot %d failed quick check\n", i + "[array_detail] slot %d failed quick check\n", i ); } continue; @@ -502,7 +510,7 @@ void send_work_old() { } else { if (config.debug_array) { log_messages.printf(MSG_NORMAL, - "[array] host has no reliable app versions; skipping\n" + "[array] host has no reliable app versions; skipping scan\n" ); } } diff --git a/sched/sched_config.cpp b/sched/sched_config.cpp index 94944300fa..2c54bdbb21 100644 --- a/sched/sched_config.cpp +++ b/sched/sched_config.cpp @@ -297,6 +297,7 @@ int SCHED_CONFIG::parse(FILE* f) { //////////// SCHEDULER LOG FLAGS ///////// if (xp.parse_bool("debug_array", debug_array)) continue; + if (xp.parse_bool("debug_array_detail", debug_array_detail)) continue; if (xp.parse_bool("debug_assignment", debug_assignment)) continue; if (xp.parse_bool("debug_credit", debug_credit)) continue; if (xp.parse_bool("debug_edf_sim_detail", debug_edf_sim_detail)) continue; diff --git a/sched/sched_config.h b/sched/sched_config.h index aae6506500..f836a21ca1 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -180,6 +180,7 @@ struct SCHED_CONFIG { // scheduler log flags // bool debug_array; // debug job-cache scheduling + bool debug_array_detail; // show slot-level info bool debug_assignment; bool debug_credit; bool debug_edf_sim_detail; // show details of EDF sim diff --git a/sched/sched_resend.cpp b/sched/sched_resend.cpp index 5980ae328f..ca2ebf155c 100644 --- a/sched/sched_resend.cpp +++ b/sched/sched_resend.cpp @@ -34,6 +34,7 @@ #include "sched_main.h" #include "sched_config.h" +#include "sched_customize.h" #include "sched_locality.h" #include "sched_msgs.h" #include "sched_send.h" @@ -140,50 +141,81 @@ bool resend_lost_work() { } DB_WORKUNIT wu; - bool cant_resend = false; + bool can_resend = true; retval = wu.lookup_id(result.workunitid); if (retval) { log_messages.printf(MSG_CRITICAL, - "[HOST#%d] WU not found for [RESULT#%d]\n", + "[HOST#%d] can't resend - WU not found for [RESULT#%d]\n", g_reply->host.id, result.id ); - cant_resend = true; - } else { + can_resend = false; + } + if (can_resend) { app = ssp->lookup_app(wu.appid); - bavp = get_app_version(wu, false, false); + bavp = get_app_version(wu, true, false); if (!bavp) { - log_messages.printf(MSG_CRITICAL, - "[HOST#%d] can't resend [RESULT#%d]: no app version for %s\n", - g_reply->host.id, result.id, app->name - ); - cant_resend = true; + if (config.debug_resend) { + log_messages.printf(MSG_NORMAL, + "[HOST#%d] can't resend [RESULT#%d]: no app version for %s\n", + g_reply->host.id, result.id, app->name + ); + } + can_resend = false; } } - - // If error occurred, - // or time is too close to the deadline, - // or we already have a canonical result, - // or WU error flag is set, - // then don't resend this result. - // Instead make it time out right away - // so that the transitioner does 'the right thing'. - // - if ( - cant_resend - || wu.error_mask - || wu.canonical_resultid - || wu_is_infeasible_fast( - wu, result.server_state, result.priority, result.report_deadline, - *app, *bavp - ) - || possibly_give_result_new_deadline(result, wu, *bavp) - ) { + if (can_resend && wu.error_mask) { if (config.debug_resend) { log_messages.printf(MSG_NORMAL, - "[resend] [HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n", - g_reply->host.id, result.id + "[resend] skipping [RESULT#%d]: WU error mask %d\n", + result.id, wu.error_mask ); } + can_resend = false; + } + if (can_resend && wu.canonical_resultid) { + if (config.debug_resend) { + log_messages.printf(MSG_NORMAL, + "[resend] skipping [RESULT#%d]: already have canonical result\n", + result.id + ); + } + can_resend = false; + } + if (can_resend && wu_is_infeasible_fast( + wu, result.server_state, result.priority, result.report_deadline, + *app, *bavp + )) { + if (config.debug_resend) { + log_messages.printf(MSG_NORMAL, + "[resend] skipping [RESULT#%d]: feasibility check failed\n", + result.id + ); + } + can_resend = false; + } + if (can_resend && possibly_give_result_new_deadline(result, wu, *bavp)) { + if (config.debug_resend) { + log_messages.printf(MSG_NORMAL, + "[resend] skipping [RESULT#%d]: deadline assignment failed\n", + result.id + ); + } + can_resend = false; + } + if (can_resend && wu_is_infeasible_custom(wu, *app, *bavp)) { + if (config.debug_resend) { + log_messages.printf(MSG_NORMAL, + "[resend] skipping [RESULT#%d]: custom feasibility check failed\n", + result.id + ); + } + can_resend = false; + } + + // If we can't resend this job for any of the above reasons, + // make it time out so that the transitioner does the right thing. + // + if (!can_resend) { result.report_deadline = time(0)-1; retval = result.mark_as_sent(result.server_state, config.report_grace_period); if (retval) {