From 56934f8fbe009aba1e434a50a88e5971957b32e8 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 12 Jun 2014 11:33:11 -0700 Subject: [PATCH] scheduler: clean up job dispatch logging There are now 3 flags for job dispatch logging: : info about work request, jobs sent, other high-level stuff : info about scans through job cache : info about individual jobs (e.g. reason for not sending) --- sched/sched_array.cpp | 76 +++++++++++++++++++++--------------------- sched/sched_check.cpp | 45 +++++++++++++------------ sched/sched_config.cpp | 4 +-- sched/sched_config.h | 7 ++-- sched/sched_nci.cpp | 10 +++--- sched/sched_score.cpp | 59 ++++++++++++++++++++++++++------ 6 files changed, 123 insertions(+), 78 deletions(-) diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index faedf2a657..621937a8df 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -61,24 +61,24 @@ static bool quick_check( // if (g_wreq->beta_only) { if (!app->beta) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] job is not from beta app; skipping\n" + "[send_job] job is not from beta app; skipping\n" ); } return false; } - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] beta work found: [RESULT#%u]\n", + "[send_job] [HOST#%d] beta work found: [RESULT#%u]\n", g_reply->host.id, wu_result.resultid ); } } else { if (app->beta) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] job is from beta app; skipping\n" + "[send_job] job is from beta app; skipping\n" ); } return false; @@ -91,16 +91,16 @@ static bool quick_check( // if (!app->beta) { if (g_wreq->reliable_only && (!wu_result.need_reliable)) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] job doesn't need reliable host; skipping\n" + "[send_job] job doesn't need reliable host; skipping\n" ); } return false; } else if (!g_wreq->reliable_only && wu_result.need_reliable) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] job needs reliable host; skipping\n" + "[send_job] job needs reliable host; skipping\n" ); } return false; @@ -111,9 +111,9 @@ static bool quick_check( // and the result is not infeasible // if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] job is not infeasible; skipping\n" + "[send_job] job is not infeasible; skipping\n" ); } return false; @@ -151,9 +151,9 @@ static bool quick_check( // bavp = get_app_version(wu, true, g_wreq->reliable_only); if (!bavp) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] No app version for job; skipping\n" + "[send_job] No app version for job; skipping\n" ); } return false; @@ -168,9 +168,9 @@ static bool quick_check( ) { if (app_not_selected(app->id)) { g_wreq->no_allowed_apps_available = true; - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] [USER#%d] [WU#%u] user doesn't want work for app %s\n", + "[send_job] [USER#%d] [WU#%u] user doesn't want work for app %s\n", g_reply->user.id, wu.id, app->name ); } @@ -188,16 +188,16 @@ static bool quick_check( *app, *bavp ); if (retval) { - if (retval != last_retval && config.debug_send) { + if (retval != last_retval && config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%u %s] WU is infeasible: %s\n", + "[send_job] [HOST#%d] [WU#%u %s] WU is infeasible: %s\n", g_reply->host.id, wu.id, wu.name, infeasible_string(retval) ); } last_retval = retval; - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] is_infeasible_fast() failed; skipping\n" + "[send_job] is_infeasible_fast() failed; skipping\n" ); } return false; @@ -242,9 +242,9 @@ static bool scan_work_array() { WU_RESULT& wu_result = ssp->wu_results[i]; - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] scanning slot %d\n", i + "[send_job] scanning slot %d\n", i ); } @@ -273,9 +273,9 @@ recheck: // This may modify wu.rsc_fpops_est // if (!quick_check(wu_result, wu, bavp, app, last_retval)) { - if (config.debug_array_detail) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[array_detail] slot %d failed quick check\n", i + "[send_job] slot %d failed quick check\n", i ); } continue; @@ -363,18 +363,18 @@ void send_work_old() { // if (g_wreq->has_reliable_version) { g_wreq->reliable_only = true; - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] scanning for jobs that need reliable host\n" + "[send_scan] scanning for jobs that need reliable host\n" ); } if (scan_work_array()) return; g_wreq->reliable_only = false; g_wreq->best_app_versions.clear(); } else { - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] host has no reliable app versions; skipping scan\n" + "[send_scan] host has no reliable app versions; skipping scan\n" ); } } @@ -385,9 +385,9 @@ void send_work_old() { // if (g_wreq->allow_beta_work) { g_wreq->beta_only = true; - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] host will accept beta jobs. Scanning for them.\n" + "[send_scan] host will accept beta jobs. Scanning for them.\n" ); } if (scan_work_array()) return; @@ -397,9 +397,9 @@ void send_work_old() { // give next priority to results that were infeasible for some other host // g_wreq->infeasible_only = true; - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] Scanning for jobs that were infeasible for another host.\n" + "[send_scan] Scanning for jobs that were infeasible for another host.\n" ); } if (scan_work_array()) return; @@ -409,9 +409,9 @@ void send_work_old() { // make a pass accepting only jobs for which the client has a file // if (ssp->locality_sched_lite) { - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] Scanning for locality sched Lite jobs.\n" + "[send_scan] Scanning for locality sched Lite jobs.\n" ); } g_wreq->locality_sched_lite = true; @@ -421,9 +421,9 @@ void send_work_old() { // end of high-priority cases. Now do general scan. // - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] Scanning: general case.\n" + "[send_scan] Scanning: general case.\n" ); } if (scan_work_array()) return; @@ -434,9 +434,9 @@ void send_work_old() { if (!g_wreq->njobs_sent && g_wreq->allow_non_preferred_apps ) { g_wreq->user_apps_only = false; preferred_app_message_index = g_wreq->no_work_messages.size(); - if (config.debug_array) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[array] scanning for jobs from non-preferred applications\n" + "[send_scan] scanning for jobs from non-preferred applications\n" ); } scan_work_array(); diff --git a/sched/sched_check.cpp b/sched/sched_check.cpp index bf0af60a1e..5e4deba503 100644 --- a/sched/sched_check.cpp +++ b/sched/sched_check.cpp @@ -71,9 +71,9 @@ static inline int check_memory(WORKUNIT& wu) { ); add_no_work_message(message); - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [WU#%u %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n", + "[send_job] [WU#%u %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n", wu.id, wu.name, wu.rsc_memory_bound/MEGA, g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA ); @@ -208,9 +208,10 @@ static inline int check_deadline( // (i.e. everyone gets one result, no matter how slow they are) // if (get_estimated_delay(bav) == 0 && !hard_app(app)) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] est delay 0, skipping deadline check\n" + "[send_job] [WU#%u] est delay 0, skipping deadline check\n", + wu.id ); } return 0; @@ -246,18 +247,18 @@ static inline int check_deadline( double est_report_delay = get_estimated_delay(bav) + ewd; double diff = est_report_delay - wu.delay_bound; if (diff > 0) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [WU#%u] deadline miss %d > %d\n", + "[send_job] [WU#%u] deadline miss %d > %d\n", wu.id, (int)est_report_delay, wu.delay_bound ); } g_reply->wreq.speed.set_insufficient(diff); return INFEASIBLE_CPU; } else { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [WU#%u] meets deadline: %.2f + %.2f < %d\n", + "[send_job] [WU#%u] meets deadline: %.2f + %.2f < %d\n", wu.id, get_estimated_delay(bav), ewd, wu.delay_bound ); } @@ -302,18 +303,18 @@ int wu_is_infeasible_fast( // if (app_hr_type(app)) { if (hr_unknown_class(g_reply->host, app_hr_type(app))) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%u %s] host is of unknown class in HR type %d\n", + "[send_job] [HOST#%d] [WU#%u %s] host is of unknown class in HR type %d\n", g_reply->host.id, wu.id, wu.name, app_hr_type(app) ); } return INFEASIBLE_HR; } if (already_sent_to_different_hr_class(wu, app)) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%u %s] failed quick HR check: WU is class %d, host is class %d\n", + "[send_job] [HOST#%d] [WU#%u %s] failed quick HR check: WU is class %d, host is class %d\n", g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app)) ); } @@ -326,9 +327,9 @@ int wu_is_infeasible_fast( if (app.homogeneous_app_version) { int avid = wu.app_version_id; if (avid && bav.avp->id != avid) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%u %s] failed homogeneous app version check: %d %d\n", + "[send_job] [HOST#%d] [WU#%u %s] failed homogeneous app version check: %d %d\n", g_reply->host.id, wu.id, wu.name, avid, bav.avp->id ); } @@ -403,9 +404,9 @@ int slow_check( return 1; } else { if (n>0) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [USER#%d] already has %d result(s) for [WU#%u]\n", + "[send_job] [USER#%d] already has %d result(s) for [WU#%u]\n", g_reply->user.id, n, wu.id ); } @@ -427,9 +428,9 @@ int slow_check( return 1; } else { if (n>0) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] already has %d result(s) for [WU#%u]\n", + "[send_job] [HOST#%d] already has %d result(s) for [WU#%u]\n", g_reply->host.id, n, wu.id ); } @@ -464,9 +465,9 @@ int slow_check( if (app_hr_type(*app)) { wu.hr_class = vals[0]; if (already_sent_to_different_hr_class(wu, *app)) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%u %s] is assigned to different HR class\n", + "[send_job] [HOST#%d] [WU#%u %s] is assigned to different HR class\n", g_reply->host.id, wu.id, wu.name ); } @@ -482,9 +483,9 @@ int slow_check( int wu_avid = vals[1]; wu.app_version_id = wu_avid; if (wu_avid && wu_avid != bavp->avp->id) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] [HOST#%d] [WU#%u %s] is assigned to different app version\n", + "[send_job] [HOST#%d] [WU#%u %s] is assigned to different app version\n", g_reply->host.id, wu.id, wu.name ); } diff --git a/sched/sched_config.cpp b/sched/sched_config.cpp index 6a826f389b..15d1a522e2 100644 --- a/sched/sched_config.cpp +++ b/sched/sched_config.cpp @@ -299,8 +299,6 @@ int SCHED_CONFIG::parse(FILE* f) { //////////// SCHEDULER LOG FLAGS ///////// - if (xp.parse_bool("debug_array", debug_array)) continue; - if (xp.parse_bool("debug_array_detail", debug_array_detail)) continue; if (xp.parse_bool("debug_assignment", debug_assignment)) continue; if (xp.parse_bool("debug_client_files", debug_client_files)) continue; if (xp.parse_bool("debug_credit", debug_credit)) continue; @@ -316,6 +314,8 @@ int SCHED_CONFIG::parse(FILE* f) { if (xp.parse_bool("debug_request_headers", debug_request_headers)) continue; if (xp.parse_bool("debug_resend", debug_resend)) continue; if (xp.parse_bool("debug_send", debug_send)) continue; + if (xp.parse_bool("debug_send_job", debug_send_job)) continue; + if (xp.parse_bool("debug_send_scan", debug_send_scan)) continue; if (xp.parse_bool("debug_user_messages", debug_user_messages)) continue; if (xp.parse_bool("debug_vda", debug_vda)) continue; if (xp.parse_bool("debug_version_select", debug_version_select)) continue; diff --git a/sched/sched_config.h b/sched/sched_config.h index 63979a4f34..f851b86859 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -182,8 +182,6 @@ struct SCHED_CONFIG { // scheduler log flags // - bool debug_array; // debug job-cache scheduling - bool debug_array_detail; // show slot-level info bool debug_assignment; bool debug_credit; bool debug_edf_sim_detail; // show details of EDF sim @@ -199,6 +197,11 @@ struct SCHED_CONFIG { bool debug_request_headers; bool debug_resend; bool debug_send; + // job dispatch, high-level stuff, e.g. request params and jobs sent + bool debug_send_scan; + // job dispatch at the level of scans through array + bool debug_send_job; + // job dispatch: why individual jobs weren't sent (most verbose) bool debug_user_messages; bool debug_vda; bool debug_version_select; diff --git a/sched/sched_nci.cpp b/sched/sched_nci.cpp index 34e29f6b2c..e14d66aec0 100644 --- a/sched/sched_nci.cpp +++ b/sched/sched_nci.cpp @@ -34,9 +34,10 @@ static bool can_send_nci( ) { bavp = get_app_version(wu, true, false); if (!bavp) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] No app version for NCI job; skipping\n" + "[send_job] [WU#%u] No app version for NCI job; skipping\n", + wu.id ); } return false; @@ -48,9 +49,10 @@ static bool can_send_nci( *app, *bavp ); if (retval) { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] wu_is_infeasible_fast() failed for NCI job; skipping\n" + "[send_job] [WU#%u] wu_is_infeasible_fast() failed for NCI job; skipping\n", + wu.id ); } return false; diff --git a/sched/sched_score.cpp b/sched/sched_score.cpp index ce4668b908..46d0e92f78 100644 --- a/sched/sched_score.cpp +++ b/sched/sched_score.cpp @@ -67,9 +67,9 @@ bool JOB::get_score(WU_RESULT& wu_result) { if (g_wreq->allow_beta_work) { score += 1; } else { - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] can't send job %d for beta app to non-beta user\n", + "[send_job] can't send job %d for beta app to non-beta user\n", wu_result.workunit.id ); } @@ -81,6 +81,12 @@ bool JOB::get_score(WU_RESULT& wu_result) { if (g_wreq->allow_non_preferred_apps) { score -= 1; } else { + if (config.debug_send_job) { + log_messages.printf(MSG_NORMAL, + "[send_job] app not selected for job %d\n", + wu_result.workunit.id + ); + } return false; } } @@ -107,9 +113,9 @@ bool JOB::get_score(WU_RESULT& wu_result) { if (app->n_size_classes > 1) { double effective_speed = bavp->host_usage.projected_flops * available_frac(*bavp); int target_size = get_size_class(*app, effective_speed); - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send] size: host %d job %d speed %f\n", + "[send_job] size: host %d job %d speed %f\n", target_size, wu_result.workunit.size_class, effective_speed ); } @@ -121,9 +127,9 @@ bool JOB::get_score(WU_RESULT& wu_result) { score -= 1; } } - if (config.debug_send) { + if (config.debug_send_job) { log_messages.printf(MSG_NORMAL, - "[send]: score %f for result %d\n", score, wu_result.resultid + "[send_job]: score %f for result %d\n", score, wu_result.resultid ); } @@ -160,9 +166,9 @@ static void restore_others(int rt) { void send_work_score_type(int rt) { vector jobs; - if (config.debug_send) { + if (config.debug_send_scan) { log_messages.printf(MSG_NORMAL, - "[send] scanning for %s jobs\n", proc_type_name(rt) + "[send_scan] scanning for %s jobs\n", proc_type_name(rt) ); } @@ -171,6 +177,11 @@ void send_work_score_type(int rt) { int nscan = config.mm_max_slots; if (!nscan) nscan = ssp->max_wu_results; int rnd_off = rand() % ssp->max_wu_results; + if (config.debug_send_scan) { + log_messages.printf(MSG_NORMAL, + "[send_scan] scanning %d slots starting at %d\n", nscan, rnd_off + ); + } for (int j=0; jmax_wu_results; WU_RESULT& wu_result = ssp->wu_results[i]; @@ -180,15 +191,43 @@ void send_work_score_type(int rt) { WORKUNIT wu = wu_result.workunit; JOB job; job.app = ssp->lookup_app(wu.appid); - if (job.app->non_cpu_intensive) continue; + if (job.app->non_cpu_intensive) { + if (config.debug_send_job) { + log_messages.printf(MSG_NORMAL, + "[send_job] [RESULT#%u] app is non compute intensive\n", + wu_result.resultid + ); + } + continue; + } job.bavp = get_app_version(wu, true, false); - if (!job.bavp) continue; + if (!job.bavp) { + if (config.debug_send_job) { + log_messages.printf(MSG_NORMAL, + "[send_job] [RESULT#%u] no app version available\n", + wu_result.resultid + ); + } + continue; + } job.index = i; job.result_id = wu_result.resultid; if (!job.get_score(wu_result)) { + if (config.debug_send_job) { + log_messages.printf(MSG_NORMAL, + "[send_job] [RESULT#%u] get_score() returned false\n", + wu_result.resultid + ); + } continue; } + if (config.debug_send_job) { + log_messages.printf(MSG_NORMAL, + "[send_job] [RESULT#%u] score: %f\n", + wu_result.resultid, job.score + ); + } jobs.push_back(job); }