scheduler: clean up job dispatch logging

There are now 3 flags for job dispatch logging:
<debug_send/>: info about work request, jobs sent, other high-level stuff
<debug_send_scan/>: info about scans through job cache
<debug_send_job/>: info about individual jobs (e.g. reason for not sending)
This commit is contained in:
David Anderson 2014-06-12 11:33:11 -07:00
parent 73d6012003
commit 56934f8fbe
6 changed files with 123 additions and 78 deletions

View File

@ -61,24 +61,24 @@ static bool quick_check(
//
if (g_wreq->beta_only) {
if (!app->beta) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] job is not from beta app; skipping\n"
"[send_job] job is not from beta app; skipping\n"
);
}
return false;
}
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] beta work found: [RESULT#%u]\n",
"[send_job] [HOST#%d] beta work found: [RESULT#%u]\n",
g_reply->host.id, wu_result.resultid
);
}
} else {
if (app->beta) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] job is from beta app; skipping\n"
"[send_job] job is from beta app; skipping\n"
);
}
return false;
@ -91,16 +91,16 @@ static bool quick_check(
//
if (!app->beta) {
if (g_wreq->reliable_only && (!wu_result.need_reliable)) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] job doesn't need reliable host; skipping\n"
"[send_job] job doesn't need reliable host; skipping\n"
);
}
return false;
} else if (!g_wreq->reliable_only && wu_result.need_reliable) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] job needs reliable host; skipping\n"
"[send_job] job needs reliable host; skipping\n"
);
}
return false;
@ -111,9 +111,9 @@ static bool quick_check(
// and the result is not infeasible
//
if (g_wreq->infeasible_only && (wu_result.infeasible_count==0)) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] job is not infeasible; skipping\n"
"[send_job] job is not infeasible; skipping\n"
);
}
return false;
@ -151,9 +151,9 @@ static bool quick_check(
//
bavp = get_app_version(wu, true, g_wreq->reliable_only);
if (!bavp) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] No app version for job; skipping\n"
"[send_job] No app version for job; skipping\n"
);
}
return false;
@ -168,9 +168,9 @@ static bool quick_check(
) {
if (app_not_selected(app->id)) {
g_wreq->no_allowed_apps_available = true;
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] [USER#%d] [WU#%u] user doesn't want work for app %s\n",
"[send_job] [USER#%d] [WU#%u] user doesn't want work for app %s\n",
g_reply->user.id, wu.id, app->name
);
}
@ -188,16 +188,16 @@ static bool quick_check(
*app, *bavp
);
if (retval) {
if (retval != last_retval && config.debug_send) {
if (retval != last_retval && config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] [WU#%u %s] WU is infeasible: %s\n",
"[send_job] [HOST#%d] [WU#%u %s] WU is infeasible: %s\n",
g_reply->host.id, wu.id, wu.name, infeasible_string(retval)
);
}
last_retval = retval;
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] is_infeasible_fast() failed; skipping\n"
"[send_job] is_infeasible_fast() failed; skipping\n"
);
}
return false;
@ -242,9 +242,9 @@ static bool scan_work_array() {
WU_RESULT& wu_result = ssp->wu_results[i];
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] scanning slot %d\n", i
"[send_job] scanning slot %d\n", i
);
}
@ -273,9 +273,9 @@ recheck:
// This may modify wu.rsc_fpops_est
//
if (!quick_check(wu_result, wu, bavp, app, last_retval)) {
if (config.debug_array_detail) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[array_detail] slot %d failed quick check\n", i
"[send_job] slot %d failed quick check\n", i
);
}
continue;
@ -363,18 +363,18 @@ void send_work_old() {
//
if (g_wreq->has_reliable_version) {
g_wreq->reliable_only = true;
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] scanning for jobs that need reliable host\n"
"[send_scan] scanning for jobs that need reliable host\n"
);
}
if (scan_work_array()) return;
g_wreq->reliable_only = false;
g_wreq->best_app_versions.clear();
} else {
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] host has no reliable app versions; skipping scan\n"
"[send_scan] host has no reliable app versions; skipping scan\n"
);
}
}
@ -385,9 +385,9 @@ void send_work_old() {
//
if (g_wreq->allow_beta_work) {
g_wreq->beta_only = true;
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] host will accept beta jobs. Scanning for them.\n"
"[send_scan] host will accept beta jobs. Scanning for them.\n"
);
}
if (scan_work_array()) return;
@ -397,9 +397,9 @@ void send_work_old() {
// give next priority to results that were infeasible for some other host
//
g_wreq->infeasible_only = true;
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] Scanning for jobs that were infeasible for another host.\n"
"[send_scan] Scanning for jobs that were infeasible for another host.\n"
);
}
if (scan_work_array()) return;
@ -409,9 +409,9 @@ void send_work_old() {
// make a pass accepting only jobs for which the client has a file
//
if (ssp->locality_sched_lite) {
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] Scanning for locality sched Lite jobs.\n"
"[send_scan] Scanning for locality sched Lite jobs.\n"
);
}
g_wreq->locality_sched_lite = true;
@ -421,9 +421,9 @@ void send_work_old() {
// end of high-priority cases. Now do general scan.
//
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] Scanning: general case.\n"
"[send_scan] Scanning: general case.\n"
);
}
if (scan_work_array()) return;
@ -434,9 +434,9 @@ void send_work_old() {
if (!g_wreq->njobs_sent && g_wreq->allow_non_preferred_apps ) {
g_wreq->user_apps_only = false;
preferred_app_message_index = g_wreq->no_work_messages.size();
if (config.debug_array) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[array] scanning for jobs from non-preferred applications\n"
"[send_scan] scanning for jobs from non-preferred applications\n"
);
}
scan_work_array();

View File

@ -71,9 +71,9 @@ static inline int check_memory(WORKUNIT& wu) {
);
add_no_work_message(message);
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [WU#%u %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n",
"[send_job] [WU#%u %s] needs %0.2fMB RAM; [HOST#%d] has %0.2fMB, %0.2fMB usable\n",
wu.id, wu.name, wu.rsc_memory_bound/MEGA,
g_reply->host.id, g_wreq->ram/MEGA, g_wreq->usable_ram/MEGA
);
@ -208,9 +208,10 @@ static inline int check_deadline(
// (i.e. everyone gets one result, no matter how slow they are)
//
if (get_estimated_delay(bav) == 0 && !hard_app(app)) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] est delay 0, skipping deadline check\n"
"[send_job] [WU#%u] est delay 0, skipping deadline check\n",
wu.id
);
}
return 0;
@ -246,18 +247,18 @@ static inline int check_deadline(
double est_report_delay = get_estimated_delay(bav) + ewd;
double diff = est_report_delay - wu.delay_bound;
if (diff > 0) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [WU#%u] deadline miss %d > %d\n",
"[send_job] [WU#%u] deadline miss %d > %d\n",
wu.id, (int)est_report_delay, wu.delay_bound
);
}
g_reply->wreq.speed.set_insufficient(diff);
return INFEASIBLE_CPU;
} else {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [WU#%u] meets deadline: %.2f + %.2f < %d\n",
"[send_job] [WU#%u] meets deadline: %.2f + %.2f < %d\n",
wu.id, get_estimated_delay(bav), ewd, wu.delay_bound
);
}
@ -302,18 +303,18 @@ int wu_is_infeasible_fast(
//
if (app_hr_type(app)) {
if (hr_unknown_class(g_reply->host, app_hr_type(app))) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] [WU#%u %s] host is of unknown class in HR type %d\n",
"[send_job] [HOST#%d] [WU#%u %s] host is of unknown class in HR type %d\n",
g_reply->host.id, wu.id, wu.name, app_hr_type(app)
);
}
return INFEASIBLE_HR;
}
if (already_sent_to_different_hr_class(wu, app)) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] [WU#%u %s] failed quick HR check: WU is class %d, host is class %d\n",
"[send_job] [HOST#%d] [WU#%u %s] failed quick HR check: WU is class %d, host is class %d\n",
g_reply->host.id, wu.id, wu.name, wu.hr_class, hr_class(g_request->host, app_hr_type(app))
);
}
@ -326,9 +327,9 @@ int wu_is_infeasible_fast(
if (app.homogeneous_app_version) {
int avid = wu.app_version_id;
if (avid && bav.avp->id != avid) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] [WU#%u %s] failed homogeneous app version check: %d %d\n",
"[send_job] [HOST#%d] [WU#%u %s] failed homogeneous app version check: %d %d\n",
g_reply->host.id, wu.id, wu.name, avid, bav.avp->id
);
}
@ -403,9 +404,9 @@ int slow_check(
return 1;
} else {
if (n>0) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [USER#%d] already has %d result(s) for [WU#%u]\n",
"[send_job] [USER#%d] already has %d result(s) for [WU#%u]\n",
g_reply->user.id, n, wu.id
);
}
@ -427,9 +428,9 @@ int slow_check(
return 1;
} else {
if (n>0) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] already has %d result(s) for [WU#%u]\n",
"[send_job] [HOST#%d] already has %d result(s) for [WU#%u]\n",
g_reply->host.id, n, wu.id
);
}
@ -464,9 +465,9 @@ int slow_check(
if (app_hr_type(*app)) {
wu.hr_class = vals[0];
if (already_sent_to_different_hr_class(wu, *app)) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] [WU#%u %s] is assigned to different HR class\n",
"[send_job] [HOST#%d] [WU#%u %s] is assigned to different HR class\n",
g_reply->host.id, wu.id, wu.name
);
}
@ -482,9 +483,9 @@ int slow_check(
int wu_avid = vals[1];
wu.app_version_id = wu_avid;
if (wu_avid && wu_avid != bavp->avp->id) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] [HOST#%d] [WU#%u %s] is assigned to different app version\n",
"[send_job] [HOST#%d] [WU#%u %s] is assigned to different app version\n",
g_reply->host.id, wu.id, wu.name
);
}

View File

@ -299,8 +299,6 @@ int SCHED_CONFIG::parse(FILE* f) {
//////////// SCHEDULER LOG FLAGS /////////
if (xp.parse_bool("debug_array", debug_array)) continue;
if (xp.parse_bool("debug_array_detail", debug_array_detail)) continue;
if (xp.parse_bool("debug_assignment", debug_assignment)) continue;
if (xp.parse_bool("debug_client_files", debug_client_files)) continue;
if (xp.parse_bool("debug_credit", debug_credit)) continue;
@ -316,6 +314,8 @@ int SCHED_CONFIG::parse(FILE* f) {
if (xp.parse_bool("debug_request_headers", debug_request_headers)) continue;
if (xp.parse_bool("debug_resend", debug_resend)) continue;
if (xp.parse_bool("debug_send", debug_send)) continue;
if (xp.parse_bool("debug_send_job", debug_send_job)) continue;
if (xp.parse_bool("debug_send_scan", debug_send_scan)) continue;
if (xp.parse_bool("debug_user_messages", debug_user_messages)) continue;
if (xp.parse_bool("debug_vda", debug_vda)) continue;
if (xp.parse_bool("debug_version_select", debug_version_select)) continue;

View File

@ -182,8 +182,6 @@ struct SCHED_CONFIG {
// scheduler log flags
//
bool debug_array; // debug job-cache scheduling
bool debug_array_detail; // show slot-level info
bool debug_assignment;
bool debug_credit;
bool debug_edf_sim_detail; // show details of EDF sim
@ -199,6 +197,11 @@ struct SCHED_CONFIG {
bool debug_request_headers;
bool debug_resend;
bool debug_send;
// job dispatch, high-level stuff, e.g. request params and jobs sent
bool debug_send_scan;
// job dispatch at the level of scans through array
bool debug_send_job;
// job dispatch: why individual jobs weren't sent (most verbose)
bool debug_user_messages;
bool debug_vda;
bool debug_version_select;

View File

@ -34,9 +34,10 @@ static bool can_send_nci(
) {
bavp = get_app_version(wu, true, false);
if (!bavp) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] No app version for NCI job; skipping\n"
"[send_job] [WU#%u] No app version for NCI job; skipping\n",
wu.id
);
}
return false;
@ -48,9 +49,10 @@ static bool can_send_nci(
*app, *bavp
);
if (retval) {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] wu_is_infeasible_fast() failed for NCI job; skipping\n"
"[send_job] [WU#%u] wu_is_infeasible_fast() failed for NCI job; skipping\n",
wu.id
);
}
return false;

View File

@ -67,9 +67,9 @@ bool JOB::get_score(WU_RESULT& wu_result) {
if (g_wreq->allow_beta_work) {
score += 1;
} else {
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] can't send job %d for beta app to non-beta user\n",
"[send_job] can't send job %d for beta app to non-beta user\n",
wu_result.workunit.id
);
}
@ -81,6 +81,12 @@ bool JOB::get_score(WU_RESULT& wu_result) {
if (g_wreq->allow_non_preferred_apps) {
score -= 1;
} else {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] app not selected for job %d\n",
wu_result.workunit.id
);
}
return false;
}
}
@ -107,9 +113,9 @@ bool JOB::get_score(WU_RESULT& wu_result) {
if (app->n_size_classes > 1) {
double effective_speed = bavp->host_usage.projected_flops * available_frac(*bavp);
int target_size = get_size_class(*app, effective_speed);
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send] size: host %d job %d speed %f\n",
"[send_job] size: host %d job %d speed %f\n",
target_size, wu_result.workunit.size_class, effective_speed
);
}
@ -121,9 +127,9 @@ bool JOB::get_score(WU_RESULT& wu_result) {
score -= 1;
}
}
if (config.debug_send) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send]: score %f for result %d\n", score, wu_result.resultid
"[send_job]: score %f for result %d\n", score, wu_result.resultid
);
}
@ -160,9 +166,9 @@ static void restore_others(int rt) {
void send_work_score_type(int rt) {
vector<JOB> jobs;
if (config.debug_send) {
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[send] scanning for %s jobs\n", proc_type_name(rt)
"[send_scan] scanning for %s jobs\n", proc_type_name(rt)
);
}
@ -171,6 +177,11 @@ void send_work_score_type(int rt) {
int nscan = config.mm_max_slots;
if (!nscan) nscan = ssp->max_wu_results;
int rnd_off = rand() % ssp->max_wu_results;
if (config.debug_send_scan) {
log_messages.printf(MSG_NORMAL,
"[send_scan] scanning %d slots starting at %d\n", nscan, rnd_off
);
}
for (int j=0; j<nscan; j++) {
int i = (j+rnd_off) % ssp->max_wu_results;
WU_RESULT& wu_result = ssp->wu_results[i];
@ -180,15 +191,43 @@ void send_work_score_type(int rt) {
WORKUNIT wu = wu_result.workunit;
JOB job;
job.app = ssp->lookup_app(wu.appid);
if (job.app->non_cpu_intensive) continue;
if (job.app->non_cpu_intensive) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [RESULT#%u] app is non compute intensive\n",
wu_result.resultid
);
}
continue;
}
job.bavp = get_app_version(wu, true, false);
if (!job.bavp) continue;
if (!job.bavp) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [RESULT#%u] no app version available\n",
wu_result.resultid
);
}
continue;
}
job.index = i;
job.result_id = wu_result.resultid;
if (!job.get_score(wu_result)) {
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [RESULT#%u] get_score() returned false\n",
wu_result.resultid
);
}
continue;
}
if (config.debug_send_job) {
log_messages.printf(MSG_NORMAL,
"[send_job] [RESULT#%u] score: %f\n",
wu_result.resultid, job.score
);
}
jobs.push_back(job);
}