From a64cb793f12fe55e78b369a566781aaf60ee3561 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Wed, 12 Dec 2012 11:45:49 -0800 Subject: [PATCH] - scheduler: attempted performance enhancement. Old: each scheduler process holds a semaphore while scanning the shared-mem job array. On machines with many CPUs there seems to be contention for this semaphore, causing slow scheduler response and possibly connection failures. New: Don't hold the semaphore while scanning array. Instead, if find a job that passes quick_check(), acquire the semaphore and recheck that the job is present in array and passes quick_check(). - client: show messages if app_config.xml has unrecognized tags --- checkin_notes | 18 ++++++++++++++++++ client/app_config.cpp | 26 ++++++++++++++++++++------ client/app_config.h | 6 +++--- sched/sched_array.cpp | 20 +++++++++++++++++--- 4 files changed, 58 insertions(+), 12 deletions(-) diff --git a/checkin_notes b/checkin_notes index d79e23df62..31fb3b7248 100644 --- a/checkin_notes +++ b/checkin_notes @@ -7645,3 +7645,21 @@ David 12 Dec 2012 - client: show Intel GPU memory size as integer lib/ coproc.cpp + +David 12 Dec 2012 + - scheduler: attempted performance enhancement. + Old: each scheduler process holds a semaphore + while scanning the shared-mem job array. + On machines with many CPUs + there seems to be contention for this semaphore, + causing slow scheduler response and possibly connection failures. + New: Don't hold the semaphore while scanning array. + Instead, if find a job that passes quick_check(), + acquire the semaphore and recheck that the job is present in array + and passes quick_check(). + - client: show messages if app_config.xml has unrecognized tags + + sched/ + sched_array.cpp + client/ + app_config.cpp,h diff --git a/client/app_config.cpp b/client/app_config.cpp index 498c559a6b..56c223ee01 100644 --- a/client/app_config.cpp +++ b/client/app_config.cpp @@ -27,7 +27,7 @@ bool have_max_concurrent = false; -int APP_CONFIG::parse(XML_PARSER& xp) { +int APP_CONFIG::parse(XML_PARSER& xp, PROJECT* p) { memset(this, 0, sizeof(APP_CONFIG)); while (!xp.get_tag()) { @@ -45,32 +45,46 @@ int APP_CONFIG::parse(XML_PARSER& xp) { } continue; } + if (log_flags.unparsed_xml) { + msg_printf(p, MSG_INFO, + "Unparsed line in app_info.xml: %s", + xp.parsed_tag + ); + } + xp.skip_unexpected(log_flags.unparsed_xml, "APP_CONFIG::parse"); } return ERR_XML_PARSE; } -int APP_CONFIGS::parse(XML_PARSER& xp) { +int APP_CONFIGS::parse(XML_PARSER& xp, PROJECT* p) { app_configs.clear(); if (!xp.parse_start("app_config")) return ERR_XML_PARSE; while (!xp.get_tag()) { if (xp.match_tag("/app_config")) return 0; if (xp.match_tag("app")) { APP_CONFIG ac; - int retval = ac.parse(xp); + int retval = ac.parse(xp, p); if (!retval) { app_configs.push_back(ac); } continue; } + if (log_flags.unparsed_xml) { + msg_printf(p, MSG_INFO, + "Unparsed line in app_info.xml: %s", + xp.parsed_tag + ); + } + xp.skip_unexpected(log_flags.unparsed_xml, "APP_CONFIGS::parse"); } return ERR_XML_PARSE; } -int APP_CONFIGS::parse_file(FILE* f) { +int APP_CONFIGS::parse_file(FILE* f, PROJECT* p) { MIOFILE mf; XML_PARSER xp(&mf); mf.init_file(f); - int retval = parse(xp); + int retval = parse(xp, p); return retval; } @@ -115,7 +129,7 @@ void check_app_config() { msg_printf(p, MSG_INFO, "Found %s", APP_CONFIG_FILE_NAME ); - int retval = p->app_configs.parse_file(f); + int retval = p->app_configs.parse_file(f, p); if (!retval) { p->app_configs.config_app_versions(p); } diff --git a/client/app_config.h b/client/app_config.h index e08ff751f9..e3e110d798 100644 --- a/client/app_config.h +++ b/client/app_config.h @@ -33,14 +33,14 @@ struct APP_CONFIG { double gpu_gpu_usage; double gpu_cpu_usage; - int parse(XML_PARSER&); + int parse(XML_PARSER&, PROJECT*); }; struct APP_CONFIGS { std::vector app_configs; - int parse(XML_PARSER&); - int parse_file(FILE*); + int parse(XML_PARSER&, PROJECT*); + int parse_file(FILE*, PROJECT*); void config_app_versions(PROJECT*); }; diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index c75068eb14..60710ed644 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -378,7 +378,12 @@ static bool scan_work_array() { bool no_more_needed = false; SCHED_DB_RESULT result; - lock_sema(); + // To minimize the amount of time we lock the array, + // we initially scan without holding the lock. + // If we find a job that passes quick_check(), + // we acquire the lock and then check the job again. + // + bool sema_locked = false; rnd_off = rand() % ssp->max_wu_results; for (j=0; jmax_wu_results; j++) { @@ -392,6 +397,7 @@ static bool scan_work_array() { ); } +recheck: if (wu_result.state != WR_STATE_PRESENT && wu_result.state != g_pid) { continue; } @@ -424,6 +430,12 @@ static bool scan_work_array() { continue; } + if (!sema_locked) { + lock_sema(); + sema_locked = true; + goto recheck; + } + // mark wu_result as checked out and release semaphore. // from here on in this loop, don't continue on failure; // instead, goto dont_send (so that we reacquire semaphore) @@ -434,6 +446,7 @@ static bool scan_work_array() { wu_result.state = g_pid; unlock_sema(); + sema_locked = false; switch (slow_check(wu_result, app, bavp)) { case 1: @@ -476,13 +489,14 @@ static bool scan_work_array() { } break; } - lock_sema(); if (!work_needed(false)) { no_more_needed = true; break; } } - unlock_sema(); + if (sema_locked) { + unlock_sema(); + } return no_more_needed; }