From 6c76ddd45c723860f2f0975b70a142b77eb01a74 Mon Sep 17 00:00:00 2001 From: Eric J Korpela Date: Sun, 19 May 2013 11:08:36 -0700 Subject: [PATCH] - SCHED: Fixed problem that prevented proper driver version checking in cuda and nvidia plan classes in plan_class_spec.xml - SCHED: Scheduler was not using properly estimated performance when assigning work. It was using theoretical performance to choose version and actual preformance to determine how long it would take. I've changed that to start with theoretical performance and converge to actual performance as host_app_version pfc_n increases. - SCHED: Added some additional app version selection debugging output. --- sched/plan_class_spec.cpp | 5 +++++ sched/sched_config.cpp | 2 +- sched/sched_version.cpp | 20 +++++++++++++++++++- 3 files changed, 25 insertions(+), 2 deletions(-) diff --git a/sched/plan_class_spec.cpp b/sched/plan_class_spec.cpp index b76bd029fe..16793fcac2 100644 --- a/sched/plan_class_spec.cpp +++ b/sched/plan_class_spec.cpp @@ -369,6 +369,9 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) { } return false; } + + // in analogy to ATI/AMD + driver_version=cp.display_driver_version; if (min_gpu_ram_mb) { gpu_requirements[PROC_TYPE_NVIDIA_GPU].update(0, min_gpu_ram_mb * MEGA); @@ -603,6 +606,7 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) { } hu.max_ncpus = hu.avg_ncpus; +#if 0 if (config.debug_version_select) { log_messages.printf(MSG_NORMAL, "[version] plan_class_spec: host_flops: %e, \tscale: %.2f, \tprojected_flops: %e, \tpeak_flops: %e\n", @@ -610,6 +614,7 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) { hu.peak_flops ); } +#endif return true; diff --git a/sched/sched_config.cpp b/sched/sched_config.cpp index 224b18028b..e7c1e6851a 100644 --- a/sched/sched_config.cpp +++ b/sched/sched_config.cpp @@ -91,7 +91,7 @@ int SCHED_CONFIG::parse(FILE* f) { strcpy(httpd_user, "apache"); max_ncpus = MAX_NCPUS; scheduler_log_buffer = 32768; - version_select_random_factor = .1; + version_select_random_factor = 1.; if (!xp.parse_start("boinc")) return ERR_XML_PARSE; if (!xp.parse_start("config")) return ERR_XML_PARSE; diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index 62d6af9551..6ca4b4b0a8 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -796,11 +796,22 @@ BEST_APP_VERSION* get_app_version( double r = 1; long n=1; if (havp) { + // slowly move from raw calc to measured performance as number + // of results increases n=std::max((long)havp->pfc.n,(long)n); - } + double old_projected_flops=host_usage.projected_flops; + estimate_flops(host_usage, av); + host_usage.projected_flops=(host_usage.projected_flops*(n-1)+old_projected_flops)/n; + } if (config.version_select_random_factor) { r += config.version_select_random_factor*rand_normal()/n; } + if (config.debug_version_select && bavp && bavp->avp) { + log_messages.printf(MSG_NORMAL, + "[version] Comparing AV#%d (%.2f GFLOP) against AV#%d (%.2f GFLOP)\n", + av.id,host_usage.projected_flops/1e+9,bavp->avp->id,bavp->host_usage.projected_flops/1e+9 + ); + } if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) { if (config.debug_version_select && (host_usage.projected_flops <= bavp->host_usage.projected_flops)) { log_messages.printf(MSG_NORMAL, @@ -813,6 +824,13 @@ BEST_APP_VERSION* get_app_version( bavp->avp = &av; bavp->reliable = app_version_is_reliable(av.id); bavp->trusted = app_version_is_trusted(av.id); + if (config.debug_version_select) { + log_messages.printf(MSG_NORMAL, + "[version] Best app version is now AV%d (%.2f GFLOP)\n", + bavp->avp->id, bavp->host_usage.projected_flops/1e+9 + ); + } + } } // loop over app versions