- SCHED: Fixed problem that prevented proper driver version checking in cuda and

nvidia plan classes in plan_class_spec.xml - SCHED: Scheduler was not using properly estimated performance when assigning work. It was using theoretical performance to choose version and actual preformance to determine how long it would take. I've changed that to start with theoretical performance and converge to actual performance as host_app_version pfc_n increases. - SCHED: Added some additional app version selection debugging output.
2013-05-19 11:08:36 -07:00 · 2013-05-19 11:08:36 -07:00 · 6c76ddd45c
parent 7ce699e6c3
commit 6c76ddd45c
3 changed files with 25 additions and 2 deletions
--- a/sched/plan_class_spec.cpp
+++ b/sched/plan_class_spec.cpp
@ -369,6 +369,9 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) {
            }
            return false;
        }
+        
+        // in analogy to ATI/AMD 
+        driver_version=cp.display_driver_version;

        if (min_gpu_ram_mb) {
            gpu_requirements[PROC_TYPE_NVIDIA_GPU].update(0, min_gpu_ram_mb * MEGA);
@ -603,6 +606,7 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) {
    }
    hu.max_ncpus = hu.avg_ncpus;

+#if 0
    if (config.debug_version_select) {
        log_messages.printf(MSG_NORMAL,
            "[version] plan_class_spec: host_flops: %e, \tscale: %.2f, \tprojected_flops: %e, \tpeak_flops: %e\n",
@ -610,6 +614,7 @@ bool PLAN_CLASS_SPEC::check(SCHEDULER_REQUEST& sreq, HOST_USAGE& hu) {
            hu.peak_flops
        );
    }
+#endif

    return true;

--- a/sched/sched_config.cpp
+++ b/sched/sched_config.cpp
@ -91,7 +91,7 @@ int SCHED_CONFIG::parse(FILE* f) {
    strcpy(httpd_user, "apache");
    max_ncpus = MAX_NCPUS;
    scheduler_log_buffer = 32768;
-    version_select_random_factor = .1;
+    version_select_random_factor = 1.;

    if (!xp.parse_start("boinc")) return ERR_XML_PARSE;
    if (!xp.parse_start("config")) return ERR_XML_PARSE;
--- a/sched/sched_version.cpp
+++ b/sched/sched_version.cpp
@ -796,11 +796,22 @@ BEST_APP_VERSION* get_app_version(
            double r = 1;
            long n=1;
            if (havp) {
+                // slowly move from raw calc to measured performance as number
+                // of results increases
                n=std::max((long)havp->pfc.n,(long)n);
-            } 
+                double old_projected_flops=host_usage.projected_flops;
+                estimate_flops(host_usage, av);
+                host_usage.projected_flops=(host_usage.projected_flops*(n-1)+old_projected_flops)/n;
+            }
            if (config.version_select_random_factor) {
                r += config.version_select_random_factor*rand_normal()/n;
            }
+            if (config.debug_version_select  && bavp && bavp->avp) {
+                log_messages.printf(MSG_NORMAL,
+                    "[version] Comparing AV#%d (%.2f GFLOP) against AV#%d (%.2f GFLOP)\n",
+                    av.id,host_usage.projected_flops/1e+9,bavp->avp->id,bavp->host_usage.projected_flops/1e+9
+                );
+            }
            if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) {
                if (config.debug_version_select && (host_usage.projected_flops <= bavp->host_usage.projected_flops)) {
                      log_messages.printf(MSG_NORMAL,
@ -813,6 +824,13 @@ BEST_APP_VERSION* get_app_version(
                bavp->avp = &av;
                bavp->reliable = app_version_is_reliable(av.id);
                bavp->trusted = app_version_is_trusted(av.id);
+                if (config.debug_version_select) {
+                      log_messages.printf(MSG_NORMAL,
+                          "[version] Best app version is now AV%d (%.2f GFLOP)\n",
+                          bavp->avp->id, bavp->host_usage.projected_flops/1e+9
+                    );
+                }
+
            }
        }   // loop over app versions