scheduler: add <rte_no_stats> config flag to estimate job runtime without stats

The scheduler estimates job runtime based on statistics of past jobs for this (host, app version). This doesn't work well if the distribution of runtimes is very wide, as may be the case of universal apps. If this flag is set, runtime estimation is based solely on CPU/GPU peak FLOPS and job FLOPs estimate.
2017-10-31 12:14:50 -07:00 · 2017-10-31 12:14:50 -07:00 · 306605e6d8
parent 28cc28b4a0
commit 306605e6d8
4 changed files with 47 additions and 19 deletions
--- a/sched/sched_config.cpp
+++ b/sched/sched_config.cpp
@ -299,6 +299,7 @@ int SCHED_CONFIG::parse(FILE* f) {
        if (xp.parse_double("maintenance_delay", maintenance_delay)) continue;
        if (xp.parse_bool("credit_by_app", credit_by_app)) continue;
        if (xp.parse_bool("keyword_sched", keyword_sched)) continue;
+        if (xp.parse_bool("rte_no_stats", rte_no_stats)) continue;

        //////////// SCHEDULER LOG FLAGS /////////

--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -182,6 +182,8 @@ struct SCHED_CONFIG {
        // store per-app credit info in credit_user and credit_team
    bool keyword_sched;
        // score jobs based on keywords
+    bool rte_no_stats;
+        // don't use statistics in job runtime estimation

    // time intervals
    double maintenance_delay;
--- a/sched/sched_version.cpp
+++ b/sched/sched_version.cpp
@ -344,9 +344,32 @@ void estimate_flops_anon_platform() {
 //    This prevents jobs from aborting with "time limit exceeded"
 //    even if the estimate supplied by the plan class function is way off
 //
+
+#define RTE_HAV_STATS 1
+#define RTE_AV_STATS  2
+#define RTE_NO_STATS  3
+
 void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
-    DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id);
-    if (havp && havp->et.n > MIN_HOST_SAMPLES) {
+    int mode;
+    DB_HOST_APP_VERSION* havp = NULL;
+
+    if (config.rte_no_stats) {
+        mode = RTE_NO_STATS;
+    } else {
+        havp = gavid_to_havp(av.id);
+        if (havp && havp->et.n > MIN_HOST_SAMPLES) {
+            mode = RTE_HAV_STATS;
+        } else {
+            if (av.pfc.n > MIN_VERSION_SAMPLES) {
+                mode = RTE_AV_STATS;
+            } else {
+                mode = RTE_NO_STATS;
+            }
+        }
+    }
+
+    switch (mode) {
+    case RTE_HAV_STATS:
        double new_flops;
        if (config.estimate_flops_from_hav_pfc) {
            new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18);
@ -387,24 +410,25 @@ void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
                1e-9/havp->et.get_avg()
            );
        }
-    } else {
-        if (av.pfc.n > MIN_VERSION_SAMPLES) {
-            hu.projected_flops = hu.peak_flops/av.pfc.get_avg();
-            if (config.debug_version_select) {
-                log_messages.printf(MSG_NORMAL,
-                    "[version] [AV#%lu] (%s) adjusting projected flops based on PFC avg: %.2fG\n",
-                    av.id, av.plan_class, hu.projected_flops/1e9
-                );
-            }
-        } else {
-            hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage);
-            if (config.debug_version_select) {
-                log_messages.printf(MSG_NORMAL,
-                    "[version] [AV#%lu] (%s) using conservative projected flops: %.2fG\n",
-                    av.id, av.plan_class, hu.projected_flops/1e9
-                );
-            }
+        break;
+    case RTE_AV_STATS:
+        hu.projected_flops = hu.peak_flops/av.pfc.get_avg();
+        if (config.debug_version_select) {
+            log_messages.printf(MSG_NORMAL,
+                "[version] [AV#%lu] (%s) adjusting projected flops based on PFC avg: %.2fG\n",
+                av.id, av.plan_class, hu.projected_flops/1e9
+            );
        }
+        break;
+    case RTE_NO_STATS:
+        hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage);
+        if (config.debug_version_select) {
+            log_messages.printf(MSG_NORMAL,
+                "[version] [AV#%lu] (%s) using conservative projected flops: %.2fG\n",
+                av.id, av.plan_class, hu.projected_flops/1e9
+            );
+        }
+        break;
    }
 }

--- a/version.h
+++ b/version.h
@ -46,3 +46,4 @@
 #endif /* #if (defined(_WIN32) || defined(__APPLE__)) */

 #endif /* #ifndef BOINC_VERSION_H */
+