From 330a25893f8f857663b8a7d71ebd8c0a922e08ed Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Thu, 4 Apr 2013 10:25:56 -0700
Subject: [PATCH] - client emulator: parse <max_concurrent> in <app> in
 client_state.xml.     This gives you a way to simulate the effects of
 app_config.xml - client: piggyback requests for resources even if we're
 backed off from them - client: change resource backoff logic     Old: if we
 requested work and didn't get any,         back off from resources for which
 we requested work     New: for each resource type T:         if we requested
 work for T and didn't get any, back off from T         Also, don't back off
 if we're already backed off             (i.e. if this is a piggyback request)
         Also, only back off if the RPC was due to an automatic            
 and potentially rapid source             (namely: work fetch, result report,
 trickle up) - client: fix small work fetch bug

---
 client/client_types.cpp |  4 ++
 client/client_types.h   |  1 +
 client/sim.cpp          |  7 +++-
 client/work_fetch.cpp   | 85 +++++++++++++++++++++++------------------
 client/work_fetch.h     |  1 +
 5 files changed, 60 insertions(+), 38 deletions(-)
diff --git a/client/client_types.cpp b/client/client_types.cpp
index 7ed53bd958..4014422be1 100644
--- a/client/client_types.cpp
+++ b/client/client_types.cpp
@@ -143,6 +143,10 @@ int APP::parse(XML_PARSER& xp) {
             fpops.parse(xp, "/fpops");
             continue;
         }
+        if (xp.parse_int("max_concurrent", max_concurrent)) {
+            if (max_concurrent) have_max_concurrent = true;
+            continue;
+        }
         if (xp.match_tag("checkpoint_period")) {
             checkpoint_period.parse(xp, "/checkpoint_period");
             continue;
diff --git a/client/client_types.h b/client/client_types.h
index d889ac4fa0..c507972b3a 100644
--- a/client/client_types.h
+++ b/client/client_types.h
@@ -247,6 +247,7 @@ struct APP {
     int max_concurrent;
         // Limit on # of concurrent jobs of this app; 0 if none
         // Specified in app_config.xml
+        // Can also specify in client_state.xml (for client emulator)
     int n_concurrent;
         // temp during job scheduling, to enforce max_concurrent
     int non_excluded_instances[MAX_RSC];
diff --git a/client/sim.cpp b/client/sim.cpp
index 126b5eb81e..6b1e1b8de6 100644
--- a/client/sim.cpp
+++ b/client/sim.cpp
@@ -1168,12 +1168,17 @@ void show_app(APP* app) {
     fprintf(summary_file,
         "   app %s\n"
         "      job params: fpops_est %.0fG fpops mean %.0fG std_dev %.0fG\n"
-        "         latency %.2f weight %.2f\n",
+        "         latency %.2f weight %.2f",
         app->name, app->fpops_est/1e9,
         app->fpops.mean/1e9, app->fpops.std_dev/1e9,
         app->latency_bound,
         app->weight
     );
+    if (app->max_concurrent) {
+        fprintf(summary_file, " max_concurrent %d\n", app->max_concurrent);
+    } else {
+        fprintf(summary_file, "\n");
+    }
     for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
         APP_VERSION* avp = gstate.app_versions[i];
         if (avp->app != app) continue;
diff --git a/client/work_fetch.cpp b/client/work_fetch.cpp
index fcc6338cc9..4c8f72803c 100644
--- a/client/work_fetch.cpp
+++ b/client/work_fetch.cpp
@@ -41,7 +41,7 @@ using std::vector;
 RSC_WORK_FETCH rsc_work_fetch[MAX_RSC];
 WORK_FETCH work_fetch;
 
-inline bool dont_fetch(PROJECT* p, int rsc_type) {
+static inline bool dont_fetch(PROJECT* p, int rsc_type) {
     if (p->no_rsc_pref[rsc_type]) return true;
     if (p->no_rsc_config[rsc_type]) return true;
     if (p->no_rsc_apps[rsc_type]) return true;
@@ -187,7 +187,6 @@ static bool wacky_dcf(PROJECT* p) {
 // don't request anything if project is backed off.
 //
 void RSC_WORK_FETCH::set_request(PROJECT* p) {
-    if (dont_fetch(p, rsc_type)) return;
 
     // if backup project, fetch 1 job per idle instance
     //
@@ -202,8 +201,6 @@ void RSC_WORK_FETCH::set_request(PROJECT* p) {
         return;
     }
     RSC_PROJECT_WORK_FETCH& w = project_state(p);
-    if (!w.may_have_work) return;
-    if (w.anon_skip) return;
     double non_excl_inst = ninstances - w.ncoprocs_excluded;
     if (shortfall) {
         if (wacky_dcf(p)) {
@@ -476,7 +473,6 @@ void WORK_FETCH::piggyback_work_request(PROJECT* p) {
     bool check_higher_priority_projects = true;
     if (p->sched_rpc_pending && config.fetch_on_update) {
         check_higher_priority_projects = false;
-        return;
     }
 
     setup();
@@ -508,7 +504,7 @@ void WORK_FETCH::piggyback_work_request(PROJECT* p) {
                     DEBUG(msg_printf(p, MSG_INFO, "piggyback: %s can't fetch work", p2->project_name);)
                     continue;
                 }
-                if (rwf.can_fetch(p2)) {
+                if (rwf.can_fetch(p2) && !rwf.backed_off(p2)) {
                     DEBUG(msg_printf(p, MSG_INFO, "piggyback: better proj %s", p2->project_name);)
                     break;
                 }
@@ -544,6 +540,16 @@ static bool higher_priority(PROJECT *p1, PROJECT *p2) {
     return (p1->sched_priority > p2->sched_priority);
 }
 
+// check resource-level backoff
+//
+bool RSC_WORK_FETCH::backed_off(PROJECT* p) {
+    if (project_state(p).backoff_time > gstate.now) {
+        DEBUG(msg_printf(p, MSG_INFO, "skip: backoff");)
+        return true;
+    }
+    return false;
+}
+
 // a variety of checks for whether we should ask this project
 // for work of this type
 //
@@ -556,12 +562,7 @@ bool RSC_WORK_FETCH::can_fetch(PROJECT *p) {
         return false;
     }
 
-    // check resource-level backoff
-    //
-    if (project_state(p).backoff_time > gstate.now) {
-        DEBUG(msg_printf(p, MSG_INFO, "skip: backoff");)
-        return false;
-    }
+    RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
 
     // if project has zero resource share,
     // only fetch work if a device is idle
@@ -585,16 +586,15 @@ bool RSC_WORK_FETCH::can_fetch(PROJECT *p) {
     // TODO: THIS IS FAIRLY CRUDE. Making it smarter would require
     // computing shortfall etc. on a per-project basis
     //
-    int nexcl = p->rsc_pwf[rsc_type].ncoprocs_excluded;
+    int nexcl = rpwf.ncoprocs_excluded;
     if (rsc_type && nexcl) {
         int n_not_excluded = ninstances - nexcl;
-        if (p->rsc_pwf[rsc_type].queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) {
+        if (rpwf.queue_est > (gstate.work_buf_min() * n_not_excluded)/ninstances) {
             DEBUG(msg_printf(p, MSG_INFO, "skip: too much work");)
             return false;
         }
     }
 
-    RSC_PROJECT_WORK_FETCH& rpwf = project_state(p);
     if (rpwf.anon_skip) {
         DEBUG(msg_printf(p, MSG_INFO, "skip: anon");)
         return false;
@@ -690,7 +690,7 @@ PROJECT* WORK_FETCH::choose_project() {
         for (int i=0; i<coprocs.n_rsc; i++) {
             if (i && !gpus_usable) continue;
             RSC_WORK_FETCH& rwf = rsc_work_fetch[i];
-            if (rwf.can_fetch(p)) {
+            if (rwf.can_fetch(p) && !rwf.backed_off(p)) {
                 if (!rwf.found_project) {
                     rwf.found_project = p;
                 }
@@ -719,17 +719,21 @@ PROJECT* WORK_FETCH::choose_project() {
                 if (i && !gpus_usable) continue;
                 RSC_WORK_FETCH& rwf = rsc_work_fetch[i];
                 bool buffer_low;
+                DEBUG(msg_printf(p, MSG_INFO, "checking %s", rsc_name(i));)
                 if (i == rsc_index) {
                     buffer_low = (rwf.saturated_time < gstate.work_buf_min());
                 } else {
                     if (rwf.found_project && rwf.found_project != p) {
+                        DEBUG(msg_printf(p, MSG_INFO, "%s not high prio proj", rsc_name(i));)
                         continue;
                     }
                     buffer_low = (rwf.saturated_time < gstate.work_buf_total());
                     if (!buffer_low && !rwf.uses_starved_excluded_instances(p)) {
+                        DEBUG(msg_printf(p, MSG_INFO, "%s don't need", rsc_name(i));)
                         continue;
                     }
-                    if (!rsc_work_fetch[i].can_fetch(p)) {
+                    if (!rwf.can_fetch(p)) {
+                        DEBUG(msg_printf(p, MSG_INFO, "%s can't fetch", rsc_name(i));)
                         continue;
                     }
                 }
@@ -848,34 +852,41 @@ void WORK_FETCH::write_request(FILE* f, PROJECT* p) {
 void WORK_FETCH::handle_reply(
     PROJECT* p, SCHEDULER_REPLY*, vector<RESULT*> new_results
 ) {
-    bool got_rsc[MAX_RSC];
+    bool got_work[MAX_RSC];
+    bool requested_work[MAX_RSC];
     for (int i=0; i<coprocs.n_rsc; i++) {
-        got_rsc[i] = false;
+        got_work[i] = false;
+        requested_work[i] = (rsc_work_fetch[i].req_secs > 0);
+    }
+    for (unsigned int i=0; i<new_results.size(); i++) {
+        RESULT* rp = new_results[i];
+        got_work[rp->avp->gpu_usage.rsc_type] = true;
     }
 
-    // if didn't get any jobs, back off on requested resource types
-    //
-    if (!new_results.size()) {
-        // but not if RPC was requested by project
+    for (int i=0; i<coprocs.n_rsc; i++) {
+        // back off on a resource type if
+        // - we asked for jobs
+        // - we didn't get any
+        // - we're not currently backed off for that type
+        //   (i.e. don't back off because of a piggyback request)
+        // - the RPC was done for a reason that is automatic
+        //   and potentially frequent
         //
-        if (p->sched_rpc_pending != RPC_REASON_PROJECT_REQ) {
-            for (int i=0; i<coprocs.n_rsc; i++) {
-                if (rsc_work_fetch[i].req_secs) {
+        if (requested_work[i] && !got_work[i]) {
+            if (p->rsc_pwf[i].backoff_time < gstate.now) {
+                switch (p->sched_rpc_pending) {
+                case RPC_REASON_RESULTS_DUE:
+                case RPC_REASON_NEED_WORK:
+                case RPC_REASON_TRICKLE_UP:
                     p->rsc_pwf[i].resource_backoff(p, rsc_name(i));
                 }
             }
         }
-        return;
-    }
-
-    // if we did get jobs, clear backoff on resource types
-    //
-    for (unsigned int i=0; i<new_results.size(); i++) {
-        RESULT* rp = new_results[i];
-        got_rsc[rp->avp->gpu_usage.rsc_type] = true;
-    }
-    for (int i=0; i<coprocs.n_rsc; i++) {
-        if (got_rsc[i]) p->rsc_pwf[i].clear_backoff();
+        // if we did get jobs, clear backoff
+        //
+        if (got_work[i]) {
+            p->rsc_pwf[i].clear_backoff();
+        }
     }
 }
 
diff --git a/client/work_fetch.h b/client/work_fetch.h
index 19840f257a..a8ba32e327 100644
--- a/client/work_fetch.h
+++ b/client/work_fetch.h
@@ -253,6 +253,7 @@ struct RSC_WORK_FETCH {
     void set_request_excluded(PROJECT*);
     bool may_have_work(PROJECT*);
     bool can_fetch(PROJECT*);
+    bool backed_off(PROJECT*);
     bool uses_starved_excluded_instances(PROJECT*);
     RSC_WORK_FETCH() {
         rsc_type = 0;