From 01f48513233163ed102462109ac7def9f98f1109 Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Mon, 14 May 2007 15:21:38 +0000
Subject: [PATCH] - scheduler: add max_wus_in_progress option.     Limits total
 # of in-progress results per host     (independently of #CPUs)

sched/
    sched_config.C,h
    sched_resend.C
    sched_send.C
    server_types.h


svn path=/trunk/boinc/; revision=12661
---
 checkin_notes        | 11 +++++++++++
 sched/main.h         |  3 +++
 sched/sched_config.C |  1 +
 sched/sched_config.h |  1 +
 sched/sched_resend.C |  9 +++++++++
 sched/sched_send.C   | 28 ++++++++++++++++++++++++++++
 sched/server_types.h |  2 ++
 7 files changed, 55 insertions(+)

diff --git a/checkin_notes b/checkin_notes
index de6a147b0b..68f3e76f6a 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -4888,3 +4888,14 @@ Rom    12 May 2007
         boinc_ss.rc
     clientgui/
         BOINCGUIApp.rc
+
+David  14 May 2007
+    - scheduler: add max_wus_in_progress option.
+        Limits total # of in-progress results per host
+        (independently of #CPUs)
+
+    sched/
+        sched_config.C,h
+        sched_resend.C
+        sched_send.C
+        server_types.h
diff --git a/sched/main.h b/sched/main.h
index 3b6b823aef..4f14ddf093 100644
--- a/sched/main.h
+++ b/sched/main.h
@@ -52,6 +52,9 @@
     // client asked for work but we didn't send any,
     // because of a reason not easily changed
     // (like wrong kind of computer)
+#define DELAY_NO_WORK_CACHE         20*60
+    // client asked for work but we didn't send any,
+    // because user had too many results in cache.
 
 extern SCHED_CONFIG config;
 extern GUI_URLS gui_urls;
diff --git a/sched/sched_config.C b/sched/sched_config.C
index 3a51e4074d..92e5928a9f 100644
--- a/sched/sched_config.C
+++ b/sched/sched_config.C
@@ -92,6 +92,7 @@ int SCHED_CONFIG::parse(FILE* f) {
         else if (xp.parse_bool(tag, "ignore_delay_bound", ignore_delay_bound)) continue;
         else if (xp.parse_int(tag, "min_sendwork_interval", min_sendwork_interval)) continue;
         else if (xp.parse_int(tag, "max_wus_to_send", max_wus_to_send)) continue;
+        else if (xp.parse_int(tag, "max_wus_in_progress", max_wus_in_progress)) continue;
         else if (xp.parse_int(tag, "daily_result_quota", daily_result_quota)) continue;
         else if (xp.parse_int(tag, "uldl_dir_fanout", uldl_dir_fanout)) continue;
         else if (xp.parse_int(tag, "locality_scheduling_wait_period", locality_scheduling_wait_period)) continue;
diff --git a/sched/sched_config.h b/sched/sched_config.h
index a63bbe439c..c3ed57c4f8 100644
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@@ -47,6 +47,7 @@ public:
     bool msg_to_host;
     int min_sendwork_interval;
     int max_wus_to_send;
+    int max_wus_in_progress;
     bool non_cpu_intensive;
     bool verify_files_on_app_start;
     bool homogeneous_redundancy;
diff --git a/sched/sched_resend.C b/sched/sched_resend.C
index 2267bd9d29..c52ee35336 100644
--- a/sched/sched_resend.C
+++ b/sched/sched_resend.C
@@ -99,6 +99,7 @@ bool resend_lost_work(
     bool did_any = false;
     int num_to_resend=0;
     int num_resent=0;
+    int num_on_host=0;
     APP* app;
     APP_VERSION* avp;
     int retval;
@@ -118,6 +119,7 @@ bool resend_lost_work(
     );
     while (!result.enumerate(buf)) {
         bool found = false;
+        num_on_host++;
         for (i=0; i<sreq.other_results.size(); i++) {
             OTHER_RESULT& orp = sreq.other_results[i];
             if (!strcmp(orp.name.c_str(), result.name)) {
@@ -221,6 +223,13 @@ bool resend_lost_work(
             did_any = true;
         }
     }
+
+    reply.wreq.nresults_on_host = num_on_host;
+    log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+        "[HOST#%d] %d results in progress, set for later checking\n",
+        reply.host.id, num_on_host
+    );
+
     if (num_to_resend) {
         log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
             "[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_to_resend, num_resent 
diff --git a/sched/sched_send.C b/sched/sched_send.C
index a52241b8e0..2ff95fb9a9 100644
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@@ -677,6 +677,19 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
             return false;
         }
     }
+
+    if (config.max_wus_in_progress) {
+        int limit = config.max_wus_in_progress;
+        if (wreq.nresults_on_host >= limit) {
+            log_messages.printf(
+                SCHED_MSG_LOG::MSG_DEBUG,
+                "cache limit exceeded; %d > %d\n",
+                wreq.nresults_on_host, config.max_wus_in_progress
+            );
+            wreq.cache_size_exceeded=true;
+            return false;
+        }
+    }
     return true;
 }
 
@@ -818,6 +831,7 @@ int add_result_to_reply(
     reply.wreq.seconds_to_fill -= wu_seconds_filled;
     request.estimated_delay += wu_seconds_filled/reply.host.p_ncpus;
     reply.wreq.nresults++;
+    reply.wreq.nresults_on_host++;
     if (!resent_result) reply.host.nresults_today++;
 
     // add this result to workload for simulation
@@ -1021,6 +1035,20 @@ int send_work(
                           (int)(3600*(double)rand()/(double)RAND_MAX);
             reply.set_delay(delay_time);
         }
+        if (reply.wreq.cache_size_exceeded) {
+            char helpful[256];
+            sprintf(helpful, "(reached per-host limit of %d tasks)",
+                config.max_wus_in_progress
+            );
+            USER_MESSAGE um(helpful, "high");
+            reply.insert_message(um);
+            reply.set_delay(DELAY_NO_WORK_CACHE);
+            log_messages.printf(
+                SCHED_MSG_LOG::MSG_NORMAL,
+                "host %d already has %d result(s) on cache\n",
+                reply.host.id, reply.wreq.nresults_on_host
+            );
+        }        
     }
 
     return 0;
diff --git a/sched/server_types.h b/sched/server_types.h
index 92ab61ab02..65bcb41272 100644
--- a/sched/server_types.h
+++ b/sched/server_types.h
@@ -70,6 +70,8 @@ struct WORK_REQ {
     bool outdated_core;
     bool daily_result_quota_exceeded;
     int  daily_result_quota; // for this machine: number of cpus * daily_quota/cpu
+    bool cache_size_exceeded;
+    int nresults_on_host;
     void update_for_result(double seconds_filled);
 };