- scheduler: add max_wus_in_progress option.

Limits total # of in-progress results per host (independently of #CPUs) sched/ sched_config.C,h sched_resend.C sched_send.C server_types.h svn path=/trunk/boinc/; revision=12661
2007-05-14 15:21:38 +00:00 · 2007-05-14 15:21:38 +00:00 · 01f4851323
parent 5f7ed5b8fc
commit 01f4851323
7 changed files with 55 additions and 0 deletions
--- a/11
+++ b/11
@ -4888,3 +4888,14 @@ Rom    12 May 2007
        boinc_ss.rc
    clientgui/
        BOINCGUIApp.rc
+
+David  14 May 2007
+    - scheduler: add max_wus_in_progress option.
+        Limits total # of in-progress results per host
+        (independently of #CPUs)
+
+    sched/
+        sched_config.C,h
+        sched_resend.C
+        sched_send.C
+        server_types.h
--- a/sched/main.h
+++ b/sched/main.h
@ -52,6 +52,9 @@
    // client asked for work but we didn't send any,
    // because of a reason not easily changed
    // (like wrong kind of computer)
+#define DELAY_NO_WORK_CACHE         20*60
+    // client asked for work but we didn't send any,
+    // because user had too many results in cache.

 extern SCHED_CONFIG config;
 extern GUI_URLS gui_urls;
--- a/sched/sched_config.C
+++ b/sched/sched_config.C
@ -92,6 +92,7 @@ int SCHED_CONFIG::parse(FILE* f) {
        else if (xp.parse_bool(tag, "ignore_delay_bound", ignore_delay_bound)) continue;
        else if (xp.parse_int(tag, "min_sendwork_interval", min_sendwork_interval)) continue;
        else if (xp.parse_int(tag, "max_wus_to_send", max_wus_to_send)) continue;
+        else if (xp.parse_int(tag, "max_wus_in_progress", max_wus_in_progress)) continue;
        else if (xp.parse_int(tag, "daily_result_quota", daily_result_quota)) continue;
        else if (xp.parse_int(tag, "uldl_dir_fanout", uldl_dir_fanout)) continue;
        else if (xp.parse_int(tag, "locality_scheduling_wait_period", locality_scheduling_wait_period)) continue;
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -47,6 +47,7 @@ public:
    bool msg_to_host;
    int min_sendwork_interval;
    int max_wus_to_send;
+    int max_wus_in_progress;
    bool non_cpu_intensive;
    bool verify_files_on_app_start;
    bool homogeneous_redundancy;
--- a/sched/sched_resend.C
+++ b/sched/sched_resend.C
@ -99,6 +99,7 @@ bool resend_lost_work(
    bool did_any = false;
    int num_to_resend=0;
    int num_resent=0;
+    int num_on_host=0;
    APP* app;
    APP_VERSION* avp;
    int retval;
@ -118,6 +119,7 @@ bool resend_lost_work(
    );
    while (!result.enumerate(buf)) {
        bool found = false;
+        num_on_host++;
        for (i=0; i<sreq.other_results.size(); i++) {
            OTHER_RESULT& orp = sreq.other_results[i];
            if (!strcmp(orp.name.c_str(), result.name)) {
@ -221,6 +223,13 @@ bool resend_lost_work(
            did_any = true;
        }
    }
+
+    reply.wreq.nresults_on_host = num_on_host;
+    log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+        "[HOST#%d] %d results in progress, set for later checking\n",
+        reply.host.id, num_on_host
+    );
+
    if (num_to_resend) {
        log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
            "[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_to_resend, num_resent 
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -677,6 +677,19 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
            return false;
        }
    }
+
+    if (config.max_wus_in_progress) {
+        int limit = config.max_wus_in_progress;
+        if (wreq.nresults_on_host >= limit) {
+            log_messages.printf(
+                SCHED_MSG_LOG::MSG_DEBUG,
+                "cache limit exceeded; %d > %d\n",
+                wreq.nresults_on_host, config.max_wus_in_progress
+            );
+            wreq.cache_size_exceeded=true;
+            return false;
+        }
+    }
    return true;
 }

@ -818,6 +831,7 @@ int add_result_to_reply(
    reply.wreq.seconds_to_fill -= wu_seconds_filled;
    request.estimated_delay += wu_seconds_filled/reply.host.p_ncpus;
    reply.wreq.nresults++;
+    reply.wreq.nresults_on_host++;
    if (!resent_result) reply.host.nresults_today++;

    // add this result to workload for simulation
@ -1021,6 +1035,20 @@ int send_work(
                          (int)(3600*(double)rand()/(double)RAND_MAX);
            reply.set_delay(delay_time);
        }
+        if (reply.wreq.cache_size_exceeded) {
+            char helpful[256];
+            sprintf(helpful, "(reached per-host limit of %d tasks)",
+                config.max_wus_in_progress
+            );
+            USER_MESSAGE um(helpful, "high");
+            reply.insert_message(um);
+            reply.set_delay(DELAY_NO_WORK_CACHE);
+            log_messages.printf(
+                SCHED_MSG_LOG::MSG_NORMAL,
+                "host %d already has %d result(s) on cache\n",
+                reply.host.id, reply.wreq.nresults_on_host
+            );
+        }        
    }

    return 0;
--- a/sched/server_types.h
+++ b/sched/server_types.h
@ -70,6 +70,8 @@ struct WORK_REQ {
    bool outdated_core;
    bool daily_result_quota_exceeded;
    int  daily_result_quota; // for this machine: number of cpus * daily_quota/cpu
+    bool cache_size_exceeded;
+    int nresults_on_host;
    void update_for_result(double seconds_filled);
 };