- scheduler:

- if WU is infeasible, print message instead of number - remove useless messages - remove EDF simulations printfs - don't update nresults_on_host in resend_lost_work() (it's already done in add_result_to_reply()) svn path=/trunk/boinc/; revision=14336
2007-11-30 23:02:55 +00:00 · 2007-11-30 23:02:55 +00:00 · df8cbdb294
parent 6b2c7f38a2
commit df8cbdb294
7 changed files with 159 additions and 139 deletions
--- a/15
+++ b/15
@ -11964,3 +11964,18 @@ Rom    30 Nov 2007
        boinctray.vcproj
    win_build/installerv2/
        BOINC.ism
+
+David  30 Nov 2007
+    - scheduler:
+        - if WU is infeasible, print message instead of number
+        - remove useless messages
+        - remove EDF simulations printfs
+        - don't update nresults_on_host in resend_lost_work()
+            (it's already done in add_result_to_reply())
+
+    sched/
+        edf_sim.C
+        handle_request.C
+        sched_array.C
+        sched_resend.C
+        sched_send.C,h
--- a/sched/edf_sim.C
+++ b/sched/edf_sim.C
@ -25,16 +25,6 @@
 using std::vector;

 //#define TEST
-//#define DEBUG
-#ifdef DEBUG
-#define INFO0
-#define INFO1
-#define INFO2
-#else
-#define INFO0 //
-#define INFO1 //
-#define INFO2 //
-#endif

 // 0 shows initial workload and candidate decisions
 // 1 shows function calls results of sim: make/miss deadline
@ -53,7 +43,9 @@ void mark_edf_misses (int ncpus, vector<IP_RESULT>& ip_results){
    double booked_to[128];
    int j;

-    INFO1 fprintf(stderr, "mark_edf_misses\n");
+#ifdef DEBUG
+    fprintf(stderr, "mark_edf_misses\n");
+#endif

    // keeps track of when each cpu is next free
    //
@ -82,20 +74,26 @@ void mark_edf_misses (int ncpus, vector<IP_RESULT>& ip_results){
        }
      
        booked_to[lowest_booked_cpu] += r.cpu_time_remaining;
-        INFO2 fprintf(stderr, "  running %s on cpu %d; finishes at %f\n",
+#ifdef DEBUG
+        fprintf(stderr, "  running %s on cpu %d; finishes at %f\n",
            r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu]
        );
+#endif
        if (booked_to[lowest_booked_cpu] > r.computation_deadline) {
 	        r.misses_deadline = true;
 	        r.estimated_completion_time = booked_to[lowest_booked_cpu];
-	        INFO1 fprintf(stderr, "  %s misses_deadline; est completion %f\n",
+#ifdef DEBUG
+	        fprintf(stderr, "  %s misses_deadline; est completion %f\n",
                r.name, booked_to[lowest_booked_cpu]
            );
+#endif
        } else {
 	        r.misses_deadline = false;
-	        INFO1 fprintf(stderr, "  %s makes deadline; est completion %f\n",
+#ifdef DEBUG
+	        fprintf(stderr, "  %s makes deadline; est completion %f\n",
                r.name, booked_to[lowest_booked_cpu]
            );
+#endif
 	        // if result doesn't miss its deadline,
            // then the estimated_completion_time is of no use
        }
@ -112,13 +110,17 @@ void init_ip_results(
 ){
    unsigned int i;

-    INFO0 fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus);
+#ifdef DEBUG
+    fprintf(stderr, "init_ip_results; work_buf_min %f ncpus %d:\n", work_buf_min, ncpus);
+#endif
    for (i=0; i<ip_results.size(); i++) {
        IP_RESULT& r = ip_results[i];
        r.computation_deadline = r.report_deadline - work_buf_min;
-        INFO0 fprintf(stderr, "    %s: deadline %.2f cpu %.2f\n",
+#ifdef DEBUG
+        fprintf(stderr, "    %s: deadline %.2f cpu %.2f\n",
            r.name, r.computation_deadline, r.cpu_time_remaining
        );
+#endif
    }

    // run edf simulation to determine whether any results miss their deadline
@ -189,10 +191,12 @@ bool check_candidate (
    double booked_to[128];     // keeps track of when each cpu is free
    int j;

-    INFO0  fprintf(stderr, "check_candidate %s: dl %f cpu %f\n",
+#ifdef DEBUG
+    fprintf(stderr, "check_candidate %s: dl %f cpu %f\n",
        candidate.name, candidate.computation_deadline,
        candidate.cpu_time_remaining
    );
+#endif

    for (j=0; j<ncpus; j++) {
        booked_to[j] = 0;
@ -219,9 +223,11 @@ bool check_candidate (
            }
        }
        booked_to[lowest_booked_cpu] += r.cpu_time_remaining;
-        INFO2 fprintf(stderr, "  running %s on cpu %d; finishes at %f\n",
+#ifdef DEBUG
+        fprintf(stderr, "  running %s on cpu %d; finishes at %f\n",
            r.name, lowest_booked_cpu, booked_to[lowest_booked_cpu]
        );
+#endif

        // return false if completion time if > computation_deadline AND
        // result would not have missed deadline to begin with
@ -229,9 +235,11 @@ bool check_candidate (
        if (booked_to[lowest_booked_cpu] > r.computation_deadline
            && !r.misses_deadline
        ) {
-	        INFO0  fprintf(stderr, "  cand. fails; %s now misses deadline: %f\n",
+#ifdef DEBUG
+	        fprintf(stderr, "  cand. fails; %s now misses deadline: %f\n",
                r.name, booked_to[lowest_booked_cpu]
            );
+#endif
            return false;
        }
        // check a late result (i.e., one that would have missed its
@ -240,11 +248,15 @@ bool check_candidate (
        if (r.misses_deadline 
            && booked_to[lowest_booked_cpu] > r.estimated_completion_time
        ){
-            INFO1  fprintf(stderr, "  cand. fails; late result %s to be returned even later\n", r.name);
+#ifdef DEBUG
+            fprintf(stderr, "  cand. fails; late result %s to be returned even later\n", r.name);
+#endif
            return false;
        }
    }
-    INFO1  fprintf(stderr, "  cand. succeeds\n");
+#ifdef DEBUG
+    fprintf(stderr, "  cand. succeeds\n");
+#endif
    return true;
 }

--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -530,7 +530,7 @@ static int update_host_record(HOST& initial_host, HOST& xhost, USER& user) {
    return 0;
 }

-// Figure out which of the results the user currently has
+// Figure out which of the results the host currently has
 // should be aborted outright, or aborted if not started yet
 //
 int send_result_abort(
@ -556,10 +556,6 @@ int send_result_abort(
        result_names.append("'");
        result_names.append(orp.name);
        result_names.append("'");
-        log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
-            "Result is on [HOST#%d]: %s\n",
-            reply.host.id, orp.name.c_str()
-        );
    }

    // query the db for the results and set the appropriate flag
@ -1426,14 +1422,15 @@ void process_request(
    handle_results(sreq, reply);

    reply.wreq.nresults_on_host = sreq.other_results.size();
-    if (config.resend_lost_results && sreq.have_other_results_list) {
-        if (resend_lost_work(sreq, reply, platforms, ss)) {
-            ok_to_send_work = false;
+    if (sreq.have_other_results_list) {
+        if (config.resend_lost_results) {
+            if (resend_lost_work(sreq, reply, platforms, ss)) {
+                ok_to_send_work = false;
+            }
+        }
+        if (config.send_result_abort) {
+            send_result_abort(sreq, reply, ss);
        }
-    }
-
-    if (config.send_result_abort && sreq.have_other_results_list) {
-        send_result_abort(sreq, reply, ss);
    }
    
    // if last RPC was within config.min_sendwork_interval, don't send work
--- a/sched/sched_array.C
+++ b/sched/sched_array.C
@ -116,8 +116,8 @@ void scan_work_array(
        retval = wu_is_infeasible(wu, sreq, reply, *app);
        if (retval) {
               log_messages.printf(
-                   SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %d\n",
-                   reply.host.id, wu.id, wu.name, retval
+                   SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
+                   reply.host.id, wu.id, wu.name, infeasible_string(retval)
               );
            continue;
        }
--- a/sched/sched_resend.C
+++ b/sched/sched_resend.C
@ -54,19 +54,19 @@ static int possibly_give_result_new_deadline(
    DB_RESULT& result, WORKUNIT& wu, SCHEDULER_REPLY& reply
 ) {
    const double resend_frac = 0.5;  // range [0, 1)
-    int result_sent_time = time(0);
-    int result_report_deadline = result_sent_time + (int)(resend_frac*(result.report_deadline - result.sent_time));
+    int now = time(0);
+    int result_report_deadline = now + (int)(resend_frac*(result.report_deadline - result.sent_time));

    if (result_report_deadline < result.report_deadline) {
        result_report_deadline = result.report_deadline;
    }
-    if (result_report_deadline > result_sent_time + wu.delay_bound) {
-        result_report_deadline = result_sent_time + wu.delay_bound;
+    if (result_report_deadline > now + wu.delay_bound) {
+        result_report_deadline = now + wu.delay_bound;
    }

    // If infeasible, return without modifying result
    //
-    if (estimate_cpu_duration(wu, reply) > result_report_deadline-result_sent_time) {
+    if (estimate_cpu_duration(wu, reply) > result_report_deadline-now) {
        log_messages.printf(
            SCHED_MSG_LOG::MSG_DEBUG,
            "[RESULT#%d] [HOST#%d] not resending lost result: can't complete in time\n",
@ -83,11 +83,17 @@ static int possibly_give_result_new_deadline(
        result.id, reply.host.id,
        result_report_deadline==result.report_deadline?"NO update to":"Updated"
    );
-    result.sent_time = result_sent_time;
+    result.sent_time = now;
    result.report_deadline = result_report_deadline;
    return 0;
 }

+// resend any jobs that:
+// 1) we already sent to this host;
+// 2) are still in progress (i.e. haven't timed out) and
+// 3) aren't present on the host
+// Return true if there were any such jobs
+//
 bool resend_lost_work(
    SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply,
    PLATFORM_LIST& platforms, SCHED_SHMEM& ss
@ -96,116 +102,99 @@ bool resend_lost_work(
    std::vector<DB_RESULT>results;
    unsigned int i;
    char buf[256];
+    char warning_msg[256];
    bool did_any = false;
-    int num_to_resend=0;
+    int num_eligible_to_resend=0;
    int num_resent=0;
-    int num_on_host=0;
    APP* app;
    APP_VERSION* avp;
    int retval;

-    // print list of results on host
-    //
-    for (i=0; i<sreq.other_results.size(); i++) {
-        OTHER_RESULT& orp=sreq.other_results[i];
-        log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
-            "Result is on [HOST#%d]: %s\n",
-            reply.host.id, orp.name.c_str()
-        );
-    }
-
    sprintf(buf, " where hostid=%d and server_state=%d ",
        reply.host.id, RESULT_SERVER_STATE_IN_PROGRESS
    );
    while (!result.enumerate(buf)) {
        bool found = false;
-        num_on_host++;
        for (i=0; i<sreq.other_results.size(); i++) {
            OTHER_RESULT& orp = sreq.other_results[i];
            if (!strcmp(orp.name.c_str(), result.name)) {
                found = true;
+                break;
            }
        }
-        if (!found) {
-            num_to_resend++;
-            log_messages.printf(
-                SCHED_MSG_LOG::MSG_DEBUG,
-                "[HOST#%d] found lost [RESULT#%d]: %s\n",
-                reply.host.id, result.id, result.name
+        if (found) continue;
+
+        num_eligible_to_resend++;
+        log_messages.printf(
+            SCHED_MSG_LOG::MSG_DEBUG,
+            "[HOST#%d] found lost [RESULT#%d]: %s\n",
+            reply.host.id, result.id, result.name
+        );
+
+        DB_WORKUNIT wu;
+        retval = wu.lookup_id(result.workunitid);
+        if (retval) {
+            log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
+                "[HOST#%d] WU not found for [RESULT#%d]\n",
+                reply.host.id, result.id
            );
+            continue;
+        }

-            DB_WORKUNIT wu;
-            retval = wu.lookup_id(result.workunitid);
-            if (retval) {
-                log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
-                    "[HOST#%d] WU not found for [RESULT#%d]\n",
-                    reply.host.id, result.id
-                );
-                continue;
-            }
+        reply.wreq.core_client_version =
+            sreq.core_client_major_version*100 + sreq.core_client_minor_version;

-            reply.wreq.core_client_version =
-                sreq.core_client_major_version*100 + sreq.core_client_minor_version;
-
-            retval = get_app_version(
-                wu, app, avp, sreq, reply, platforms, ss
+        retval = get_app_version(
+            wu, app, avp, sreq, reply, platforms, ss
+        );
+        if (retval) {
+            log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
+                "[HOST#%d] no app version [RESULT#%d]\n",
+                reply.host.id, result.id
            );
+            continue;
+        }
+
+        // If time is too close to the deadline,
+        // or we already have a canonical result,
+        // or WU error flag is set,
+        // then don't bother to resend this result.
+        // Instead make it time out right away
+        // so that the transitioner does 'the right thing'.
+        //
+        if (
+            wu.error_mask ||
+            wu.canonical_resultid ||
+            possibly_give_result_new_deadline(result, wu, reply)
+        ) {
+            log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+                "[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
+                reply.host.id, result.id
+            );
+            result.report_deadline = time(0)-1;
+            retval = result.mark_as_sent(result.server_state);
            if (retval) {
-                log_messages.printf( SCHED_MSG_LOG::MSG_CRITICAL,
-                    "[HOST#%d] no app version [RESULT#%d]\n",
-                    reply.host.id, result.id
-                );
-                continue;
-            }
-
-            // If time is too close to the deadline,
-            // or we already have a canonical result,
-            // or WU error flag is set,
-            // then don't bother to resend this result.
-            // Instead make it time out right away
-            // so that the transitioner does 'the right thing'.
-            //
-            char warning_msg[256];
-            if (
-                wu.error_mask ||
-                wu.canonical_resultid ||
-                possibly_give_result_new_deadline(result, wu, reply)
-            ) {
-                result.report_deadline = time(0);
-                retval = result.mark_as_sent(result.server_state);
-                if (retval==ERR_DB_NOT_FOUND) {
-                    log_messages.printf(
-                        SCHED_MSG_LOG::MSG_CRITICAL,
-                        "[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
-                        result.id, reply.host.id
-                    );
-                } else if (retval) {
-                    log_messages.printf(
-                        SCHED_MSG_LOG::MSG_CRITICAL,
-                        "resend_lost_result: can't update result deadline: %d\n", retval
-                    );
-                }
-                if (retval) continue;
-
-                retval = update_wu_transition_time(wu, result.report_deadline);
-                if (retval) {
-                    log_messages.printf(
-                        SCHED_MSG_LOG::MSG_CRITICAL,
-                        "resend_lost_result: can't update WU transition time: %d\n", retval
-                    );
-                    continue;
-                }
                log_messages.printf(
-                    SCHED_MSG_LOG::MSG_DEBUG,
-                    "[HOST#%d][RESULT#%d] not needed or too close to deadline, expiring\n",
-                    reply.host.id, result.id
+                    SCHED_MSG_LOG::MSG_CRITICAL,
+                    "resend_lost_work: can't update result deadline: %d\n", retval
                );
-                sprintf(warning_msg, "Didn't resend lost result %s (expired)", result.name);
-                USER_MESSAGE um(warning_msg, "high");
-                reply.insert_message(um);
                continue;
            }

+            retval = update_wu_transition_time(wu, result.report_deadline);
+            if (retval) {
+                log_messages.printf(
+                    SCHED_MSG_LOG::MSG_CRITICAL,
+                    "resend_lost_result: can't update WU transition time: %d\n", retval
+                );
+                continue;
+            }
+            sprintf(warning_msg,
+                "Didn't resend lost result %s (expired)", result.name
+            );
+            USER_MESSAGE um(warning_msg, "high");
+            reply.insert_message(um);
+        } else {
            retval = add_result_to_reply(
                result, wu, sreq, reply, platforms, app, avp
            );
@ -224,15 +213,9 @@ bool resend_lost_work(
        }
    }

-    reply.wreq.nresults_on_host = num_on_host;
-    log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
-        "[HOST#%d] %d results in progress, set for later checking\n",
-        reply.host.id, num_on_host
-    );
-
-    if (num_to_resend) {
+    if (num_eligible_to_resend) {
        log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
-            "[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_to_resend, num_resent 
+            "[HOST#%d] %d lost results, resent %d\n", reply.host.id, num_eligible_to_resend, num_resent 
        );
    }

--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -56,6 +56,19 @@ using namespace std;
 #define FCGI_ToFILE(x) (x)
 #endif

+const char* infeasible_string(int code) {
+    switch (code) {
+    case INFEASIBLE_MEM: return "Not enough memory";
+    case INFEASIBLE_DISK: return "Not enough disk";
+    case INFEASIBLE_CPU: return "CPU too slow";
+    case INFEASIBLE_APP_SETTING: return "App not selected";
+    case INFEASIBLE_WORKLOAD: return "Existing workload";
+    case INFEASIBLE_DUP: return "Already in reply";
+    case INFEASIBLE_HR: return "Homogeneous redundancy";
+    }
+    return "Unknown";
+}
+
 const int MIN_SECONDS_TO_SEND = 0;
 const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY);
 const int MAX_CPUS = 8;
@ -784,6 +797,8 @@ int add_result_to_reply(

    int delay_bound = wu.delay_bound;
    if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
+        // We are sending this result for the first time
+        //
        // If the workunit needs reliable and is being sent to a reliable host,
        // then shorten the delay bound by the percent specified
        //
@ -793,18 +808,16 @@ int add_result_to_reply(
            }
        }

-        // We are sending this result for the first time
-        //
        result.report_deadline = result.sent_time + delay_bound;
        result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
    } else {
-        // Result was ALREADY sent to this host but never arrived.
-        // So we are resending it.
-        // result.report_deadline and time_sent
-        // have already been updated before this function was called.
+        // Result was already sent to this host but was lost,
+        // so we are resending it.
        //
        resent_result = true;
 
+        // TODO: explain the following
+        //
        if (result.report_deadline < result.sent_time) {
            result.report_deadline = result.sent_time + 10;
        }
--- a/sched/sched_send.h
+++ b/sched/sched_send.h
@ -66,4 +66,4 @@ extern int update_wu_transition_time(WORKUNIT wu, time_t x);

 extern void lock_sema();
 extern void unlock_sema();
-
+extern const char* infeasible_string(int);