*** empty log message ***

svn path=/trunk/boinc/; revision=2344
2003-09-21 22:33:43 +00:00 · 2003-09-21 22:33:43 +00:00 · dd4d6d2594
parent 1938fc43ee
commit dd4d6d2594
5 changed files with 131 additions and 69 deletions
--- a/19
+++ b/19
@ -6314,3 +6314,22 @@ David  Sept 21 2003
        feeder.C
        handle_request.C
        sched_shmem.h
+
+David  Sept 21 2003
+    - Partly debugged the above changes:
+    - The feeder's result enumeration should have no limit;
+        otherwise it may enumerate the same results over and over
+    - If the infeasible_count of a result a threshold (MAX_INFEASIBLE_COUNT)
+        the feeder marks it as couldnt_send.
+        This prevents a situation where results sit in the work array
+        forever, their infeasible_count approaching infinity
+    - Added show_shmem to sched/ Makefile, and fleshed it out a bit
+    - Show hostID correctly when get RPC from wrong major version
+
+    NOTE: still haven't exercised the case where a result is marked COULDNT_SEND
+
+    sched/
+        Makefile.am
+        feeder.C
+        handle_request.C
+        show_shmem.C
--- a/sched/Makefile.am
+++ b/sched/Makefile.am
@ -5,7 +5,7 @@ include $(top_srcdir)/Makefile.incl
 noinst_PROGRAMS =								\
 	cgi feeder show_shmem file_upload_handler				\
        validate_test validate_trivial make_work transitioner file_deleter	\
-        assimilator db_dump update_stats
+        assimilator db_dump update_stats show_shmem

 noinst_LIBRARIES = libsched.a

--- a/sched/feeder.C
+++ b/sched/feeder.C
@ -17,14 +17,45 @@
 // Contributor(s):
 //

+// -------------------------------
+//
 // feeder [-asynch] [-d debug_level]
+// -asynch      fork and run in a separate process
 //
 // Creates a shared memory segment containing DB info,
-// including results/workunits to send.
+// including the work array (results/workunits to send).
 // This means that the scheduler CGI program doesn't have to
 // access the DB to get this info.
 //
-// -asynch      fork and run in a separate process
+// Try to keep the work array filled.
+// This is a little tricky.
+// We use an enumerator.
+// The inner loop scans the wu_result table,
+// looking for empty slots and trying to fill them in.
+// When the enumerator reaches the end, it is restarted;
+// hopefully there will be some new workunits.
+// There are two complications:
+//  - An enumeration may return results already in the array.
+//    So, for each result, we scan the entire array to make sure
+//    it's not there already.  Can this be streamlined?
+//  - We must avoid excessive re-enumeration,
+//    especially when the number of results is less than the array size.
+//    Crude approach: if a "collision" (as above) occurred on
+//    a pass through the array, wait a long time (5 sec)
+//
+// Checking for infeasible results (i.e. can't sent to any host):
+// - the "infeasible_count" field of WU_RESULT keeps track of
+//   how many times the WU_RESULT was infeasible for a host
+// - the scheduler gives priority to results that have infeasible_count > 0
+// - If the infeasible_count of any result exceeds MAX_INFEASIBLE_COUNT,
+//   the feeder flags the result as OVER with outcome COULDNT_SEND,
+//   and flags the WU for the transitioner.
+// - the feeder tries to ensure that the number of WU_RESULTs
+//   with infeasible_count  > 0 doesn't exceed MAX_INFEASIBLE
+//   (compiled into feeder).
+//   If it does, then the feeder picks the WU_RESULT with
+//   the largest infeasible_count, marks if COULDNT_SEND as above,
+//   and repeats this until the infeasible count is low enough again

 // Trigger files:
 // The feeder program periodically checks for two trigger files:
@ -55,7 +86,8 @@
 #include "sched_shmem.h"
 #include "sched_util.h"

-#define RESULTS_PER_ENUM    100
+#define MAX_INFEASIBLE_COUNT    50
+
 #define REREAD_DB_FILENAME      "reread_db"
 #define LOCKFILE                "feeder.out"
 #define PIDFILE                 "feeder.pid"
@ -82,34 +114,46 @@ int check_reread_trigger() {
    return 0;
 }

-// Try to keep the work array filled.
-// This is a little tricky.
-// We use an enumerator.
-// The inner loop scans the wu_result table,
-// looking for empty slots and trying to fill them in.
-// When the enumerator reaches the end, it is restarted;
-// hopefully there will be some new workunits.
-// There are two complications:
-//  - An enumeration may return results already in the array.
-//    So, for each result, we scan the entire array to make sure
-//    it's not there already.  Can this be streamlined?
-//  - We must avoid excessive re-enumeration,
-//    especially when the number of results is less than the array size.
-//    Crude approach: if a "collision" (as above) occurred on
-//    a pass through the array, wait a long time (5 sec)
-//
-// Checking for infeasible results (i.e. can't sent to any host):
-// - the "infeasible_count" field of WU_RESULT keeps track of
-//   how many times the WU_RESULT was infeasible for a host
-// - the scheduler gives priority to results that have infeasible_count > 0
-// - the feeder tries to ensure that the number of WU_RESULTs
-//   with infeasible_count  > 0 doesn't exceed MAX_INFEASIBLE
-//   (compiled into feeder).
-//   If it does, then the feeder picks the WU_RESULT with
-//   the largest infeasible_count,
-//   flags the result as OVER with outcome COULDNT_SEND,
-//   flags the WU for the transitioner,
-//   and repeats this until the infeasible count is low enough again
+static int remove_infeasible(int i) {
+    int retval;
+    DB_RESULT result;
+    DB_WORKUNIT wu;
+
+    WU_RESULT& wu_result = ssp->wu_results[i];
+    wu_result.present = false;      // mark as absent
+    result = wu_result.result;
+    wu = wu_result.workunit;
+
+    log_messages.printf(
+        SchedMessages::NORMAL,
+        "[%s] declaring result as unsendable\n",
+        result.name
+    );
+
+    result.server_state = RESULT_SERVER_STATE_OVER;
+    result.outcome = RESULT_OUTCOME_COULDNT_SEND;
+    retval = result.update();
+    if (retval) {
+        log_messages.printf(
+            SchedMessages::CRITICAL,
+            "[%s]: can't update: %d\n",
+            result.name, retval
+        );
+        return retval;
+    }
+    wu.transition_time = time(0);
+    retval = wu.update();
+    if (retval) {
+        log_messages.printf(
+            SchedMessages::CRITICAL,
+            "[%s]: can't update: %d\n",
+            wu.name, retval
+        );
+        return retval;
+    }
+
+    return 0;
+}

 static void scan_work_array(
    DB_RESULT& result, char* clause,
@ -123,7 +167,9 @@ static void scan_work_array(
    for (i=0; i<ssp->nwu_results; i++) {
        WU_RESULT& wu_result = ssp->wu_results[i];
        if (wu_result.present) {
-            if (wu_result.infeasible_count > 0) {
+            if (wu_result.infeasible_count > MAX_INFEASIBLE_COUNT) {
+                remove_infeasible(i);
+            } else if (wu_result.infeasible_count > 0) {
                ninfeasible++;
            }
        } else {
@ -135,7 +181,9 @@ try_again:
                // there's no point in doing it again.
                //
                if (restarted_enum) {
-                    log_messages.printf(SchedMessages::DEBUG, "already restarted enum on this array scan\n");
+                    log_messages.printf(SchedMessages::DEBUG,
+                        "already restarted enum on this array scan\n"
+                    );
                    break;
                }

@ -143,9 +191,13 @@ try_again:
                //
                restarted_enum = true;
                retval = result.enumerate(clause);
-                log_messages.printf(SchedMessages::DEBUG, "restarting enumeration\n");
+                log_messages.printf(SchedMessages::DEBUG,
+                    "restarting enumeration\n"
+                );
                if (retval) {
-                    log_messages.printf(SchedMessages::DEBUG, "enumeration restart returned nothing\n");
+                    log_messages.printf(SchedMessages::DEBUG,
+                        "enumeration restart returned nothing\n"
+                    );
                    no_wus = true;
                    break;
                }
@ -205,10 +257,8 @@ try_again:
    }
 }

-int remove_most_infeasible() {
-    int i, max, imax=-1, retval;
-    DB_RESULT result;
-    DB_WORKUNIT wu;
+static int remove_most_infeasible() {
+    int i, max, imax=-1;

    max = 0;
    for (i=0; i<ssp->nwu_results; i++) {
@ -220,26 +270,7 @@ int remove_most_infeasible() {
    }
    if (max == 0) return -1;        // nothing is infeasible

-    WU_RESULT& wu_result = ssp->wu_results[imax];
-    wu_result.present = false;      // mark as absent
-    result = wu_result.result;
-    wu = wu_result.workunit;
-
-    log_messages.printf(
-        SchedMessages::NORMAL,
-        "[%s] declaring result as unsendable\n",
-        result.name
-    );
-
-    result.server_state = RESULT_SERVER_STATE_OVER;
-    result.outcome = RESULT_OUTCOME_COULDNT_SEND;
-    retval = result.update();
-    if (retval) return retval;
-    wu.transition_time = time(0);
-    retval = wu.update();
-    if (retval) return retval;
-
-    return 0;
+    return remove_infeasible(imax);
 }

 void feeder_loop() {
@ -248,8 +279,8 @@ void feeder_loop() {
    bool no_wus;
    char clause[256];

-    sprintf(clause, "where server_state=%d order by random limit %d",
-        RESULT_SERVER_STATE_UNSENT, RESULTS_PER_ENUM
+    sprintf(clause, "where server_state=%d order by random",
+        RESULT_SERVER_STATE_UNSENT
    );

    while (1) {
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -691,10 +691,14 @@ static void scan_work_array(
        }

        result = wu_result.result;
-        wu_result.present = false;

        retval = add_wu_to_reply(wu, reply, platform, ss);
-        if (retval) continue;
+        if (retval) {
+            wu_result.infeasible_count++;
+            continue;
+        }
+
+        wu_result.present = false;

        log_messages.printf(
            SchedMessages::NORMAL,
@ -849,8 +853,8 @@ bool wrong_major_version(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        strcpy(reply.message_priority, "low");
        log_messages.printf(
            SchedMessages::NORMAL,
-            "[HOST#%d] Wrong major version from user: wanted %d, got %d\n",
-            reply.host.id,
+            "[HOST#%d] [auth %s] Wrong major version from user: wanted %d, got %d\n",
+            sreq.hostid, sreq.authenticator,
            MAJOR_VERSION, sreq.core_client_major_version
        );
        return true;
@ -858,8 +862,7 @@ bool wrong_major_version(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
    return false;
 }

-inline static const char* get_remote_addr()
-{
+inline static const char* get_remote_addr() {
    const char * r = getenv("REMOTE_ADDR");
    return r ? r : "?.?.?.?";
 }
--- a/sched/show_shmem.C
+++ b/sched/show_shmem.C
@ -46,7 +46,16 @@ int main() {
    ssp = (SCHED_SHMEM*)p;
    retval = ssp->verify();
    printf("ready: %d\n", ssp->ready);
+    printf("nwu_results: %d\n", ssp->nwu_results);
+    printf("max_wu_results: %d\n", ssp->max_wu_results);
    for (i=0; i<ssp->max_wu_results; i++) {
-        printf("%d. %s\n", i, ssp->wu_results[i].present?"present":"absent");
+        WU_RESULT& wu_result = ssp->wu_results[i];
+        if (wu_result.present) {
+            printf("%d: present; infeasible_count %d; result %s\n",
+                i, wu_result.infeasible_count, wu_result.result.name
+            );
+        } else {
+            printf("%d: absent\n", i);
+        }
    }
 }