*** empty log message ***

svn path=/trunk/boinc/; revision=4172
2004-09-09 21:52:20 +00:00 · 2004-09-09 21:52:20 +00:00 · 860ef1a1e6
parent d6c2901e26
commit 860ef1a1e6
4 changed files with 170 additions and 130 deletions
--- a/98
+++ b/98
@ -17131,58 +17131,86 @@ Rom    8 Sep 2004
        file_names.C

 Jeff  09 Sep 2004
-    - Change to the validator to handle the case where one or more of the result files
-      for a WU's result set are missing.  Before this change this state would result
-      in this WU and it's results never getting looked at again.  No science, no
-      credit.
+    - Change the validator to handle the case where one or more
+      of the result files for a WU's result set can't be read.
+      Currently, this case results in this WU and its results
+      never getting looked at again - no science, no credit.

-      This is coordinated seti_boinc / boinc change.  This note is repeated in
-      the seti_boinc checkin_notes.
+      We distinguish 2 cases:
+      1) transient failure, e.g. an NFS mount is not working.
+         (can't read directory)
+         In this case check_set() returns a special code to the validator,
+         which arranges for the WU to get transitioned again in a few hours.
+      2) permanent failure (file doesn't exist)
+         The result's validate_state is set to ERROR (new state),
+         which is generally equivalent to INVALID
+    - add new WU arg to check_set()
+        (so it can access min_quorum)
+
+      This is coordinated seti_boinc / boinc change.
+      This note is repeated in the seti_boinc checkin_notes.

      seti_boinc changes:
-        - get_result_file(), if it cannot read the result file, tries to read the
-          directory where the result file should be.  If it can read the directory,
-          it returns ERR_FOPEN, as before.  If it cannot read the directory, it
-          resturns ERR_OPENDIR.   ERR_OPENDIR in this context signals a possibly
-          transient problem.  Upon a successful result read, it sets sah_result.have_result
+        - get_result_file(), if it cannot read the result file,
+          tries to read the directory where the result file should be.
+          If it can read the directory, it returns ERR_FOPEN, as before.
+          If it cannot read the directory, it resturns ERR_OPENDIR.
+          ERR_OPENDIR in this context signals a possibly transient problem.
+          Upon a successful result read, it sets sah_result.have_result
          (a new data member as of this change) to true.
        - check_set(), upon return from get_result_file() :
-                - sets result.validate_state to VALIDATE_STATE_ERROR (a new state
-                  as of this change) and retval to zero if there is a nonzero
-                  retval from check_set() and it is not ERR_OPENDIR.  It then
-                  continues.
-                - otherwise just continues.
+            - sets result.validate_state to VALIDATE_STATE_ERROR (a new state)
+              and retval to zero if there is a nonzero retval from check_set()
+              and it is not ERR_OPENDIR.  It then continues.
+            - otherwise just continues.
          check_set() then determines if any IO errors brought the result count
          below wu.min_quorum.  If not, it continues.  Otherwise it returns.
-          In all subsequent logic any results for which sah_result.have_result == false
-          are skipped.  Note that the result and the sah_result vectors are associated.
+          In all subsequent logic any results for which
+          sah_result.have_result == false are skipped.
+          Note that the result and the sah_result vectors are associated.

        validate/
-                sah_result.h
-                sah_validate.cpp
-                sah_boinc_db.cpp
+            sah_result.h
+            sah_validate.cpp
+            sah_boinc_db.cpp

        boinc changes:
          - new validate state VALIDATE_STATE_ERROR.
-          - in the enumeration for check_set(), we now include a clause to check for
-            VALIDATE_STATE_INIT in order to ignore results set to VALIDATE_STATE_ERROR.
+          - in the enumeration for check_set(),
+            we now include a clause to check for VALIDATE_STATE_INIT
+            in order to ignore results set to VALIDATE_STATE_ERROR.
            This enumeration query now matches that for check_pair().
-          Note that the only possible retvals from check_set() are now zero or
-          ERR_OPENDIR.
-          Note that logic to retry possibly transient errors still needs to be done.
-          Also left to be done is a transitioner change to subtract the number of
-          results in state VALIDATE_STATE_ERROR from the count of active results.
-          The latter change is needed to stimulate the production of additional results
-          for the affected WU.
+          Note that the only possible retvals from check_set()
+          are now zero or ERR_OPENDIR.
+          Note that logic to retry possibly transient errors still needs
+          to be done.
+          Also left to be done is a transitioner change to subtract
+          the number of results in state VALIDATE_STATE_ERROR
+          from the count of active results.
+          The latter change is needed to stimulate the production
+          of additional results for the affected WU.

        sched/
-                validator.C
+            validator.C
        db/
-                boinc_db.h
+            boinc_db.h

-Rom    8 Sep 2004 ( for Sebastian Masch )
-    - Don't cause the core client to rewrite the state file for every block transfered
-      during a network operation.
+
+Rom    9 Sep 2004 ( for Sebastian Masch )
+    - Don't cause the core client to rewrite the state file
+      for every block transfered during a network operation.

    client/
        pers_file_xfer.C
+
+David  9 Sept 2004
+    - Further refinements of validation to deal with
+       recoverable/nonrecoverable file read errors.
+       Revised the signatures of check_set() and check_pair(),
+       and revised the validator accordingly.
+
+    db/
+        boinc.db.h
+    sched/
+        sample_trivial_validator.C
+        validator.C
--- a/db/boinc_db.h
+++ b/db/boinc_db.h
@ -359,10 +359,14 @@ struct WORKUNIT {
 #define RESULT_OUTCOME_SUCCESS          1
 #define RESULT_OUTCOME_COULDNT_SEND     2
 #define RESULT_OUTCOME_CLIENT_ERROR     3
+    // an error happened on the client
 #define RESULT_OUTCOME_NO_REPLY         4
 #define RESULT_OUTCOME_DIDNT_NEED       5
    // we created the result but didn't need to send it because we already
    // got a quorum
+#define RESULT_OUTCOME_VALIDATE_ERROR   6
+    // The outcome was initially SUCCESS, but the validator
+    // had a permanent error reading a result file

 #define VALIDATE_STATE_INIT         0
 #define VALIDATE_STATE_VALID        1
@ -370,8 +374,6 @@ struct WORKUNIT {
 #define VALIDATE_STATE_NO_CHECK     3
    // WU had error, so we'll never get around to validating its results
    // This lets us avoid showing the claimed credit as "pending"
-#define VALIDATE_STATE_ERROR        4
-    // Validation encountered an internal error (eg, file IO error).

 struct RESULT {
    int id;
--- a/sched/sample_trivial_validator.C
+++ b/sched/sample_trivial_validator.C
@ -32,10 +32,10 @@ int init_result_trivial(RESULT const& result, void*& data) {

 int check_pair_initialized_trivial(
    RESULT const& r1, void* /*data1*/,
-    RESULT const& r2, void* /*data2*/,
-    bool& match
+    RESULT const& r2, void* /*data2*/
 ) {
-    match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME);
+    bool match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME);
+    r1.validate_state = match?VALIDATE_STATE_VALID:VALIDATE_STATE_INVALID;
    return 0;
 }

@ -43,7 +43,8 @@ int cleanup_result_trivial(RESULT const&, void*) {
    return 0;
 }

-int check_set(vector<RESULT>& results, int& canonicalid, double& credit) {
+int check_set(vector<RESULT>& results, int& canonicalid, double& credit, bool& retry) {
+    retry = false;
    return generic_check_set_majority(
        results, canonicalid, credit,
        init_result_trivial,
@ -55,8 +56,7 @@ int check_set(vector<RESULT>& results, int& canonicalid, double& credit) {
 int check_pair(RESULT const& r1, RESULT const& r2, bool& match) {
    return check_pair_initialized_trivial(
        r1, NULL,
-        r2, NULL,
-        match
+        r2, NULL
    );
 }

--- a/sched/validator.C
+++ b/sched/validator.C
@ -18,26 +18,15 @@
 //

 //
-// validate - check and validate new results, and grant credit
+// validator - check and validate new results, and grant credit
 //  -app appname
 //  [-d debug_level]
 //  [-one_pass]     // make one pass through WU table, then exit
 //  [-asynch]       // fork, run in separate process
 //
 // This program must be linked with two project-specific functions:
-//
-// int check_set(vector<RESULT>, int& canonical, double& credit)
-//    Compare a set of results.
-//    If a canonical result is found, return its ID,
-//    and set the "validate_state" field of all the results
-//    according to whether they match the canonical result.
-//    Also return the "canonical credit" (e.g. the average or median)
-//
-// int pair_check(RESULT& new_result, RESULT& canonical, bool& valid);
-//    return valid=true iff the new result matches the canonical one
-//
-// Both functions return nonzero if an error occurred,
-// in which case other outputs are undefined
+// check_set() and check_pair().
+// See doc/validate.php for a description.

 using namespace std;

@ -47,6 +36,7 @@ using namespace std;

 #include "boinc_db.h"
 #include "util.h"
+#include "error_numbers.h"
 #include "sched_config.h"
 #include "sched_util.h"
 #include "sched_msgs.h"
@ -54,8 +44,13 @@ using namespace std;
 #define LOCKFILE "validate.out"
 #define PIDFILE  "validate.pid"

-extern int check_set(vector<RESULT>&, DB_WORKUNIT& wu, int& canonical, double& credit);
-extern int check_pair(RESULT const&, RESULT const&, bool&);
+extern int check_set(
+    vector<RESULT>&, DB_WORKUNIT& wu, int& canonical, double& credit,
+    bool& retry
+);
+extern int check_pair(
+    RESULT const& new_result, RESULT const& canonical_result, bool& retry
+);

 SCHED_CONFIG config;
 char app_name[256];
@ -137,7 +132,9 @@ int grant_credit(DB_RESULT& result, double credit) {

 void handle_wu(DB_WORKUNIT& wu) {
    DB_RESULT result, canonical_result;
-    bool match, update_result, need_transition = false;
+    bool update_result, retry;
+    bool canonical_result_missing = false;
+    bool need_immediate_transition = false, need_delayed_transition = false;
    int retval, canonicalid = 0;
    double credit;
    unsigned int i;
@ -155,14 +152,20 @@ void handle_wu(DB_WORKUNIT& wu) {
        // Get unchecked results and see if they match the canonical result
        //
        retval = canonical_result.lookup_id(wu.canonical_resultid);
-        if (retval) {
+        if (retval == ERR_DB_NOT_FOUND) {
            log_messages.printf(
                SCHED_MSG_LOG::CRITICAL,
-                "[WU#%d %s] Can't read canonical result; marking as validated: %d\n",
+                "[WU#%d %s] Canonical result not in DB %d",
                wu.id, wu.name, retval
            );
-            // Mark this WU as validated, otherwise we'll keep checking it
-            goto mark_validated;
+            canonical_result_missing = true;
+        } else if (retval) {
+            log_messages.printf(
+                SCHED_MSG_LOG::CRITICAL,
+                "[WU#%d %s] Can't read canonical result %d; exiting",
+                wu.id, wu.name, retval
+            );
+            exit(retval);
        }

        // scan this WU's results, and check the unchecked ones
@ -173,63 +176,56 @@ void handle_wu(DB_WORKUNIT& wu) {
            wu.id, VALIDATE_STATE_INIT, RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_SUCCESS
        );
        while (!result.enumerate(buf)) {
-            need_transition = true;
+            need_immediate_transition = true;

-            // it's possible that we've deleted canonical result outputs
-            //
-            if (canonical_result.file_delete_state == FILE_DELETE_DONE) {
-                log_messages.printf(
-                    SCHED_MSG_LOG::DEBUG,
-                    "[WU#%d]: Canonical result (%d) has been deleted\n",
-                    wu.id, canonical_result.id
-                );
-                match = false;
-                retval = 0;
-            } else {
-                retval = check_pair(result, canonical_result, match);
-            }
+            retval = check_pair(
+                result, canonical_result, retry
+            );
            if (retval) {
                log_messages.printf(
                    SCHED_MSG_LOG::DEBUG,
                    "[RESULT#%d %s]: pair_check() failed for result: %d\n",
                    result.id, result.name, retval
                );
-                continue;
-            } else {
-                if (match) {
-                    result.validate_state = VALIDATE_STATE_VALID;
-                    result.granted_credit = wu.canonical_credit;
-                    log_messages.printf(
-                        SCHED_MSG_LOG::NORMAL,
-                        "[RESULT#%d %s] pair_check() matched: setting result to valid; credit %f\n",
-                        result.id, result.name, result.granted_credit
-                    );
-                } else {
-                    result.validate_state = VALIDATE_STATE_INVALID;
-                    log_messages.printf(
-                        SCHED_MSG_LOG::NORMAL,
-                        "[RESULT#%d %s] pair_check() didn't match: setting result to invalid\n",
-                        result.id, result.name
-                    );
-                }
+                exit(retval);
            }
-            retval = result.update();
-            if (retval) {
-                log_messages.printf(
-                    SCHED_MSG_LOG::CRITICAL,
-                    "[RESULT#%d %s] Can't update result: %d\n",
-                    result.id, result.name, retval
-                );
-                continue;
-            }
-            retval = grant_credit(result, result.granted_credit);
-            if (retval) {
+            if (retry) need_delayed_transition = true;
+            update_result = false;
+            switch (result.validate_state) {
+            case VALIDATE_STATE_VALID:
+                update_result = true;
+                result.granted_credit = wu.canonical_credit;
                log_messages.printf(
                    SCHED_MSG_LOG::NORMAL,
-                    "[RESULT#%d %s] Can't grant credit: %d\n",
-                    result.id, result.name, retval
+                    "[RESULT#%d %s] pair_check() matched: setting result to valid; credit %f\n",
+                    result.id, result.name, result.granted_credit
                );
-                continue;
+                retval = grant_credit(result, result.granted_credit);
+                if (retval) {
+                    log_messages.printf(
+                        SCHED_MSG_LOG::NORMAL,
+                        "[RESULT#%d %s] Can't grant credit: %d\n",
+                        result.id, result.name, retval
+                    );
+                }
+                break;
+            case VALIDATE_STATE_INVALID:
+                update_result = true;
+                log_messages.printf(
+                    SCHED_MSG_LOG::NORMAL,
+                    "[RESULT#%d %s] pair_check() didn't match: setting result to invalid\n",
+                    result.id, result.name
+                );
+            }
+            if (update_result) {
+                retval = result.update();
+                if (retval) {
+                    log_messages.printf(
+                        SCHED_MSG_LOG::CRITICAL,
+                        "[RESULT#%d %s] Can't update result: %d\n",
+                        result.id, result.name, retval
+                    );
+                }
            }
        }
    } else {
@ -266,9 +262,18 @@ void handle_wu(DB_WORKUNIT& wu) {
                SCHED_MSG_LOG::DEBUG,
                "[WU#%d %s] Enough for quorum, checking set.\n", wu.id, wu.name
            );
-            retval = check_set(results, wu, canonicalid, credit);
-            if (!retval && canonicalid) {
-                need_transition = true;
+            retval = check_set(results, wu, canonicalid, credit, retry);
+            if (retval) {
+                log_messages.printf(
+                    SCHED_MSG_LOG::CRITICAL,
+                    "[WU#%d %s] check_set returned %d, exiting",
+                    wu.id, wu.name, retval
+                );
+                exit(retval);
+            }
+            if (retry) need_delayed_transition = true;
+            if (canonicalid) {
+                need_immediate_transition = true;
                log_messages.printf(
                    SCHED_MSG_LOG::DEBUG,
                    "[WU#%d %s] Found a canonical result: id=%d\n",
@ -279,12 +284,15 @@ void handle_wu(DB_WORKUNIT& wu) {
                wu.assimilate_state = ASSIMILATE_READY;
                for (i=0; i<results.size(); i++) {
                    result = results[i];
-                    update_result = false;
+
+                    // skip results that had file-read errors
+                    //
+                    if (result.outcome != RESULT_OUTCOME_SUCCESS) continue;
+                    if (result.validate_state == VALIDATE_STATE_INIT) continue;

                    // grant credit for valid results
                    //
                    if (result.validate_state == VALIDATE_STATE_VALID) {
-                        update_result = true;
                        retval = grant_credit(result, credit);
                        if (retval) {
                            log_messages.printf(
@ -301,19 +309,19 @@ void handle_wu(DB_WORKUNIT& wu) {
                        );
                    }

-                    if (update_result) {
-                        retval = result.update();
-                        if (retval) {
-                            log_messages.printf(
-                                SCHED_MSG_LOG::CRITICAL,
-                                "[RESULT#%d %s] result.update() failed: %d\n",
-                                result.id, result.name, retval
-                            );
-                        }
+                    retval = result.update();
+                    if (retval) {
+                        log_messages.printf(
+                            SCHED_MSG_LOG::CRITICAL,
+                            "[RESULT#%d %s] result.update() failed: %d\n",
+                            result.id, result.name, retval
+                        );
                    }
                }

-                // don't send any unsent results
+                // If found a canonical result, don't send any unsent results
+                // TODO: could do this in a single SQL statement
+                //
                sprintf(buf, "where workunitid=%d and server_state=%d",
                    wu.id, RESULT_SERVER_STATE_UNSENT
                );
@ -334,7 +342,7 @@ void handle_wu(DB_WORKUNIT& wu) {
                //
                if ((int)results.size() > wu.max_success_results) {
                    wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS;
-                    need_transition = true;
+                    need_immediate_transition = true;
                }
            }
        }
@ -342,13 +350,15 @@ void handle_wu(DB_WORKUNIT& wu) {

    --log_messages;

-mark_validated:
-
-    if (need_transition) {
+    if (need_immediate_transition) {
        wu.transition_time = time(0);
    }
+    if (need_delayed_transition) {
+        int x = time(0) + 6*3600;
+        if (x < wu.transition_time) wu.transition_time = x;
+    }

-    // we've checked all results for this WU, so turn off flag
+    // clear WU.need_validate
    //
    wu.need_validate = 0;
    retval = wu.update();