diff --git a/checkin_notes b/checkin_notes index d3246799e5..e6c02596ab 100755 --- a/checkin_notes +++ b/checkin_notes @@ -17131,58 +17131,86 @@ Rom 8 Sep 2004 file_names.C Jeff 09 Sep 2004 - - Change to the validator to handle the case where one or more of the result files - for a WU's result set are missing. Before this change this state would result - in this WU and it's results never getting looked at again. No science, no - credit. + - Change the validator to handle the case where one or more + of the result files for a WU's result set can't be read. + Currently, this case results in this WU and its results + never getting looked at again - no science, no credit. - This is coordinated seti_boinc / boinc change. This note is repeated in - the seti_boinc checkin_notes. + We distinguish 2 cases: + 1) transient failure, e.g. an NFS mount is not working. + (can't read directory) + In this case check_set() returns a special code to the validator, + which arranges for the WU to get transitioned again in a few hours. + 2) permanent failure (file doesn't exist) + The result's validate_state is set to ERROR (new state), + which is generally equivalent to INVALID + - add new WU arg to check_set() + (so it can access min_quorum) + + This is coordinated seti_boinc / boinc change. + This note is repeated in the seti_boinc checkin_notes. seti_boinc changes: - - get_result_file(), if it cannot read the result file, tries to read the - directory where the result file should be. If it can read the directory, - it returns ERR_FOPEN, as before. If it cannot read the directory, it - resturns ERR_OPENDIR. ERR_OPENDIR in this context signals a possibly - transient problem. Upon a successful result read, it sets sah_result.have_result + - get_result_file(), if it cannot read the result file, + tries to read the directory where the result file should be. + If it can read the directory, it returns ERR_FOPEN, as before. + If it cannot read the directory, it resturns ERR_OPENDIR. + ERR_OPENDIR in this context signals a possibly transient problem. + Upon a successful result read, it sets sah_result.have_result (a new data member as of this change) to true. - check_set(), upon return from get_result_file() : - - sets result.validate_state to VALIDATE_STATE_ERROR (a new state - as of this change) and retval to zero if there is a nonzero - retval from check_set() and it is not ERR_OPENDIR. It then - continues. - - otherwise just continues. + - sets result.validate_state to VALIDATE_STATE_ERROR (a new state) + and retval to zero if there is a nonzero retval from check_set() + and it is not ERR_OPENDIR. It then continues. + - otherwise just continues. check_set() then determines if any IO errors brought the result count below wu.min_quorum. If not, it continues. Otherwise it returns. - In all subsequent logic any results for which sah_result.have_result == false - are skipped. Note that the result and the sah_result vectors are associated. + In all subsequent logic any results for which + sah_result.have_result == false are skipped. + Note that the result and the sah_result vectors are associated. validate/ - sah_result.h - sah_validate.cpp - sah_boinc_db.cpp + sah_result.h + sah_validate.cpp + sah_boinc_db.cpp boinc changes: - new validate state VALIDATE_STATE_ERROR. - - in the enumeration for check_set(), we now include a clause to check for - VALIDATE_STATE_INIT in order to ignore results set to VALIDATE_STATE_ERROR. + - in the enumeration for check_set(), + we now include a clause to check for VALIDATE_STATE_INIT + in order to ignore results set to VALIDATE_STATE_ERROR. This enumeration query now matches that for check_pair(). - Note that the only possible retvals from check_set() are now zero or - ERR_OPENDIR. - Note that logic to retry possibly transient errors still needs to be done. - Also left to be done is a transitioner change to subtract the number of - results in state VALIDATE_STATE_ERROR from the count of active results. - The latter change is needed to stimulate the production of additional results - for the affected WU. + Note that the only possible retvals from check_set() + are now zero or ERR_OPENDIR. + Note that logic to retry possibly transient errors still needs + to be done. + Also left to be done is a transitioner change to subtract + the number of results in state VALIDATE_STATE_ERROR + from the count of active results. + The latter change is needed to stimulate the production + of additional results for the affected WU. sched/ - validator.C + validator.C db/ - boinc_db.h + boinc_db.h -Rom 8 Sep 2004 ( for Sebastian Masch ) - - Don't cause the core client to rewrite the state file for every block transfered - during a network operation. + +Rom 9 Sep 2004 ( for Sebastian Masch ) + - Don't cause the core client to rewrite the state file + for every block transfered during a network operation. client/ pers_file_xfer.C + +David 9 Sept 2004 + - Further refinements of validation to deal with + recoverable/nonrecoverable file read errors. + Revised the signatures of check_set() and check_pair(), + and revised the validator accordingly. + + db/ + boinc.db.h + sched/ + sample_trivial_validator.C + validator.C diff --git a/db/boinc_db.h b/db/boinc_db.h index 08874af48f..9418849e25 100755 --- a/db/boinc_db.h +++ b/db/boinc_db.h @@ -359,10 +359,14 @@ struct WORKUNIT { #define RESULT_OUTCOME_SUCCESS 1 #define RESULT_OUTCOME_COULDNT_SEND 2 #define RESULT_OUTCOME_CLIENT_ERROR 3 + // an error happened on the client #define RESULT_OUTCOME_NO_REPLY 4 #define RESULT_OUTCOME_DIDNT_NEED 5 // we created the result but didn't need to send it because we already // got a quorum +#define RESULT_OUTCOME_VALIDATE_ERROR 6 + // The outcome was initially SUCCESS, but the validator + // had a permanent error reading a result file #define VALIDATE_STATE_INIT 0 #define VALIDATE_STATE_VALID 1 @@ -370,8 +374,6 @@ struct WORKUNIT { #define VALIDATE_STATE_NO_CHECK 3 // WU had error, so we'll never get around to validating its results // This lets us avoid showing the claimed credit as "pending" -#define VALIDATE_STATE_ERROR 4 - // Validation encountered an internal error (eg, file IO error). struct RESULT { int id; diff --git a/sched/sample_trivial_validator.C b/sched/sample_trivial_validator.C index d78076f6d3..f62f0aa8eb 100644 --- a/sched/sample_trivial_validator.C +++ b/sched/sample_trivial_validator.C @@ -32,10 +32,10 @@ int init_result_trivial(RESULT const& result, void*& data) { int check_pair_initialized_trivial( RESULT const& r1, void* /*data1*/, - RESULT const& r2, void* /*data2*/, - bool& match + RESULT const& r2, void* /*data2*/ ) { - match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME); + bool match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME); + r1.validate_state = match?VALIDATE_STATE_VALID:VALIDATE_STATE_INVALID; return 0; } @@ -43,7 +43,8 @@ int cleanup_result_trivial(RESULT const&, void*) { return 0; } -int check_set(vector& results, int& canonicalid, double& credit) { +int check_set(vector& results, int& canonicalid, double& credit, bool& retry) { + retry = false; return generic_check_set_majority( results, canonicalid, credit, init_result_trivial, @@ -55,8 +56,7 @@ int check_set(vector& results, int& canonicalid, double& credit) { int check_pair(RESULT const& r1, RESULT const& r2, bool& match) { return check_pair_initialized_trivial( r1, NULL, - r2, NULL, - match + r2, NULL ); } diff --git a/sched/validator.C b/sched/validator.C index 103ad4c66c..2dafadc87f 100644 --- a/sched/validator.C +++ b/sched/validator.C @@ -18,26 +18,15 @@ // // -// validate - check and validate new results, and grant credit +// validator - check and validate new results, and grant credit // -app appname // [-d debug_level] // [-one_pass] // make one pass through WU table, then exit // [-asynch] // fork, run in separate process // // This program must be linked with two project-specific functions: -// -// int check_set(vector, int& canonical, double& credit) -// Compare a set of results. -// If a canonical result is found, return its ID, -// and set the "validate_state" field of all the results -// according to whether they match the canonical result. -// Also return the "canonical credit" (e.g. the average or median) -// -// int pair_check(RESULT& new_result, RESULT& canonical, bool& valid); -// return valid=true iff the new result matches the canonical one -// -// Both functions return nonzero if an error occurred, -// in which case other outputs are undefined +// check_set() and check_pair(). +// See doc/validate.php for a description. using namespace std; @@ -47,6 +36,7 @@ using namespace std; #include "boinc_db.h" #include "util.h" +#include "error_numbers.h" #include "sched_config.h" #include "sched_util.h" #include "sched_msgs.h" @@ -54,8 +44,13 @@ using namespace std; #define LOCKFILE "validate.out" #define PIDFILE "validate.pid" -extern int check_set(vector&, DB_WORKUNIT& wu, int& canonical, double& credit); -extern int check_pair(RESULT const&, RESULT const&, bool&); +extern int check_set( + vector&, DB_WORKUNIT& wu, int& canonical, double& credit, + bool& retry +); +extern int check_pair( + RESULT const& new_result, RESULT const& canonical_result, bool& retry +); SCHED_CONFIG config; char app_name[256]; @@ -137,7 +132,9 @@ int grant_credit(DB_RESULT& result, double credit) { void handle_wu(DB_WORKUNIT& wu) { DB_RESULT result, canonical_result; - bool match, update_result, need_transition = false; + bool update_result, retry; + bool canonical_result_missing = false; + bool need_immediate_transition = false, need_delayed_transition = false; int retval, canonicalid = 0; double credit; unsigned int i; @@ -155,14 +152,20 @@ void handle_wu(DB_WORKUNIT& wu) { // Get unchecked results and see if they match the canonical result // retval = canonical_result.lookup_id(wu.canonical_resultid); - if (retval) { + if (retval == ERR_DB_NOT_FOUND) { log_messages.printf( SCHED_MSG_LOG::CRITICAL, - "[WU#%d %s] Can't read canonical result; marking as validated: %d\n", + "[WU#%d %s] Canonical result not in DB %d", wu.id, wu.name, retval ); - // Mark this WU as validated, otherwise we'll keep checking it - goto mark_validated; + canonical_result_missing = true; + } else if (retval) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "[WU#%d %s] Can't read canonical result %d; exiting", + wu.id, wu.name, retval + ); + exit(retval); } // scan this WU's results, and check the unchecked ones @@ -173,63 +176,56 @@ void handle_wu(DB_WORKUNIT& wu) { wu.id, VALIDATE_STATE_INIT, RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_SUCCESS ); while (!result.enumerate(buf)) { - need_transition = true; + need_immediate_transition = true; - // it's possible that we've deleted canonical result outputs - // - if (canonical_result.file_delete_state == FILE_DELETE_DONE) { - log_messages.printf( - SCHED_MSG_LOG::DEBUG, - "[WU#%d]: Canonical result (%d) has been deleted\n", - wu.id, canonical_result.id - ); - match = false; - retval = 0; - } else { - retval = check_pair(result, canonical_result, match); - } + retval = check_pair( + result, canonical_result, retry + ); if (retval) { log_messages.printf( SCHED_MSG_LOG::DEBUG, "[RESULT#%d %s]: pair_check() failed for result: %d\n", result.id, result.name, retval ); - continue; - } else { - if (match) { - result.validate_state = VALIDATE_STATE_VALID; - result.granted_credit = wu.canonical_credit; - log_messages.printf( - SCHED_MSG_LOG::NORMAL, - "[RESULT#%d %s] pair_check() matched: setting result to valid; credit %f\n", - result.id, result.name, result.granted_credit - ); - } else { - result.validate_state = VALIDATE_STATE_INVALID; - log_messages.printf( - SCHED_MSG_LOG::NORMAL, - "[RESULT#%d %s] pair_check() didn't match: setting result to invalid\n", - result.id, result.name - ); - } + exit(retval); } - retval = result.update(); - if (retval) { - log_messages.printf( - SCHED_MSG_LOG::CRITICAL, - "[RESULT#%d %s] Can't update result: %d\n", - result.id, result.name, retval - ); - continue; - } - retval = grant_credit(result, result.granted_credit); - if (retval) { + if (retry) need_delayed_transition = true; + update_result = false; + switch (result.validate_state) { + case VALIDATE_STATE_VALID: + update_result = true; + result.granted_credit = wu.canonical_credit; log_messages.printf( SCHED_MSG_LOG::NORMAL, - "[RESULT#%d %s] Can't grant credit: %d\n", - result.id, result.name, retval + "[RESULT#%d %s] pair_check() matched: setting result to valid; credit %f\n", + result.id, result.name, result.granted_credit ); - continue; + retval = grant_credit(result, result.granted_credit); + if (retval) { + log_messages.printf( + SCHED_MSG_LOG::NORMAL, + "[RESULT#%d %s] Can't grant credit: %d\n", + result.id, result.name, retval + ); + } + break; + case VALIDATE_STATE_INVALID: + update_result = true; + log_messages.printf( + SCHED_MSG_LOG::NORMAL, + "[RESULT#%d %s] pair_check() didn't match: setting result to invalid\n", + result.id, result.name + ); + } + if (update_result) { + retval = result.update(); + if (retval) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "[RESULT#%d %s] Can't update result: %d\n", + result.id, result.name, retval + ); + } } } } else { @@ -266,9 +262,18 @@ void handle_wu(DB_WORKUNIT& wu) { SCHED_MSG_LOG::DEBUG, "[WU#%d %s] Enough for quorum, checking set.\n", wu.id, wu.name ); - retval = check_set(results, wu, canonicalid, credit); - if (!retval && canonicalid) { - need_transition = true; + retval = check_set(results, wu, canonicalid, credit, retry); + if (retval) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "[WU#%d %s] check_set returned %d, exiting", + wu.id, wu.name, retval + ); + exit(retval); + } + if (retry) need_delayed_transition = true; + if (canonicalid) { + need_immediate_transition = true; log_messages.printf( SCHED_MSG_LOG::DEBUG, "[WU#%d %s] Found a canonical result: id=%d\n", @@ -279,12 +284,15 @@ void handle_wu(DB_WORKUNIT& wu) { wu.assimilate_state = ASSIMILATE_READY; for (i=0; i wu.max_success_results) { wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS; - need_transition = true; + need_immediate_transition = true; } } } @@ -342,13 +350,15 @@ void handle_wu(DB_WORKUNIT& wu) { --log_messages; -mark_validated: - - if (need_transition) { + if (need_immediate_transition) { wu.transition_time = time(0); } + if (need_delayed_transition) { + int x = time(0) + 6*3600; + if (x < wu.transition_time) wu.transition_time = x; + } - // we've checked all results for this WU, so turn off flag + // clear WU.need_validate // wu.need_validate = 0; retval = wu.update();