validator: retry if transient failure

check_set() wasn't returning "retry" properly in the case where
one of the calls to init_result() return ERR_OPEN_DIR
(treated as a transient failure, since it can be caused by a failed NFS mount)
This commit is contained in:
David Anderson 2013-05-20 13:01:10 -07:00
parent 7472e29495
commit 9049737d1f
3 changed files with 33 additions and 13 deletions

View File

@ -859,11 +859,19 @@ double vec_min(vector<double>& v) {
}
// Called by validator when canonical result has been selected.
// Compute credit for valid instances.
// This is called exactly once for each valid result.
// For each valid result in the list:
// - calculate a peak FLOP count (PFC) and a "mode" that indicates
// our confidence in the PFC
// - upate the statistics of PFC in host_app_version and app_version
// - Compute a credit value based on a weighted average of
// the PFCs of valid results
// (this value can be used or ignored by the caller)
//
// This must be called exactly once for each valid result.
//
int assign_credit_set(
WORKUNIT &wu, vector<RESULT>& results,
WORKUNIT &wu,
vector<RESULT>& results,
DB_APP &app,
vector<DB_APP_VERSION>& app_versions,
vector<DB_HOST_APP_VERSION>& host_app_versions,
@ -942,8 +950,7 @@ int assign_credit_set(
if (approx.size()) {
x = pegged_average(approx, wu_estimated_pfc(wu, app));
} else {
// there were only PFC_MODE_INVALID results, so
// we guess
// there were only PFC_MODE_INVALID results, so we guess
x = wu_estimated_pfc(wu, app);
}
break;

View File

@ -37,16 +37,27 @@
using std::vector;
// Given a set of results, check for a canonical result,
// Given a set of results:
// 1) call init_result() for each one;
// this detects results with bad or missing output files
// 2) if # of good results is >= wu.min_quorum,
// check for a canonical result,
// i.e. a set of at least min_quorum/2+1 results for which
// that are equivalent according to check_pair().
//
// invariants:
// results.size() >= wu.min_quorum
// input invariants:
// for each result:
// result.outcome == SUCCESS
// result.validate_state == INIT
//
// Outputs:
// canonicalid: the ID of canonical result, if any
// result.outcome, result.validate_state
// modified; caller must update DB
// retry: set to true if some result had a transient failure
// (i.e. there was a broken NFS mount).
// Should call this again after a while.
//
int check_set(
vector<RESULT>& results, WORKUNIT& wu,
int& canonicalid, double&, bool& retry
@ -75,6 +86,7 @@ int check_set(
"check_set: init_result([RESULT#%d %s]) transient failure\n",
results[i].id, results[i].name
);
retry = true;
had_error[i] = true;
} else if (retval) {
log_messages.printf(MSG_CRITICAL,

View File

@ -103,7 +103,8 @@ bool no_credit = false;
WORKUNIT* g_wup;
vector<DB_APP_VERSION> app_versions;
// cache of app_versions; used by v2 credit system
// cache of app_versions; the PFC statistics of these are
// updated in memory, and periodically flushed to the DB
bool is_unreplicated(WORKUNIT& wu) {
return (wu.target_nresults == 1 && app.target_nresults > 1);