mirror of https://github.com/BOINC/boinc.git
validator: retry if transient failure
check_set() wasn't returning "retry" properly in the case where one of the calls to init_result() return ERR_OPEN_DIR (treated as a transient failure, since it can be caused by a failed NFS mount)
This commit is contained in:
parent
7472e29495
commit
9049737d1f
|
@ -859,11 +859,19 @@ double vec_min(vector<double>& v) {
|
|||
}
|
||||
|
||||
// Called by validator when canonical result has been selected.
|
||||
// Compute credit for valid instances.
|
||||
// This is called exactly once for each valid result.
|
||||
// For each valid result in the list:
|
||||
// - calculate a peak FLOP count (PFC) and a "mode" that indicates
|
||||
// our confidence in the PFC
|
||||
// - upate the statistics of PFC in host_app_version and app_version
|
||||
// - Compute a credit value based on a weighted average of
|
||||
// the PFCs of valid results
|
||||
// (this value can be used or ignored by the caller)
|
||||
//
|
||||
// This must be called exactly once for each valid result.
|
||||
//
|
||||
int assign_credit_set(
|
||||
WORKUNIT &wu, vector<RESULT>& results,
|
||||
WORKUNIT &wu,
|
||||
vector<RESULT>& results,
|
||||
DB_APP &app,
|
||||
vector<DB_APP_VERSION>& app_versions,
|
||||
vector<DB_HOST_APP_VERSION>& host_app_versions,
|
||||
|
@ -940,15 +948,14 @@ int assign_credit_set(
|
|||
// fall through
|
||||
case 0:
|
||||
if (approx.size()) {
|
||||
x = pegged_average(approx,wu_estimated_pfc(wu, app));
|
||||
x = pegged_average(approx, wu_estimated_pfc(wu, app));
|
||||
} else {
|
||||
// there were only PFC_MODE_INVALID results, so
|
||||
// we guess
|
||||
// there were only PFC_MODE_INVALID results, so we guess
|
||||
x = wu_estimated_pfc(wu, app);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
x = pegged_average(normal,wu_estimated_pfc(wu, app));
|
||||
x = pegged_average(normal, wu_estimated_pfc(wu, app));
|
||||
break;
|
||||
}
|
||||
|
||||
|
|
|
@ -37,16 +37,27 @@
|
|||
|
||||
using std::vector;
|
||||
|
||||
// Given a set of results, check for a canonical result,
|
||||
// i.e. a set of at least min_quorum/2+1 results for which
|
||||
// that are equivalent according to check_pair().
|
||||
// Given a set of results:
|
||||
// 1) call init_result() for each one;
|
||||
// this detects results with bad or missing output files
|
||||
// 2) if # of good results is >= wu.min_quorum,
|
||||
// check for a canonical result,
|
||||
// i.e. a set of at least min_quorum/2+1 results for which
|
||||
// that are equivalent according to check_pair().
|
||||
//
|
||||
// invariants:
|
||||
// results.size() >= wu.min_quorum
|
||||
// input invariants:
|
||||
// for each result:
|
||||
// result.outcome == SUCCESS
|
||||
// result.validate_state == INIT
|
||||
//
|
||||
// Outputs:
|
||||
// canonicalid: the ID of canonical result, if any
|
||||
// result.outcome, result.validate_state
|
||||
// modified; caller must update DB
|
||||
// retry: set to true if some result had a transient failure
|
||||
// (i.e. there was a broken NFS mount).
|
||||
// Should call this again after a while.
|
||||
//
|
||||
int check_set(
|
||||
vector<RESULT>& results, WORKUNIT& wu,
|
||||
int& canonicalid, double&, bool& retry
|
||||
|
@ -75,6 +86,7 @@ int check_set(
|
|||
"check_set: init_result([RESULT#%d %s]) transient failure\n",
|
||||
results[i].id, results[i].name
|
||||
);
|
||||
retry = true;
|
||||
had_error[i] = true;
|
||||
} else if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
|
|
|
@ -103,7 +103,8 @@ bool no_credit = false;
|
|||
|
||||
WORKUNIT* g_wup;
|
||||
vector<DB_APP_VERSION> app_versions;
|
||||
// cache of app_versions; used by v2 credit system
|
||||
// cache of app_versions; the PFC statistics of these are
|
||||
// updated in memory, and periodically flushed to the DB
|
||||
|
||||
bool is_unreplicated(WORKUNIT& wu) {
|
||||
return (wu.target_nresults == 1 && app.target_nresults > 1);
|
||||
|
|
Loading…
Reference in New Issue