mirror of https://github.com/BOINC/boinc.git
validator: retry if transient failure
check_set() wasn't returning "retry" properly in the case where one of the calls to init_result() return ERR_OPEN_DIR (treated as a transient failure, since it can be caused by a failed NFS mount)
This commit is contained in:
parent
7472e29495
commit
9049737d1f
|
@ -859,11 +859,19 @@ double vec_min(vector<double>& v) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Called by validator when canonical result has been selected.
|
// Called by validator when canonical result has been selected.
|
||||||
// Compute credit for valid instances.
|
// For each valid result in the list:
|
||||||
// This is called exactly once for each valid result.
|
// - calculate a peak FLOP count (PFC) and a "mode" that indicates
|
||||||
|
// our confidence in the PFC
|
||||||
|
// - upate the statistics of PFC in host_app_version and app_version
|
||||||
|
// - Compute a credit value based on a weighted average of
|
||||||
|
// the PFCs of valid results
|
||||||
|
// (this value can be used or ignored by the caller)
|
||||||
|
//
|
||||||
|
// This must be called exactly once for each valid result.
|
||||||
//
|
//
|
||||||
int assign_credit_set(
|
int assign_credit_set(
|
||||||
WORKUNIT &wu, vector<RESULT>& results,
|
WORKUNIT &wu,
|
||||||
|
vector<RESULT>& results,
|
||||||
DB_APP &app,
|
DB_APP &app,
|
||||||
vector<DB_APP_VERSION>& app_versions,
|
vector<DB_APP_VERSION>& app_versions,
|
||||||
vector<DB_HOST_APP_VERSION>& host_app_versions,
|
vector<DB_HOST_APP_VERSION>& host_app_versions,
|
||||||
|
@ -940,15 +948,14 @@ int assign_credit_set(
|
||||||
// fall through
|
// fall through
|
||||||
case 0:
|
case 0:
|
||||||
if (approx.size()) {
|
if (approx.size()) {
|
||||||
x = pegged_average(approx,wu_estimated_pfc(wu, app));
|
x = pegged_average(approx, wu_estimated_pfc(wu, app));
|
||||||
} else {
|
} else {
|
||||||
// there were only PFC_MODE_INVALID results, so
|
// there were only PFC_MODE_INVALID results, so we guess
|
||||||
// we guess
|
|
||||||
x = wu_estimated_pfc(wu, app);
|
x = wu_estimated_pfc(wu, app);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
x = pegged_average(normal,wu_estimated_pfc(wu, app));
|
x = pegged_average(normal, wu_estimated_pfc(wu, app));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -37,16 +37,27 @@
|
||||||
|
|
||||||
using std::vector;
|
using std::vector;
|
||||||
|
|
||||||
// Given a set of results, check for a canonical result,
|
// Given a set of results:
|
||||||
// i.e. a set of at least min_quorum/2+1 results for which
|
// 1) call init_result() for each one;
|
||||||
// that are equivalent according to check_pair().
|
// this detects results with bad or missing output files
|
||||||
|
// 2) if # of good results is >= wu.min_quorum,
|
||||||
|
// check for a canonical result,
|
||||||
|
// i.e. a set of at least min_quorum/2+1 results for which
|
||||||
|
// that are equivalent according to check_pair().
|
||||||
//
|
//
|
||||||
// invariants:
|
// input invariants:
|
||||||
// results.size() >= wu.min_quorum
|
|
||||||
// for each result:
|
// for each result:
|
||||||
// result.outcome == SUCCESS
|
// result.outcome == SUCCESS
|
||||||
// result.validate_state == INIT
|
// result.validate_state == INIT
|
||||||
//
|
//
|
||||||
|
// Outputs:
|
||||||
|
// canonicalid: the ID of canonical result, if any
|
||||||
|
// result.outcome, result.validate_state
|
||||||
|
// modified; caller must update DB
|
||||||
|
// retry: set to true if some result had a transient failure
|
||||||
|
// (i.e. there was a broken NFS mount).
|
||||||
|
// Should call this again after a while.
|
||||||
|
//
|
||||||
int check_set(
|
int check_set(
|
||||||
vector<RESULT>& results, WORKUNIT& wu,
|
vector<RESULT>& results, WORKUNIT& wu,
|
||||||
int& canonicalid, double&, bool& retry
|
int& canonicalid, double&, bool& retry
|
||||||
|
@ -75,6 +86,7 @@ int check_set(
|
||||||
"check_set: init_result([RESULT#%d %s]) transient failure\n",
|
"check_set: init_result([RESULT#%d %s]) transient failure\n",
|
||||||
results[i].id, results[i].name
|
results[i].id, results[i].name
|
||||||
);
|
);
|
||||||
|
retry = true;
|
||||||
had_error[i] = true;
|
had_error[i] = true;
|
||||||
} else if (retval) {
|
} else if (retval) {
|
||||||
log_messages.printf(MSG_CRITICAL,
|
log_messages.printf(MSG_CRITICAL,
|
||||||
|
|
|
@ -103,7 +103,8 @@ bool no_credit = false;
|
||||||
|
|
||||||
WORKUNIT* g_wup;
|
WORKUNIT* g_wup;
|
||||||
vector<DB_APP_VERSION> app_versions;
|
vector<DB_APP_VERSION> app_versions;
|
||||||
// cache of app_versions; used by v2 credit system
|
// cache of app_versions; the PFC statistics of these are
|
||||||
|
// updated in memory, and periodically flushed to the DB
|
||||||
|
|
||||||
bool is_unreplicated(WORKUNIT& wu) {
|
bool is_unreplicated(WORKUNIT& wu) {
|
||||||
return (wu.target_nresults == 1 && app.target_nresults > 1);
|
return (wu.target_nresults == 1 && app.target_nresults > 1);
|
||||||
|
|
Loading…
Reference in New Issue