From 9049737d1f66f5e9ef77d741092a827435bd3d2b Mon Sep 17 00:00:00 2001 From: David Anderson Date: Mon, 20 May 2013 13:01:10 -0700 Subject: [PATCH] validator: retry if transient failure check_set() wasn't returning "retry" properly in the case where one of the calls to init_result() return ERR_OPEN_DIR (treated as a transient failure, since it can be caused by a failed NFS mount) --- sched/credit.cpp | 21 ++++++++++++++------- sched/validate_util2.cpp | 22 +++++++++++++++++----- sched/validator.cpp | 3 ++- 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/sched/credit.cpp b/sched/credit.cpp index 7199a3374d..4045baf629 100644 --- a/sched/credit.cpp +++ b/sched/credit.cpp @@ -859,11 +859,19 @@ double vec_min(vector& v) { } // Called by validator when canonical result has been selected. -// Compute credit for valid instances. -// This is called exactly once for each valid result. +// For each valid result in the list: +// - calculate a peak FLOP count (PFC) and a "mode" that indicates +// our confidence in the PFC +// - upate the statistics of PFC in host_app_version and app_version +// - Compute a credit value based on a weighted average of +// the PFCs of valid results +// (this value can be used or ignored by the caller) +// +// This must be called exactly once for each valid result. // int assign_credit_set( - WORKUNIT &wu, vector& results, + WORKUNIT &wu, + vector& results, DB_APP &app, vector& app_versions, vector& host_app_versions, @@ -940,15 +948,14 @@ int assign_credit_set( // fall through case 0: if (approx.size()) { - x = pegged_average(approx,wu_estimated_pfc(wu, app)); + x = pegged_average(approx, wu_estimated_pfc(wu, app)); } else { - // there were only PFC_MODE_INVALID results, so - // we guess + // there were only PFC_MODE_INVALID results, so we guess x = wu_estimated_pfc(wu, app); } break; default: - x = pegged_average(normal,wu_estimated_pfc(wu, app)); + x = pegged_average(normal, wu_estimated_pfc(wu, app)); break; } diff --git a/sched/validate_util2.cpp b/sched/validate_util2.cpp index 8558b8ad69..a9e54c0a49 100644 --- a/sched/validate_util2.cpp +++ b/sched/validate_util2.cpp @@ -37,16 +37,27 @@ using std::vector; -// Given a set of results, check for a canonical result, -// i.e. a set of at least min_quorum/2+1 results for which -// that are equivalent according to check_pair(). +// Given a set of results: +// 1) call init_result() for each one; +// this detects results with bad or missing output files +// 2) if # of good results is >= wu.min_quorum, +// check for a canonical result, +// i.e. a set of at least min_quorum/2+1 results for which +// that are equivalent according to check_pair(). // -// invariants: -// results.size() >= wu.min_quorum +// input invariants: // for each result: // result.outcome == SUCCESS // result.validate_state == INIT // +// Outputs: +// canonicalid: the ID of canonical result, if any +// result.outcome, result.validate_state +// modified; caller must update DB +// retry: set to true if some result had a transient failure +// (i.e. there was a broken NFS mount). +// Should call this again after a while. +// int check_set( vector& results, WORKUNIT& wu, int& canonicalid, double&, bool& retry @@ -75,6 +86,7 @@ int check_set( "check_set: init_result([RESULT#%d %s]) transient failure\n", results[i].id, results[i].name ); + retry = true; had_error[i] = true; } else if (retval) { log_messages.printf(MSG_CRITICAL, diff --git a/sched/validator.cpp b/sched/validator.cpp index 81b39c2581..0d6e7e8216 100644 --- a/sched/validator.cpp +++ b/sched/validator.cpp @@ -103,7 +103,8 @@ bool no_credit = false; WORKUNIT* g_wup; vector app_versions; - // cache of app_versions; used by v2 credit system + // cache of app_versions; the PFC statistics of these are + // updated in memory, and periodically flushed to the DB bool is_unreplicated(WORKUNIT& wu) { return (wu.target_nresults == 1 && app.target_nresults > 1);