validator: retry if transient failure

check_set() wasn't returning "retry" properly in the case where one of the calls to init_result() return ERR_OPEN_DIR (treated as a transient failure, since it can be caused by a failed NFS mount)
2013-05-20 13:01:10 -07:00 · 2013-05-20 13:01:10 -07:00 · 9049737d1f
parent 7472e29495
commit 9049737d1f
3 changed files with 33 additions and 13 deletions
--- a/sched/credit.cpp
+++ b/sched/credit.cpp
@ -859,11 +859,19 @@ double vec_min(vector<double>& v) {
 }
 // Called by validator when canonical result has been selected.
-// Compute credit for valid instances.
+// For each valid result in the list:
-// This is called exactly once for each valid result.
+// - calculate a peak FLOP count (PFC) and a "mode" that indicates
 //   our confidence in the PFC
 // - upate the statistics of PFC in host_app_version and app_version
 // - Compute a credit value based on a weighted average of
 //   the PFCs of valid results
 //   (this value can be used or ignored by the caller)
 //
 // This must be called exactly once for each valid result.
 //
 int assign_credit_set(
-    WORKUNIT &wu, vector<RESULT>& results,
+    WORKUNIT &wu,
    vector<RESULT>& results,
    DB_APP &app,
    vector<DB_APP_VERSION>& app_versions,
    vector<DB_HOST_APP_VERSION>& host_app_versions,
@ -940,15 +948,14 @@ int assign_credit_set(
        // fall through
    case 0:
        if (approx.size()) {
-            x = pegged_average(approx,wu_estimated_pfc(wu, app));
+            x = pegged_average(approx, wu_estimated_pfc(wu, app));
        } else {
-            // there were only PFC_MODE_INVALID results, so
+            // there were only PFC_MODE_INVALID results, so we guess
            // we guess
            x = wu_estimated_pfc(wu, app);
        }
        break;
    default:
-        x = pegged_average(normal,wu_estimated_pfc(wu, app));
+        x = pegged_average(normal, wu_estimated_pfc(wu, app));
        break;
    }
--- a/sched/validate_util2.cpp
+++ b/sched/validate_util2.cpp
@ -37,16 +37,27 @@
 using std::vector;
-// Given a set of results, check for a canonical result,
+// Given a set of results:
-// i.e. a set of at least min_quorum/2+1 results for which
+// 1) call init_result() for each one;
-// that are equivalent according to check_pair().
+//    this detects results with bad or missing output files
 // 2) if # of good results is >= wu.min_quorum,
 //    check for a canonical result,
 //    i.e. a set of at least min_quorum/2+1 results for which
 //    that are equivalent according to check_pair().
 //
-// invariants:
+// input invariants:
 // results.size() >= wu.min_quorum
 // for each result:
 //   result.outcome == SUCCESS
 //   result.validate_state == INIT
 //
 // Outputs:
 // canonicalid: the ID of canonical result, if any
 // result.outcome, result.validate_state
 //    modified; caller must update DB
 // retry: set to true if some result had a transient failure
 //    (i.e. there was a broken NFS mount).
 //    Should call this again after a while.
 //
 int check_set(
    vector<RESULT>& results, WORKUNIT& wu,
    int& canonicalid, double&, bool& retry
@ -75,6 +86,7 @@ int check_set(
                "check_set: init_result([RESULT#%d %s]) transient failure\n",
                results[i].id, results[i].name
            );
            retry = true;
            had_error[i] = true;
        } else if (retval) {
            log_messages.printf(MSG_CRITICAL,
--- a/sched/validator.cpp
+++ b/sched/validator.cpp
@ -103,7 +103,8 @@ bool no_credit = false;
 WORKUNIT* g_wup;
 vector<DB_APP_VERSION> app_versions;
-    // cache of app_versions; used by v2 credit system
+    // cache of app_versions; the PFC statistics of these are
    // updated in memory, and periodically flushed to the DB
 bool is_unreplicated(WORKUNIT& wu) {
    return (wu.target_nresults == 1 && app.target_nresults > 1);