validator: retry if transient failure

check_set() wasn't returning "retry" properly in the case where one of the calls to init_result() return ERR_OPEN_DIR (treated as a transient failure, since it can be caused by a failed NFS mount)
2013-05-20 13:01:10 -07:00 · 2013-05-20 13:01:10 -07:00 · 9049737d1f
parent 7472e29495
commit 9049737d1f
3 changed files with 33 additions and 13 deletions
--- a/sched/credit.cpp
+++ b/sched/credit.cpp
@ -859,11 +859,19 @@ double vec_min(vector<double>& v) {
 }

 // Called by validator when canonical result has been selected.
-// Compute credit for valid instances.
-// This is called exactly once for each valid result.
+// For each valid result in the list:
+// - calculate a peak FLOP count (PFC) and a "mode" that indicates
+//   our confidence in the PFC
+// - upate the statistics of PFC in host_app_version and app_version
+// - Compute a credit value based on a weighted average of
+//   the PFCs of valid results
+//   (this value can be used or ignored by the caller)
+//
+// This must be called exactly once for each valid result.
 //
 int assign_credit_set(
-    WORKUNIT &wu, vector<RESULT>& results,
+    WORKUNIT &wu,
+    vector<RESULT>& results,
    DB_APP &app,
    vector<DB_APP_VERSION>& app_versions,
    vector<DB_HOST_APP_VERSION>& host_app_versions,
@ -942,8 +950,7 @@ int assign_credit_set(
        if (approx.size()) {
            x = pegged_average(approx, wu_estimated_pfc(wu, app));
        } else {
-            // there were only PFC_MODE_INVALID results, so
-            // we guess
+            // there were only PFC_MODE_INVALID results, so we guess
            x = wu_estimated_pfc(wu, app);
        }
        break;
--- a/sched/validate_util2.cpp
+++ b/sched/validate_util2.cpp
@ -37,16 +37,27 @@

 using std::vector;

-// Given a set of results, check for a canonical result,
+// Given a set of results:
+// 1) call init_result() for each one;
+//    this detects results with bad or missing output files
+// 2) if # of good results is >= wu.min_quorum,
+//    check for a canonical result,
 //    i.e. a set of at least min_quorum/2+1 results for which
 //    that are equivalent according to check_pair().
 //
-// invariants:
-// results.size() >= wu.min_quorum
+// input invariants:
 // for each result:
 //   result.outcome == SUCCESS
 //   result.validate_state == INIT
 //
+// Outputs:
+// canonicalid: the ID of canonical result, if any
+// result.outcome, result.validate_state
+//    modified; caller must update DB
+// retry: set to true if some result had a transient failure
+//    (i.e. there was a broken NFS mount).
+//    Should call this again after a while.
+//
 int check_set(
    vector<RESULT>& results, WORKUNIT& wu,
    int& canonicalid, double&, bool& retry
@ -75,6 +86,7 @@ int check_set(
                "check_set: init_result([RESULT#%d %s]) transient failure\n",
                results[i].id, results[i].name
            );
+            retry = true;
            had_error[i] = true;
        } else if (retval) {
            log_messages.printf(MSG_CRITICAL,
--- a/sched/validator.cpp
+++ b/sched/validator.cpp
@ -103,7 +103,8 @@ bool no_credit = false;

 WORKUNIT* g_wup;
 vector<DB_APP_VERSION> app_versions;
-    // cache of app_versions; used by v2 credit system
+    // cache of app_versions; the PFC statistics of these are
+    // updated in memory, and periodically flushed to the DB

 bool is_unreplicated(WORKUNIT& wu) {
    return (wu.target_nresults == 1 && app.target_nresults > 1);