From 9049737d1f66f5e9ef77d741092a827435bd3d2b Mon Sep 17 00:00:00 2001
From: David Anderson <davea@ssl.berkeley.edu>
Date: Mon, 20 May 2013 13:01:10 -0700
Subject: [PATCH] validator: retry if transient failure

check_set() wasn't returning "retry" properly in the case where
one of the calls to init_result() return ERR_OPEN_DIR
(treated as a transient failure, since it can be caused by a failed NFS mount)
---
 sched/credit.cpp         | 21 ++++++++++++++-------
 sched/validate_util2.cpp | 22 +++++++++++++++++-----
 sched/validator.cpp      |  3 ++-
 3 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/sched/credit.cpp b/sched/credit.cpp
index 7199a3374d..4045baf629 100644
--- a/sched/credit.cpp
+++ b/sched/credit.cpp
@@ -859,11 +859,19 @@ double vec_min(vector<double>& v) {
 }
 
 // Called by validator when canonical result has been selected.
-// Compute credit for valid instances.
-// This is called exactly once for each valid result.
+// For each valid result in the list:
+// - calculate a peak FLOP count (PFC) and a "mode" that indicates
+//   our confidence in the PFC
+// - upate the statistics of PFC in host_app_version and app_version
+// - Compute a credit value based on a weighted average of
+//   the PFCs of valid results
+//   (this value can be used or ignored by the caller)
+//
+// This must be called exactly once for each valid result.
 //
 int assign_credit_set(
-    WORKUNIT &wu, vector<RESULT>& results,
+    WORKUNIT &wu,
+    vector<RESULT>& results,
     DB_APP &app,
     vector<DB_APP_VERSION>& app_versions,
     vector<DB_HOST_APP_VERSION>& host_app_versions,
@@ -940,15 +948,14 @@ int assign_credit_set(
         // fall through
     case 0:
         if (approx.size()) {
-            x = pegged_average(approx,wu_estimated_pfc(wu, app));
+            x = pegged_average(approx, wu_estimated_pfc(wu, app));
         } else {
-            // there were only PFC_MODE_INVALID results, so
-            // we guess
+            // there were only PFC_MODE_INVALID results, so we guess
             x = wu_estimated_pfc(wu, app);
         }
         break;
     default:
-        x = pegged_average(normal,wu_estimated_pfc(wu, app));
+        x = pegged_average(normal, wu_estimated_pfc(wu, app));
         break;
     }
 
diff --git a/sched/validate_util2.cpp b/sched/validate_util2.cpp
index 8558b8ad69..a9e54c0a49 100644
--- a/sched/validate_util2.cpp
+++ b/sched/validate_util2.cpp
@@ -37,16 +37,27 @@
 
 using std::vector;
 
-// Given a set of results, check for a canonical result,
-// i.e. a set of at least min_quorum/2+1 results for which
-// that are equivalent according to check_pair().
+// Given a set of results:
+// 1) call init_result() for each one;
+//    this detects results with bad or missing output files
+// 2) if # of good results is >= wu.min_quorum,
+//    check for a canonical result,
+//    i.e. a set of at least min_quorum/2+1 results for which
+//    that are equivalent according to check_pair().
 //
-// invariants:
-// results.size() >= wu.min_quorum
+// input invariants:
 // for each result:
 //   result.outcome == SUCCESS
 //   result.validate_state == INIT
 //
+// Outputs:
+// canonicalid: the ID of canonical result, if any
+// result.outcome, result.validate_state
+//    modified; caller must update DB
+// retry: set to true if some result had a transient failure
+//    (i.e. there was a broken NFS mount).
+//    Should call this again after a while.
+//
 int check_set(
     vector<RESULT>& results, WORKUNIT& wu,
     int& canonicalid, double&, bool& retry
@@ -75,6 +86,7 @@ int check_set(
                 "check_set: init_result([RESULT#%d %s]) transient failure\n",
                 results[i].id, results[i].name
             );
+            retry = true;
             had_error[i] = true;
         } else if (retval) {
             log_messages.printf(MSG_CRITICAL,
diff --git a/sched/validator.cpp b/sched/validator.cpp
index 81b39c2581..0d6e7e8216 100644
--- a/sched/validator.cpp
+++ b/sched/validator.cpp
@@ -103,7 +103,8 @@ bool no_credit = false;
 
 WORKUNIT* g_wup;
 vector<DB_APP_VERSION> app_versions;
-    // cache of app_versions; used by v2 credit system
+    // cache of app_versions; the PFC statistics of these are
+    // updated in memory, and periodically flushed to the DB
 
 bool is_unreplicated(WORKUNIT& wu) {
     return (wu.target_nresults == 1 && app.target_nresults > 1);