diff --git a/checkin_notes b/checkin_notes
index 7d7636b07e..329b7df3d4 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -21068,3 +21068,24 @@ David 10 Dec 2004
and with the shared library in the non-monolithic case)
graphics_lib.C,h (new)
(link this with the main program in the non-monolithic case)
+
+David 10 Dec 2004
+ - Fixed a bug in the validator that could cause WUs to
+ get stuck in a state where they have no canonical result,
+ no results in progress, and no transition scheduled.
+
+ 1) After calling check_set(), count the number "nsuccess_results"
+ of results that still have outcome == SUCCESS
+ (this may have changed, since check_set() can change
+ the outcome to VALIDATE_ERROR).
+
+ 2) If check_set() fails to find a canonical result,
+ always schedule an immediate transition.
+
+ 3) If nsuccess_results >= target_nresults,
+ set target_nresults to nsuccess_results+1.
+ This ensures that the transitioner will make a new result,
+ even in pathological cases (e.g. nsuccess_results > target_nresults)
+
+ sched/
+ transitioner.C
diff --git a/doc/validate.php b/doc/validate.php
index fe47b6a1eb..e257d900b4 100644
--- a/doc/validate.php
+++ b/doc/validate.php
@@ -32,17 +32,18 @@ If, when an output file for a result has a nonrecoverable error
then it must set the result's outcome (in memory, not database)
to VALIDATE_ERROR.
Note: the function try_fopen() (in lib/util.C) can be used
-to detect recoverable/nonrecoverable errors.
+to distinguish recoverable and nonrecoverable file-open errors.
If a canonical result is found, check_set() must set the
-validate_state field of each non-ERROR result to either VALID or INVALID.
+validate_state field of each non-ERROR result
+(in memory, not database) to either VALID or INVALID.
If a recoverable error occurs while reading output files
(e.g. a directory wasn't visible due to NFS mount failure)
then check_set() should return retry=true.
This tells the validator to arrange for this WU to be
-examined again in a few hours.
+processed again in a few hours.
check_set() should return nonzero if a major error occurs.
This tells the validator to write an error message and exit.
diff --git a/sched/validator.C b/sched/validator.C
index 5f2685abd1..433364c3b7 100644
--- a/sched/validator.C
+++ b/sched/validator.C
@@ -272,6 +272,7 @@ void handle_wu(
}
} else {
vector results;
+ int nsuccess_results;
// Here if WU doesn't have a canonical result yet.
// Try to get one
@@ -283,13 +284,12 @@ void handle_wu(
);
++log_messages;
- // make a vector of only successful, unvalidated results
+ // make a vector of only successful results
//
for (i=0; i wu.max_success_results) {
+ if (nsuccess_results > wu.max_success_results) {
wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS;
- need_immediate_transition = true;
}
- // if #success results is target_nresults, bump it up
+
+ // if #success results == than target_nresults,
+ // we need more results, so bump target_nresults
+ // NOTE: nsuccess_results should never be > target_nresults,
+ // but accommodate that if it should happen
//
- if ((int)results.size() == wu.target_nresults) {
- wu.target_nresults++;
- need_immediate_transition = true;
+ if (nsuccess_results >= wu.target_nresults) {
+ wu.target_nresults = nsuccess_results+1;
}
}
}