*** empty log message ***

svn path=/trunk/boinc/; revision=4172
This commit is contained in:
David Anderson 2004-09-09 21:52:20 +00:00
parent d6c2901e26
commit 860ef1a1e6
4 changed files with 170 additions and 130 deletions

View File

@ -17131,58 +17131,86 @@ Rom 8 Sep 2004
file_names.C
Jeff 09 Sep 2004
- Change to the validator to handle the case where one or more of the result files
for a WU's result set are missing. Before this change this state would result
in this WU and it's results never getting looked at again. No science, no
credit.
- Change the validator to handle the case where one or more
of the result files for a WU's result set can't be read.
Currently, this case results in this WU and its results
never getting looked at again - no science, no credit.
This is coordinated seti_boinc / boinc change. This note is repeated in
the seti_boinc checkin_notes.
We distinguish 2 cases:
1) transient failure, e.g. an NFS mount is not working.
(can't read directory)
In this case check_set() returns a special code to the validator,
which arranges for the WU to get transitioned again in a few hours.
2) permanent failure (file doesn't exist)
The result's validate_state is set to ERROR (new state),
which is generally equivalent to INVALID
- add new WU arg to check_set()
(so it can access min_quorum)
This is coordinated seti_boinc / boinc change.
This note is repeated in the seti_boinc checkin_notes.
seti_boinc changes:
- get_result_file(), if it cannot read the result file, tries to read the
directory where the result file should be. If it can read the directory,
it returns ERR_FOPEN, as before. If it cannot read the directory, it
resturns ERR_OPENDIR. ERR_OPENDIR in this context signals a possibly
transient problem. Upon a successful result read, it sets sah_result.have_result
- get_result_file(), if it cannot read the result file,
tries to read the directory where the result file should be.
If it can read the directory, it returns ERR_FOPEN, as before.
If it cannot read the directory, it resturns ERR_OPENDIR.
ERR_OPENDIR in this context signals a possibly transient problem.
Upon a successful result read, it sets sah_result.have_result
(a new data member as of this change) to true.
- check_set(), upon return from get_result_file() :
- sets result.validate_state to VALIDATE_STATE_ERROR (a new state
as of this change) and retval to zero if there is a nonzero
retval from check_set() and it is not ERR_OPENDIR. It then
continues.
- otherwise just continues.
- sets result.validate_state to VALIDATE_STATE_ERROR (a new state)
and retval to zero if there is a nonzero retval from check_set()
and it is not ERR_OPENDIR. It then continues.
- otherwise just continues.
check_set() then determines if any IO errors brought the result count
below wu.min_quorum. If not, it continues. Otherwise it returns.
In all subsequent logic any results for which sah_result.have_result == false
are skipped. Note that the result and the sah_result vectors are associated.
In all subsequent logic any results for which
sah_result.have_result == false are skipped.
Note that the result and the sah_result vectors are associated.
validate/
sah_result.h
sah_validate.cpp
sah_boinc_db.cpp
sah_result.h
sah_validate.cpp
sah_boinc_db.cpp
boinc changes:
- new validate state VALIDATE_STATE_ERROR.
- in the enumeration for check_set(), we now include a clause to check for
VALIDATE_STATE_INIT in order to ignore results set to VALIDATE_STATE_ERROR.
- in the enumeration for check_set(),
we now include a clause to check for VALIDATE_STATE_INIT
in order to ignore results set to VALIDATE_STATE_ERROR.
This enumeration query now matches that for check_pair().
Note that the only possible retvals from check_set() are now zero or
ERR_OPENDIR.
Note that logic to retry possibly transient errors still needs to be done.
Also left to be done is a transitioner change to subtract the number of
results in state VALIDATE_STATE_ERROR from the count of active results.
The latter change is needed to stimulate the production of additional results
for the affected WU.
Note that the only possible retvals from check_set()
are now zero or ERR_OPENDIR.
Note that logic to retry possibly transient errors still needs
to be done.
Also left to be done is a transitioner change to subtract
the number of results in state VALIDATE_STATE_ERROR
from the count of active results.
The latter change is needed to stimulate the production
of additional results for the affected WU.
sched/
validator.C
validator.C
db/
boinc_db.h
boinc_db.h
Rom 8 Sep 2004 ( for Sebastian Masch )
- Don't cause the core client to rewrite the state file for every block transfered
during a network operation.
Rom 9 Sep 2004 ( for Sebastian Masch )
- Don't cause the core client to rewrite the state file
for every block transfered during a network operation.
client/
pers_file_xfer.C
David 9 Sept 2004
- Further refinements of validation to deal with
recoverable/nonrecoverable file read errors.
Revised the signatures of check_set() and check_pair(),
and revised the validator accordingly.
db/
boinc.db.h
sched/
sample_trivial_validator.C
validator.C

View File

@ -359,10 +359,14 @@ struct WORKUNIT {
#define RESULT_OUTCOME_SUCCESS 1
#define RESULT_OUTCOME_COULDNT_SEND 2
#define RESULT_OUTCOME_CLIENT_ERROR 3
// an error happened on the client
#define RESULT_OUTCOME_NO_REPLY 4
#define RESULT_OUTCOME_DIDNT_NEED 5
// we created the result but didn't need to send it because we already
// got a quorum
#define RESULT_OUTCOME_VALIDATE_ERROR 6
// The outcome was initially SUCCESS, but the validator
// had a permanent error reading a result file
#define VALIDATE_STATE_INIT 0
#define VALIDATE_STATE_VALID 1
@ -370,8 +374,6 @@ struct WORKUNIT {
#define VALIDATE_STATE_NO_CHECK 3
// WU had error, so we'll never get around to validating its results
// This lets us avoid showing the claimed credit as "pending"
#define VALIDATE_STATE_ERROR 4
// Validation encountered an internal error (eg, file IO error).
struct RESULT {
int id;

View File

@ -32,10 +32,10 @@ int init_result_trivial(RESULT const& result, void*& data) {
int check_pair_initialized_trivial(
RESULT const& r1, void* /*data1*/,
RESULT const& r2, void* /*data2*/,
bool& match
RESULT const& r2, void* /*data2*/
) {
match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME);
bool match = (r1.cpu_time >= MIN_CPU_TIME && r2.cpu_time >= MIN_CPU_TIME);
r1.validate_state = match?VALIDATE_STATE_VALID:VALIDATE_STATE_INVALID;
return 0;
}
@ -43,7 +43,8 @@ int cleanup_result_trivial(RESULT const&, void*) {
return 0;
}
int check_set(vector<RESULT>& results, int& canonicalid, double& credit) {
int check_set(vector<RESULT>& results, int& canonicalid, double& credit, bool& retry) {
retry = false;
return generic_check_set_majority(
results, canonicalid, credit,
init_result_trivial,
@ -55,8 +56,7 @@ int check_set(vector<RESULT>& results, int& canonicalid, double& credit) {
int check_pair(RESULT const& r1, RESULT const& r2, bool& match) {
return check_pair_initialized_trivial(
r1, NULL,
r2, NULL,
match
r2, NULL
);
}

View File

@ -18,26 +18,15 @@
//
//
// validate - check and validate new results, and grant credit
// validator - check and validate new results, and grant credit
// -app appname
// [-d debug_level]
// [-one_pass] // make one pass through WU table, then exit
// [-asynch] // fork, run in separate process
//
// This program must be linked with two project-specific functions:
//
// int check_set(vector<RESULT>, int& canonical, double& credit)
// Compare a set of results.
// If a canonical result is found, return its ID,
// and set the "validate_state" field of all the results
// according to whether they match the canonical result.
// Also return the "canonical credit" (e.g. the average or median)
//
// int pair_check(RESULT& new_result, RESULT& canonical, bool& valid);
// return valid=true iff the new result matches the canonical one
//
// Both functions return nonzero if an error occurred,
// in which case other outputs are undefined
// check_set() and check_pair().
// See doc/validate.php for a description.
using namespace std;
@ -47,6 +36,7 @@ using namespace std;
#include "boinc_db.h"
#include "util.h"
#include "error_numbers.h"
#include "sched_config.h"
#include "sched_util.h"
#include "sched_msgs.h"
@ -54,8 +44,13 @@ using namespace std;
#define LOCKFILE "validate.out"
#define PIDFILE "validate.pid"
extern int check_set(vector<RESULT>&, DB_WORKUNIT& wu, int& canonical, double& credit);
extern int check_pair(RESULT const&, RESULT const&, bool&);
extern int check_set(
vector<RESULT>&, DB_WORKUNIT& wu, int& canonical, double& credit,
bool& retry
);
extern int check_pair(
RESULT const& new_result, RESULT const& canonical_result, bool& retry
);
SCHED_CONFIG config;
char app_name[256];
@ -137,7 +132,9 @@ int grant_credit(DB_RESULT& result, double credit) {
void handle_wu(DB_WORKUNIT& wu) {
DB_RESULT result, canonical_result;
bool match, update_result, need_transition = false;
bool update_result, retry;
bool canonical_result_missing = false;
bool need_immediate_transition = false, need_delayed_transition = false;
int retval, canonicalid = 0;
double credit;
unsigned int i;
@ -155,14 +152,20 @@ void handle_wu(DB_WORKUNIT& wu) {
// Get unchecked results and see if they match the canonical result
//
retval = canonical_result.lookup_id(wu.canonical_resultid);
if (retval) {
if (retval == ERR_DB_NOT_FOUND) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[WU#%d %s] Can't read canonical result; marking as validated: %d\n",
"[WU#%d %s] Canonical result not in DB %d",
wu.id, wu.name, retval
);
// Mark this WU as validated, otherwise we'll keep checking it
goto mark_validated;
canonical_result_missing = true;
} else if (retval) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[WU#%d %s] Can't read canonical result %d; exiting",
wu.id, wu.name, retval
);
exit(retval);
}
// scan this WU's results, and check the unchecked ones
@ -173,63 +176,56 @@ void handle_wu(DB_WORKUNIT& wu) {
wu.id, VALIDATE_STATE_INIT, RESULT_SERVER_STATE_OVER, RESULT_OUTCOME_SUCCESS
);
while (!result.enumerate(buf)) {
need_transition = true;
need_immediate_transition = true;
// it's possible that we've deleted canonical result outputs
//
if (canonical_result.file_delete_state == FILE_DELETE_DONE) {
log_messages.printf(
SCHED_MSG_LOG::DEBUG,
"[WU#%d]: Canonical result (%d) has been deleted\n",
wu.id, canonical_result.id
);
match = false;
retval = 0;
} else {
retval = check_pair(result, canonical_result, match);
}
retval = check_pair(
result, canonical_result, retry
);
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::DEBUG,
"[RESULT#%d %s]: pair_check() failed for result: %d\n",
result.id, result.name, retval
);
continue;
} else {
if (match) {
result.validate_state = VALIDATE_STATE_VALID;
result.granted_credit = wu.canonical_credit;
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"[RESULT#%d %s] pair_check() matched: setting result to valid; credit %f\n",
result.id, result.name, result.granted_credit
);
} else {
result.validate_state = VALIDATE_STATE_INVALID;
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"[RESULT#%d %s] pair_check() didn't match: setting result to invalid\n",
result.id, result.name
);
}
exit(retval);
}
retval = result.update();
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[RESULT#%d %s] Can't update result: %d\n",
result.id, result.name, retval
);
continue;
}
retval = grant_credit(result, result.granted_credit);
if (retval) {
if (retry) need_delayed_transition = true;
update_result = false;
switch (result.validate_state) {
case VALIDATE_STATE_VALID:
update_result = true;
result.granted_credit = wu.canonical_credit;
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"[RESULT#%d %s] Can't grant credit: %d\n",
result.id, result.name, retval
"[RESULT#%d %s] pair_check() matched: setting result to valid; credit %f\n",
result.id, result.name, result.granted_credit
);
continue;
retval = grant_credit(result, result.granted_credit);
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"[RESULT#%d %s] Can't grant credit: %d\n",
result.id, result.name, retval
);
}
break;
case VALIDATE_STATE_INVALID:
update_result = true;
log_messages.printf(
SCHED_MSG_LOG::NORMAL,
"[RESULT#%d %s] pair_check() didn't match: setting result to invalid\n",
result.id, result.name
);
}
if (update_result) {
retval = result.update();
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[RESULT#%d %s] Can't update result: %d\n",
result.id, result.name, retval
);
}
}
}
} else {
@ -266,9 +262,18 @@ void handle_wu(DB_WORKUNIT& wu) {
SCHED_MSG_LOG::DEBUG,
"[WU#%d %s] Enough for quorum, checking set.\n", wu.id, wu.name
);
retval = check_set(results, wu, canonicalid, credit);
if (!retval && canonicalid) {
need_transition = true;
retval = check_set(results, wu, canonicalid, credit, retry);
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[WU#%d %s] check_set returned %d, exiting",
wu.id, wu.name, retval
);
exit(retval);
}
if (retry) need_delayed_transition = true;
if (canonicalid) {
need_immediate_transition = true;
log_messages.printf(
SCHED_MSG_LOG::DEBUG,
"[WU#%d %s] Found a canonical result: id=%d\n",
@ -279,12 +284,15 @@ void handle_wu(DB_WORKUNIT& wu) {
wu.assimilate_state = ASSIMILATE_READY;
for (i=0; i<results.size(); i++) {
result = results[i];
update_result = false;
// skip results that had file-read errors
//
if (result.outcome != RESULT_OUTCOME_SUCCESS) continue;
if (result.validate_state == VALIDATE_STATE_INIT) continue;
// grant credit for valid results
//
if (result.validate_state == VALIDATE_STATE_VALID) {
update_result = true;
retval = grant_credit(result, credit);
if (retval) {
log_messages.printf(
@ -301,19 +309,19 @@ void handle_wu(DB_WORKUNIT& wu) {
);
}
if (update_result) {
retval = result.update();
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[RESULT#%d %s] result.update() failed: %d\n",
result.id, result.name, retval
);
}
retval = result.update();
if (retval) {
log_messages.printf(
SCHED_MSG_LOG::CRITICAL,
"[RESULT#%d %s] result.update() failed: %d\n",
result.id, result.name, retval
);
}
}
// don't send any unsent results
// If found a canonical result, don't send any unsent results
// TODO: could do this in a single SQL statement
//
sprintf(buf, "where workunitid=%d and server_state=%d",
wu.id, RESULT_SERVER_STATE_UNSENT
);
@ -334,7 +342,7 @@ void handle_wu(DB_WORKUNIT& wu) {
//
if ((int)results.size() > wu.max_success_results) {
wu.error_mask |= WU_ERROR_TOO_MANY_SUCCESS_RESULTS;
need_transition = true;
need_immediate_transition = true;
}
}
}
@ -342,13 +350,15 @@ void handle_wu(DB_WORKUNIT& wu) {
--log_messages;
mark_validated:
if (need_transition) {
if (need_immediate_transition) {
wu.transition_time = time(0);
}
if (need_delayed_transition) {
int x = time(0) + 6*3600;
if (x < wu.transition_time) wu.transition_time = x;
}
// we've checked all results for this WU, so turn off flag
// clear WU.need_validate
//
wu.need_validate = 0;
retval = wu.update();