file deletion and state

svn path=/trunk/boinc/; revision=823
This commit is contained in:
David Anderson 2003-01-23 08:07:48 +00:00
parent d82670df96
commit 46f84c5bbf
29 changed files with 985 additions and 644 deletions

View File

@ -2981,3 +2981,51 @@ David Jan 15 2003
test.inc
tools/
backend_lib.C
David Jan 22 2003
- Changed some of the state fields of workunit and result,
to accommodate new policies about when to delete files
See doc/backend_* for details
- Split off dummy assimilation function from the main program.
- Added new stripchart scripts for logging DB counts
and directory sizes
NOTE: it looks like shared memory segment is at same address
in feeder and cgi. So it's OK to use pointers in it.
db/
db.h
db_mysql.C
schema.sql
doc/
backend.html (removed)
backend_functions.html (new)
backend_programs.html (new)
backend_state.html (new)
backend_work_sequence.html (new)
create_project.html
garbage.html (removed)
retry.html (removed)
work.html
work_states.html (removed)
html_ops/
db.inc
db.php
sched/
Makefile.in
assimilate_handler.C,h (new)
assimilator.C
handle_request.C
result_retry.C
validate.C
strichart/samples/
datafiles
db_count (new)
db_looper (new)
dir_size (new)
test/
test.inc
test_loop.php
tools/
backend_lib.C
create_work.C

67
db/db.h
View File

@ -183,11 +183,6 @@ struct HOST {
int parse_net_stats(FILE*);
};
// values for main_state
#define WU_MAIN_STATE_INIT 0
#define WU_MAIN_STATE_DONE 1
#define WU_MAIN_STATE_ERROR 2
// values for file_delete state
#define FILE_DELETE_INIT 0
#define FILE_DELETE_READY 1
@ -198,13 +193,15 @@ struct HOST {
#define ASSIMILATE_READY 1
#define ASSIMILATE_DONE 2
// values for error
#define SEND_FAIL 1
// failed to send results for this WU
#define TOO_MANY_ERRORS 2
// too many errors; may have bug
#define TOO_MANY_DONE 3
// too many results without consensus; may be nondeterministic
// NOTE: there is no overall state for a WU
// (like done/not done)
// There's just a bunch of independent substates
// (file delete, assimilate, and states of results, error flags)
// bit fields of error_mask
#define WU_ERROR_COULDNT_SEND_RESULT 1
#define WU_ERROR_TOO_MANY_ERROR_RESULTS 2
#define WU_ERROR_TOO_MANY_RESULTS 4
struct WORKUNIT {
int id;
@ -218,13 +215,14 @@ struct WORKUNIT {
double rsc_memory; // estimated size of RAM working set (bytes)
double rsc_disk; // estimated amount of disk needed (bytes)
bool need_validate; // this WU has at least 1 result in
// VALIDATE_STATE_NEED_CHECK state
// validate state = NEED_CHECK
int canonical_resultid; // ID of canonical result, or zero
double canonical_credit; // credit that all correct results get
double retry_check_time; // when to check for result retry
int delay_bound; // determines result deadline, retry check time
int main_state; // see values above
int error;
unsigned int timeout_check_time; // when to check for timeouts
// zero if no need to check
int delay_bound; // determines result deadline,
// timeout check time
int error_mask; // bitmask of errors (see above)
int file_delete_state;
int assimilate_state;
int workseq_next; // if part of a sequence, the next WU
@ -235,30 +233,39 @@ struct WORKUNIT {
#define RESULT_SERVER_STATE_INACTIVE 1
#define RESULT_SERVER_STATE_UNSENT 2
#define RESULT_SERVER_STATE_IN_PROGRESS 3
#define RESULT_SERVER_STATE_DONE 4
#define RESULT_SERVER_STATE_TIMEOUT 5
#define RESULT_SERVER_STATE_ERROR 6
#define RESULT_SERVER_STATE_UNSENT_SEQ 7
#define RESULT_SERVER_STATE_UNSENT_SEQ 3
// unsent, part of a work sequence
#define RESULT_SERVER_STATE_IN_PROGRESS 4
#define RESULT_SERVER_STATE_OVER 5
// we received a reply, timed out, or decided not to send.
// Note: we could get a reply even after timing out.
#define VALIDATE_STATE_INITIAL 0
#define VALIDATE_STATE_NEED_CHECK 1
#define VALIDATE_STATE_VALID 2
#define VALIDATE_STATE_INVALID 3
#define RESULT_OUTCOME_INIT 0
#define RESULT_OUTCOME_SUCCESS 1
#define RESULT_OUTCOME_COULDNT_SEND 2
#define RESULT_OUTCOME_CLIENT_ERROR 3
#define RESULT_OUTCOME_NO_REPLY 4
#define RESULT_OUTCOME_DIDNT_NEED 5
#define VALIDATE_STATE_INIT 0
#define VALIDATE_STATE_VALID 1
#define VALIDATE_STATE_INVALID 2
struct RESULT {
int id;
unsigned int create_time;
int workunitid;
int server_state; // server state (see above)
int server_state; // see above
int outcome; // see above; defined if server state OVER
int client_state; // phase when client error happened
// (download, compute, upload)
// Defined if outcome is CLIENT_ERROR
// and error details are in stderr_out
int hostid; // host processing this result
unsigned int report_deadline; // deadline for receiving result
unsigned int sent_time; // when result was sent to host
unsigned int received_time; // when result was received from host
char name[256];
int client_state; // records phase when error happened
// (download, compute, upload)
double cpu_time; // CPU time used to complete result
char xml_doc_in[MAX_BLOB_SIZE]; // descriptions of output files
char xml_doc_out[MAX_BLOB_SIZE]; // MD5s of output files
@ -333,7 +340,7 @@ extern int db_workunit(int id, WORKUNIT&);
extern int db_workunit_update(WORKUNIT& p);
extern int db_workunit_lookup_name(WORKUNIT&);
extern int db_workunit_enum_app_need_validate(WORKUNIT&);
extern int db_workunit_enum_retry_check_time(WORKUNIT&);
extern int db_workunit_enum_timeout_check_time(WORKUNIT&);
extern int db_workunit_enum_file_delete_state(WORKUNIT&);
extern int db_workunit_enum_app_assimilate_state(WORKUNIT&);

View File

@ -228,31 +228,33 @@ void BOINC_MYSQL_DB::struct_to_str(void* vp, char* q, int type) {
"rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, "
"need_validate=%d, "
"canonical_resultid=%d, canonical_credit=%f, "
"retry_check_time=%f, delay_bound=%d, main_state=%d, "
"error=%d, file_delete_state=%d, assimilate_state=%d, "
"timeout_check_time=%d, delay_bound=%d, "
"error_mask=%d, file_delete_state=%d, assimilate_state=%d, "
"workseq_next=%d",
wup->id, wup->create_time, wup->appid,
wup->name, wup->xml_doc, wup->batch,
wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk,
wup->need_validate,
wup->canonical_resultid, wup->canonical_credit,
wup->retry_check_time, wup->delay_bound, wup->main_state,
wup->error, wup->file_delete_state, wup->assimilate_state,
wup->timeout_check_time, wup->delay_bound,
wup->error_mask, wup->file_delete_state, wup->assimilate_state,
wup->workseq_next
);
break;
case TYPE_RESULT:
rp = (RESULT*)vp;
sprintf(q,
"id=%d, create_time=%d, workunitid=%d, server_state=%d, "
"id=%d, create_time=%d, workunitid=%d, "
"server_state=%d, outcome=%d, client_state=%d, "
"hostid=%d, report_deadline=%d, sent_time=%d, received_time=%d, "
"name='%s', client_state=%d, cpu_time=%f, "
"name='%s', cpu_time=%f, "
"xml_doc_in='%s', xml_doc_out='%s', stderr_out='%s', "
"batch=%d, file_delete_state=%d, validate_state=%d, "
"claimed_credit=%f, granted_credit=%f",
rp->id, rp->create_time, rp->workunitid, rp->server_state,
rp->id, rp->create_time, rp->workunitid,
rp->server_state, rp->outcome, rp->client_state,
rp->hostid, rp->report_deadline, rp->sent_time, rp->received_time,
rp->name, rp->client_state, rp->cpu_time,
rp->name, rp->cpu_time,
rp->xml_doc_in, rp->xml_doc_out, rp->stderr_out,
rp->batch, rp->file_delete_state, rp->validate_state,
rp->claimed_credit, rp->granted_credit
@ -411,10 +413,9 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) {
wup->need_validate = atoi(r[i++]);
wup->canonical_resultid = atoi(r[i++]);
wup->canonical_credit = atof(r[i++]);
wup->retry_check_time = atof(r[i++]);
wup->timeout_check_time = atoi(r[i++]);
wup->delay_bound = atoi(r[i++]);
wup->main_state = atoi(r[i++]);
wup->error = atoi(r[i++]);
wup->error_mask = atoi(r[i++]);
wup->file_delete_state = atoi(r[i++]);
wup->assimilate_state = atoi(r[i++]);
wup->workseq_next = atoi(r[i++]);
@ -426,12 +427,13 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) {
rp->create_time = atoi(r[i++]);
rp->workunitid = atoi(r[i++]);
rp->server_state = atoi(r[i++]);
rp->outcome = atoi(r[i++]);
rp->client_state = atoi(r[i++]);
rp->hostid = atoi(r[i++]);
rp->report_deadline = atoi(r[i++]);
rp->sent_time = atoi(r[i++]);
rp->received_time = atoi(r[i++]);
strcpy2(rp->name, r[i++]);
rp->client_state = atoi(r[i++]);
rp->cpu_time = atof(r[i++]);
strcpy2(rp->xml_doc_in, r[i++]);
strcpy2(rp->xml_doc_out, r[i++]);
@ -655,14 +657,14 @@ int db_workunit_enum_app_assimilate_state(WORKUNIT& p) {
return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf);
}
int db_workunit_enum_retry_check_time(WORKUNIT& p) {
int db_workunit_enum_timeout_check_time(WORKUNIT& p) {
static ENUM e;
char buf[256];
if (!e.active) {
sprintf(buf,
"where appid=%d and retry_check_time > 0 and retry_check_time < %f",
p.appid, p.retry_check_time
"where appid=%d and timeout_check_time > 0 and timeout_check_time < %d",
p.appid, p.timeout_check_time
);
}
return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf);

View File

@ -135,10 +135,9 @@ create table workunit (
need_validate smallint not null,
canonical_resultid integer not null,
canonical_credit double not null,
retry_check_time double not null,
timeout_check_time integer not null,
delay_bound integer not null,
main_state integer not null,
error integer not null,
error_mask integer not null,
file_delete_state integer not null,
assimilate_state integer not null,
workseq_next integer not null,
@ -150,12 +149,13 @@ create table result (
create_time integer not null,
workunitid integer not null,
server_state integer not null,
outcome integer not null,
client_state integer not null,
hostid integer not null,
report_deadline integer not null,
sent_time integer not null,
received_time integer not null,
name varchar(254) not null,
client_state integer not null,
cpu_time double not null,
xml_doc_in blob,
xml_doc_out blob,

View File

@ -1,99 +0,0 @@
<title>The components of a project back end</title>
<h2>The components of a project back end</h2>
<p>
The <b>back end</b> of a BOINC project consists
of a number of components that together are responsible
for generating work and handling results.
Some of these components are supplied by BOINC;
other parts are project- or application-specific:
<br>
<img vspace=10 src=backend.png>
<br>
<p>
<table border=1 cellpadding=8>
<tr>
<th>Component</th>
<th>BOINC-supplied part</th>
<th>project-supplied part</th>
</tr>
<tr>
<td valign=top>
<b>Work generator</b>: generates work units, work sequences, results,
and the corresponding input files.
</td>
<td valign=top>
Functions and programs that handle the details of
creating workunit, work sequence, and result database records.
</td>
<td valign=top>
Programs or scripts that generate input files,
install them on data servers,
and call the BOINC functions.
</td></tr>
<tr>
<td valign=top><b>Result retry generator</b>:
generates additional results if some of those initially sent are lost
not completed, or completed erroneously.</td>
<td valign=top>A program, <b>result_retry</b>, that does the job.</td>
<td valign=top>Some parameters used by result_retry.</td>
</tr>
<tr>
<td valign=top><b>Result validation and accounting</b>:
compare redundant results; select a <b>canonical result</b>
representing the correct output,
and a <b>canonical credit</b> granted to users and hosts
that return the correct output.</td>
<td valign=top>A program, <b>validate</b>, that contains the
basic logic for validation.</td>
<td valign=top>An application-specific function, linked with <b>validate</b>,
that compares sets of redundant results.</td>
</tr>
<tr>
<td valign=top><b>Assimilator</b>: parse output files of
completed results, and take appropriate action
(record results in a database, and/or generate more work)</td>
<td valign=top>
A main program that enumerates unassimilated workunits
and updates the database.
</td>
<td valign=top>
A function that assimilates a workunit,
either by processing its canonical result
or handling an error return.
</td>
</tr>
<tr>
<td valign=top><b>File deleter</b>: delete input and output files
when they are no longer needed.</td>
<td valign=top>A program, <b>file_deleter</b>, that does the job.</td>
<td valign=top>None.</td>
</tr>
</table>
<p>
Project that use work sequences will need two additional processes:
<p>
<table border=1 cellpadding=8>
<tr>
<th>Component</th>
<th>BOINC-supplied part</th>
<th>project-supplied part</th>
</tr>
<tr>
<td valign=top><b>Work sequence relocater</b>:
detects work sequences whose hosts have failed,
and relocates them to other hosts.</td>
<td valign=top>A program <b>seq_relocate</b></td>
<td valign=top>Some parameters used by seq_relocate</td>
</tr>
<tr>
<td valign=top><b>Work sequence validation and accounting</b>:
Similar to result validation, but for work sequences.
</td>
<td valign=top>A program <b>seq_validate</b></td>
<td valign=top>An application-specific function,
linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
</tr>
</table>

View File

@ -0,0 +1,44 @@
<title>Back end functions</title>
<h2>Back end functions</h2>
<p>
The general functions of a project back end include:
<ul>
<li> <b>Generate work</b>.
<p>
<li> <b>Reissue results</b>.
Hosts may fail to return results for various reasons;
such results are "lost".
Lost and erroneous results may prevent
finding a canonical result for a workunit.
The "result reissue" mechanism generates additional
results as needed to find a canonical result.
<p>
<li> <b>Select canonical results</b>.
Communication from the core client can easily be altered or forged.
Output files may be wrong.
This can result from tampering or hardware failures.
This problem can be addressed
by <b>redundant computing</b>
In this approach, each workunit is processed at least twice.
The project back end waits until a minimum number of results have been returned,then compares the results and decides which are considered correct.
The notion of equality of results,
and the policy for deciding which are correct, are project-specific.
<p>
<li> <b>Grant credit</b>.
Some users will attempt to get undeserved credit
by falsifying their CPU metrics or CPU times.
The back end
finds the minimum reported credit for the correct results of a given workunit,
and assigns this amount of credit to all the correct results.
This ensures that as long as a reasonable majority of participants
don't falsify credit, almost all credit accounting will be correct.
<p>
<li> <b>Assimilate results</b>.
<p>
<li> <b>Delete files</b>.
<p>
<li> <b>Detect "problem" workunits</b>.
</ul>

207
doc/backend_programs.html Normal file
View File

@ -0,0 +1,207 @@
<title>Back end programs</title>
<body bgcolor=ffffff>
<h2>Back end programs</h2>
<p>
A project back end is implemented as a set of programs.
Some parts of these programs are supplied by BOINC;
other parts are project- or application-specific:
<br>
<img vspace=10 src=backend.png>
<br>
<p>
<table border=1 cellpadding=8>
<tr>
<th>Component</th>
<th>BOINC-supplied part</th>
<th>project-supplied part</th>
</tr>
<tr>
<td valign=top>
<b>Work generator</b>: generates work units, results,
and the corresponding input files.
</td>
<td valign=top>
Functions and programs that handle the details of
creating workunit and result database records.
</td>
<td valign=top>
Programs or scripts that generate input files,
install them on data servers, and call the BOINC functions.
</td></tr>
<tr>
<td valign=top><b>Timeout check</b>:
Checks for various timeout conditions,
such as result timeout.
Reissues results for workunits as needed.
</td>
<td valign=top>A program <b>timeout_check</b>.</td>
<td valign=top>Some parameters used by timeout_check.</td>
</tr>
<tr>
<td valign=top><b>Result validation and accounting</b>:
compare redundant results; select a <b>canonical result</b>
representing the correct output,
and a <b>canonical credit</b> granted to users and hosts
that return the correct output.</td>
<td valign=top>A program, <b>validate</b>, that contains the
basic logic for validation.</td>
<td valign=top>An application-specific function, linked with <b>validate</b>,
that compares sets of redundant results.</td>
</tr>
<tr>
<td valign=top><b>Assimilator</b>:
handles workunits that are "completed":
that is, which have a canonical result or for which
an error condition has occurred.
Handling a successfully completed result might involve
record results in a database and perhaps generating more work.</td>
<td valign=top>
A main program that enumerates unassimilated workunits,
calls a project-supplied "handler" function,
and updates the database.
</td>
<td valign=top>
A handler function that assimilates a workunit,
either by processing its canonical result
or handling an error return.
</td>
</tr>
<tr>
<td valign=top><b>File deleter</b>: delete input and output files
when they are no longer needed.</td>
<td valign=top>A program <b>file_deleter</b>.</td>
<td valign=top>None.</td>
</tr>
</table>
<h3>Timeout checker</h3>
<p>
The timeout checker is passed the following parameters:
max_errors
give up on a workunit if it gets this many error results(i.e., there must be a bug in the application).
max_results
give up on a workunit if it gets this many
non-error results without finding a canonical result
redundancy
try to get at least this many non-error results.
application
which application to handle
use crontab to run timeout_checker continuously.
<pre>
for each WU with timeout_check_time < now
for each result of WU
if result.server_state=IN_PROGRESS and now > result.report_deadline
result.server_state = OVER
result.outcome = NO_REPLY
if any result has outcome COULDNT_SEND
wu.error_mask |= COULDNT_SEND
got_error = true
if too many error results
wu.error_mask |= TOO_MANY_ERROR_RESULTS
got_error = true
if too many results
wu.error mask |= TOO_MANY_RESULTS
got_error = true
else
generate new results as needed
if got_error
for all results server_state UNSENT
result.server_state = OVER
result.outcome = DIDNT_NEED
if wu.assimilate_state == INIT
wu.assimilate_state = READY
if all results are OVER and wu.assimilate_state = DONE
wu.file_delete_state = READY
wu.timeout_check_time = 0
else
wu.timeout_check_time = now + delay_bound
</pre>
<h3>Validater</h3>
<p>
BOINC supplies a utility program <b>validate</b>
to perform validation and credit-granting.
This program must be linked with two project-specific functions:
<pre>
int check_set(vector<RESULT> results, int& canonicalid, double& credit);
int check_pair(RESULT& r1, RESULT& r2, bool& match);
</pre>
<b>check_set()</b> takes a set of results.
If there is sufficient agreement,
it selects one of them as the "canonical" result
(returning its ID) and also decides what credit should
be granted for correct results for this workunit.
<p>
<b>check_pair()</b> compares two results and returns match=true
if they agree.
<p>
The file <b>validate_test.C</b> contains an example
implementation of check_set() and check_pair().
<pre>
for each WU with need_validate = true
if already have canonical result
for each result with validate_state = INIT and outcome = SUCCESS
if matches canonical, grant credit
set result.validate_state to VALID or INVALID
else
build set of results with outcome = SUCCESS
if find canonical result
wu.assimilate_state = READY
for all results server_state = UNSENT
result.server_state = OVER
result.outcome = DIDNT_NEED
</pre>
<h3>scheduler</h3>
<pre>
- when send a result
result.server_state = IN_PROGRESS
result.report_deadline = now + wu.delay_bound
??? should do lookup before updating? shmem may be stale
doesn't matter; can't be stale
- when receive a result
switch result.server_state
client_state = (from reply msg)
case IN_PROGRESS:
result.server_state = OVER
case OVER:
result.file_delete_state = READY;
if client_state is DONE
result.outcome = SUCCESS
wu.need_validate = true
else
result.outcome = CLIENT_ERROR
result.validate_state = INVALID
</pre>
<h3>Assimilator</h3>
<pre>
for each WU with assimilate_state = READY
call project-specific handler function
NOTE: canonical_resultid and error_mask are not mutually exclusive
if all results are OVER with outcomes SUCCESS or CLIENT_ERROR
set result.file_delete = READY for all results
else
for each non-canonical result
if state is OVER and outcome is SUCCESS or CLIENT_ERROR
set result.file_delete = READY
wu.assimilate_state = DONE
if all results are OVER
wu.file_delete_state = READY
</pre>

220
doc/backend_state.html Normal file
View File

@ -0,0 +1,220 @@
<title>Back end state transitions</title>
<h2>Back end state transitions</h2>
<p>
The processing of workunits and results involves
several independent activities.
To keep track of these activities,
workunit and result database records have several "state" fields,
and their processing can be viewed as the combination
of several finite-state machines.
<p>
A workunit has the following state fields:
<ul>
<li>
<b>delay_bound</b>.
upper bound for the interval between sending this WU to a host
and getting the result
Should be several times the execution time on an average host.
If it's exceeded, the server "gives up" on the result
and may delete its input files.
If the result is returned later,
it will still be validated and credited.
<li>
<b>canonical_resultid</b>.
<li>
<b>timeout_check_time</b>.
<li>
<b>file_delete_state</b>:
Initially INIT.
When the main state transitions to either DONE or ERROR,
it transitions to READY,
indicating that input files can be deleted.
When file deletion is completed (by file_deleter)
it transitions to DONE.
<li>
<b>assimilate_state</b>:
Initially INIT.
When the main state transitions to either DONE or ERROR,
it transitions to READY,
indicating that the workunit can be assimilated.
When assimplateion is completed (by assimilator)
it transitions to DONE.
<li>
<b>need_validate</b>:
A boolean, true whenever
the workunit has a result whose validate state is NEED_CHECK.
The validate program sets it back to false.
<li>
<b>error_mask</b>.
bit mask for error conditions
</ul>
Invariants:
<ul>
<li> eventually either canonical_resultid or error_mask is set
<li> eventually timeout_check_time=0
<li> WUs are eventually assimilated
<li> input files are eventually deleted,
but only when all results have state=OVER
(since may need to validate results that arrive after assimilation)
and wu.assimilate_state = DONE
(since project may want to do something with WU in error case)
</ul>
<p>
A result has the following state fields:
<ul>
<li> <b>report_deadline</b>:
give up on result (and possibly delete input files)
if don't get reply by this time.
Assignment: when send result; now + WU.delay_bound
<li> <b>server_state</b>:
UNSENT, IN_PROGRESS, OVER.
Initially UNSENT.
Becomes IN_PROGRESS when the result has been sent to a client.
Becomes OVER if we get a host reply,
or the result times out, or we decide not to send it.
<li> <b>outcome</b>:
SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED.
Defined if server_state = OVER.
<li>
<b>client_state</b>:
Records the client state (upload, process, or download)
where an error occurred.
Defined if outcome is CLIENT_ERROR.
<li>
<b>file_delete_state</b>:
INIT, READY, DONE.
<li>
<b>validate_state</b>:
INITIAL, VALID, INVALID.
When a canonical result has been found for the workunit,
becomes either VALID or INVALID.
</ul>
Invariants:
<ul>
<li> results eventually have server_state = OVER.
<li> output files are eventually deleted.
Non-canonical results can be deleted as soon as the WU is assimilated.
Canonical results can be deleted only when all results have server_state=OVER.
If a result reply arrives after its timeout,
the output files can be immediately deleted.
How do we delete output files that arrive REALLY late?
(e.g. uploaded after all results have timed out, and never reported)?
Let X = create time of oldest unassimilated WU.
Any output files created before X can be deleted.
</ul>
<h3>A note on scheduling</h3>
<p>
- when is it feasible to send a result to a host?
Request msg should include X = amount of work currently queued.
TODO: include % time active in calculation??
Decision for each WU:
is X + time for WUs sent so far < delay_bound?
- When is a result declared "unsendable"?
Not a good idea to do on the basis of time;
do it only if a result is flushed from FIFO (see below)
<h3>State transitions</h3>
<pre>
fields of "result" table:
server_state
UNSENT
(on creation)
IN_PROGRESS
from UNSENT
scheduler: when send
OVER
from IN_PROGRESS
scheduler: get reply from host
timeout_check: now > report_deadline
from UNSENT
validate: got canonical result for this WU and server_state=UNSENT
timeout_check: WU has error
file_delete_state
INIT
(on creation)
READY
from INIT:
scheduler: got reply and server_state = OVER
timeout_check: all results are OVER
assimilator: all results are OVER or result is not canonical
from DONE:
scheduler: got reply and server_state = OVER
DONE
from READY
file_deleter: tried to delete files
validate_state
INIT
VALID
from INIT:
validate: outcome = SUCCESS and matched canonical result
INVALID
from INIT:
scheduler: got reply, client error
validate: didn't match canonical result
-------------
fields of "workunit" table
need_validate
FALSE
(on creation)
from TRUE:
validate: done checking
TRUE
from FALSE:
scheduler: got reply w/ client_state = DONE (i.e. no error)
file_delete_state
INIT
(on creation)
READY
timeout_check: all results haver server_state=OVER
and wu.assimilate_state = DONE
assimilate:
all results have server_state = OVER
(and wu.assimilate_state = DONE)
DONE
assimilate_state
INIT
(on creation)
READY
from INIT:
timeout_check: WU has error
validate: found canonical result
DONE
from READY:
assimilator: done
error_mask
COULDNT_SEND
timeout_check: some result has outcome COULDNT_SEND
TOO_MANY_ERROR_RESULTS
timeout_check: too many error results
TOO_MANY_RESULTS
timeout_check: too many results
timeout_check_time:
nonzero
(on creation)
zero
timeout_check: all results are OVER and validate_state = DONE
</pre>

View File

@ -0,0 +1,27 @@
<title>Back ends and work sequences</title>
<h2>Back ends and work sequences</h2>
<p>
Project that use work sequences will need two additional processes:
<p>
<table border=1 cellpadding=8>
<tr>
<th>Component</th>
<th>BOINC-supplied part</th>
<th>project-supplied part</th>
</tr>
<tr>
<td valign=top><b>Work sequence relocater</b>:
detects work sequences whose hosts have failed,
and relocates them to other hosts.</td>
<td valign=top>A program <b>seq_relocate</b></td>
<td valign=top>Some parameters used by seq_relocate</td>
</tr>
<tr>
<td valign=top><b>Work sequence validation and accounting</b>:
Similar to result validation, but for work sequences.
</td>
<td valign=top>A program <b>seq_validate</b></td>
<td valign=top>An application-specific function,
linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
</tr>
</table>

View File

@ -68,13 +68,11 @@ Developing and operating a BOINC project back end
How to feed work into a BOINC system and collect the results.
</font>
<ul>
<li> <a href=backend.html>The components of a project back end</a>
<li> <a href=work_states.html>Workunit and result states</a>
<li> <a href=backend_functions.html>Back end functions</a>
<li> <a href=backend_state.html>Back end state transitions</a>
<li> <a href=backend_programs.html>Back end programs</a>
<li> <a href=tools_work.html>Generating work</a>
<li> <a href=retry.html>Generating result retries</a>
<li> <a href=validation.html>Result validation and accounting</a>
<li> <a href=result_process.html>Assimilation</a>
<li> <a href=garbage.html>File deletion</a>
<li> <a href=backend_work_sequence.html>Back ends and work sequences</a>
<li> <a href=back_end.html>Back end examples</a>
</ul>

View File

@ -1,2 +0,0 @@
<title>Garbage collection</title>
<h2>Garbage collection</h2>

View File

@ -1,66 +0,0 @@
<title>Generating result retries</title>
<h2>Generating result retries</h2>
<p>
Hosts may fail to process and return results for various reasons;
such results are said to be <b>lost</b>.
A combination of lost and erroneous results may prevent
finding canonical result for a workunit.
The <b>result retry</b> mechanism generates additional
results as needed to find a canonical result.
<p>
The result retry mechanism has the following project-supplied parameters:
<ul>
<li> D<sub>WU</sub>: the expected delay (in seconds) between
creating a WU and getting a canonical result.
<li> D<sub>result</sub>: the expected delay (in seconds) between
creating a result and getting a confirmation.
<li> N<sub>Error</sub>: give up on a workunit if it gets this many error results
(i.e., there must be a bug in the application).
<li> N<sub>det</sub>: give up on a workunit if it gets this many
non-error results without finding a canonical result
(i.e., the algorithm must nondeterministic).
<li> N<sub>redundancy</sub>: try to get at least this many non-error results.
</ul>
<p>
Each workunit has a <b>retry check time</b>.
This is initially set to now + D<sub>WU</sub>,
and is set to zero if a canonical result is found for the WU.
<p>
Each result has a <b>deadline</b>,
a time by which a confirmation is expected for the result.
This is initially set to now + D<sub>result</sub>,
<p>
Retry generation is handled by the program <b>result_retry</b>, invoked as
<pre>
result_retry -appname name
</pre>
This program continually checks for workunits past their check time
and without pending validation.
For each such workunit, the program does the following:
<ul>
<li> If any result is not sent, generate an error message,
and give up on the WU (i.e., set its check time to zero).
This condition indicates that either
1) the resource requirements of the WU are too much for
any host;
2) there are insufficient hosts to handle the rate of work generation; or
3) scheduling servers have been out of service.
<li> If at least N<sub>error</sub> results have an error,
generate an error message and give up on the WU.
<li> If at least N<sub>det</sub> results are done,
generate an error message and give up on the WU.
<li> Generate N<sub>redundancy</sub> - n new results for the WU,
where n is the number of results that are done.
The deadline of these results is now + D<sub>result</sub>.
<li> Set the check time of the WU to now + D<sub>WU</sub>
</ul>
<p>
Use crontab to run <b>result_retry</b> continuously.

View File

@ -1,77 +0,0 @@
<title>Validation of results and credit</title>
<body bgcolor=ffffff>
<h2>Validation of results and credit</h2>
<p>
Communication from the core client can easily be altered or forged.
Some users will attempt to get undeserved credit
by falsifying their CPU metrics or CPU times.
Output files may be wrong.
This can result from tampering or hardware failures.
<p>
Both problems - credit-cheating and wrong results - can be addressed
by <b>redundant computing</b> and <b>result validation</b>.
In this approach, each workunit is processed at least twice.
The project back end waits until a minimum number of results have been returned,
then compares the results and decides which are considered correct.
The notion of equality of results,
and the policy for deciding which are correct,
are project-specific.
<p>
The back end then marks correct results as "validated",
finds the minimum reported credit for the correct results of a given workunit,
and assigns this amount of credit to all the correct results.
This ensures that as long as a reasonable majority of participants
don't falsify credit, almost all credit accounting will be correct.
<h3>The validation program</h3>
BOINC supplies a utility program <b>validate</b>
to perform validation and credit-granting.
This program must be linked with two project-specific functions:
<pre>
int check_set(vector<RESULT> results, int& canonicalid, double& credit);
int check_pair(RESULT& r1, RESULT& r2, bool& match);
</pre>
<b>check_set()</b> takes a set of results.
If there is sufficient agreement,
it selects one of them as the "canonical" result
(returning its ID) and also decides what credit should
be granted for correct results for this workunit.
<p>
<b>check_pair()</b> compares two results and returns match=true
if they agree.
<p>
The file <b>validate_test.C</b> contains an example
implementation of check_set() and check_pair().
<hr>
<h3>Implementation</h3>
The following database fields are used:
<p>
<b>WORKUNIT</b>
<dt> bool need_validate
<dd>
true iff this workunit has one or more results in state DONE
and validate_state UNCHECKED
<dt>
int canonical_resultid
<dd>
nonzero if a conclusive check has been done for this WU;
indicates the canonical result
<p>
<b>RESULT</b>
<dt>
int state
<dd> INACTIVE, ..., DONE, ERROR
<dt>
int validate_state
<dd>
NEED_CHECK,
VALID,
INVALID

View File

@ -2,6 +2,9 @@
<body bgcolor=ffffff>
<h2>Workunits</h2>
<p>
TODO: don't separate into XML/other parts;
describe XML format only in the tools doc.
<p>
A <b>workunit</b> describes a computation to be performed.
Workunits are maintained in the <b>workunit</b> table in the BOINC DB.
The attributes of a workunit include:
@ -13,13 +16,6 @@ The attributes of a workunit include:
(see below).
<li> The estimated resource requirements of the work unit
(computation, memory, disk space).
<li> The maximum processing
(measured in <a href=credit.html>Cobblestones</a>)
and maximum disk space to be used for the computation.
An instance of the computation that exceeds either of these bounds
will be aborted.
This mechanism is used to prevent an infinite-loop bug from
indefinitely incapacitating a host.
<li> A <b>delay bound</b>: upper bound on how long (in real time)
a result associated with this work unit should take to complete.
This determines which hosts the workunit can be sent to,
@ -49,8 +45,12 @@ The components are:
<td>A list of environment variables in the form
name=value&name=value&name=value.
</td></tr>
<tr><td>&lt;max_processing></td>
<td>Maximum processing (in Cobblestones).
<tr><td valign=top>&lt;max_processing></td>
<td>Maximum processing
(measured in <a href=credit.html>Cobblestones</a>).
An instance of the computation that exceeds this bound will be aborted.
This mechanism prevents an infinite-loop bug from
indefinitely incapacitating a host.
The default is determined by the client; typically it is 1.
</td></tr>
<tr><td>&lt;max_disk></td>

View File

@ -1,80 +0,0 @@
<title>Workunit and result states</title>
<h2>Workunit and result states</h2>
<p>
The processing of workunits and results involves
several independent activities.
To keep track of these activities,
workunit and result database records have several "state" fields,
and their processing can be viewed as the combination
of several finite-state machines.
<p>
A workunit has the following state fields:
<ul>
<li>
<b>main_state</b>:
Initially INIT.
If a canonical result is found (by validate)
it transitions to DONE.
If it's decided that no canonical result can be found
(by result_retry)
it transitions to ERROR.
<li>
<b>file_delete_state</b>:
Initially INIT.
When the main state transitions to either DONE or ERROR,
it transitions to READY,
indicating that input files can be deleted.
When file deletion is completed (by file_deleter)
it transitions to DONE.
<li>
<b>assimilate_state</b>:
Initially INIT.
When the main state transitions to either DONE or ERROR,
it transitions to READY,
indicating that the workunit can be assimilated.
When assimplateion is completed (by assimilator)
it transitions to DONE.
<li>
<b>need_validate</b>:
A boolean, true whenever
the workunit has a result whose validate state is NEED_CHECK.
The validate program sets it back to false.
</ul>
<p>
A result has the following state fields:
<ul>
<li>
<b>server_state</b>:
Initially UNSENT.
Becomes IN_PROGRESS when the result has been sent to a client.
Becomes DONE or ERROR if the client returns the result,
or TIMEOUT if result_retry gives up on the result.
<li>
<b>client_state</b>:
Records the client state (upload, process, or download)
where an error occurred.
Not relevant here.
<li>
<b>file_delete_state</b>:
Initially INIT.
When the parent workunit becomes ERROR, becomes READY,
indication that the output files can be deleted.
When the parent workunit becomes DONE, becomes READY
except for the canonical result, which remains INIT.
When the output files have been deleted (by file_deleter) becomes DONE.
<li>
<b>validate_state</b>:
Initially INITIAL.
When the result is returned, becomes NEED_CHECK.
When a canonical result has been found for the workunit,
becomes either VALID or INVALID.
</ul>

View File

@ -112,22 +112,13 @@ function show_host($host) {
}
function main_state_str($s) {
switch($s) {
case 0: return "Initial";
case 1: return "Done";
case 2: return "Error";
}
}
function error_str($s) {
switch($s) {
case 0: return "No error";
case 1: return "Send failed";
case 2: return "Too many errors (may have bug)";
case 3: return "Too many done (may be nondeterministic)";
}
return "unknown";
function wu_error_mask_str($s) {
$x = "";
if ($s & 1) $x = $x."Couldn't send result; ";
if ($s & 2) $x = $x."Too many errors (may have bug); ";
if ($s & 4) $x = $x."Too many results (may be nondeterministic)";
$x = $x."<br>";
return $x;
}
function assimilate_state_str($s) {
@ -162,10 +153,9 @@ function show_workunit($wu,$show_xml_doc) {
row("Need validate?", $wu->need_validate?"yes":"no");
row("Canonical resultid", $wu->canonical_resultid);
row("Canonical credit", $wu->canonical_credit);
row("Retry check time", time_str($wu->retry_check_time));
row("Timeout check time", time_str($wu->timeout_check_time));
row("Delay bound", $wu->delay_bound);
row("Main state", main_state_str($wu->main_state));
row("Error", error_str($wu->error));
row("Error mask", wu_error_mask_str($wu->error_mask));
row("File delete state", file_delete_state_str($wu->file_delete_state));
row("Assimilation state", assimilate_state_str($wu->assimilate_state));
row("","<a href=db.php?show=result&wu_id=$wu->id>Show associated results</a>");
@ -173,19 +163,28 @@ function show_workunit($wu,$show_xml_doc) {
echo "<p>";
}
function res_server_state_string($s) {
function result_server_state_string($s) {
switch($s) {
case 1: return "Inactive";
case 2: return "Unsent";
case 3: return "In Progress";
case 4: return "Done";
case 5: return "Timeout";
case 6: return "Error";
case 4: return "In Progress";
case 5: return "Over";
}
return "unknown";
}
function result_client_state_str($s) {
function result_outcome_string($s) {
switch($s) {
case 1: return "Success";
case 2: return "Couldn't send";
case 3: return "Client error";
case 4: return "No reply";
case 5: return "Didn't need";
}
return "unknown";
}
function result_client_state_string($s) {
switch($s) {
case 1: return "Downloading";
case 2: return "Computing";
@ -197,9 +196,8 @@ function result_client_state_str($s) {
function validate_state_str($s) {
switch($s) {
case 0: return "Initial";
case 1: return "Need check";
case 2: return "Valid";
case 3: return "Invalid";
case 1: return "Valid";
case 2: return "Invalid";
}
return "unknown";
}
@ -213,10 +211,11 @@ function show_result($result,$show_xml_docs,$show_stderr,$show_times) {
}
row("Name", $result->name);
row("Workunit", "<a href=db.php?show=workunit&id=$result->workunitid>" . wu_name_by_id($result->workunitid) . "</a>" );
row("Server state", res_server_state_string($result->server_state));
row("Deadline", time_str($result->report_deadline));
row("Server state", result_server_state_string($result->server_state));
row("Outcome", result_outcome_string($result->outcome));
row("Client state", result_client_state_string($result->client_state));
row("Host ID", "<a href=db.php?show=host&id=$result->hostid>" . host_name_by_id($result->hostid) . "</a>");
row("Client state", result_client_state_str($result->client_state));
row("Report deadline", time_str($result->report_deadline));
row("CPU time", $result->cpu_time);
if ($show_xml_docs) {
row("XML doc in", "<pre>".htmlspecialchars($result->xml_doc_in)."</pre>");

View File

@ -66,7 +66,7 @@
if (strlen($result_state) && $result_state != 0) {
$query = append_sql_query( $query, "server_state = $result_state", $first );
$english_query = append_sql_query( $english_query, "server state is ".res_server_state_string($result_state), $first );
$english_query = append_sql_query( $english_query, "server state is ".result_server_state_string($result_state), $first );
$rstate = $result_state;
$first = 0;
} else {
@ -172,7 +172,7 @@
. "<option value=\"0\"" . ($rstate == 0 ? "selected" : "") . "> All\n"
);
for( $i=1;$i<=6;$i++ ) {
printf( "<option value=\"$i\"" . ($rstate == $i ? "selected" : "") . ">" . res_server_state_string($i) . "\n" );
printf( "<option value=\"$i\"" . ($rstate == $i ? "selected" : "") . ">" . result_server_state_string($i) . "\n" );
}
printf( "</select>\n<p>\n" );
print_text_field( "Result in batch number:", "batch", $batch );

View File

@ -77,6 +77,7 @@ FILE_DELETER_OBJS = \
ASSIMILATOR_OBJS = \
assimilator.o \
assimilate_handler.o \
config.o \
../db/db_mysql.o \
../db/mysql_util.o \

View File

@ -0,0 +1,24 @@
#include <stdio.h>
#include "db.h"
#include "assimilate_handler.h"
void assimilate_handler(
WORKUNIT& wu, vector<RESULT>& results, RESULT& canonical_result
) {
printf("assimilating WU %s\n", wu.name);
if (wu.canonical_resultid) {
printf("canonical result:\n%s", canonical_result.xml_doc_out);
} else {
printf("no canonical result\n");
}
if (wu.error_mask&WU_ERROR_COULDNT_SEND_RESULT) {
printf("Error: couldn't send a result\n");
}
if (wu.error_mask&WU_ERROR_TOO_MANY_ERROR_RESULTS) {
printf("Error: too many error results\n");
}
if (wu.error_mask&WU_ERROR_TOO_MANY_RESULTS) {
printf("Error: too many total results\n");
}
}

View File

@ -0,0 +1,4 @@
#include <vector>
#include "db.h"
extern void assimilate_handler(WORKUNIT&, vector<RESULT>&, RESULT&);

View File

@ -21,10 +21,12 @@
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <vector>
#include "db.h"
#include "parse.h"
#include "config.h"
#include "assimilate_handler.h"
CONFIG config;
@ -35,48 +37,75 @@ void write_log(char* p) {
fprintf(stderr, "%s: %s", timestr, p);
}
// assimilate all WUs that need it
// return nonzero if did anything
//
bool do_pass(APP app) {
bool do_pass(APP& app) {
WORKUNIT wu;
RESULT result;
bool did_something = false;
int retval;
RESULT canonical_result, result;
vector<RESULT> results;
bool did_something = false, delete_inputs, delete_outputs;
char buf[MAX_BLOB_SIZE];
unsigned int i;
wu.appid = app.id;
wu.assimilate_state = ASSIMILATE_READY;
while (!db_workunit_enum_app_assimilate_state(wu)) {
did_something = true;
sprintf(buf, "Assimilating WU %s, assim state %d\n", wu.name, wu.assimilate_state);
sprintf(buf,
"Assimilating WU %s, assim state %d\n",
wu.name, wu.assimilate_state
);
write_log(buf);
switch(wu.main_state) {
case WU_MAIN_STATE_INIT:
write_log("ERROR; WU shouldn't be in init state\n");
break;
case WU_MAIN_STATE_DONE:
if (!wu.canonical_resultid) {
write_log("ERROR: canonical resultid zero\n");
break;
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
results.push_back(result);
if (result.id == wu.canonical_resultid) {
canonical_result = result;
}
retval = db_result(wu.canonical_resultid, result);
if (retval) {
write_log("can't get canonical result\n");
break;
}
sprintf(buf, "canonical result for WU %s:\n%s", wu.name, result.xml_doc_out);
write_log(buf);
result.file_delete_state = FILE_DELETE_READY;
db_result_update(result);
break;
case WU_MAIN_STATE_ERROR:
printf("WU %s had an error\n", wu.name);
break;
}
assimilate_handler(wu, results, canonical_result);
delete_outputs = true;
delete_inputs = true;
for (i=0; i<results.size(); i++) {
result = results[i];
if (result.server_state != RESULT_SERVER_STATE_OVER
|| (result.outcome != RESULT_OUTCOME_SUCCESS && result.outcome != RESULT_OUTCOME_CLIENT_ERROR)
) {
delete_outputs = false;
}
if (result.server_state != RESULT_SERVER_STATE_OVER) {
delete_inputs = false;
}
}
if (delete_outputs) {
for (i=0; i<results.size(); i++) {
result = results[i];
result.file_delete_state = FILE_DELETE_READY;
db_result_update(result);
}
} else {
for (i=0; i<results.size(); i++) {
result = results[i];
if (result.server_state == RESULT_SERVER_STATE_OVER
&& result.id != wu.canonical_resultid
&& (result.outcome == RESULT_OUTCOME_SUCCESS || result.outcome == RESULT_OUTCOME_CLIENT_ERROR)
) {
result.file_delete_state = FILE_DELETE_READY;
db_result_update(result);
}
}
}
wu.assimilate_state = ASSIMILATE_DONE;
if (delete_inputs) {
wu.file_delete_state = FILE_DELETE_READY;
}
db_workunit_update(wu);
}
return did_something;

View File

@ -332,12 +332,13 @@ int handle_results(
for (i=0; i<sreq.results.size(); i++) {
rp = &sreq.results[i];
// acknowledge the result even if we couldn't find it --
// don't want it to keep coming back
//
reply.result_acks.push_back(*rp);
sprintf(buf, "got ack for result %s\n", rp->name);
sprintf(buf, "got result %s\n", rp->name);
write_log(buf);
strncpy(result.name, rp->name, sizeof(result.name));
@ -347,7 +348,8 @@ int handle_results(
write_log(buf);
continue;
}
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
sprintf(buf,
"got unexpected result for %s: server state is %d\n",
rp->name, result.server_state
@ -355,6 +357,12 @@ int handle_results(
write_log(buf);
continue;
}
if (result.server_state == RESULT_SERVER_STATE_OVER) {
result.file_delete_state = FILE_DELETE_READY;
}
if (result.server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
result.server_state = RESULT_SERVER_STATE_OVER;
}
if (result.hostid != sreq.hostid) {
sprintf(buf,
@ -365,22 +373,32 @@ int handle_results(
continue;
}
// TODO: handle error returns
//
result.hostid = reply.host.id;
result.received_time = time(0);
result.client_state = rp->client_state;
result.cpu_time = rp->cpu_time;
result.claimed_credit = result.cpu_time * host.credit_per_cpu_sec;
result.validate_state = VALIDATE_STATE_NEED_CHECK;
if (result.client_state != CLIENT_DONE) {
result.validate_state = VALIDATE_STATE_INVALID;
//so we won't try to validate this result anymore
result.server_state = RESULT_SERVER_STATE_ERROR;
result.server_state = RESULT_SERVER_STATE_OVER;
if (result.client_state == CLIENT_DONE) {
result.outcome = RESULT_OUTCOME_SUCCESS;
retval = db_workunit(result.workunitid, wu);
if (retval) {
sprintf(buf,
"can't find WU %d for result %d\n",
result.workunitid, result.id
);
write_log(buf);
} else {
wu.need_validate = 1;
retval = db_workunit_update(wu);
if (retval) {
write_log("Can't update WU\n");
}
}
} else {
result.server_state = RESULT_SERVER_STATE_DONE;
result.outcome = RESULT_OUTCOME_CLIENT_ERROR;
result.validate_state = VALIDATE_STATE_INVALID;
}
strncpy(result.stderr_out, rp->stderr_out, sizeof(result.stderr_out));
strncpy(result.xml_doc_out, rp->xml_doc_out, sizeof(result.xml_doc_out));
@ -390,20 +408,6 @@ int handle_results(
write_log(buf);
}
retval = db_workunit(result.workunitid, wu);
if (retval) {
sprintf(buf,
"can't find WU %d for result %d\n",
result.workunitid, result.id
);
write_log(buf);
} else {
wu.need_validate = 1;
retval = db_workunit_update(wu);
if (retval) {
write_log("Can't update WU\n");
}
}
}
return 0;
}
@ -487,6 +491,7 @@ int send_work(
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
result.hostid = reply.host.id;
result.sent_time = time(0);
result.report_deadline = result.sent_time + wu.delay_bound;
db_result_update(result);
nresults++;

View File

@ -17,9 +17,12 @@
// Contributor(s):
//
// result_retry - create new results to make up for lost ones
// timeout_check - do various time-based tasks
// - time out results
// - create new results to make up for lost ones
// - check for WU error conditions
//
// result_retry
// timeout_check
// -app appname
// [ -nerror n ] if get this many errors, bail on WU
// [ -ndet n ] if get this results w/o consensus, bail
@ -155,95 +158,92 @@ int assign_new_names(char* in) {
return 0;
}
bool do_pass(APP& app) {
WORKUNIT wu;
void handle_wu(WORKUNIT& wu) {
vector<RESULT> results;
RESULT result;
int nerrors, ndone, retval;
unsigned int i, n;
bool did_something = false;
char buf[256];
unsigned int now = time(0);
bool wu_error = false, all_over;
wu.retry_check_time = time(0);
wu.appid = app.id;
// loop over WUs that are due to be checked
// scan the results for the WU
//
while (!db_workunit_enum_retry_check_time(wu)) {
vector<RESULT> results;
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
results.push_back(result);
}
did_something = true;
// if this WU has a canonical result, we're done
// (this normally doesn't happen since the retry check time
// is zeroed when canonical result found, but just in case).
//
if (wu.canonical_resultid) {
wu.retry_check_time = 0;
goto update_wu;
nerrors = 0;
ndone = 0;
for (i=0; i<results.size(); i++) {
result = results[i];
switch (result.server_state) {
case RESULT_SERVER_STATE_IN_PROGRESS:
if (result.report_deadline < now) {
result.server_state = RESULT_SERVER_STATE_OVER;
result.outcome = RESULT_OUTCOME_NO_REPLY;
db_result_update(result);
}
break;
case RESULT_SERVER_STATE_OVER:
switch (result.outcome) {
case RESULT_OUTCOME_COULDNT_SEND:
sprintf(buf, "WU %s has couldn't-send result\n", wu.name);
write_log(buf);
wu.error_mask |= WU_ERROR_COULDNT_SEND_RESULT;
wu_error = true;
break;
case RESULT_OUTCOME_SUCCESS:
ndone++;
break;
default:
nerrors++;
break;
}
break;
}
}
// enumerate all the results for the WU
//
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
results.push_back(result);
}
// check for too many errors or too many results
//
if (nerrors > max_errors) {
sprintf(buf, "WU %s has too many errors\n", wu.name);
write_log(buf);
wu.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS;
wu_error = true;
}
if (ndone > max_done) {
sprintf(buf, "WU %s has too many answers\n", wu.name);
write_log(buf);
wu.error_mask |= WU_ERROR_TOO_MANY_RESULTS;
wu_error = true;
}
nerrors = 0;
ndone = 0;
// if this WU had an error, don't send any unsent results
//
if (wu_error) {
for (i=0; i<results.size(); i++) {
result = results[i];
// if any result is unsent, give up on the WU
//
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
sprintf(buf, "WU %s has unsent result\n", wu.name);
write_log(buf);
wu.main_state = WU_MAIN_STATE_ERROR;
wu.error = SEND_FAIL;
wu.file_delete_state = FILE_DELETE_READY;
wu.assimilate_state = ASSIMILATE_READY;
wu.retry_check_time = 0;
goto update_wu;
}
if (result.server_state == RESULT_SERVER_STATE_ERROR) {
nerrors++;
}
if (result.server_state == RESULT_SERVER_STATE_DONE) {
ndone++;
result.server_state = RESULT_SERVER_STATE_OVER;
result.outcome = RESULT_OUTCOME_DIDNT_NEED;
db_result_update(result);
}
}
// it too many errors or too many different results, bail
//
if (nerrors > max_errors) {
sprintf(buf, "WU %s has too many errors\n", wu.name);
write_log(buf);
wu.main_state = WU_MAIN_STATE_ERROR;
wu.error = TOO_MANY_ERRORS;
wu.file_delete_state = FILE_DELETE_READY;
if (wu.assimilate_state == ASSIMILATE_INIT) {
wu.assimilate_state = ASSIMILATE_READY;
wu.retry_check_time = 0;
goto update_wu;
}
if (ndone > max_done) {
sprintf(buf, "WU %s has too many answers\n", wu.name);
write_log(buf);
wu.main_state = WU_MAIN_STATE_ERROR;
wu.error = TOO_MANY_DONE;
wu.file_delete_state = FILE_DELETE_READY;
wu.assimilate_state = ASSIMILATE_READY;
wu.retry_check_time = 0;
goto update_wu;
}
// Generate new results if needed.
} else {
// If no error, generate new results if needed.
// Munge the XML of an existing result
// to create unique new output filenames.
//
if (nredundancy > ndone) {
n = nredundancy - ndone;
for (i=0; i<n; i++) {
for (i=0; i<n; i++) {
result = results[0];
make_unique_name(result.name);
initialize_result(result, wu);
@ -258,17 +258,46 @@ bool do_pass(APP& app) {
}
}
}
}
// update the WU's result retry check time
//
wu.retry_check_time = time(0) + wu.delay_bound;
update_wu:
retval = db_workunit_update(wu);
if (retval) {
sprintf(buf, "db_workunit_update %d\n", retval);
write_log(buf);
// see if all results are OVER and result is assimilated;
// if so we don't need to check this WU ever again.
//
all_over = true;
for (i=0; i<results.size(); i++) {
result = results[i];
if (result.server_state != RESULT_SERVER_STATE_OVER) {
all_over = false;
break;
}
}
if (all_over && wu.assimilate_state == ASSIMILATE_DONE) {
wu.file_delete_state = FILE_DELETE_READY;
wu.timeout_check_time = 0;
} else {
wu.timeout_check_time = now + wu.delay_bound;
}
retval = db_workunit_update(wu);
if (retval) {
sprintf(buf, "db_workunit_update %d\n", retval);
write_log(buf);
}
}
bool do_pass(APP& app) {
WORKUNIT wu;
bool did_something = false;
// loop over WUs that are due to be checked
//
wu.timeout_check_time = time(0);
wu.appid = app.id;
while (!db_workunit_enum_timeout_check_time(wu)) {
did_something = true;
handle_wu(wu);
}
return did_something;
}

View File

@ -122,43 +122,37 @@ int grant_credit(RESULT& result, double credit) {
return 0;
}
// make one pass through the workunits with need_validate set.
// return true if there were any
//
bool do_validate_scan(APP& app, int min_quorum) {
WORKUNIT wu;
void handle_wu(WORKUNIT& wu) {
RESULT result, canonical_result;
bool found=false, match;
bool match, update_result;
int retval, canonicalid;
double credit;
unsigned int i;
char buf[256];
wu.appid = app.id;
while(!db_workunit_enum_app_need_validate(wu)) {
found = true;
if (wu.canonical_resultid) {
sprintf(buf,
"validating WU %s; already have canonical result\n", wu.name
);
write_log(buf);
if (wu.canonical_resultid) {
sprintf(buf,
"validating WU %s; already have canonical result\n", wu.name
);
write_log(buf);
// Here if WU already has a canonical result.
// Get unchecked results and see if they match the canonical result
//
retval = db_result(wu.canonical_resultid, canonical_result);
if (retval) {
write_log("can't read canonical result\n");
continue;
}
// Here if WU already has a canonical result.
// Get unchecked results and see if they match the canonical result
//
retval = db_result(wu.canonical_resultid, canonical_result);
if (retval) {
write_log("can't read canonical result\n");
return;
}
// scan this WU's results, and check any that need checking
//
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
if (result.validate_state != VALIDATE_STATE_NEED_CHECK) {
continue;
}
// scan this WU's results, and check the unchecked ones
//
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
if (result.validate_state == VALIDATE_STATE_INIT
&& result.server_state == RESULT_SERVER_STATE_OVER
&& result.outcome == RESULT_OUTCOME_SUCCESS
) {
retval = check_pair(result, canonical_result, match);
if (retval) {
sprintf(buf,
@ -188,55 +182,66 @@ bool do_validate_scan(APP& app, int min_quorum) {
continue;
}
}
} else {
// Here if WU doesn't have a canonical result yet.
// Try to get one
}
} else {
vector<RESULT> results;
sprintf(buf, "validating WU %s; no canonical result\n", wu.name);
write_log(buf);
// Here if WU doesn't have a canonical result yet.
// Try to get one
vector<RESULT> results;
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
if (result.server_state == RESULT_SERVER_STATE_DONE) {
results.push_back(result);
}
sprintf(buf, "validating WU %s; no canonical result\n", wu.name);
write_log(buf);
result.workunitid = wu.id;
while (!db_result_enum_wuid(result)) {
if (result.server_state == RESULT_SERVER_STATE_OVER
&& result.outcome == RESULT_OUTCOME_SUCCESS
) {
results.push_back(result);
}
sprintf(buf, "found %d results\n", results.size());
write_log(buf);
if (results.size() >= (unsigned int)min_quorum) {
retval = check_set(results, canonicalid, credit);
if (!retval && canonicalid) {
write_log("found a canonical result\n");
wu.canonical_resultid = canonicalid;
wu.canonical_credit = credit;
wu.main_state = WU_MAIN_STATE_DONE;
wu.file_delete_state = FILE_DELETE_READY;
wu.assimilate_state = ASSIMILATE_READY;
for (i=0; i<results.size(); i++) {
}
sprintf(buf, "found %d successful results\n", results.size());
write_log(buf);
if (results.size() >= (unsigned int)min_quorum) {
retval = check_set(results, canonicalid, credit);
if (!retval && canonicalid) {
write_log("found a canonical result\n");
wu.canonical_resultid = canonicalid;
wu.canonical_credit = credit;
wu.assimilate_state = ASSIMILATE_READY;
for (i=0; i<results.size(); i++) {
result = results[i];
update_result = false;
// if result is not canonical, arrange to delete
// its output files
//
if (results[i].id != canonicalid) {
results[i].file_delete_state = FILE_DELETE_READY;
// grant credit for valid results
//
if (result.validate_state == VALIDATE_STATE_VALID) {
update_result = true;
retval = grant_credit(result, credit);
if (retval) {
sprintf(buf,
"validate: grant_credit %d\n", retval
);
write_log(buf);
}
// grant credit for valid results
//
if (results[i].validate_state == VALIDATE_STATE_VALID) {
retval = grant_credit(results[i], credit);
if (retval) {
sprintf(buf,
"validate: grant_credit %d\n", retval
);
write_log(buf);
}
results[i].granted_credit = credit;
}
sprintf(buf, "updating result %d to %d; credit %f\n", results[i].id, results[i].validate_state, credit);
result.granted_credit = credit;
sprintf(buf,
"updating result %d to %d; credit %f\n",
result.id, result.validate_state, credit
);
write_log(buf);
retval = db_result_update(results[i]);
}
// don't send any unsent results
//
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
update_result = true;
result.server_state = RESULT_SERVER_STATE_OVER;
result.outcome = RESULT_OUTCOME_DIDNT_NEED;
}
if (update_result) {
retval = db_result_update(result);
if (retval) {
sprintf(buf,
"validate: db_result_update %d\n", retval
@ -247,15 +252,29 @@ bool do_validate_scan(APP& app, int min_quorum) {
}
}
}
}
// we've checked all results for this WU, so turn off flag
//
wu.need_validate = 0;
retval = db_workunit_update(wu);
if (retval) {
sprintf(buf, "db_workunit_update: %d\n", retval);
write_log(buf);
}
// we've checked all results for this WU, so turn off flag
//
wu.need_validate = 0;
retval = db_workunit_update(wu);
if (retval) {
sprintf(buf, "db_workunit_update: %d\n", retval);
write_log(buf);
}
}
// make one pass through the workunits with need_validate set.
// return true if there were any
//
bool do_validate_scan(APP& app, int min_quorum) {
WORKUNIT wu;
bool found=false;
wu.appid = app.id;
while(!db_workunit_enum_app_need_validate(wu)) {
handle_wu(wu);
found = true;
}
return found;
}

View File

@ -10,3 +10,6 @@
#
get_load_out:machine load:3
count_results_out:Number of results:3
assimilated_wus_out:Number of assimilated workunits:3
download_size_out:Bytes in download dir:3
upload_size_out:Bytes in upload dir:3

View File

@ -424,12 +424,17 @@ class Project {
PassThru("cp $source_dir/stripchart/stripchart $this->project_dir/cgi/");
PassThru("cp $source_dir/stripchart/stripchart.cnf $this->project_dir/cgi/");
PassThru("cp $source_dir/stripchart/samples/looper $this->project_dir/cgi/");
PassThru("cp $source_dir/stripchart/samples/db_looper $this->project_dir/cgi/");
PassThru("cp $source_dir/stripchart/samples/datafiles $this->project_dir/cgi/");
PassThru("cp $source_dir/stripchart/samples/get_load $this->project_dir/cgi/");
macro_substitute("BOINC_DB_NAME", $this->db_name, "$source_dir/stripchart/samples/count_results", "$this->project_dir/cgi/count_results");
make_executable("$this->project_dir/cgi/count_results");
PassThru("cp $source_dir/stripchart/samples/dir_size $this->project_dir/cgi/");
macro_substitute("BOINC_DB_NAME", $this->db_name, "$source_dir/stripchart/samples/db_count", "$this->project_dir/cgi/db_count");
make_executable("$this->project_dir/cgi/db_count");
PassThru("cd $this->project_dir/cgi; looper get_load 1 > get_load_out &");
PassThru("cd $this->project_dir/cgi; looper count_results 1 > count_results_out &");
PassThru("cd $this->project_dir/cgi; db_looper 'result' 1 > count_results_out &");
PassThru("cd $this->project_dir/cgi; db_looper 'workunit where assimilate_state=2' 1 > assimilated_wus_out &");
PassThru("cd $this->project_dir/cgi; looper 'dir_size ../download' 1 > download_size_out &");
PassThru("cd $this->project_dir/cgi; looper 'dir_size ../upload' 1 > upload_size_out &");
}
// this should stop the feeder and any other daemons

View File

@ -33,16 +33,9 @@
$project->start_feeder();
$project->start_make_work($work);
$project->start_validate($app, 3);
//$project->start_file_delete();
$project->start_file_delete();
$project->start_assimilator($app);
$project->start_stripchart();
$host->run();
//$project->stop();
//$result->state = RESULT_STATE_DONE;
//$result->stderr_out = "APP: upper_case: starting, argc 1";
//$result->exit_status = 0;
//$project->check_results(2, $result);
//$project->compare_file("uc_wu_0_0", "uc_correct_output");
//$project->compare_file("uc_wu_1_0", "uc_correct_output");
$project->stop();
?>

View File

@ -153,15 +153,16 @@ void initialize_result(RESULT& result, WORKUNIT& wu) {
result.workunitid = wu.id;
result.server_state = RESULT_SERVER_STATE_UNSENT;
result.hostid = 0;
result.report_deadline = time(0) + wu.delay_bound;
result.report_deadline = 0;
result.sent_time = 0;
result.received_time = 0;
result.client_state = 0;
result.cpu_time = 0;
strcpy(result.xml_doc_out, "");
strcpy(result.stderr_out, "");
result.outcome = RESULT_OUTCOME_INIT;
result.file_delete_state = ASSIMILATE_INIT;
result.validate_state = VALIDATE_STATE_INITIAL;
result.validate_state = VALIDATE_STATE_INIT;
result.claimed_credit = 0;
result.granted_credit = 0;
}

View File

@ -144,7 +144,7 @@ int main(int argc, char** argv) {
}
wu.appid = app.id;
wu.retry_check_time = time(0) + wu.delay_bound;
wu.timeout_check_time = time(0) + wu.delay_bound;
retval = read_key_file(keyfile, key);
if (retval) {