mirror of https://github.com/BOINC/boinc.git
file deletion and state
svn path=/trunk/boinc/; revision=823
This commit is contained in:
parent
d82670df96
commit
46f84c5bbf
|
@ -2981,3 +2981,51 @@ David Jan 15 2003
|
|||
test.inc
|
||||
tools/
|
||||
backend_lib.C
|
||||
|
||||
David Jan 22 2003
|
||||
- Changed some of the state fields of workunit and result,
|
||||
to accommodate new policies about when to delete files
|
||||
See doc/backend_* for details
|
||||
- Split off dummy assimilation function from the main program.
|
||||
- Added new stripchart scripts for logging DB counts
|
||||
and directory sizes
|
||||
|
||||
NOTE: it looks like shared memory segment is at same address
|
||||
in feeder and cgi. So it's OK to use pointers in it.
|
||||
|
||||
db/
|
||||
db.h
|
||||
db_mysql.C
|
||||
schema.sql
|
||||
doc/
|
||||
backend.html (removed)
|
||||
backend_functions.html (new)
|
||||
backend_programs.html (new)
|
||||
backend_state.html (new)
|
||||
backend_work_sequence.html (new)
|
||||
create_project.html
|
||||
garbage.html (removed)
|
||||
retry.html (removed)
|
||||
work.html
|
||||
work_states.html (removed)
|
||||
html_ops/
|
||||
db.inc
|
||||
db.php
|
||||
sched/
|
||||
Makefile.in
|
||||
assimilate_handler.C,h (new)
|
||||
assimilator.C
|
||||
handle_request.C
|
||||
result_retry.C
|
||||
validate.C
|
||||
strichart/samples/
|
||||
datafiles
|
||||
db_count (new)
|
||||
db_looper (new)
|
||||
dir_size (new)
|
||||
test/
|
||||
test.inc
|
||||
test_loop.php
|
||||
tools/
|
||||
backend_lib.C
|
||||
create_work.C
|
||||
|
|
67
db/db.h
67
db/db.h
|
@ -183,11 +183,6 @@ struct HOST {
|
|||
int parse_net_stats(FILE*);
|
||||
};
|
||||
|
||||
// values for main_state
|
||||
#define WU_MAIN_STATE_INIT 0
|
||||
#define WU_MAIN_STATE_DONE 1
|
||||
#define WU_MAIN_STATE_ERROR 2
|
||||
|
||||
// values for file_delete state
|
||||
#define FILE_DELETE_INIT 0
|
||||
#define FILE_DELETE_READY 1
|
||||
|
@ -198,13 +193,15 @@ struct HOST {
|
|||
#define ASSIMILATE_READY 1
|
||||
#define ASSIMILATE_DONE 2
|
||||
|
||||
// values for error
|
||||
#define SEND_FAIL 1
|
||||
// failed to send results for this WU
|
||||
#define TOO_MANY_ERRORS 2
|
||||
// too many errors; may have bug
|
||||
#define TOO_MANY_DONE 3
|
||||
// too many results without consensus; may be nondeterministic
|
||||
// NOTE: there is no overall state for a WU
|
||||
// (like done/not done)
|
||||
// There's just a bunch of independent substates
|
||||
// (file delete, assimilate, and states of results, error flags)
|
||||
|
||||
// bit fields of error_mask
|
||||
#define WU_ERROR_COULDNT_SEND_RESULT 1
|
||||
#define WU_ERROR_TOO_MANY_ERROR_RESULTS 2
|
||||
#define WU_ERROR_TOO_MANY_RESULTS 4
|
||||
|
||||
struct WORKUNIT {
|
||||
int id;
|
||||
|
@ -218,13 +215,14 @@ struct WORKUNIT {
|
|||
double rsc_memory; // estimated size of RAM working set (bytes)
|
||||
double rsc_disk; // estimated amount of disk needed (bytes)
|
||||
bool need_validate; // this WU has at least 1 result in
|
||||
// VALIDATE_STATE_NEED_CHECK state
|
||||
// validate state = NEED_CHECK
|
||||
int canonical_resultid; // ID of canonical result, or zero
|
||||
double canonical_credit; // credit that all correct results get
|
||||
double retry_check_time; // when to check for result retry
|
||||
int delay_bound; // determines result deadline, retry check time
|
||||
int main_state; // see values above
|
||||
int error;
|
||||
unsigned int timeout_check_time; // when to check for timeouts
|
||||
// zero if no need to check
|
||||
int delay_bound; // determines result deadline,
|
||||
// timeout check time
|
||||
int error_mask; // bitmask of errors (see above)
|
||||
int file_delete_state;
|
||||
int assimilate_state;
|
||||
int workseq_next; // if part of a sequence, the next WU
|
||||
|
@ -235,30 +233,39 @@ struct WORKUNIT {
|
|||
|
||||
#define RESULT_SERVER_STATE_INACTIVE 1
|
||||
#define RESULT_SERVER_STATE_UNSENT 2
|
||||
#define RESULT_SERVER_STATE_IN_PROGRESS 3
|
||||
#define RESULT_SERVER_STATE_DONE 4
|
||||
#define RESULT_SERVER_STATE_TIMEOUT 5
|
||||
#define RESULT_SERVER_STATE_ERROR 6
|
||||
#define RESULT_SERVER_STATE_UNSENT_SEQ 7
|
||||
#define RESULT_SERVER_STATE_UNSENT_SEQ 3
|
||||
// unsent, part of a work sequence
|
||||
#define RESULT_SERVER_STATE_IN_PROGRESS 4
|
||||
#define RESULT_SERVER_STATE_OVER 5
|
||||
// we received a reply, timed out, or decided not to send.
|
||||
// Note: we could get a reply even after timing out.
|
||||
|
||||
#define VALIDATE_STATE_INITIAL 0
|
||||
#define VALIDATE_STATE_NEED_CHECK 1
|
||||
#define VALIDATE_STATE_VALID 2
|
||||
#define VALIDATE_STATE_INVALID 3
|
||||
#define RESULT_OUTCOME_INIT 0
|
||||
#define RESULT_OUTCOME_SUCCESS 1
|
||||
#define RESULT_OUTCOME_COULDNT_SEND 2
|
||||
#define RESULT_OUTCOME_CLIENT_ERROR 3
|
||||
#define RESULT_OUTCOME_NO_REPLY 4
|
||||
#define RESULT_OUTCOME_DIDNT_NEED 5
|
||||
|
||||
#define VALIDATE_STATE_INIT 0
|
||||
#define VALIDATE_STATE_VALID 1
|
||||
#define VALIDATE_STATE_INVALID 2
|
||||
|
||||
struct RESULT {
|
||||
int id;
|
||||
unsigned int create_time;
|
||||
int workunitid;
|
||||
int server_state; // server state (see above)
|
||||
int server_state; // see above
|
||||
int outcome; // see above; defined if server state OVER
|
||||
int client_state; // phase when client error happened
|
||||
// (download, compute, upload)
|
||||
// Defined if outcome is CLIENT_ERROR
|
||||
// and error details are in stderr_out
|
||||
int hostid; // host processing this result
|
||||
unsigned int report_deadline; // deadline for receiving result
|
||||
unsigned int sent_time; // when result was sent to host
|
||||
unsigned int received_time; // when result was received from host
|
||||
char name[256];
|
||||
int client_state; // records phase when error happened
|
||||
// (download, compute, upload)
|
||||
double cpu_time; // CPU time used to complete result
|
||||
char xml_doc_in[MAX_BLOB_SIZE]; // descriptions of output files
|
||||
char xml_doc_out[MAX_BLOB_SIZE]; // MD5s of output files
|
||||
|
@ -333,7 +340,7 @@ extern int db_workunit(int id, WORKUNIT&);
|
|||
extern int db_workunit_update(WORKUNIT& p);
|
||||
extern int db_workunit_lookup_name(WORKUNIT&);
|
||||
extern int db_workunit_enum_app_need_validate(WORKUNIT&);
|
||||
extern int db_workunit_enum_retry_check_time(WORKUNIT&);
|
||||
extern int db_workunit_enum_timeout_check_time(WORKUNIT&);
|
||||
extern int db_workunit_enum_file_delete_state(WORKUNIT&);
|
||||
extern int db_workunit_enum_app_assimilate_state(WORKUNIT&);
|
||||
|
||||
|
|
|
@ -228,31 +228,33 @@ void BOINC_MYSQL_DB::struct_to_str(void* vp, char* q, int type) {
|
|||
"rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, "
|
||||
"need_validate=%d, "
|
||||
"canonical_resultid=%d, canonical_credit=%f, "
|
||||
"retry_check_time=%f, delay_bound=%d, main_state=%d, "
|
||||
"error=%d, file_delete_state=%d, assimilate_state=%d, "
|
||||
"timeout_check_time=%d, delay_bound=%d, "
|
||||
"error_mask=%d, file_delete_state=%d, assimilate_state=%d, "
|
||||
"workseq_next=%d",
|
||||
wup->id, wup->create_time, wup->appid,
|
||||
wup->name, wup->xml_doc, wup->batch,
|
||||
wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk,
|
||||
wup->need_validate,
|
||||
wup->canonical_resultid, wup->canonical_credit,
|
||||
wup->retry_check_time, wup->delay_bound, wup->main_state,
|
||||
wup->error, wup->file_delete_state, wup->assimilate_state,
|
||||
wup->timeout_check_time, wup->delay_bound,
|
||||
wup->error_mask, wup->file_delete_state, wup->assimilate_state,
|
||||
wup->workseq_next
|
||||
);
|
||||
break;
|
||||
case TYPE_RESULT:
|
||||
rp = (RESULT*)vp;
|
||||
sprintf(q,
|
||||
"id=%d, create_time=%d, workunitid=%d, server_state=%d, "
|
||||
"id=%d, create_time=%d, workunitid=%d, "
|
||||
"server_state=%d, outcome=%d, client_state=%d, "
|
||||
"hostid=%d, report_deadline=%d, sent_time=%d, received_time=%d, "
|
||||
"name='%s', client_state=%d, cpu_time=%f, "
|
||||
"name='%s', cpu_time=%f, "
|
||||
"xml_doc_in='%s', xml_doc_out='%s', stderr_out='%s', "
|
||||
"batch=%d, file_delete_state=%d, validate_state=%d, "
|
||||
"claimed_credit=%f, granted_credit=%f",
|
||||
rp->id, rp->create_time, rp->workunitid, rp->server_state,
|
||||
rp->id, rp->create_time, rp->workunitid,
|
||||
rp->server_state, rp->outcome, rp->client_state,
|
||||
rp->hostid, rp->report_deadline, rp->sent_time, rp->received_time,
|
||||
rp->name, rp->client_state, rp->cpu_time,
|
||||
rp->name, rp->cpu_time,
|
||||
rp->xml_doc_in, rp->xml_doc_out, rp->stderr_out,
|
||||
rp->batch, rp->file_delete_state, rp->validate_state,
|
||||
rp->claimed_credit, rp->granted_credit
|
||||
|
@ -411,10 +413,9 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) {
|
|||
wup->need_validate = atoi(r[i++]);
|
||||
wup->canonical_resultid = atoi(r[i++]);
|
||||
wup->canonical_credit = atof(r[i++]);
|
||||
wup->retry_check_time = atof(r[i++]);
|
||||
wup->timeout_check_time = atoi(r[i++]);
|
||||
wup->delay_bound = atoi(r[i++]);
|
||||
wup->main_state = atoi(r[i++]);
|
||||
wup->error = atoi(r[i++]);
|
||||
wup->error_mask = atoi(r[i++]);
|
||||
wup->file_delete_state = atoi(r[i++]);
|
||||
wup->assimilate_state = atoi(r[i++]);
|
||||
wup->workseq_next = atoi(r[i++]);
|
||||
|
@ -426,12 +427,13 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) {
|
|||
rp->create_time = atoi(r[i++]);
|
||||
rp->workunitid = atoi(r[i++]);
|
||||
rp->server_state = atoi(r[i++]);
|
||||
rp->outcome = atoi(r[i++]);
|
||||
rp->client_state = atoi(r[i++]);
|
||||
rp->hostid = atoi(r[i++]);
|
||||
rp->report_deadline = atoi(r[i++]);
|
||||
rp->sent_time = atoi(r[i++]);
|
||||
rp->received_time = atoi(r[i++]);
|
||||
strcpy2(rp->name, r[i++]);
|
||||
rp->client_state = atoi(r[i++]);
|
||||
rp->cpu_time = atof(r[i++]);
|
||||
strcpy2(rp->xml_doc_in, r[i++]);
|
||||
strcpy2(rp->xml_doc_out, r[i++]);
|
||||
|
@ -655,14 +657,14 @@ int db_workunit_enum_app_assimilate_state(WORKUNIT& p) {
|
|||
return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf);
|
||||
}
|
||||
|
||||
int db_workunit_enum_retry_check_time(WORKUNIT& p) {
|
||||
int db_workunit_enum_timeout_check_time(WORKUNIT& p) {
|
||||
static ENUM e;
|
||||
char buf[256];
|
||||
|
||||
if (!e.active) {
|
||||
sprintf(buf,
|
||||
"where appid=%d and retry_check_time > 0 and retry_check_time < %f",
|
||||
p.appid, p.retry_check_time
|
||||
"where appid=%d and timeout_check_time > 0 and timeout_check_time < %d",
|
||||
p.appid, p.timeout_check_time
|
||||
);
|
||||
}
|
||||
return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf);
|
||||
|
|
|
@ -135,10 +135,9 @@ create table workunit (
|
|||
need_validate smallint not null,
|
||||
canonical_resultid integer not null,
|
||||
canonical_credit double not null,
|
||||
retry_check_time double not null,
|
||||
timeout_check_time integer not null,
|
||||
delay_bound integer not null,
|
||||
main_state integer not null,
|
||||
error integer not null,
|
||||
error_mask integer not null,
|
||||
file_delete_state integer not null,
|
||||
assimilate_state integer not null,
|
||||
workseq_next integer not null,
|
||||
|
@ -150,12 +149,13 @@ create table result (
|
|||
create_time integer not null,
|
||||
workunitid integer not null,
|
||||
server_state integer not null,
|
||||
outcome integer not null,
|
||||
client_state integer not null,
|
||||
hostid integer not null,
|
||||
report_deadline integer not null,
|
||||
sent_time integer not null,
|
||||
received_time integer not null,
|
||||
name varchar(254) not null,
|
||||
client_state integer not null,
|
||||
cpu_time double not null,
|
||||
xml_doc_in blob,
|
||||
xml_doc_out blob,
|
||||
|
|
|
@ -1,99 +0,0 @@
|
|||
<title>The components of a project back end</title>
|
||||
<h2>The components of a project back end</h2>
|
||||
|
||||
<p>
|
||||
The <b>back end</b> of a BOINC project consists
|
||||
of a number of components that together are responsible
|
||||
for generating work and handling results.
|
||||
Some of these components are supplied by BOINC;
|
||||
other parts are project- or application-specific:
|
||||
|
||||
<br>
|
||||
<img vspace=10 src=backend.png>
|
||||
<br>
|
||||
|
||||
<p>
|
||||
<table border=1 cellpadding=8>
|
||||
<tr>
|
||||
<th>Component</th>
|
||||
<th>BOINC-supplied part</th>
|
||||
<th>project-supplied part</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top>
|
||||
<b>Work generator</b>: generates work units, work sequences, results,
|
||||
and the corresponding input files.
|
||||
</td>
|
||||
<td valign=top>
|
||||
Functions and programs that handle the details of
|
||||
creating workunit, work sequence, and result database records.
|
||||
</td>
|
||||
<td valign=top>
|
||||
Programs or scripts that generate input files,
|
||||
install them on data servers,
|
||||
and call the BOINC functions.
|
||||
</td></tr>
|
||||
<tr>
|
||||
<td valign=top><b>Result retry generator</b>:
|
||||
generates additional results if some of those initially sent are lost
|
||||
not completed, or completed erroneously.</td>
|
||||
<td valign=top>A program, <b>result_retry</b>, that does the job.</td>
|
||||
<td valign=top>Some parameters used by result_retry.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Result validation and accounting</b>:
|
||||
compare redundant results; select a <b>canonical result</b>
|
||||
representing the correct output,
|
||||
and a <b>canonical credit</b> granted to users and hosts
|
||||
that return the correct output.</td>
|
||||
<td valign=top>A program, <b>validate</b>, that contains the
|
||||
basic logic for validation.</td>
|
||||
<td valign=top>An application-specific function, linked with <b>validate</b>,
|
||||
that compares sets of redundant results.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Assimilator</b>: parse output files of
|
||||
completed results, and take appropriate action
|
||||
(record results in a database, and/or generate more work)</td>
|
||||
<td valign=top>
|
||||
A main program that enumerates unassimilated workunits
|
||||
and updates the database.
|
||||
</td>
|
||||
<td valign=top>
|
||||
A function that assimilates a workunit,
|
||||
either by processing its canonical result
|
||||
or handling an error return.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>File deleter</b>: delete input and output files
|
||||
when they are no longer needed.</td>
|
||||
<td valign=top>A program, <b>file_deleter</b>, that does the job.</td>
|
||||
<td valign=top>None.</td>
|
||||
</tr>
|
||||
</table>
|
||||
<p>
|
||||
Project that use work sequences will need two additional processes:
|
||||
<p>
|
||||
<table border=1 cellpadding=8>
|
||||
<tr>
|
||||
<th>Component</th>
|
||||
<th>BOINC-supplied part</th>
|
||||
<th>project-supplied part</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Work sequence relocater</b>:
|
||||
detects work sequences whose hosts have failed,
|
||||
and relocates them to other hosts.</td>
|
||||
<td valign=top>A program <b>seq_relocate</b></td>
|
||||
<td valign=top>Some parameters used by seq_relocate</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Work sequence validation and accounting</b>:
|
||||
Similar to result validation, but for work sequences.
|
||||
</td>
|
||||
<td valign=top>A program <b>seq_validate</b></td>
|
||||
<td valign=top>An application-specific function,
|
||||
linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
|
||||
</tr>
|
||||
</table>
|
|
@ -0,0 +1,44 @@
|
|||
<title>Back end functions</title>
|
||||
<h2>Back end functions</h2>
|
||||
|
||||
<p>
|
||||
The general functions of a project back end include:
|
||||
<ul>
|
||||
<li> <b>Generate work</b>.
|
||||
<p>
|
||||
<li> <b>Reissue results</b>.
|
||||
Hosts may fail to return results for various reasons;
|
||||
such results are "lost".
|
||||
Lost and erroneous results may prevent
|
||||
finding a canonical result for a workunit.
|
||||
The "result reissue" mechanism generates additional
|
||||
results as needed to find a canonical result.
|
||||
|
||||
<p>
|
||||
<li> <b>Select canonical results</b>.
|
||||
Communication from the core client can easily be altered or forged.
|
||||
Output files may be wrong.
|
||||
This can result from tampering or hardware failures.
|
||||
This problem can be addressed
|
||||
by <b>redundant computing</b>
|
||||
In this approach, each workunit is processed at least twice.
|
||||
The project back end waits until a minimum number of results have been returned,then compares the results and decides which are considered correct.
|
||||
The notion of equality of results,
|
||||
and the policy for deciding which are correct, are project-specific.
|
||||
<p>
|
||||
|
||||
<li> <b>Grant credit</b>.
|
||||
Some users will attempt to get undeserved credit
|
||||
by falsifying their CPU metrics or CPU times.
|
||||
The back end
|
||||
finds the minimum reported credit for the correct results of a given workunit,
|
||||
and assigns this amount of credit to all the correct results.
|
||||
This ensures that as long as a reasonable majority of participants
|
||||
don't falsify credit, almost all credit accounting will be correct.
|
||||
<p>
|
||||
<li> <b>Assimilate results</b>.
|
||||
<p>
|
||||
<li> <b>Delete files</b>.
|
||||
<p>
|
||||
<li> <b>Detect "problem" workunits</b>.
|
||||
</ul>
|
|
@ -0,0 +1,207 @@
|
|||
<title>Back end programs</title>
|
||||
<body bgcolor=ffffff>
|
||||
<h2>Back end programs</h2>
|
||||
|
||||
<p>
|
||||
A project back end is implemented as a set of programs.
|
||||
Some parts of these programs are supplied by BOINC;
|
||||
other parts are project- or application-specific:
|
||||
|
||||
<br>
|
||||
<img vspace=10 src=backend.png>
|
||||
<br>
|
||||
|
||||
<p>
|
||||
<table border=1 cellpadding=8>
|
||||
<tr>
|
||||
<th>Component</th>
|
||||
<th>BOINC-supplied part</th>
|
||||
<th>project-supplied part</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top>
|
||||
<b>Work generator</b>: generates work units, results,
|
||||
and the corresponding input files.
|
||||
</td>
|
||||
<td valign=top>
|
||||
Functions and programs that handle the details of
|
||||
creating workunit and result database records.
|
||||
</td>
|
||||
<td valign=top>
|
||||
Programs or scripts that generate input files,
|
||||
install them on data servers, and call the BOINC functions.
|
||||
</td></tr>
|
||||
<tr>
|
||||
<td valign=top><b>Timeout check</b>:
|
||||
Checks for various timeout conditions,
|
||||
such as result timeout.
|
||||
Reissues results for workunits as needed.
|
||||
</td>
|
||||
<td valign=top>A program <b>timeout_check</b>.</td>
|
||||
<td valign=top>Some parameters used by timeout_check.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Result validation and accounting</b>:
|
||||
compare redundant results; select a <b>canonical result</b>
|
||||
representing the correct output,
|
||||
and a <b>canonical credit</b> granted to users and hosts
|
||||
that return the correct output.</td>
|
||||
<td valign=top>A program, <b>validate</b>, that contains the
|
||||
basic logic for validation.</td>
|
||||
<td valign=top>An application-specific function, linked with <b>validate</b>,
|
||||
that compares sets of redundant results.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Assimilator</b>:
|
||||
handles workunits that are "completed":
|
||||
that is, which have a canonical result or for which
|
||||
an error condition has occurred.
|
||||
Handling a successfully completed result might involve
|
||||
record results in a database and perhaps generating more work.</td>
|
||||
<td valign=top>
|
||||
A main program that enumerates unassimilated workunits,
|
||||
calls a project-supplied "handler" function,
|
||||
and updates the database.
|
||||
</td>
|
||||
<td valign=top>
|
||||
A handler function that assimilates a workunit,
|
||||
either by processing its canonical result
|
||||
or handling an error return.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>File deleter</b>: delete input and output files
|
||||
when they are no longer needed.</td>
|
||||
<td valign=top>A program <b>file_deleter</b>.</td>
|
||||
<td valign=top>None.</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
||||
<h3>Timeout checker</h3>
|
||||
<p>
|
||||
The timeout checker is passed the following parameters:
|
||||
|
||||
max_errors
|
||||
give up on a workunit if it gets this many error results(i.e., there must be a bug in the application).
|
||||
|
||||
max_results
|
||||
give up on a workunit if it gets this many
|
||||
non-error results without finding a canonical result
|
||||
|
||||
redundancy
|
||||
try to get at least this many non-error results.
|
||||
|
||||
application
|
||||
which application to handle
|
||||
|
||||
use crontab to run timeout_checker continuously.
|
||||
|
||||
<pre>
|
||||
for each WU with timeout_check_time < now
|
||||
for each result of WU
|
||||
if result.server_state=IN_PROGRESS and now > result.report_deadline
|
||||
result.server_state = OVER
|
||||
result.outcome = NO_REPLY
|
||||
if any result has outcome COULDNT_SEND
|
||||
wu.error_mask |= COULDNT_SEND
|
||||
got_error = true
|
||||
if too many error results
|
||||
wu.error_mask |= TOO_MANY_ERROR_RESULTS
|
||||
got_error = true
|
||||
if too many results
|
||||
wu.error mask |= TOO_MANY_RESULTS
|
||||
got_error = true
|
||||
else
|
||||
generate new results as needed
|
||||
|
||||
if got_error
|
||||
for all results server_state UNSENT
|
||||
result.server_state = OVER
|
||||
result.outcome = DIDNT_NEED
|
||||
if wu.assimilate_state == INIT
|
||||
wu.assimilate_state = READY
|
||||
|
||||
if all results are OVER and wu.assimilate_state = DONE
|
||||
wu.file_delete_state = READY
|
||||
wu.timeout_check_time = 0
|
||||
else
|
||||
wu.timeout_check_time = now + delay_bound
|
||||
|
||||
</pre>
|
||||
|
||||
<h3>Validater</h3>
|
||||
<p>
|
||||
BOINC supplies a utility program <b>validate</b>
|
||||
to perform validation and credit-granting.
|
||||
This program must be linked with two project-specific functions:
|
||||
<pre>
|
||||
int check_set(vector<RESULT> results, int& canonicalid, double& credit);
|
||||
int check_pair(RESULT& r1, RESULT& r2, bool& match);
|
||||
</pre>
|
||||
<b>check_set()</b> takes a set of results.
|
||||
If there is sufficient agreement,
|
||||
it selects one of them as the "canonical" result
|
||||
(returning its ID) and also decides what credit should
|
||||
be granted for correct results for this workunit.
|
||||
<p>
|
||||
<b>check_pair()</b> compares two results and returns match=true
|
||||
if they agree.
|
||||
|
||||
<p>
|
||||
The file <b>validate_test.C</b> contains an example
|
||||
implementation of check_set() and check_pair().
|
||||
|
||||
<pre>
|
||||
for each WU with need_validate = true
|
||||
if already have canonical result
|
||||
for each result with validate_state = INIT and outcome = SUCCESS
|
||||
if matches canonical, grant credit
|
||||
set result.validate_state to VALID or INVALID
|
||||
else
|
||||
build set of results with outcome = SUCCESS
|
||||
if find canonical result
|
||||
wu.assimilate_state = READY
|
||||
for all results server_state = UNSENT
|
||||
result.server_state = OVER
|
||||
result.outcome = DIDNT_NEED
|
||||
</pre>
|
||||
|
||||
<h3>scheduler</h3>
|
||||
<pre>
|
||||
- when send a result
|
||||
result.server_state = IN_PROGRESS
|
||||
result.report_deadline = now + wu.delay_bound
|
||||
??? should do lookup before updating? shmem may be stale
|
||||
doesn't matter; can't be stale
|
||||
- when receive a result
|
||||
switch result.server_state
|
||||
client_state = (from reply msg)
|
||||
case IN_PROGRESS:
|
||||
result.server_state = OVER
|
||||
case OVER:
|
||||
result.file_delete_state = READY;
|
||||
|
||||
if client_state is DONE
|
||||
result.outcome = SUCCESS
|
||||
wu.need_validate = true
|
||||
else
|
||||
result.outcome = CLIENT_ERROR
|
||||
result.validate_state = INVALID
|
||||
|
||||
</pre>
|
||||
|
||||
<h3>Assimilator</h3>
|
||||
<pre>
|
||||
for each WU with assimilate_state = READY
|
||||
call project-specific handler function
|
||||
NOTE: canonical_resultid and error_mask are not mutually exclusive
|
||||
if all results are OVER with outcomes SUCCESS or CLIENT_ERROR
|
||||
set result.file_delete = READY for all results
|
||||
else
|
||||
for each non-canonical result
|
||||
if state is OVER and outcome is SUCCESS or CLIENT_ERROR
|
||||
set result.file_delete = READY
|
||||
wu.assimilate_state = DONE
|
||||
if all results are OVER
|
||||
wu.file_delete_state = READY
|
||||
</pre>
|
|
@ -0,0 +1,220 @@
|
|||
<title>Back end state transitions</title>
|
||||
<h2>Back end state transitions</h2>
|
||||
|
||||
<p>
|
||||
The processing of workunits and results involves
|
||||
several independent activities.
|
||||
To keep track of these activities,
|
||||
workunit and result database records have several "state" fields,
|
||||
and their processing can be viewed as the combination
|
||||
of several finite-state machines.
|
||||
|
||||
<p>
|
||||
A workunit has the following state fields:
|
||||
<ul>
|
||||
<li>
|
||||
<b>delay_bound</b>.
|
||||
upper bound for the interval between sending this WU to a host
|
||||
and getting the result
|
||||
Should be several times the execution time on an average host.
|
||||
If it's exceeded, the server "gives up" on the result
|
||||
and may delete its input files.
|
||||
If the result is returned later,
|
||||
it will still be validated and credited.
|
||||
|
||||
<li>
|
||||
<b>canonical_resultid</b>.
|
||||
<li>
|
||||
<b>timeout_check_time</b>.
|
||||
|
||||
<li>
|
||||
<b>file_delete_state</b>:
|
||||
Initially INIT.
|
||||
When the main state transitions to either DONE or ERROR,
|
||||
it transitions to READY,
|
||||
indicating that input files can be deleted.
|
||||
When file deletion is completed (by file_deleter)
|
||||
it transitions to DONE.
|
||||
|
||||
<li>
|
||||
<b>assimilate_state</b>:
|
||||
Initially INIT.
|
||||
When the main state transitions to either DONE or ERROR,
|
||||
it transitions to READY,
|
||||
indicating that the workunit can be assimilated.
|
||||
When assimplateion is completed (by assimilator)
|
||||
it transitions to DONE.
|
||||
|
||||
<li>
|
||||
<b>need_validate</b>:
|
||||
A boolean, true whenever
|
||||
the workunit has a result whose validate state is NEED_CHECK.
|
||||
The validate program sets it back to false.
|
||||
<li>
|
||||
<b>error_mask</b>.
|
||||
bit mask for error conditions
|
||||
|
||||
</ul>
|
||||
Invariants:
|
||||
<ul>
|
||||
<li> eventually either canonical_resultid or error_mask is set
|
||||
<li> eventually timeout_check_time=0
|
||||
<li> WUs are eventually assimilated
|
||||
<li> input files are eventually deleted,
|
||||
but only when all results have state=OVER
|
||||
(since may need to validate results that arrive after assimilation)
|
||||
and wu.assimilate_state = DONE
|
||||
(since project may want to do something with WU in error case)
|
||||
|
||||
|
||||
</ul>
|
||||
|
||||
|
||||
<p>
|
||||
|
||||
A result has the following state fields:
|
||||
<ul>
|
||||
<li> <b>report_deadline</b>:
|
||||
give up on result (and possibly delete input files)
|
||||
if don't get reply by this time.
|
||||
Assignment: when send result; now + WU.delay_bound
|
||||
|
||||
<li> <b>server_state</b>:
|
||||
UNSENT, IN_PROGRESS, OVER.
|
||||
Initially UNSENT.
|
||||
Becomes IN_PROGRESS when the result has been sent to a client.
|
||||
Becomes OVER if we get a host reply,
|
||||
or the result times out, or we decide not to send it.
|
||||
<li> <b>outcome</b>:
|
||||
SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED.
|
||||
Defined if server_state = OVER.
|
||||
<li>
|
||||
<b>client_state</b>:
|
||||
Records the client state (upload, process, or download)
|
||||
where an error occurred.
|
||||
Defined if outcome is CLIENT_ERROR.
|
||||
<li>
|
||||
<b>file_delete_state</b>:
|
||||
INIT, READY, DONE.
|
||||
<li>
|
||||
<b>validate_state</b>:
|
||||
INITIAL, VALID, INVALID.
|
||||
When a canonical result has been found for the workunit,
|
||||
becomes either VALID or INVALID.
|
||||
</ul>
|
||||
|
||||
Invariants:
|
||||
<ul>
|
||||
<li> results eventually have server_state = OVER.
|
||||
<li> output files are eventually deleted.
|
||||
Non-canonical results can be deleted as soon as the WU is assimilated.
|
||||
Canonical results can be deleted only when all results have server_state=OVER.
|
||||
If a result reply arrives after its timeout,
|
||||
the output files can be immediately deleted.
|
||||
How do we delete output files that arrive REALLY late?
|
||||
(e.g. uploaded after all results have timed out, and never reported)?
|
||||
Let X = create time of oldest unassimilated WU.
|
||||
Any output files created before X can be deleted.
|
||||
</ul>
|
||||
<h3>A note on scheduling</h3>
|
||||
<p>
|
||||
- when is it feasible to send a result to a host?
|
||||
Request msg should include X = amount of work currently queued.
|
||||
TODO: include % time active in calculation??
|
||||
Decision for each WU:
|
||||
is X + time for WUs sent so far < delay_bound?
|
||||
- When is a result declared "unsendable"?
|
||||
Not a good idea to do on the basis of time;
|
||||
do it only if a result is flushed from FIFO (see below)
|
||||
|
||||
<h3>State transitions</h3>
|
||||
<pre>
|
||||
|
||||
fields of "result" table:
|
||||
|
||||
server_state
|
||||
UNSENT
|
||||
(on creation)
|
||||
IN_PROGRESS
|
||||
from UNSENT
|
||||
scheduler: when send
|
||||
OVER
|
||||
from IN_PROGRESS
|
||||
scheduler: get reply from host
|
||||
timeout_check: now > report_deadline
|
||||
from UNSENT
|
||||
validate: got canonical result for this WU and server_state=UNSENT
|
||||
timeout_check: WU has error
|
||||
|
||||
file_delete_state
|
||||
INIT
|
||||
(on creation)
|
||||
READY
|
||||
from INIT:
|
||||
scheduler: got reply and server_state = OVER
|
||||
timeout_check: all results are OVER
|
||||
assimilator: all results are OVER or result is not canonical
|
||||
from DONE:
|
||||
scheduler: got reply and server_state = OVER
|
||||
DONE
|
||||
from READY
|
||||
file_deleter: tried to delete files
|
||||
|
||||
validate_state
|
||||
INIT
|
||||
VALID
|
||||
from INIT:
|
||||
validate: outcome = SUCCESS and matched canonical result
|
||||
INVALID
|
||||
from INIT:
|
||||
scheduler: got reply, client error
|
||||
validate: didn't match canonical result
|
||||
|
||||
-------------
|
||||
fields of "workunit" table
|
||||
|
||||
need_validate
|
||||
FALSE
|
||||
(on creation)
|
||||
from TRUE:
|
||||
validate: done checking
|
||||
TRUE
|
||||
from FALSE:
|
||||
scheduler: got reply w/ client_state = DONE (i.e. no error)
|
||||
|
||||
file_delete_state
|
||||
INIT
|
||||
(on creation)
|
||||
READY
|
||||
timeout_check: all results haver server_state=OVER
|
||||
and wu.assimilate_state = DONE
|
||||
assimilate:
|
||||
all results have server_state = OVER
|
||||
(and wu.assimilate_state = DONE)
|
||||
DONE
|
||||
|
||||
assimilate_state
|
||||
INIT
|
||||
(on creation)
|
||||
READY
|
||||
from INIT:
|
||||
timeout_check: WU has error
|
||||
validate: found canonical result
|
||||
DONE
|
||||
from READY:
|
||||
assimilator: done
|
||||
|
||||
error_mask
|
||||
COULDNT_SEND
|
||||
timeout_check: some result has outcome COULDNT_SEND
|
||||
TOO_MANY_ERROR_RESULTS
|
||||
timeout_check: too many error results
|
||||
TOO_MANY_RESULTS
|
||||
timeout_check: too many results
|
||||
|
||||
timeout_check_time:
|
||||
nonzero
|
||||
(on creation)
|
||||
zero
|
||||
timeout_check: all results are OVER and validate_state = DONE
|
||||
</pre>
|
|
@ -0,0 +1,27 @@
|
|||
<title>Back ends and work sequences</title>
|
||||
<h2>Back ends and work sequences</h2>
|
||||
<p>
|
||||
Project that use work sequences will need two additional processes:
|
||||
<p>
|
||||
<table border=1 cellpadding=8>
|
||||
<tr>
|
||||
<th>Component</th>
|
||||
<th>BOINC-supplied part</th>
|
||||
<th>project-supplied part</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Work sequence relocater</b>:
|
||||
detects work sequences whose hosts have failed,
|
||||
and relocates them to other hosts.</td>
|
||||
<td valign=top>A program <b>seq_relocate</b></td>
|
||||
<td valign=top>Some parameters used by seq_relocate</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td valign=top><b>Work sequence validation and accounting</b>:
|
||||
Similar to result validation, but for work sequences.
|
||||
</td>
|
||||
<td valign=top>A program <b>seq_validate</b></td>
|
||||
<td valign=top>An application-specific function,
|
||||
linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
|
||||
</tr>
|
||||
</table>
|
|
@ -68,13 +68,11 @@ Developing and operating a BOINC project back end
|
|||
How to feed work into a BOINC system and collect the results.
|
||||
</font>
|
||||
<ul>
|
||||
<li> <a href=backend.html>The components of a project back end</a>
|
||||
<li> <a href=work_states.html>Workunit and result states</a>
|
||||
<li> <a href=backend_functions.html>Back end functions</a>
|
||||
<li> <a href=backend_state.html>Back end state transitions</a>
|
||||
<li> <a href=backend_programs.html>Back end programs</a>
|
||||
<li> <a href=tools_work.html>Generating work</a>
|
||||
<li> <a href=retry.html>Generating result retries</a>
|
||||
<li> <a href=validation.html>Result validation and accounting</a>
|
||||
<li> <a href=result_process.html>Assimilation</a>
|
||||
<li> <a href=garbage.html>File deletion</a>
|
||||
<li> <a href=backend_work_sequence.html>Back ends and work sequences</a>
|
||||
<li> <a href=back_end.html>Back end examples</a>
|
||||
</ul>
|
||||
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
<title>Garbage collection</title>
|
||||
<h2>Garbage collection</h2>
|
|
@ -1,66 +0,0 @@
|
|||
<title>Generating result retries</title>
|
||||
<h2>Generating result retries</h2>
|
||||
|
||||
<p>
|
||||
Hosts may fail to process and return results for various reasons;
|
||||
such results are said to be <b>lost</b>.
|
||||
A combination of lost and erroneous results may prevent
|
||||
finding canonical result for a workunit.
|
||||
The <b>result retry</b> mechanism generates additional
|
||||
results as needed to find a canonical result.
|
||||
|
||||
<p>
|
||||
The result retry mechanism has the following project-supplied parameters:
|
||||
<ul>
|
||||
<li> D<sub>WU</sub>: the expected delay (in seconds) between
|
||||
creating a WU and getting a canonical result.
|
||||
<li> D<sub>result</sub>: the expected delay (in seconds) between
|
||||
creating a result and getting a confirmation.
|
||||
<li> N<sub>Error</sub>: give up on a workunit if it gets this many error results
|
||||
(i.e., there must be a bug in the application).
|
||||
<li> N<sub>det</sub>: give up on a workunit if it gets this many
|
||||
non-error results without finding a canonical result
|
||||
(i.e., the algorithm must nondeterministic).
|
||||
<li> N<sub>redundancy</sub>: try to get at least this many non-error results.
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
Each workunit has a <b>retry check time</b>.
|
||||
This is initially set to now + D<sub>WU</sub>,
|
||||
and is set to zero if a canonical result is found for the WU.
|
||||
|
||||
<p>
|
||||
Each result has a <b>deadline</b>,
|
||||
a time by which a confirmation is expected for the result.
|
||||
This is initially set to now + D<sub>result</sub>,
|
||||
|
||||
<p>
|
||||
Retry generation is handled by the program <b>result_retry</b>, invoked as
|
||||
<pre>
|
||||
result_retry -appname name
|
||||
</pre>
|
||||
This program continually checks for workunits past their check time
|
||||
and without pending validation.
|
||||
For each such workunit, the program does the following:
|
||||
|
||||
<ul>
|
||||
<li> If any result is not sent, generate an error message,
|
||||
and give up on the WU (i.e., set its check time to zero).
|
||||
This condition indicates that either
|
||||
1) the resource requirements of the WU are too much for
|
||||
any host;
|
||||
2) there are insufficient hosts to handle the rate of work generation; or
|
||||
3) scheduling servers have been out of service.
|
||||
<li> If at least N<sub>error</sub> results have an error,
|
||||
generate an error message and give up on the WU.
|
||||
<li> If at least N<sub>det</sub> results are done,
|
||||
generate an error message and give up on the WU.
|
||||
<li> Generate N<sub>redundancy</sub> - n new results for the WU,
|
||||
where n is the number of results that are done.
|
||||
The deadline of these results is now + D<sub>result</sub>.
|
||||
<li> Set the check time of the WU to now + D<sub>WU</sub>
|
||||
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
Use crontab to run <b>result_retry</b> continuously.
|
|
@ -1,77 +0,0 @@
|
|||
<title>Validation of results and credit</title>
|
||||
<body bgcolor=ffffff>
|
||||
<h2>Validation of results and credit</h2>
|
||||
<p>
|
||||
Communication from the core client can easily be altered or forged.
|
||||
Some users will attempt to get undeserved credit
|
||||
by falsifying their CPU metrics or CPU times.
|
||||
Output files may be wrong.
|
||||
This can result from tampering or hardware failures.
|
||||
<p>
|
||||
Both problems - credit-cheating and wrong results - can be addressed
|
||||
by <b>redundant computing</b> and <b>result validation</b>.
|
||||
In this approach, each workunit is processed at least twice.
|
||||
The project back end waits until a minimum number of results have been returned,
|
||||
then compares the results and decides which are considered correct.
|
||||
The notion of equality of results,
|
||||
and the policy for deciding which are correct,
|
||||
are project-specific.
|
||||
<p>
|
||||
The back end then marks correct results as "validated",
|
||||
finds the minimum reported credit for the correct results of a given workunit,
|
||||
and assigns this amount of credit to all the correct results.
|
||||
This ensures that as long as a reasonable majority of participants
|
||||
don't falsify credit, almost all credit accounting will be correct.
|
||||
|
||||
<h3>The validation program</h3>
|
||||
|
||||
BOINC supplies a utility program <b>validate</b>
|
||||
to perform validation and credit-granting.
|
||||
This program must be linked with two project-specific functions:
|
||||
<pre>
|
||||
int check_set(vector<RESULT> results, int& canonicalid, double& credit);
|
||||
int check_pair(RESULT& r1, RESULT& r2, bool& match);
|
||||
</pre>
|
||||
<b>check_set()</b> takes a set of results.
|
||||
If there is sufficient agreement,
|
||||
it selects one of them as the "canonical" result
|
||||
(returning its ID) and also decides what credit should
|
||||
be granted for correct results for this workunit.
|
||||
<p>
|
||||
<b>check_pair()</b> compares two results and returns match=true
|
||||
if they agree.
|
||||
|
||||
<p>
|
||||
The file <b>validate_test.C</b> contains an example
|
||||
implementation of check_set() and check_pair().
|
||||
|
||||
|
||||
<hr>
|
||||
<h3>Implementation</h3>
|
||||
|
||||
The following database fields are used:
|
||||
|
||||
<p>
|
||||
<b>WORKUNIT</b>
|
||||
<dt> bool need_validate
|
||||
<dd>
|
||||
true iff this workunit has one or more results in state DONE
|
||||
and validate_state UNCHECKED
|
||||
<dt>
|
||||
int canonical_resultid
|
||||
<dd>
|
||||
nonzero if a conclusive check has been done for this WU;
|
||||
indicates the canonical result
|
||||
|
||||
<p>
|
||||
<b>RESULT</b>
|
||||
<dt>
|
||||
int state
|
||||
<dd> INACTIVE, ..., DONE, ERROR
|
||||
<dt>
|
||||
int validate_state
|
||||
<dd>
|
||||
NEED_CHECK,
|
||||
VALID,
|
||||
INVALID
|
||||
|
|
@ -2,6 +2,9 @@
|
|||
<body bgcolor=ffffff>
|
||||
<h2>Workunits</h2>
|
||||
<p>
|
||||
TODO: don't separate into XML/other parts;
|
||||
describe XML format only in the tools doc.
|
||||
<p>
|
||||
A <b>workunit</b> describes a computation to be performed.
|
||||
Workunits are maintained in the <b>workunit</b> table in the BOINC DB.
|
||||
The attributes of a workunit include:
|
||||
|
@ -13,13 +16,6 @@ The attributes of a workunit include:
|
|||
(see below).
|
||||
<li> The estimated resource requirements of the work unit
|
||||
(computation, memory, disk space).
|
||||
<li> The maximum processing
|
||||
(measured in <a href=credit.html>Cobblestones</a>)
|
||||
and maximum disk space to be used for the computation.
|
||||
An instance of the computation that exceeds either of these bounds
|
||||
will be aborted.
|
||||
This mechanism is used to prevent an infinite-loop bug from
|
||||
indefinitely incapacitating a host.
|
||||
<li> A <b>delay bound</b>: upper bound on how long (in real time)
|
||||
a result associated with this work unit should take to complete.
|
||||
This determines which hosts the workunit can be sent to,
|
||||
|
@ -49,8 +45,12 @@ The components are:
|
|||
<td>A list of environment variables in the form
|
||||
name=value&name=value&name=value.
|
||||
</td></tr>
|
||||
<tr><td><max_processing></td>
|
||||
<td>Maximum processing (in Cobblestones).
|
||||
<tr><td valign=top><max_processing></td>
|
||||
<td>Maximum processing
|
||||
(measured in <a href=credit.html>Cobblestones</a>).
|
||||
An instance of the computation that exceeds this bound will be aborted.
|
||||
This mechanism prevents an infinite-loop bug from
|
||||
indefinitely incapacitating a host.
|
||||
The default is determined by the client; typically it is 1.
|
||||
</td></tr>
|
||||
<tr><td><max_disk></td>
|
||||
|
|
|
@ -1,80 +0,0 @@
|
|||
<title>Workunit and result states</title>
|
||||
<h2>Workunit and result states</h2>
|
||||
|
||||
<p>
|
||||
The processing of workunits and results involves
|
||||
several independent activities.
|
||||
To keep track of these activities,
|
||||
workunit and result database records have several "state" fields,
|
||||
and their processing can be viewed as the combination
|
||||
of several finite-state machines.
|
||||
|
||||
<p>
|
||||
A workunit has the following state fields:
|
||||
<ul>
|
||||
<li>
|
||||
<b>main_state</b>:
|
||||
Initially INIT.
|
||||
If a canonical result is found (by validate)
|
||||
it transitions to DONE.
|
||||
If it's decided that no canonical result can be found
|
||||
(by result_retry)
|
||||
it transitions to ERROR.
|
||||
|
||||
<li>
|
||||
<b>file_delete_state</b>:
|
||||
Initially INIT.
|
||||
When the main state transitions to either DONE or ERROR,
|
||||
it transitions to READY,
|
||||
indicating that input files can be deleted.
|
||||
When file deletion is completed (by file_deleter)
|
||||
it transitions to DONE.
|
||||
|
||||
<li>
|
||||
<b>assimilate_state</b>:
|
||||
Initially INIT.
|
||||
When the main state transitions to either DONE or ERROR,
|
||||
it transitions to READY,
|
||||
indicating that the workunit can be assimilated.
|
||||
When assimplateion is completed (by assimilator)
|
||||
it transitions to DONE.
|
||||
|
||||
<li>
|
||||
<b>need_validate</b>:
|
||||
A boolean, true whenever
|
||||
the workunit has a result whose validate state is NEED_CHECK.
|
||||
The validate program sets it back to false.
|
||||
|
||||
</ul>
|
||||
|
||||
|
||||
<p>
|
||||
|
||||
A result has the following state fields:
|
||||
<ul>
|
||||
<li>
|
||||
<b>server_state</b>:
|
||||
Initially UNSENT.
|
||||
Becomes IN_PROGRESS when the result has been sent to a client.
|
||||
Becomes DONE or ERROR if the client returns the result,
|
||||
or TIMEOUT if result_retry gives up on the result.
|
||||
<li>
|
||||
<b>client_state</b>:
|
||||
Records the client state (upload, process, or download)
|
||||
where an error occurred.
|
||||
Not relevant here.
|
||||
<li>
|
||||
<b>file_delete_state</b>:
|
||||
Initially INIT.
|
||||
When the parent workunit becomes ERROR, becomes READY,
|
||||
indication that the output files can be deleted.
|
||||
When the parent workunit becomes DONE, becomes READY
|
||||
except for the canonical result, which remains INIT.
|
||||
When the output files have been deleted (by file_deleter) becomes DONE.
|
||||
<li>
|
||||
<b>validate_state</b>:
|
||||
Initially INITIAL.
|
||||
When the result is returned, becomes NEED_CHECK.
|
||||
When a canonical result has been found for the workunit,
|
||||
becomes either VALID or INVALID.
|
||||
</ul>
|
|
@ -112,22 +112,13 @@ function show_host($host) {
|
|||
|
||||
}
|
||||
|
||||
function main_state_str($s) {
|
||||
switch($s) {
|
||||
case 0: return "Initial";
|
||||
case 1: return "Done";
|
||||
case 2: return "Error";
|
||||
}
|
||||
}
|
||||
|
||||
function error_str($s) {
|
||||
switch($s) {
|
||||
case 0: return "No error";
|
||||
case 1: return "Send failed";
|
||||
case 2: return "Too many errors (may have bug)";
|
||||
case 3: return "Too many done (may be nondeterministic)";
|
||||
}
|
||||
return "unknown";
|
||||
function wu_error_mask_str($s) {
|
||||
$x = "";
|
||||
if ($s & 1) $x = $x."Couldn't send result; ";
|
||||
if ($s & 2) $x = $x."Too many errors (may have bug); ";
|
||||
if ($s & 4) $x = $x."Too many results (may be nondeterministic)";
|
||||
$x = $x."<br>";
|
||||
return $x;
|
||||
}
|
||||
|
||||
function assimilate_state_str($s) {
|
||||
|
@ -162,10 +153,9 @@ function show_workunit($wu,$show_xml_doc) {
|
|||
row("Need validate?", $wu->need_validate?"yes":"no");
|
||||
row("Canonical resultid", $wu->canonical_resultid);
|
||||
row("Canonical credit", $wu->canonical_credit);
|
||||
row("Retry check time", time_str($wu->retry_check_time));
|
||||
row("Timeout check time", time_str($wu->timeout_check_time));
|
||||
row("Delay bound", $wu->delay_bound);
|
||||
row("Main state", main_state_str($wu->main_state));
|
||||
row("Error", error_str($wu->error));
|
||||
row("Error mask", wu_error_mask_str($wu->error_mask));
|
||||
row("File delete state", file_delete_state_str($wu->file_delete_state));
|
||||
row("Assimilation state", assimilate_state_str($wu->assimilate_state));
|
||||
row("","<a href=db.php?show=result&wu_id=$wu->id>Show associated results</a>");
|
||||
|
@ -173,19 +163,28 @@ function show_workunit($wu,$show_xml_doc) {
|
|||
echo "<p>";
|
||||
}
|
||||
|
||||
function res_server_state_string($s) {
|
||||
function result_server_state_string($s) {
|
||||
switch($s) {
|
||||
case 1: return "Inactive";
|
||||
case 2: return "Unsent";
|
||||
case 3: return "In Progress";
|
||||
case 4: return "Done";
|
||||
case 5: return "Timeout";
|
||||
case 6: return "Error";
|
||||
case 4: return "In Progress";
|
||||
case 5: return "Over";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
function result_client_state_str($s) {
|
||||
function result_outcome_string($s) {
|
||||
switch($s) {
|
||||
case 1: return "Success";
|
||||
case 2: return "Couldn't send";
|
||||
case 3: return "Client error";
|
||||
case 4: return "No reply";
|
||||
case 5: return "Didn't need";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
function result_client_state_string($s) {
|
||||
switch($s) {
|
||||
case 1: return "Downloading";
|
||||
case 2: return "Computing";
|
||||
|
@ -197,9 +196,8 @@ function result_client_state_str($s) {
|
|||
function validate_state_str($s) {
|
||||
switch($s) {
|
||||
case 0: return "Initial";
|
||||
case 1: return "Need check";
|
||||
case 2: return "Valid";
|
||||
case 3: return "Invalid";
|
||||
case 1: return "Valid";
|
||||
case 2: return "Invalid";
|
||||
}
|
||||
return "unknown";
|
||||
}
|
||||
|
@ -213,10 +211,11 @@ function show_result($result,$show_xml_docs,$show_stderr,$show_times) {
|
|||
}
|
||||
row("Name", $result->name);
|
||||
row("Workunit", "<a href=db.php?show=workunit&id=$result->workunitid>" . wu_name_by_id($result->workunitid) . "</a>" );
|
||||
row("Server state", res_server_state_string($result->server_state));
|
||||
row("Deadline", time_str($result->report_deadline));
|
||||
row("Server state", result_server_state_string($result->server_state));
|
||||
row("Outcome", result_outcome_string($result->outcome));
|
||||
row("Client state", result_client_state_string($result->client_state));
|
||||
row("Host ID", "<a href=db.php?show=host&id=$result->hostid>" . host_name_by_id($result->hostid) . "</a>");
|
||||
row("Client state", result_client_state_str($result->client_state));
|
||||
row("Report deadline", time_str($result->report_deadline));
|
||||
row("CPU time", $result->cpu_time);
|
||||
if ($show_xml_docs) {
|
||||
row("XML doc in", "<pre>".htmlspecialchars($result->xml_doc_in)."</pre>");
|
||||
|
|
|
@ -66,7 +66,7 @@
|
|||
|
||||
if (strlen($result_state) && $result_state != 0) {
|
||||
$query = append_sql_query( $query, "server_state = $result_state", $first );
|
||||
$english_query = append_sql_query( $english_query, "server state is ".res_server_state_string($result_state), $first );
|
||||
$english_query = append_sql_query( $english_query, "server state is ".result_server_state_string($result_state), $first );
|
||||
$rstate = $result_state;
|
||||
$first = 0;
|
||||
} else {
|
||||
|
@ -172,7 +172,7 @@
|
|||
. "<option value=\"0\"" . ($rstate == 0 ? "selected" : "") . "> All\n"
|
||||
);
|
||||
for( $i=1;$i<=6;$i++ ) {
|
||||
printf( "<option value=\"$i\"" . ($rstate == $i ? "selected" : "") . ">" . res_server_state_string($i) . "\n" );
|
||||
printf( "<option value=\"$i\"" . ($rstate == $i ? "selected" : "") . ">" . result_server_state_string($i) . "\n" );
|
||||
}
|
||||
printf( "</select>\n<p>\n" );
|
||||
print_text_field( "Result in batch number:", "batch", $batch );
|
||||
|
|
|
@ -77,6 +77,7 @@ FILE_DELETER_OBJS = \
|
|||
|
||||
ASSIMILATOR_OBJS = \
|
||||
assimilator.o \
|
||||
assimilate_handler.o \
|
||||
config.o \
|
||||
../db/db_mysql.o \
|
||||
../db/mysql_util.o \
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
#include <stdio.h>
|
||||
|
||||
#include "db.h"
|
||||
#include "assimilate_handler.h"
|
||||
|
||||
void assimilate_handler(
|
||||
WORKUNIT& wu, vector<RESULT>& results, RESULT& canonical_result
|
||||
) {
|
||||
printf("assimilating WU %s\n", wu.name);
|
||||
if (wu.canonical_resultid) {
|
||||
printf("canonical result:\n%s", canonical_result.xml_doc_out);
|
||||
} else {
|
||||
printf("no canonical result\n");
|
||||
}
|
||||
if (wu.error_mask&WU_ERROR_COULDNT_SEND_RESULT) {
|
||||
printf("Error: couldn't send a result\n");
|
||||
}
|
||||
if (wu.error_mask&WU_ERROR_TOO_MANY_ERROR_RESULTS) {
|
||||
printf("Error: too many error results\n");
|
||||
}
|
||||
if (wu.error_mask&WU_ERROR_TOO_MANY_RESULTS) {
|
||||
printf("Error: too many total results\n");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,4 @@
|
|||
#include <vector>
|
||||
#include "db.h"
|
||||
|
||||
extern void assimilate_handler(WORKUNIT&, vector<RESULT>&, RESULT&);
|
|
@ -21,10 +21,12 @@
|
|||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <vector>
|
||||
|
||||
#include "db.h"
|
||||
#include "parse.h"
|
||||
#include "config.h"
|
||||
#include "assimilate_handler.h"
|
||||
|
||||
CONFIG config;
|
||||
|
||||
|
@ -35,48 +37,75 @@ void write_log(char* p) {
|
|||
fprintf(stderr, "%s: %s", timestr, p);
|
||||
}
|
||||
|
||||
// assimilate all WUs that need it
|
||||
// return nonzero if did anything
|
||||
//
|
||||
bool do_pass(APP app) {
|
||||
bool do_pass(APP& app) {
|
||||
WORKUNIT wu;
|
||||
RESULT result;
|
||||
bool did_something = false;
|
||||
int retval;
|
||||
RESULT canonical_result, result;
|
||||
vector<RESULT> results;
|
||||
bool did_something = false, delete_inputs, delete_outputs;
|
||||
char buf[MAX_BLOB_SIZE];
|
||||
unsigned int i;
|
||||
|
||||
wu.appid = app.id;
|
||||
wu.assimilate_state = ASSIMILATE_READY;
|
||||
while (!db_workunit_enum_app_assimilate_state(wu)) {
|
||||
did_something = true;
|
||||
|
||||
sprintf(buf, "Assimilating WU %s, assim state %d\n", wu.name, wu.assimilate_state);
|
||||
sprintf(buf,
|
||||
"Assimilating WU %s, assim state %d\n",
|
||||
wu.name, wu.assimilate_state
|
||||
);
|
||||
write_log(buf);
|
||||
|
||||
switch(wu.main_state) {
|
||||
case WU_MAIN_STATE_INIT:
|
||||
write_log("ERROR; WU shouldn't be in init state\n");
|
||||
break;
|
||||
case WU_MAIN_STATE_DONE:
|
||||
if (!wu.canonical_resultid) {
|
||||
write_log("ERROR: canonical resultid zero\n");
|
||||
break;
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
results.push_back(result);
|
||||
if (result.id == wu.canonical_resultid) {
|
||||
canonical_result = result;
|
||||
}
|
||||
retval = db_result(wu.canonical_resultid, result);
|
||||
if (retval) {
|
||||
write_log("can't get canonical result\n");
|
||||
break;
|
||||
}
|
||||
sprintf(buf, "canonical result for WU %s:\n%s", wu.name, result.xml_doc_out);
|
||||
write_log(buf);
|
||||
|
||||
result.file_delete_state = FILE_DELETE_READY;
|
||||
db_result_update(result);
|
||||
break;
|
||||
case WU_MAIN_STATE_ERROR:
|
||||
printf("WU %s had an error\n", wu.name);
|
||||
break;
|
||||
}
|
||||
|
||||
assimilate_handler(wu, results, canonical_result);
|
||||
|
||||
delete_outputs = true;
|
||||
delete_inputs = true;
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
if (result.server_state != RESULT_SERVER_STATE_OVER
|
||||
|| (result.outcome != RESULT_OUTCOME_SUCCESS && result.outcome != RESULT_OUTCOME_CLIENT_ERROR)
|
||||
) {
|
||||
delete_outputs = false;
|
||||
}
|
||||
if (result.server_state != RESULT_SERVER_STATE_OVER) {
|
||||
delete_inputs = false;
|
||||
}
|
||||
}
|
||||
|
||||
if (delete_outputs) {
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
result.file_delete_state = FILE_DELETE_READY;
|
||||
db_result_update(result);
|
||||
}
|
||||
} else {
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
if (result.server_state == RESULT_SERVER_STATE_OVER
|
||||
&& result.id != wu.canonical_resultid
|
||||
&& (result.outcome == RESULT_OUTCOME_SUCCESS || result.outcome == RESULT_OUTCOME_CLIENT_ERROR)
|
||||
) {
|
||||
result.file_delete_state = FILE_DELETE_READY;
|
||||
db_result_update(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
wu.assimilate_state = ASSIMILATE_DONE;
|
||||
if (delete_inputs) {
|
||||
wu.file_delete_state = FILE_DELETE_READY;
|
||||
}
|
||||
db_workunit_update(wu);
|
||||
}
|
||||
return did_something;
|
||||
|
|
|
@ -332,12 +332,13 @@ int handle_results(
|
|||
|
||||
for (i=0; i<sreq.results.size(); i++) {
|
||||
rp = &sreq.results[i];
|
||||
|
||||
// acknowledge the result even if we couldn't find it --
|
||||
// don't want it to keep coming back
|
||||
//
|
||||
reply.result_acks.push_back(*rp);
|
||||
|
||||
sprintf(buf, "got ack for result %s\n", rp->name);
|
||||
sprintf(buf, "got result %s\n", rp->name);
|
||||
write_log(buf);
|
||||
|
||||
strncpy(result.name, rp->name, sizeof(result.name));
|
||||
|
@ -347,7 +348,8 @@ int handle_results(
|
|||
write_log(buf);
|
||||
continue;
|
||||
}
|
||||
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
|
||||
|
||||
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
|
||||
sprintf(buf,
|
||||
"got unexpected result for %s: server state is %d\n",
|
||||
rp->name, result.server_state
|
||||
|
@ -355,6 +357,12 @@ int handle_results(
|
|||
write_log(buf);
|
||||
continue;
|
||||
}
|
||||
if (result.server_state == RESULT_SERVER_STATE_OVER) {
|
||||
result.file_delete_state = FILE_DELETE_READY;
|
||||
}
|
||||
if (result.server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
}
|
||||
|
||||
if (result.hostid != sreq.hostid) {
|
||||
sprintf(buf,
|
||||
|
@ -365,22 +373,32 @@ int handle_results(
|
|||
continue;
|
||||
}
|
||||
|
||||
// TODO: handle error returns
|
||||
//
|
||||
result.hostid = reply.host.id;
|
||||
result.received_time = time(0);
|
||||
result.client_state = rp->client_state;
|
||||
result.cpu_time = rp->cpu_time;
|
||||
result.claimed_credit = result.cpu_time * host.credit_per_cpu_sec;
|
||||
result.validate_state = VALIDATE_STATE_NEED_CHECK;
|
||||
if (result.client_state != CLIENT_DONE) {
|
||||
result.validate_state = VALIDATE_STATE_INVALID;
|
||||
//so we won't try to validate this result anymore
|
||||
result.server_state = RESULT_SERVER_STATE_ERROR;
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
if (result.client_state == CLIENT_DONE) {
|
||||
result.outcome = RESULT_OUTCOME_SUCCESS;
|
||||
retval = db_workunit(result.workunitid, wu);
|
||||
if (retval) {
|
||||
sprintf(buf,
|
||||
"can't find WU %d for result %d\n",
|
||||
result.workunitid, result.id
|
||||
);
|
||||
write_log(buf);
|
||||
} else {
|
||||
wu.need_validate = 1;
|
||||
retval = db_workunit_update(wu);
|
||||
if (retval) {
|
||||
write_log("Can't update WU\n");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
result.server_state = RESULT_SERVER_STATE_DONE;
|
||||
result.outcome = RESULT_OUTCOME_CLIENT_ERROR;
|
||||
result.validate_state = VALIDATE_STATE_INVALID;
|
||||
}
|
||||
|
||||
|
||||
strncpy(result.stderr_out, rp->stderr_out, sizeof(result.stderr_out));
|
||||
strncpy(result.xml_doc_out, rp->xml_doc_out, sizeof(result.xml_doc_out));
|
||||
|
@ -390,20 +408,6 @@ int handle_results(
|
|||
write_log(buf);
|
||||
}
|
||||
|
||||
retval = db_workunit(result.workunitid, wu);
|
||||
if (retval) {
|
||||
sprintf(buf,
|
||||
"can't find WU %d for result %d\n",
|
||||
result.workunitid, result.id
|
||||
);
|
||||
write_log(buf);
|
||||
} else {
|
||||
wu.need_validate = 1;
|
||||
retval = db_workunit_update(wu);
|
||||
if (retval) {
|
||||
write_log("Can't update WU\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -487,6 +491,7 @@ int send_work(
|
|||
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
|
||||
result.hostid = reply.host.id;
|
||||
result.sent_time = time(0);
|
||||
result.report_deadline = result.sent_time + wu.delay_bound;
|
||||
db_result_update(result);
|
||||
|
||||
nresults++;
|
||||
|
|
|
@ -17,9 +17,12 @@
|
|||
// Contributor(s):
|
||||
//
|
||||
|
||||
// result_retry - create new results to make up for lost ones
|
||||
// timeout_check - do various time-based tasks
|
||||
// - time out results
|
||||
// - create new results to make up for lost ones
|
||||
// - check for WU error conditions
|
||||
//
|
||||
// result_retry
|
||||
// timeout_check
|
||||
// -app appname
|
||||
// [ -nerror n ] if get this many errors, bail on WU
|
||||
// [ -ndet n ] if get this results w/o consensus, bail
|
||||
|
@ -155,95 +158,92 @@ int assign_new_names(char* in) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
bool do_pass(APP& app) {
|
||||
WORKUNIT wu;
|
||||
void handle_wu(WORKUNIT& wu) {
|
||||
vector<RESULT> results;
|
||||
RESULT result;
|
||||
int nerrors, ndone, retval;
|
||||
unsigned int i, n;
|
||||
bool did_something = false;
|
||||
char buf[256];
|
||||
unsigned int now = time(0);
|
||||
bool wu_error = false, all_over;
|
||||
|
||||
wu.retry_check_time = time(0);
|
||||
wu.appid = app.id;
|
||||
|
||||
// loop over WUs that are due to be checked
|
||||
// scan the results for the WU
|
||||
//
|
||||
while (!db_workunit_enum_retry_check_time(wu)) {
|
||||
vector<RESULT> results;
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
results.push_back(result);
|
||||
}
|
||||
|
||||
did_something = true;
|
||||
// if this WU has a canonical result, we're done
|
||||
// (this normally doesn't happen since the retry check time
|
||||
// is zeroed when canonical result found, but just in case).
|
||||
//
|
||||
if (wu.canonical_resultid) {
|
||||
wu.retry_check_time = 0;
|
||||
goto update_wu;
|
||||
nerrors = 0;
|
||||
ndone = 0;
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
|
||||
switch (result.server_state) {
|
||||
case RESULT_SERVER_STATE_IN_PROGRESS:
|
||||
if (result.report_deadline < now) {
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
result.outcome = RESULT_OUTCOME_NO_REPLY;
|
||||
db_result_update(result);
|
||||
}
|
||||
break;
|
||||
case RESULT_SERVER_STATE_OVER:
|
||||
switch (result.outcome) {
|
||||
case RESULT_OUTCOME_COULDNT_SEND:
|
||||
sprintf(buf, "WU %s has couldn't-send result\n", wu.name);
|
||||
write_log(buf);
|
||||
wu.error_mask |= WU_ERROR_COULDNT_SEND_RESULT;
|
||||
wu_error = true;
|
||||
break;
|
||||
case RESULT_OUTCOME_SUCCESS:
|
||||
ndone++;
|
||||
break;
|
||||
default:
|
||||
nerrors++;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// enumerate all the results for the WU
|
||||
//
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
results.push_back(result);
|
||||
}
|
||||
// check for too many errors or too many results
|
||||
//
|
||||
if (nerrors > max_errors) {
|
||||
sprintf(buf, "WU %s has too many errors\n", wu.name);
|
||||
write_log(buf);
|
||||
wu.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS;
|
||||
wu_error = true;
|
||||
}
|
||||
if (ndone > max_done) {
|
||||
sprintf(buf, "WU %s has too many answers\n", wu.name);
|
||||
write_log(buf);
|
||||
wu.error_mask |= WU_ERROR_TOO_MANY_RESULTS;
|
||||
wu_error = true;
|
||||
}
|
||||
|
||||
nerrors = 0;
|
||||
ndone = 0;
|
||||
// if this WU had an error, don't send any unsent results
|
||||
//
|
||||
if (wu_error) {
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
|
||||
// if any result is unsent, give up on the WU
|
||||
//
|
||||
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
|
||||
sprintf(buf, "WU %s has unsent result\n", wu.name);
|
||||
write_log(buf);
|
||||
wu.main_state = WU_MAIN_STATE_ERROR;
|
||||
wu.error = SEND_FAIL;
|
||||
wu.file_delete_state = FILE_DELETE_READY;
|
||||
wu.assimilate_state = ASSIMILATE_READY;
|
||||
wu.retry_check_time = 0;
|
||||
goto update_wu;
|
||||
}
|
||||
if (result.server_state == RESULT_SERVER_STATE_ERROR) {
|
||||
nerrors++;
|
||||
}
|
||||
if (result.server_state == RESULT_SERVER_STATE_DONE) {
|
||||
ndone++;
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
result.outcome = RESULT_OUTCOME_DIDNT_NEED;
|
||||
db_result_update(result);
|
||||
}
|
||||
}
|
||||
|
||||
// it too many errors or too many different results, bail
|
||||
//
|
||||
if (nerrors > max_errors) {
|
||||
sprintf(buf, "WU %s has too many errors\n", wu.name);
|
||||
write_log(buf);
|
||||
wu.main_state = WU_MAIN_STATE_ERROR;
|
||||
wu.error = TOO_MANY_ERRORS;
|
||||
wu.file_delete_state = FILE_DELETE_READY;
|
||||
if (wu.assimilate_state == ASSIMILATE_INIT) {
|
||||
wu.assimilate_state = ASSIMILATE_READY;
|
||||
wu.retry_check_time = 0;
|
||||
goto update_wu;
|
||||
}
|
||||
if (ndone > max_done) {
|
||||
sprintf(buf, "WU %s has too many answers\n", wu.name);
|
||||
write_log(buf);
|
||||
wu.main_state = WU_MAIN_STATE_ERROR;
|
||||
wu.error = TOO_MANY_DONE;
|
||||
wu.file_delete_state = FILE_DELETE_READY;
|
||||
wu.assimilate_state = ASSIMILATE_READY;
|
||||
wu.retry_check_time = 0;
|
||||
goto update_wu;
|
||||
}
|
||||
|
||||
// Generate new results if needed.
|
||||
} else {
|
||||
// If no error, generate new results if needed.
|
||||
// Munge the XML of an existing result
|
||||
// to create unique new output filenames.
|
||||
//
|
||||
if (nredundancy > ndone) {
|
||||
n = nredundancy - ndone;
|
||||
|
||||
for (i=0; i<n; i++) {
|
||||
for (i=0; i<n; i++) {
|
||||
result = results[0];
|
||||
make_unique_name(result.name);
|
||||
initialize_result(result, wu);
|
||||
|
@ -258,17 +258,46 @@ bool do_pass(APP& app) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// update the WU's result retry check time
|
||||
//
|
||||
wu.retry_check_time = time(0) + wu.delay_bound;
|
||||
update_wu:
|
||||
retval = db_workunit_update(wu);
|
||||
if (retval) {
|
||||
sprintf(buf, "db_workunit_update %d\n", retval);
|
||||
write_log(buf);
|
||||
// see if all results are OVER and result is assimilated;
|
||||
// if so we don't need to check this WU ever again.
|
||||
//
|
||||
all_over = true;
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
if (result.server_state != RESULT_SERVER_STATE_OVER) {
|
||||
all_over = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (all_over && wu.assimilate_state == ASSIMILATE_DONE) {
|
||||
wu.file_delete_state = FILE_DELETE_READY;
|
||||
wu.timeout_check_time = 0;
|
||||
} else {
|
||||
wu.timeout_check_time = now + wu.delay_bound;
|
||||
}
|
||||
|
||||
retval = db_workunit_update(wu);
|
||||
if (retval) {
|
||||
sprintf(buf, "db_workunit_update %d\n", retval);
|
||||
write_log(buf);
|
||||
}
|
||||
}
|
||||
|
||||
bool do_pass(APP& app) {
|
||||
WORKUNIT wu;
|
||||
bool did_something = false;
|
||||
|
||||
// loop over WUs that are due to be checked
|
||||
//
|
||||
wu.timeout_check_time = time(0);
|
||||
wu.appid = app.id;
|
||||
while (!db_workunit_enum_timeout_check_time(wu)) {
|
||||
did_something = true;
|
||||
handle_wu(wu);
|
||||
}
|
||||
return did_something;
|
||||
}
|
||||
|
||||
|
|
179
sched/validate.C
179
sched/validate.C
|
@ -122,43 +122,37 @@ int grant_credit(RESULT& result, double credit) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// make one pass through the workunits with need_validate set.
|
||||
// return true if there were any
|
||||
//
|
||||
bool do_validate_scan(APP& app, int min_quorum) {
|
||||
WORKUNIT wu;
|
||||
void handle_wu(WORKUNIT& wu) {
|
||||
RESULT result, canonical_result;
|
||||
bool found=false, match;
|
||||
bool match, update_result;
|
||||
int retval, canonicalid;
|
||||
double credit;
|
||||
unsigned int i;
|
||||
char buf[256];
|
||||
|
||||
wu.appid = app.id;
|
||||
while(!db_workunit_enum_app_need_validate(wu)) {
|
||||
found = true;
|
||||
if (wu.canonical_resultid) {
|
||||
sprintf(buf,
|
||||
"validating WU %s; already have canonical result\n", wu.name
|
||||
);
|
||||
write_log(buf);
|
||||
if (wu.canonical_resultid) {
|
||||
sprintf(buf,
|
||||
"validating WU %s; already have canonical result\n", wu.name
|
||||
);
|
||||
write_log(buf);
|
||||
|
||||
// Here if WU already has a canonical result.
|
||||
// Get unchecked results and see if they match the canonical result
|
||||
//
|
||||
retval = db_result(wu.canonical_resultid, canonical_result);
|
||||
if (retval) {
|
||||
write_log("can't read canonical result\n");
|
||||
continue;
|
||||
}
|
||||
// Here if WU already has a canonical result.
|
||||
// Get unchecked results and see if they match the canonical result
|
||||
//
|
||||
retval = db_result(wu.canonical_resultid, canonical_result);
|
||||
if (retval) {
|
||||
write_log("can't read canonical result\n");
|
||||
return;
|
||||
}
|
||||
|
||||
// scan this WU's results, and check any that need checking
|
||||
//
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
if (result.validate_state != VALIDATE_STATE_NEED_CHECK) {
|
||||
continue;
|
||||
}
|
||||
// scan this WU's results, and check the unchecked ones
|
||||
//
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
if (result.validate_state == VALIDATE_STATE_INIT
|
||||
&& result.server_state == RESULT_SERVER_STATE_OVER
|
||||
&& result.outcome == RESULT_OUTCOME_SUCCESS
|
||||
) {
|
||||
retval = check_pair(result, canonical_result, match);
|
||||
if (retval) {
|
||||
sprintf(buf,
|
||||
|
@ -188,55 +182,66 @@ bool do_validate_scan(APP& app, int min_quorum) {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Here if WU doesn't have a canonical result yet.
|
||||
// Try to get one
|
||||
}
|
||||
} else {
|
||||
vector<RESULT> results;
|
||||
|
||||
sprintf(buf, "validating WU %s; no canonical result\n", wu.name);
|
||||
write_log(buf);
|
||||
// Here if WU doesn't have a canonical result yet.
|
||||
// Try to get one
|
||||
|
||||
vector<RESULT> results;
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
if (result.server_state == RESULT_SERVER_STATE_DONE) {
|
||||
results.push_back(result);
|
||||
}
|
||||
sprintf(buf, "validating WU %s; no canonical result\n", wu.name);
|
||||
write_log(buf);
|
||||
|
||||
result.workunitid = wu.id;
|
||||
while (!db_result_enum_wuid(result)) {
|
||||
if (result.server_state == RESULT_SERVER_STATE_OVER
|
||||
&& result.outcome == RESULT_OUTCOME_SUCCESS
|
||||
) {
|
||||
results.push_back(result);
|
||||
}
|
||||
sprintf(buf, "found %d results\n", results.size());
|
||||
write_log(buf);
|
||||
if (results.size() >= (unsigned int)min_quorum) {
|
||||
retval = check_set(results, canonicalid, credit);
|
||||
if (!retval && canonicalid) {
|
||||
write_log("found a canonical result\n");
|
||||
wu.canonical_resultid = canonicalid;
|
||||
wu.canonical_credit = credit;
|
||||
wu.main_state = WU_MAIN_STATE_DONE;
|
||||
wu.file_delete_state = FILE_DELETE_READY;
|
||||
wu.assimilate_state = ASSIMILATE_READY;
|
||||
for (i=0; i<results.size(); i++) {
|
||||
}
|
||||
sprintf(buf, "found %d successful results\n", results.size());
|
||||
write_log(buf);
|
||||
if (results.size() >= (unsigned int)min_quorum) {
|
||||
retval = check_set(results, canonicalid, credit);
|
||||
if (!retval && canonicalid) {
|
||||
write_log("found a canonical result\n");
|
||||
wu.canonical_resultid = canonicalid;
|
||||
wu.canonical_credit = credit;
|
||||
wu.assimilate_state = ASSIMILATE_READY;
|
||||
for (i=0; i<results.size(); i++) {
|
||||
result = results[i];
|
||||
update_result = false;
|
||||
|
||||
// if result is not canonical, arrange to delete
|
||||
// its output files
|
||||
//
|
||||
if (results[i].id != canonicalid) {
|
||||
results[i].file_delete_state = FILE_DELETE_READY;
|
||||
// grant credit for valid results
|
||||
//
|
||||
if (result.validate_state == VALIDATE_STATE_VALID) {
|
||||
update_result = true;
|
||||
retval = grant_credit(result, credit);
|
||||
if (retval) {
|
||||
sprintf(buf,
|
||||
"validate: grant_credit %d\n", retval
|
||||
);
|
||||
write_log(buf);
|
||||
}
|
||||
|
||||
// grant credit for valid results
|
||||
//
|
||||
if (results[i].validate_state == VALIDATE_STATE_VALID) {
|
||||
retval = grant_credit(results[i], credit);
|
||||
if (retval) {
|
||||
sprintf(buf,
|
||||
"validate: grant_credit %d\n", retval
|
||||
);
|
||||
write_log(buf);
|
||||
}
|
||||
results[i].granted_credit = credit;
|
||||
}
|
||||
sprintf(buf, "updating result %d to %d; credit %f\n", results[i].id, results[i].validate_state, credit);
|
||||
result.granted_credit = credit;
|
||||
sprintf(buf,
|
||||
"updating result %d to %d; credit %f\n",
|
||||
result.id, result.validate_state, credit
|
||||
);
|
||||
write_log(buf);
|
||||
retval = db_result_update(results[i]);
|
||||
}
|
||||
|
||||
// don't send any unsent results
|
||||
//
|
||||
if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
|
||||
update_result = true;
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
result.outcome = RESULT_OUTCOME_DIDNT_NEED;
|
||||
}
|
||||
|
||||
if (update_result) {
|
||||
retval = db_result_update(result);
|
||||
if (retval) {
|
||||
sprintf(buf,
|
||||
"validate: db_result_update %d\n", retval
|
||||
|
@ -247,15 +252,29 @@ bool do_validate_scan(APP& app, int min_quorum) {
|
|||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// we've checked all results for this WU, so turn off flag
|
||||
//
|
||||
wu.need_validate = 0;
|
||||
retval = db_workunit_update(wu);
|
||||
if (retval) {
|
||||
sprintf(buf, "db_workunit_update: %d\n", retval);
|
||||
write_log(buf);
|
||||
}
|
||||
// we've checked all results for this WU, so turn off flag
|
||||
//
|
||||
wu.need_validate = 0;
|
||||
retval = db_workunit_update(wu);
|
||||
if (retval) {
|
||||
sprintf(buf, "db_workunit_update: %d\n", retval);
|
||||
write_log(buf);
|
||||
}
|
||||
}
|
||||
|
||||
// make one pass through the workunits with need_validate set.
|
||||
// return true if there were any
|
||||
//
|
||||
bool do_validate_scan(APP& app, int min_quorum) {
|
||||
WORKUNIT wu;
|
||||
bool found=false;
|
||||
|
||||
wu.appid = app.id;
|
||||
while(!db_workunit_enum_app_need_validate(wu)) {
|
||||
handle_wu(wu);
|
||||
found = true;
|
||||
}
|
||||
return found;
|
||||
}
|
||||
|
|
|
@ -10,3 +10,6 @@
|
|||
#
|
||||
get_load_out:machine load:3
|
||||
count_results_out:Number of results:3
|
||||
assimilated_wus_out:Number of assimilated workunits:3
|
||||
download_size_out:Bytes in download dir:3
|
||||
upload_size_out:Bytes in upload dir:3
|
||||
|
|
|
@ -424,12 +424,17 @@ class Project {
|
|||
PassThru("cp $source_dir/stripchart/stripchart $this->project_dir/cgi/");
|
||||
PassThru("cp $source_dir/stripchart/stripchart.cnf $this->project_dir/cgi/");
|
||||
PassThru("cp $source_dir/stripchart/samples/looper $this->project_dir/cgi/");
|
||||
PassThru("cp $source_dir/stripchart/samples/db_looper $this->project_dir/cgi/");
|
||||
PassThru("cp $source_dir/stripchart/samples/datafiles $this->project_dir/cgi/");
|
||||
PassThru("cp $source_dir/stripchart/samples/get_load $this->project_dir/cgi/");
|
||||
macro_substitute("BOINC_DB_NAME", $this->db_name, "$source_dir/stripchart/samples/count_results", "$this->project_dir/cgi/count_results");
|
||||
make_executable("$this->project_dir/cgi/count_results");
|
||||
PassThru("cp $source_dir/stripchart/samples/dir_size $this->project_dir/cgi/");
|
||||
macro_substitute("BOINC_DB_NAME", $this->db_name, "$source_dir/stripchart/samples/db_count", "$this->project_dir/cgi/db_count");
|
||||
make_executable("$this->project_dir/cgi/db_count");
|
||||
PassThru("cd $this->project_dir/cgi; looper get_load 1 > get_load_out &");
|
||||
PassThru("cd $this->project_dir/cgi; looper count_results 1 > count_results_out &");
|
||||
PassThru("cd $this->project_dir/cgi; db_looper 'result' 1 > count_results_out &");
|
||||
PassThru("cd $this->project_dir/cgi; db_looper 'workunit where assimilate_state=2' 1 > assimilated_wus_out &");
|
||||
PassThru("cd $this->project_dir/cgi; looper 'dir_size ../download' 1 > download_size_out &");
|
||||
PassThru("cd $this->project_dir/cgi; looper 'dir_size ../upload' 1 > upload_size_out &");
|
||||
}
|
||||
|
||||
// this should stop the feeder and any other daemons
|
||||
|
|
|
@ -33,16 +33,9 @@
|
|||
$project->start_feeder();
|
||||
$project->start_make_work($work);
|
||||
$project->start_validate($app, 3);
|
||||
//$project->start_file_delete();
|
||||
$project->start_file_delete();
|
||||
$project->start_assimilator($app);
|
||||
$project->start_stripchart();
|
||||
$host->run();
|
||||
//$project->stop();
|
||||
|
||||
//$result->state = RESULT_STATE_DONE;
|
||||
//$result->stderr_out = "APP: upper_case: starting, argc 1";
|
||||
//$result->exit_status = 0;
|
||||
//$project->check_results(2, $result);
|
||||
//$project->compare_file("uc_wu_0_0", "uc_correct_output");
|
||||
//$project->compare_file("uc_wu_1_0", "uc_correct_output");
|
||||
$project->stop();
|
||||
?>
|
||||
|
|
|
@ -153,15 +153,16 @@ void initialize_result(RESULT& result, WORKUNIT& wu) {
|
|||
result.workunitid = wu.id;
|
||||
result.server_state = RESULT_SERVER_STATE_UNSENT;
|
||||
result.hostid = 0;
|
||||
result.report_deadline = time(0) + wu.delay_bound;
|
||||
result.report_deadline = 0;
|
||||
result.sent_time = 0;
|
||||
result.received_time = 0;
|
||||
result.client_state = 0;
|
||||
result.cpu_time = 0;
|
||||
strcpy(result.xml_doc_out, "");
|
||||
strcpy(result.stderr_out, "");
|
||||
result.outcome = RESULT_OUTCOME_INIT;
|
||||
result.file_delete_state = ASSIMILATE_INIT;
|
||||
result.validate_state = VALIDATE_STATE_INITIAL;
|
||||
result.validate_state = VALIDATE_STATE_INIT;
|
||||
result.claimed_credit = 0;
|
||||
result.granted_credit = 0;
|
||||
}
|
||||
|
|
|
@ -144,7 +144,7 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
|
||||
wu.appid = app.id;
|
||||
wu.retry_check_time = time(0) + wu.delay_bound;
|
||||
wu.timeout_check_time = time(0) + wu.delay_bound;
|
||||
|
||||
retval = read_key_file(keyfile, key);
|
||||
if (retval) {
|
||||
|
|
Loading…
Reference in New Issue