From 46f84c5bbf47529c8882cb17473218b23c0ba78e Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 23 Jan 2003 08:07:48 +0000 Subject: [PATCH] file deletion and state svn path=/trunk/boinc/; revision=823 --- checkin_notes | 48 +++++++ db/db.h | 67 +++++----- db/db_mysql.C | 32 ++--- db/schema.sql | 8 +- doc/backend.html | 99 --------------- doc/backend_functions.html | 44 +++++++ doc/backend_programs.html | 207 +++++++++++++++++++++++++++++++ doc/backend_state.html | 220 +++++++++++++++++++++++++++++++++ doc/backend_work_sequence.html | 27 ++++ doc/create_project.html | 10 +- doc/garbage.html | 2 - doc/retry.html | 66 ---------- doc/validation.html | 77 ------------ doc/work.html | 18 +-- doc/work_states.html | 80 ------------ html/ops/db.inc | 61 +++++---- html/ops/db.php | 4 +- sched/Makefile.in | 1 + sched/assimilate_handler.C | 24 ++++ sched/assimilate_handler.h | 4 + sched/assimilator.C | 83 +++++++++---- sched/handle_request.C | 55 +++++---- sched/result_retry.C | 181 +++++++++++++++------------ sched/validate.C | 179 +++++++++++++++------------ stripchart/samples/datafiles | 3 + test/test.inc | 11 +- test/test_loop.php | 11 +- tools/backend_lib.C | 5 +- tools/create_work.C | 2 +- 29 files changed, 985 insertions(+), 644 deletions(-) delete mode 100644 doc/backend.html create mode 100644 doc/backend_functions.html create mode 100644 doc/backend_programs.html create mode 100644 doc/backend_state.html create mode 100644 doc/backend_work_sequence.html delete mode 100644 doc/garbage.html delete mode 100644 doc/retry.html delete mode 100644 doc/validation.html delete mode 100644 doc/work_states.html create mode 100644 sched/assimilate_handler.C create mode 100644 sched/assimilate_handler.h diff --git a/checkin_notes b/checkin_notes index adbca00a1b..0060380c98 100755 --- a/checkin_notes +++ b/checkin_notes @@ -2981,3 +2981,51 @@ David Jan 15 2003 test.inc tools/ backend_lib.C + +David Jan 22 2003 + - Changed some of the state fields of workunit and result, + to accommodate new policies about when to delete files + See doc/backend_* for details + - Split off dummy assimilation function from the main program. + - Added new stripchart scripts for logging DB counts + and directory sizes + + NOTE: it looks like shared memory segment is at same address + in feeder and cgi. So it's OK to use pointers in it. + + db/ + db.h + db_mysql.C + schema.sql + doc/ + backend.html (removed) + backend_functions.html (new) + backend_programs.html (new) + backend_state.html (new) + backend_work_sequence.html (new) + create_project.html + garbage.html (removed) + retry.html (removed) + work.html + work_states.html (removed) + html_ops/ + db.inc + db.php + sched/ + Makefile.in + assimilate_handler.C,h (new) + assimilator.C + handle_request.C + result_retry.C + validate.C + strichart/samples/ + datafiles + db_count (new) + db_looper (new) + dir_size (new) + test/ + test.inc + test_loop.php + tools/ + backend_lib.C + create_work.C diff --git a/db/db.h b/db/db.h index 37f9135c12..6f132c4b2f 100644 --- a/db/db.h +++ b/db/db.h @@ -183,11 +183,6 @@ struct HOST { int parse_net_stats(FILE*); }; -// values for main_state -#define WU_MAIN_STATE_INIT 0 -#define WU_MAIN_STATE_DONE 1 -#define WU_MAIN_STATE_ERROR 2 - // values for file_delete state #define FILE_DELETE_INIT 0 #define FILE_DELETE_READY 1 @@ -198,13 +193,15 @@ struct HOST { #define ASSIMILATE_READY 1 #define ASSIMILATE_DONE 2 -// values for error -#define SEND_FAIL 1 - // failed to send results for this WU -#define TOO_MANY_ERRORS 2 - // too many errors; may have bug -#define TOO_MANY_DONE 3 - // too many results without consensus; may be nondeterministic +// NOTE: there is no overall state for a WU +// (like done/not done) +// There's just a bunch of independent substates +// (file delete, assimilate, and states of results, error flags) + +// bit fields of error_mask +#define WU_ERROR_COULDNT_SEND_RESULT 1 +#define WU_ERROR_TOO_MANY_ERROR_RESULTS 2 +#define WU_ERROR_TOO_MANY_RESULTS 4 struct WORKUNIT { int id; @@ -218,13 +215,14 @@ struct WORKUNIT { double rsc_memory; // estimated size of RAM working set (bytes) double rsc_disk; // estimated amount of disk needed (bytes) bool need_validate; // this WU has at least 1 result in - // VALIDATE_STATE_NEED_CHECK state + // validate state = NEED_CHECK int canonical_resultid; // ID of canonical result, or zero double canonical_credit; // credit that all correct results get - double retry_check_time; // when to check for result retry - int delay_bound; // determines result deadline, retry check time - int main_state; // see values above - int error; + unsigned int timeout_check_time; // when to check for timeouts + // zero if no need to check + int delay_bound; // determines result deadline, + // timeout check time + int error_mask; // bitmask of errors (see above) int file_delete_state; int assimilate_state; int workseq_next; // if part of a sequence, the next WU @@ -235,30 +233,39 @@ struct WORKUNIT { #define RESULT_SERVER_STATE_INACTIVE 1 #define RESULT_SERVER_STATE_UNSENT 2 -#define RESULT_SERVER_STATE_IN_PROGRESS 3 -#define RESULT_SERVER_STATE_DONE 4 -#define RESULT_SERVER_STATE_TIMEOUT 5 -#define RESULT_SERVER_STATE_ERROR 6 -#define RESULT_SERVER_STATE_UNSENT_SEQ 7 +#define RESULT_SERVER_STATE_UNSENT_SEQ 3 // unsent, part of a work sequence +#define RESULT_SERVER_STATE_IN_PROGRESS 4 +#define RESULT_SERVER_STATE_OVER 5 + // we received a reply, timed out, or decided not to send. + // Note: we could get a reply even after timing out. -#define VALIDATE_STATE_INITIAL 0 -#define VALIDATE_STATE_NEED_CHECK 1 -#define VALIDATE_STATE_VALID 2 -#define VALIDATE_STATE_INVALID 3 +#define RESULT_OUTCOME_INIT 0 +#define RESULT_OUTCOME_SUCCESS 1 +#define RESULT_OUTCOME_COULDNT_SEND 2 +#define RESULT_OUTCOME_CLIENT_ERROR 3 +#define RESULT_OUTCOME_NO_REPLY 4 +#define RESULT_OUTCOME_DIDNT_NEED 5 + +#define VALIDATE_STATE_INIT 0 +#define VALIDATE_STATE_VALID 1 +#define VALIDATE_STATE_INVALID 2 struct RESULT { int id; unsigned int create_time; int workunitid; - int server_state; // server state (see above) + int server_state; // see above + int outcome; // see above; defined if server state OVER + int client_state; // phase when client error happened + // (download, compute, upload) + // Defined if outcome is CLIENT_ERROR + // and error details are in stderr_out int hostid; // host processing this result unsigned int report_deadline; // deadline for receiving result unsigned int sent_time; // when result was sent to host unsigned int received_time; // when result was received from host char name[256]; - int client_state; // records phase when error happened - // (download, compute, upload) double cpu_time; // CPU time used to complete result char xml_doc_in[MAX_BLOB_SIZE]; // descriptions of output files char xml_doc_out[MAX_BLOB_SIZE]; // MD5s of output files @@ -333,7 +340,7 @@ extern int db_workunit(int id, WORKUNIT&); extern int db_workunit_update(WORKUNIT& p); extern int db_workunit_lookup_name(WORKUNIT&); extern int db_workunit_enum_app_need_validate(WORKUNIT&); -extern int db_workunit_enum_retry_check_time(WORKUNIT&); +extern int db_workunit_enum_timeout_check_time(WORKUNIT&); extern int db_workunit_enum_file_delete_state(WORKUNIT&); extern int db_workunit_enum_app_assimilate_state(WORKUNIT&); diff --git a/db/db_mysql.C b/db/db_mysql.C index 22e1123952..5322dd0736 100644 --- a/db/db_mysql.C +++ b/db/db_mysql.C @@ -228,31 +228,33 @@ void BOINC_MYSQL_DB::struct_to_str(void* vp, char* q, int type) { "rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, " "need_validate=%d, " "canonical_resultid=%d, canonical_credit=%f, " - "retry_check_time=%f, delay_bound=%d, main_state=%d, " - "error=%d, file_delete_state=%d, assimilate_state=%d, " + "timeout_check_time=%d, delay_bound=%d, " + "error_mask=%d, file_delete_state=%d, assimilate_state=%d, " "workseq_next=%d", wup->id, wup->create_time, wup->appid, wup->name, wup->xml_doc, wup->batch, wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk, wup->need_validate, wup->canonical_resultid, wup->canonical_credit, - wup->retry_check_time, wup->delay_bound, wup->main_state, - wup->error, wup->file_delete_state, wup->assimilate_state, + wup->timeout_check_time, wup->delay_bound, + wup->error_mask, wup->file_delete_state, wup->assimilate_state, wup->workseq_next ); break; case TYPE_RESULT: rp = (RESULT*)vp; sprintf(q, - "id=%d, create_time=%d, workunitid=%d, server_state=%d, " + "id=%d, create_time=%d, workunitid=%d, " + "server_state=%d, outcome=%d, client_state=%d, " "hostid=%d, report_deadline=%d, sent_time=%d, received_time=%d, " - "name='%s', client_state=%d, cpu_time=%f, " + "name='%s', cpu_time=%f, " "xml_doc_in='%s', xml_doc_out='%s', stderr_out='%s', " "batch=%d, file_delete_state=%d, validate_state=%d, " "claimed_credit=%f, granted_credit=%f", - rp->id, rp->create_time, rp->workunitid, rp->server_state, + rp->id, rp->create_time, rp->workunitid, + rp->server_state, rp->outcome, rp->client_state, rp->hostid, rp->report_deadline, rp->sent_time, rp->received_time, - rp->name, rp->client_state, rp->cpu_time, + rp->name, rp->cpu_time, rp->xml_doc_in, rp->xml_doc_out, rp->stderr_out, rp->batch, rp->file_delete_state, rp->validate_state, rp->claimed_credit, rp->granted_credit @@ -411,10 +413,9 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) { wup->need_validate = atoi(r[i++]); wup->canonical_resultid = atoi(r[i++]); wup->canonical_credit = atof(r[i++]); - wup->retry_check_time = atof(r[i++]); + wup->timeout_check_time = atoi(r[i++]); wup->delay_bound = atoi(r[i++]); - wup->main_state = atoi(r[i++]); - wup->error = atoi(r[i++]); + wup->error_mask = atoi(r[i++]); wup->file_delete_state = atoi(r[i++]); wup->assimilate_state = atoi(r[i++]); wup->workseq_next = atoi(r[i++]); @@ -426,12 +427,13 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) { rp->create_time = atoi(r[i++]); rp->workunitid = atoi(r[i++]); rp->server_state = atoi(r[i++]); + rp->outcome = atoi(r[i++]); + rp->client_state = atoi(r[i++]); rp->hostid = atoi(r[i++]); rp->report_deadline = atoi(r[i++]); rp->sent_time = atoi(r[i++]); rp->received_time = atoi(r[i++]); strcpy2(rp->name, r[i++]); - rp->client_state = atoi(r[i++]); rp->cpu_time = atof(r[i++]); strcpy2(rp->xml_doc_in, r[i++]); strcpy2(rp->xml_doc_out, r[i++]); @@ -655,14 +657,14 @@ int db_workunit_enum_app_assimilate_state(WORKUNIT& p) { return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf); } -int db_workunit_enum_retry_check_time(WORKUNIT& p) { +int db_workunit_enum_timeout_check_time(WORKUNIT& p) { static ENUM e; char buf[256]; if (!e.active) { sprintf(buf, - "where appid=%d and retry_check_time > 0 and retry_check_time < %f", - p.appid, p.retry_check_time + "where appid=%d and timeout_check_time > 0 and timeout_check_time < %d", + p.appid, p.timeout_check_time ); } return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf); diff --git a/db/schema.sql b/db/schema.sql index 8f21198e35..5b6f5a7812 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -135,10 +135,9 @@ create table workunit ( need_validate smallint not null, canonical_resultid integer not null, canonical_credit double not null, - retry_check_time double not null, + timeout_check_time integer not null, delay_bound integer not null, - main_state integer not null, - error integer not null, + error_mask integer not null, file_delete_state integer not null, assimilate_state integer not null, workseq_next integer not null, @@ -150,12 +149,13 @@ create table result ( create_time integer not null, workunitid integer not null, server_state integer not null, + outcome integer not null, + client_state integer not null, hostid integer not null, report_deadline integer not null, sent_time integer not null, received_time integer not null, name varchar(254) not null, - client_state integer not null, cpu_time double not null, xml_doc_in blob, xml_doc_out blob, diff --git a/doc/backend.html b/doc/backend.html deleted file mode 100644 index 99a89188b4..0000000000 --- a/doc/backend.html +++ /dev/null @@ -1,99 +0,0 @@ -The components of a project back end -

The components of a project back end

- -

-The back end of a BOINC project consists -of a number of components that together are responsible -for generating work and handling results. -Some of these components are supplied by BOINC; -other parts are project- or application-specific: - -
- -
- -

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
ComponentBOINC-supplied partproject-supplied part
-Work generator: generates work units, work sequences, results, -and the corresponding input files. - -Functions and programs that handle the details of -creating workunit, work sequence, and result database records. - -Programs or scripts that generate input files, -install them on data servers, -and call the BOINC functions. -
Result retry generator: -generates additional results if some of those initially sent are lost -not completed, or completed erroneously.A program, result_retry, that does the job.Some parameters used by result_retry.
Result validation and accounting: -compare redundant results; select a canonical result -representing the correct output, -and a canonical credit granted to users and hosts -that return the correct output.A program, validate, that contains the -basic logic for validation.An application-specific function, linked with validate, -that compares sets of redundant results.
Assimilator: parse output files of -completed results, and take appropriate action -(record results in a database, and/or generate more work) -A main program that enumerates unassimilated workunits -and updates the database. - -A function that assimilates a workunit, -either by processing its canonical result -or handling an error return. -
File deleter: delete input and output files -when they are no longer needed.A program, file_deleter, that does the job.None.
-

-Project that use work sequences will need two additional processes: -

- - - - - - - - - - - - - - - - -
ComponentBOINC-supplied partproject-supplied part
Work sequence relocater: -detects work sequences whose hosts have failed, -and relocates them to other hosts.A program seq_relocateSome parameters used by seq_relocate
Work sequence validation and accounting: -Similar to result validation, but for work sequences. -A program seq_validateAn application-specific function, -linked with seq_validate, that compares sets of redundant results.
diff --git a/doc/backend_functions.html b/doc/backend_functions.html new file mode 100644 index 0000000000..2a6eee1a35 --- /dev/null +++ b/doc/backend_functions.html @@ -0,0 +1,44 @@ +Back end functions +

Back end functions

+ +

+The general functions of a project back end include: +

diff --git a/doc/backend_programs.html b/doc/backend_programs.html new file mode 100644 index 0000000000..455635b802 --- /dev/null +++ b/doc/backend_programs.html @@ -0,0 +1,207 @@ +Back end programs + +

Back end programs

+ +

+A project back end is implemented as a set of programs. +Some parts of these programs are supplied by BOINC; +other parts are project- or application-specific: + +
+ +
+ +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ComponentBOINC-supplied partproject-supplied part
+Work generator: generates work units, results, +and the corresponding input files. + +Functions and programs that handle the details of +creating workunit and result database records. + +Programs or scripts that generate input files, +install them on data servers, and call the BOINC functions. +
Timeout check: +Checks for various timeout conditions, +such as result timeout. +Reissues results for workunits as needed. +A program timeout_check.Some parameters used by timeout_check.
Result validation and accounting: +compare redundant results; select a canonical result +representing the correct output, +and a canonical credit granted to users and hosts +that return the correct output.A program, validate, that contains the +basic logic for validation.An application-specific function, linked with validate, +that compares sets of redundant results.
Assimilator: +handles workunits that are "completed": +that is, which have a canonical result or for which +an error condition has occurred. +Handling a successfully completed result might involve +record results in a database and perhaps generating more work. +A main program that enumerates unassimilated workunits, +calls a project-supplied "handler" function, +and updates the database. + +A handler function that assimilates a workunit, +either by processing its canonical result +or handling an error return. +
File deleter: delete input and output files +when they are no longer needed.A program file_deleter.None.
+ +

Timeout checker

+

+The timeout checker is passed the following parameters: + +max_errors +give up on a workunit if it gets this many error results(i.e., there must be a bug in the application). + +max_results +give up on a workunit if it gets this many +non-error results without finding a canonical result + +redundancy +try to get at least this many non-error results. + +application +which application to handle + +use crontab to run timeout_checker continuously. + +

+    for each WU with timeout_check_time < now
+        for each result of WU
+            if result.server_state=IN_PROGRESS and now > result.report_deadline
+                result.server_state = OVER
+                result.outcome = NO_REPLY
+        if any result has outcome COULDNT_SEND
+            wu.error_mask |= COULDNT_SEND
+            got_error = true
+        if too many error results
+            wu.error_mask |= TOO_MANY_ERROR_RESULTS
+            got_error = true
+        if too many results
+            wu.error mask |= TOO_MANY_RESULTS
+            got_error = true
+        else
+            generate new results as needed
+
+        if got_error
+            for all results server_state UNSENT
+                result.server_state = OVER
+                result.outcome = DIDNT_NEED
+            if wu.assimilate_state == INIT
+                wu.assimilate_state = READY
+
+    if all results are OVER and wu.assimilate_state = DONE
+        wu.file_delete_state = READY
+        wu.timeout_check_time = 0
+    else
+        wu.timeout_check_time = now + delay_bound
+
+
+ +

Validater

+

+BOINC supplies a utility program validate +to perform validation and credit-granting. +This program must be linked with two project-specific functions: +

+int check_set(vector results, int& canonicalid, double& credit);
+int check_pair(RESULT& r1, RESULT& r2, bool& match);
+
+check_set() takes a set of results. +If there is sufficient agreement, +it selects one of them as the "canonical" result +(returning its ID) and also decides what credit should +be granted for correct results for this workunit. +

+check_pair() compares two results and returns match=true +if they agree. + +

+The file validate_test.C contains an example +implementation of check_set() and check_pair(). + +

+    for each WU with need_validate = true
+        if already have canonical result
+            for each result with validate_state = INIT and outcome = SUCCESS
+                if matches canonical, grant credit
+                set result.validate_state to VALID or INVALID
+        else
+            build set of results with outcome = SUCCESS
+            if find canonical result
+                wu.assimilate_state = READY
+                for all results server_state = UNSENT
+                    result.server_state = OVER
+                    result.outcome = DIDNT_NEED
+
+ +

scheduler

+
+    - when send a result
+        result.server_state = IN_PROGRESS
+        result.report_deadline = now + wu.delay_bound
+        ??? should do lookup before updating?  shmem may be stale
+            doesn't matter; can't be stale
+    - when receive a result
+        switch result.server_state
+        client_state = (from reply msg)
+        case IN_PROGRESS:
+            result.server_state = OVER
+        case OVER:
+            result.file_delete_state = READY;
+
+        if client_state is DONE
+            result.outcome = SUCCESS
+            wu.need_validate = true
+        else
+            result.outcome = CLIENT_ERROR
+            result.validate_state = INVALID
+
+
+ +

Assimilator

+
+    for each WU with assimilate_state = READY
+        call project-specific handler function
+            NOTE: canonical_resultid and error_mask are not mutually exclusive
+        if all results are OVER with outcomes SUCCESS or CLIENT_ERROR
+            set result.file_delete = READY for all results
+        else
+            for each non-canonical result
+                if state is OVER and outcome is SUCCESS or CLIENT_ERROR
+                    set result.file_delete = READY
+        wu.assimilate_state = DONE
+        if all results are OVER
+            wu.file_delete_state = READY
+
diff --git a/doc/backend_state.html b/doc/backend_state.html new file mode 100644 index 0000000000..d67324bf84 --- /dev/null +++ b/doc/backend_state.html @@ -0,0 +1,220 @@ +Back end state transitions +

Back end state transitions

+ +

+The processing of workunits and results involves +several independent activities. +To keep track of these activities, +workunit and result database records have several "state" fields, +and their processing can be viewed as the combination +of several finite-state machines. + +

+A workunit has the following state fields: +

+Invariants: + + + +

+ +A result has the following state fields: +

+ +Invariants: + +

A note on scheduling

+

+ - when is it feasible to send a result to a host? + Request msg should include X = amount of work currently queued. + TODO: include % time active in calculation?? + Decision for each WU: + is X + time for WUs sent so far < delay_bound? + - When is a result declared "unsendable"? + Not a good idea to do on the basis of time; + do it only if a result is flushed from FIFO (see below) + +

State transitions

+
+
+fields of "result" table:
+
+server_state
+    UNSENT
+        (on creation)
+    IN_PROGRESS
+        from UNSENT
+            scheduler: when send
+    OVER
+        from IN_PROGRESS
+            scheduler: get reply from host
+            timeout_check: now > report_deadline
+        from UNSENT
+            validate: got canonical result for this WU and server_state=UNSENT
+            timeout_check: WU has error
+
+file_delete_state
+    INIT
+        (on creation)
+    READY
+        from INIT:
+            scheduler: got reply and server_state = OVER
+            timeout_check: all results are OVER
+            assimilator: all results are OVER or result is not canonical
+        from DONE:
+            scheduler: got reply and server_state = OVER
+    DONE
+        from READY
+            file_deleter: tried to delete files
+
+validate_state
+    INIT
+    VALID
+        from INIT:
+            validate: outcome = SUCCESS and matched canonical result
+    INVALID
+        from INIT:
+            scheduler: got reply, client error
+            validate: didn't match canonical result
+
+-------------
+fields of "workunit" table
+
+need_validate
+    FALSE
+        (on creation)
+        from TRUE:
+            validate: done checking
+    TRUE
+        from FALSE:
+            scheduler: got reply w/ client_state = DONE (i.e. no error)
+
+file_delete_state
+    INIT
+        (on creation)
+    READY
+        timeout_check: all results haver server_state=OVER
+            and wu.assimilate_state = DONE
+        assimilate:
+            all results have server_state = OVER
+            (and wu.assimilate_state = DONE)
+    DONE
+
+assimilate_state
+    INIT
+        (on creation)
+    READY
+        from INIT:
+            timeout_check: WU has error
+            validate: found canonical result
+    DONE
+        from READY:
+            assimilator: done
+
+error_mask
+    COULDNT_SEND
+        timeout_check: some result has outcome COULDNT_SEND
+    TOO_MANY_ERROR_RESULTS
+        timeout_check: too many error results
+    TOO_MANY_RESULTS
+        timeout_check: too many results
+
+timeout_check_time:
+    nonzero
+        (on creation)
+    zero
+        timeout_check: all results are OVER and validate_state = DONE
+
diff --git a/doc/backend_work_sequence.html b/doc/backend_work_sequence.html new file mode 100644 index 0000000000..b1de167a75 --- /dev/null +++ b/doc/backend_work_sequence.html @@ -0,0 +1,27 @@ +Back ends and work sequences +

Back ends and work sequences

+

+Project that use work sequences will need two additional processes: +

+ + + + + + + + + + + + + + + + +
ComponentBOINC-supplied partproject-supplied part
Work sequence relocater: +detects work sequences whose hosts have failed, +and relocates them to other hosts.A program seq_relocateSome parameters used by seq_relocate
Work sequence validation and accounting: +Similar to result validation, but for work sequences. +A program seq_validateAn application-specific function, +linked with seq_validate, that compares sets of redundant results.
diff --git a/doc/create_project.html b/doc/create_project.html index 5f767d341e..54bdcb93f6 100644 --- a/doc/create_project.html +++ b/doc/create_project.html @@ -68,13 +68,11 @@ Developing and operating a BOINC project back end How to feed work into a BOINC system and collect the results.

diff --git a/doc/garbage.html b/doc/garbage.html deleted file mode 100644 index 0129a4ee05..0000000000 --- a/doc/garbage.html +++ /dev/null @@ -1,2 +0,0 @@ -Garbage collection -

Garbage collection

diff --git a/doc/retry.html b/doc/retry.html deleted file mode 100644 index 5329c2a2cf..0000000000 --- a/doc/retry.html +++ /dev/null @@ -1,66 +0,0 @@ -Generating result retries -

Generating result retries

- -

-Hosts may fail to process and return results for various reasons; -such results are said to be lost. -A combination of lost and erroneous results may prevent -finding canonical result for a workunit. -The result retry mechanism generates additional -results as needed to find a canonical result. - -

-The result retry mechanism has the following project-supplied parameters: -

- -

-Each workunit has a retry check time. -This is initially set to now + DWU, -and is set to zero if a canonical result is found for the WU. - -

-Each result has a deadline, -a time by which a confirmation is expected for the result. -This is initially set to now + Dresult, - -

-Retry generation is handled by the program result_retry, invoked as -

-result_retry -appname name
-
-This program continually checks for workunits past their check time -and without pending validation. -For each such workunit, the program does the following: - - - -

-Use crontab to run result_retry continuously. diff --git a/doc/validation.html b/doc/validation.html deleted file mode 100644 index 256ab6b323..0000000000 --- a/doc/validation.html +++ /dev/null @@ -1,77 +0,0 @@ -Validation of results and credit - -

Validation of results and credit

-

-Communication from the core client can easily be altered or forged. -Some users will attempt to get undeserved credit -by falsifying their CPU metrics or CPU times. -Output files may be wrong. -This can result from tampering or hardware failures. -

-Both problems - credit-cheating and wrong results - can be addressed -by redundant computing and result validation. -In this approach, each workunit is processed at least twice. -The project back end waits until a minimum number of results have been returned, -then compares the results and decides which are considered correct. -The notion of equality of results, -and the policy for deciding which are correct, -are project-specific. -

-The back end then marks correct results as "validated", -finds the minimum reported credit for the correct results of a given workunit, -and assigns this amount of credit to all the correct results. -This ensures that as long as a reasonable majority of participants -don't falsify credit, almost all credit accounting will be correct. - -

The validation program

- -BOINC supplies a utility program validate -to perform validation and credit-granting. -This program must be linked with two project-specific functions: -
-int check_set(vector results, int& canonicalid, double& credit);
-int check_pair(RESULT& r1, RESULT& r2, bool& match);
-
-check_set() takes a set of results. -If there is sufficient agreement, -it selects one of them as the "canonical" result -(returning its ID) and also decides what credit should -be granted for correct results for this workunit. -

-check_pair() compares two results and returns match=true -if they agree. - -

-The file validate_test.C contains an example -implementation of check_set() and check_pair(). - - -


-

Implementation

- -The following database fields are used: - -

-WORKUNIT -

bool need_validate -
- true iff this workunit has one or more results in state DONE - and validate_state UNCHECKED -
- int canonical_resultid -
- nonzero if a conclusive check has been done for this WU; - indicates the canonical result - -

-RESULT -

- int state -
INACTIVE, ..., DONE, ERROR -
- int validate_state -
- NEED_CHECK, - VALID, - INVALID - diff --git a/doc/work.html b/doc/work.html index cc85171cc3..0324e935c8 100644 --- a/doc/work.html +++ b/doc/work.html @@ -2,6 +2,9 @@

Workunits

+TODO: don't separate into XML/other parts; +describe XML format only in the tools doc. +

A workunit describes a computation to be performed. Workunits are maintained in the workunit table in the BOINC DB. The attributes of a workunit include: @@ -13,13 +16,6 @@ The attributes of a workunit include: (see below).

  • The estimated resource requirements of the work unit (computation, memory, disk space). -
  • The maximum processing -(measured in Cobblestones) -and maximum disk space to be used for the computation. -An instance of the computation that exceeds either of these bounds -will be aborted. -This mechanism is used to prevent an infinite-loop bug from -indefinitely incapacitating a host.
  • A delay bound: upper bound on how long (in real time) a result associated with this work unit should take to complete. This determines which hosts the workunit can be sent to, @@ -49,8 +45,12 @@ The components are: A list of environment variables in the form name=value&name=value&name=value. -<max_processing> -Maximum processing (in Cobblestones). +<max_processing> +Maximum processing +(measured in Cobblestones). +An instance of the computation that exceeds this bound will be aborted. +This mechanism prevents an infinite-loop bug from +indefinitely incapacitating a host. The default is determined by the client; typically it is 1. <max_disk> diff --git a/doc/work_states.html b/doc/work_states.html deleted file mode 100644 index 9bf7dcaa21..0000000000 --- a/doc/work_states.html +++ /dev/null @@ -1,80 +0,0 @@ -Workunit and result states -

    Workunit and result states

    - -

    -The processing of workunits and results involves -several independent activities. -To keep track of these activities, -workunit and result database records have several "state" fields, -and their processing can be viewed as the combination -of several finite-state machines. - -

    -A workunit has the following state fields: -

      -
    • -main_state: -Initially INIT. -If a canonical result is found (by validate) -it transitions to DONE. -If it's decided that no canonical result can be found -(by result_retry) -it transitions to ERROR. - -
    • -file_delete_state: -Initially INIT. -When the main state transitions to either DONE or ERROR, -it transitions to READY, -indicating that input files can be deleted. -When file deletion is completed (by file_deleter) -it transitions to DONE. - -
    • -assimilate_state: -Initially INIT. -When the main state transitions to either DONE or ERROR, -it transitions to READY, -indicating that the workunit can be assimilated. -When assimplateion is completed (by assimilator) -it transitions to DONE. - -
    • -need_validate: -A boolean, true whenever -the workunit has a result whose validate state is NEED_CHECK. -The validate program sets it back to false. - -
    - - -

    - -A result has the following state fields: -

      -
    • -server_state: -Initially UNSENT. -Becomes IN_PROGRESS when the result has been sent to a client. -Becomes DONE or ERROR if the client returns the result, -or TIMEOUT if result_retry gives up on the result. -
    • -client_state: -Records the client state (upload, process, or download) -where an error occurred. -Not relevant here. -
    • -file_delete_state: -Initially INIT. -When the parent workunit becomes ERROR, becomes READY, -indication that the output files can be deleted. -When the parent workunit becomes DONE, becomes READY -except for the canonical result, which remains INIT. -When the output files have been deleted (by file_deleter) becomes DONE. -
    • -validate_state: -Initially INITIAL. -When the result is returned, becomes NEED_CHECK. -When a canonical result has been found for the workunit, -becomes either VALID or INVALID. -
    diff --git a/html/ops/db.inc b/html/ops/db.inc index 356346ee3f..7b14c662f0 100644 --- a/html/ops/db.inc +++ b/html/ops/db.inc @@ -112,22 +112,13 @@ function show_host($host) { } -function main_state_str($s) { - switch($s) { - case 0: return "Initial"; - case 1: return "Done"; - case 2: return "Error"; - } -} - -function error_str($s) { - switch($s) { - case 0: return "No error"; - case 1: return "Send failed"; - case 2: return "Too many errors (may have bug)"; - case 3: return "Too many done (may be nondeterministic)"; - } - return "unknown"; +function wu_error_mask_str($s) { + $x = ""; + if ($s & 1) $x = $x."Couldn't send result; "; + if ($s & 2) $x = $x."Too many errors (may have bug); "; + if ($s & 4) $x = $x."Too many results (may be nondeterministic)"; + $x = $x."
    "; + return $x; } function assimilate_state_str($s) { @@ -162,10 +153,9 @@ function show_workunit($wu,$show_xml_doc) { row("Need validate?", $wu->need_validate?"yes":"no"); row("Canonical resultid", $wu->canonical_resultid); row("Canonical credit", $wu->canonical_credit); - row("Retry check time", time_str($wu->retry_check_time)); + row("Timeout check time", time_str($wu->timeout_check_time)); row("Delay bound", $wu->delay_bound); - row("Main state", main_state_str($wu->main_state)); - row("Error", error_str($wu->error)); + row("Error mask", wu_error_mask_str($wu->error_mask)); row("File delete state", file_delete_state_str($wu->file_delete_state)); row("Assimilation state", assimilate_state_str($wu->assimilate_state)); row("","id>Show associated results"); @@ -173,19 +163,28 @@ function show_workunit($wu,$show_xml_doc) { echo "

    "; } -function res_server_state_string($s) { +function result_server_state_string($s) { switch($s) { case 1: return "Inactive"; case 2: return "Unsent"; - case 3: return "In Progress"; - case 4: return "Done"; - case 5: return "Timeout"; - case 6: return "Error"; + case 4: return "In Progress"; + case 5: return "Over"; } return "unknown"; } -function result_client_state_str($s) { +function result_outcome_string($s) { + switch($s) { + case 1: return "Success"; + case 2: return "Couldn't send"; + case 3: return "Client error"; + case 4: return "No reply"; + case 5: return "Didn't need"; + } + return "unknown"; +} + +function result_client_state_string($s) { switch($s) { case 1: return "Downloading"; case 2: return "Computing"; @@ -197,9 +196,8 @@ function result_client_state_str($s) { function validate_state_str($s) { switch($s) { case 0: return "Initial"; - case 1: return "Need check"; - case 2: return "Valid"; - case 3: return "Invalid"; + case 1: return "Valid"; + case 2: return "Invalid"; } return "unknown"; } @@ -213,10 +211,11 @@ function show_result($result,$show_xml_docs,$show_stderr,$show_times) { } row("Name", $result->name); row("Workunit", "workunitid>" . wu_name_by_id($result->workunitid) . "" ); - row("Server state", res_server_state_string($result->server_state)); - row("Deadline", time_str($result->report_deadline)); + row("Server state", result_server_state_string($result->server_state)); + row("Outcome", result_outcome_string($result->outcome)); + row("Client state", result_client_state_string($result->client_state)); row("Host ID", "hostid>" . host_name_by_id($result->hostid) . ""); - row("Client state", result_client_state_str($result->client_state)); + row("Report deadline", time_str($result->report_deadline)); row("CPU time", $result->cpu_time); if ($show_xml_docs) { row("XML doc in", "

    ".htmlspecialchars($result->xml_doc_in)."
    "); diff --git a/html/ops/db.php b/html/ops/db.php index be0d158506..2725ed83ca 100644 --- a/html/ops/db.php +++ b/html/ops/db.php @@ -66,7 +66,7 @@ if (strlen($result_state) && $result_state != 0) { $query = append_sql_query( $query, "server_state = $result_state", $first ); - $english_query = append_sql_query( $english_query, "server state is ".res_server_state_string($result_state), $first ); + $english_query = append_sql_query( $english_query, "server state is ".result_server_state_string($result_state), $first ); $rstate = $result_state; $first = 0; } else { @@ -172,7 +172,7 @@ . "