file deletion and state

svn path=/trunk/boinc/; revision=823
2003-01-23 08:07:48 +00:00 · 2003-01-23 08:07:48 +00:00 · 46f84c5bbf
parent d82670df96
commit 46f84c5bbf
29 changed files with 985 additions and 644 deletions
--- a/48
+++ b/48
@ -2981,3 +2981,51 @@ David  Jan 15 2003
        test.inc
    tools/
        backend_lib.C
+
+David  Jan 22 2003
+    - Changed some of the state fields of workunit and result,
+        to accommodate new policies about when to delete files
+        See doc/backend_* for details
+    - Split off dummy assimilation function from the main program.
+    - Added new stripchart scripts for logging DB counts
+        and directory sizes
+
+    NOTE: it looks like shared memory segment is at same address
+    in feeder and cgi.  So it's OK to use pointers in it.
+
+    db/
+        db.h
+        db_mysql.C
+        schema.sql
+    doc/
+        backend.html (removed)
+        backend_functions.html (new)
+        backend_programs.html (new)
+        backend_state.html (new)
+        backend_work_sequence.html (new)
+        create_project.html
+        garbage.html (removed)
+        retry.html (removed)
+        work.html
+        work_states.html (removed)
+    html_ops/
+        db.inc
+        db.php
+    sched/
+        Makefile.in
+        assimilate_handler.C,h (new)
+        assimilator.C
+        handle_request.C
+        result_retry.C
+        validate.C
+    strichart/samples/
+        datafiles
+        db_count (new)
+        db_looper (new)
+        dir_size (new)
+    test/
+        test.inc
+        test_loop.php
+    tools/
+        backend_lib.C
+        create_work.C
--- a/db/db.h
+++ b/db/db.h
@ -183,11 +183,6 @@ struct HOST {
    int parse_net_stats(FILE*);
 };

-// values for main_state
-#define WU_MAIN_STATE_INIT      0
-#define WU_MAIN_STATE_DONE      1
-#define WU_MAIN_STATE_ERROR     2
-
 // values for file_delete state
 #define FILE_DELETE_INIT        0
 #define FILE_DELETE_READY       1
@ -198,13 +193,15 @@ struct HOST {
 #define ASSIMILATE_READY        1
 #define ASSIMILATE_DONE         2

-// values for error
-#define SEND_FAIL               1
-    // failed to send results for this WU
-#define TOO_MANY_ERRORS         2
-    // too many errors; may have bug
-#define TOO_MANY_DONE           3
-    // too many results without consensus; may be nondeterministic
+// NOTE: there is no overall state for a WU
+// (like done/not done)
+// There's just a bunch of independent substates
+// (file delete, assimilate, and states of results, error flags)
+
+// bit fields of error_mask
+#define WU_ERROR_COULDNT_SEND_RESULT        1
+#define WU_ERROR_TOO_MANY_ERROR_RESULTS     2
+#define WU_ERROR_TOO_MANY_RESULTS            4

 struct WORKUNIT {
    int id;
@ -218,13 +215,14 @@ struct WORKUNIT {
    double rsc_memory;          // estimated size of RAM working set (bytes)
    double rsc_disk;            // estimated amount of disk needed (bytes)
    bool need_validate;         // this WU has at least 1 result in
-                                // VALIDATE_STATE_NEED_CHECK state
+                                // validate state = NEED_CHECK
    int canonical_resultid;     // ID of canonical result, or zero
    double canonical_credit;    // credit that all correct results get
-    double retry_check_time;    // when to check for result retry
-    int delay_bound;            // determines result deadline, retry check time
-    int main_state;             // see values above
-    int error;
+    unsigned int timeout_check_time;  // when to check for timeouts
+                                // zero if no need to check
+    int delay_bound;            // determines result deadline,
+                                // timeout check time
+    int error_mask;             // bitmask of errors (see above)
    int file_delete_state;
    int assimilate_state;
    int workseq_next;           // if part of a sequence, the next WU
@ -235,30 +233,39 @@ struct WORKUNIT {

 #define RESULT_SERVER_STATE_INACTIVE       1
 #define RESULT_SERVER_STATE_UNSENT         2
-#define RESULT_SERVER_STATE_IN_PROGRESS    3
-#define RESULT_SERVER_STATE_DONE           4
-#define RESULT_SERVER_STATE_TIMEOUT        5
-#define RESULT_SERVER_STATE_ERROR          6
-#define RESULT_SERVER_STATE_UNSENT_SEQ     7
+#define RESULT_SERVER_STATE_UNSENT_SEQ     3
    // unsent, part of a work sequence
+#define RESULT_SERVER_STATE_IN_PROGRESS    4
+#define RESULT_SERVER_STATE_OVER           5
+    // we received a reply, timed out, or decided not to send.
+    // Note: we could get a reply even after timing out.

-#define VALIDATE_STATE_INITIAL      0
-#define VALIDATE_STATE_NEED_CHECK   1
-#define VALIDATE_STATE_VALID        2
-#define VALIDATE_STATE_INVALID      3
+#define RESULT_OUTCOME_INIT             0
+#define RESULT_OUTCOME_SUCCESS          1
+#define RESULT_OUTCOME_COULDNT_SEND     2
+#define RESULT_OUTCOME_CLIENT_ERROR     3
+#define RESULT_OUTCOME_NO_REPLY         4
+#define RESULT_OUTCOME_DIDNT_NEED       5
+
+#define VALIDATE_STATE_INIT         0
+#define VALIDATE_STATE_VALID        1
+#define VALIDATE_STATE_INVALID      2

 struct RESULT {
    int id;
    unsigned int create_time;
    int workunitid;
-    int server_state;               // server state (see above)
+    int server_state;               // see above
+    int outcome;                    // see above; defined if server state OVER
+    int client_state;               // phase when client error happened
+                                    // (download, compute, upload)
+                                    // Defined if outcome is CLIENT_ERROR
+                                    // and error details are in stderr_out
    int hostid;                     // host processing this result
    unsigned int report_deadline;   // deadline for receiving result
    unsigned int sent_time;         // when result was sent to host
    unsigned int received_time;     // when result was received from host
    char name[256];
-    int client_state;               // records phase when error happened
-                                    // (download, compute, upload)
    double cpu_time;                // CPU time used to complete result
    char xml_doc_in[MAX_BLOB_SIZE];     // descriptions of output files
    char xml_doc_out[MAX_BLOB_SIZE];    // MD5s of output files
@ -333,7 +340,7 @@ extern int db_workunit(int id, WORKUNIT&);
 extern int db_workunit_update(WORKUNIT& p);
 extern int db_workunit_lookup_name(WORKUNIT&);
 extern int db_workunit_enum_app_need_validate(WORKUNIT&);
-extern int db_workunit_enum_retry_check_time(WORKUNIT&);
+extern int db_workunit_enum_timeout_check_time(WORKUNIT&);
 extern int db_workunit_enum_file_delete_state(WORKUNIT&);
 extern int db_workunit_enum_app_assimilate_state(WORKUNIT&);

--- a/db/db_mysql.C
+++ b/db/db_mysql.C
@ -228,31 +228,33 @@ void BOINC_MYSQL_DB::struct_to_str(void* vp, char* q, int type) {
            "rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, "
            "need_validate=%d, "
            "canonical_resultid=%d, canonical_credit=%f, "
-            "retry_check_time=%f, delay_bound=%d, main_state=%d, "
-            "error=%d, file_delete_state=%d, assimilate_state=%d, "
+            "timeout_check_time=%d, delay_bound=%d, "
+            "error_mask=%d, file_delete_state=%d, assimilate_state=%d, "
            "workseq_next=%d",
            wup->id, wup->create_time, wup->appid,
            wup->name, wup->xml_doc, wup->batch,
            wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk, 
            wup->need_validate,
            wup->canonical_resultid, wup->canonical_credit,
-            wup->retry_check_time, wup->delay_bound, wup->main_state,
-            wup->error, wup->file_delete_state, wup->assimilate_state,
+            wup->timeout_check_time, wup->delay_bound,
+            wup->error_mask, wup->file_delete_state, wup->assimilate_state,
            wup->workseq_next
        );
        break;
    case TYPE_RESULT:
        rp = (RESULT*)vp;
        sprintf(q,
-            "id=%d, create_time=%d, workunitid=%d, server_state=%d, "
+            "id=%d, create_time=%d, workunitid=%d, "
+            "server_state=%d, outcome=%d, client_state=%d, "
            "hostid=%d, report_deadline=%d, sent_time=%d, received_time=%d, "
-            "name='%s', client_state=%d, cpu_time=%f, "
+            "name='%s', cpu_time=%f, "
            "xml_doc_in='%s', xml_doc_out='%s', stderr_out='%s', "
            "batch=%d, file_delete_state=%d, validate_state=%d, "
            "claimed_credit=%f, granted_credit=%f",
-            rp->id, rp->create_time, rp->workunitid, rp->server_state,
+            rp->id, rp->create_time, rp->workunitid,
+            rp->server_state, rp->outcome, rp->client_state,
            rp->hostid, rp->report_deadline, rp->sent_time, rp->received_time,
-            rp->name, rp->client_state, rp->cpu_time,
+            rp->name, rp->cpu_time,
            rp->xml_doc_in, rp->xml_doc_out, rp->stderr_out,
            rp->batch, rp->file_delete_state, rp->validate_state,
            rp->claimed_credit, rp->granted_credit
@ -411,10 +413,9 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) {
        wup->need_validate = atoi(r[i++]);
        wup->canonical_resultid = atoi(r[i++]);
        wup->canonical_credit = atof(r[i++]);
-        wup->retry_check_time = atof(r[i++]);
+        wup->timeout_check_time = atoi(r[i++]);
        wup->delay_bound = atoi(r[i++]);
-        wup->main_state = atoi(r[i++]);
-        wup->error = atoi(r[i++]);
+        wup->error_mask = atoi(r[i++]);
        wup->file_delete_state = atoi(r[i++]);
        wup->assimilate_state = atoi(r[i++]);
        wup->workseq_next = atoi(r[i++]);
@ -426,12 +427,13 @@ void BOINC_MYSQL_DB::row_to_struct(MYSQL_ROW& r, void* vp, int type) {
        rp->create_time = atoi(r[i++]);
        rp->workunitid = atoi(r[i++]);
        rp->server_state = atoi(r[i++]);
+        rp->outcome = atoi(r[i++]);
+        rp->client_state = atoi(r[i++]);
        rp->hostid = atoi(r[i++]);
        rp->report_deadline = atoi(r[i++]);
        rp->sent_time = atoi(r[i++]);
        rp->received_time = atoi(r[i++]);
        strcpy2(rp->name, r[i++]);
-        rp->client_state = atoi(r[i++]);
        rp->cpu_time = atof(r[i++]);
        strcpy2(rp->xml_doc_in, r[i++]);
        strcpy2(rp->xml_doc_out, r[i++]);
@ -655,14 +657,14 @@ int db_workunit_enum_app_assimilate_state(WORKUNIT& p) {
    return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf);
 }

-int db_workunit_enum_retry_check_time(WORKUNIT& p) {
+int db_workunit_enum_timeout_check_time(WORKUNIT& p) {
    static ENUM e;
    char buf[256];

    if (!e.active) {
        sprintf(buf,
-            "where appid=%d and retry_check_time > 0 and retry_check_time < %f",
-            p.appid, p.retry_check_time
+            "where appid=%d and timeout_check_time > 0 and timeout_check_time < %d",
+            p.appid, p.timeout_check_time
        );
    }
    return boinc_db.db_enum(e, &p, TYPE_WORKUNIT, buf);
--- a/db/schema.sql
+++ b/db/schema.sql
@ -135,10 +135,9 @@ create table workunit (
    need_validate   smallint    not null,
    canonical_resultid integer  not null,
    canonical_credit double     not null,
-    retry_check_time double     not null,
+    timeout_check_time integer  not null,
    delay_bound     integer     not null,
-    main_state      integer     not null,
-    error           integer     not null,
+    error_mask      integer     not null,
    file_delete_state   integer not null,
    assimilate_state    integer not null,
    workseq_next    integer     not null,
@ -150,12 +149,13 @@ create table result (
    create_time     integer     not null,
    workunitid      integer     not null,
    server_state    integer     not null,
+    outcome         integer     not null,
+    client_state    integer     not null,
    hostid          integer     not null,
    report_deadline integer     not null,
    sent_time       integer     not null,
    received_time   integer     not null,
    name            varchar(254) not null,
-    client_state    integer     not null,
    cpu_time        double       not null,
    xml_doc_in      blob,
    xml_doc_out     blob,
--- a/doc/backend.html
+++ b/doc/backend.html
@ -1,99 +0,0 @@
-<title>The components of a project back end</title>
-<h2>The components of a project back end</h2>
-
-<p>
-The <b>back end</b> of a BOINC project consists
-of a number of components that together are responsible
-for generating work and handling results.
-Some of these components are supplied by BOINC;
-other parts are project- or application-specific:
-
-<br>
-<img vspace=10 src=backend.png>
-<br>
-
-<p>
-<table border=1 cellpadding=8>
-<tr>
-<th>Component</th>
-<th>BOINC-supplied part</th>
-<th>project-supplied part</th>
-</tr>
-<tr>
-<td valign=top>
-<b>Work generator</b>: generates work units, work sequences, results,
-and the corresponding input files.
-</td>
-<td valign=top>
-Functions and programs that handle the details of
-creating workunit, work sequence, and result database records.
-</td>
-<td valign=top>
-Programs or scripts that generate input files,
-install them on data servers,
-and call the BOINC functions.
-</td></tr>
-<tr>
-<td valign=top><b>Result retry generator</b>:
-generates additional results if some of those initially sent are lost
-not completed, or completed erroneously.</td>
-<td valign=top>A program, <b>result_retry</b>, that does the job.</td>
-<td valign=top>Some parameters used by result_retry.</td>
-</tr>
-<tr>
-<td valign=top><b>Result validation and accounting</b>:
-compare redundant results; select a <b>canonical result</b>
-representing the correct output,
-and a <b>canonical credit</b> granted to users and hosts
-that return the correct output.</td>
-<td valign=top>A program, <b>validate</b>, that contains the
-basic logic for validation.</td>
-<td valign=top>An application-specific function, linked with <b>validate</b>,
-that compares sets of redundant results.</td>
-</tr>
-<tr>
-<td valign=top><b>Assimilator</b>: parse output files of
-completed results, and take appropriate action
-(record results in a database, and/or generate more work)</td>
-<td valign=top>
-A main program that enumerates unassimilated workunits
-and updates the database.
-</td>
-<td valign=top>
-A function that assimilates a workunit,
-either by processing its canonical result
-or handling an error return.
-</td>
-</tr>
-<tr>
-<td valign=top><b>File deleter</b>: delete input and output files
-when they are no longer needed.</td>
-<td valign=top>A program, <b>file_deleter</b>, that does the job.</td>
-<td valign=top>None.</td>
-</tr>
-</table>
-<p>
-Project that use work sequences will need two additional processes:
-<p>
-<table border=1 cellpadding=8>
-<tr>
-<th>Component</th>
-<th>BOINC-supplied part</th>
-<th>project-supplied part</th>
-</tr>
-<tr>
-<td valign=top><b>Work sequence relocater</b>:
-detects work sequences whose hosts have failed,
-and relocates them to other hosts.</td>
-<td valign=top>A program <b>seq_relocate</b></td>
-<td valign=top>Some parameters used by seq_relocate</td>
-</tr>
-<tr>
-<td valign=top><b>Work sequence validation and accounting</b>:
-Similar to result validation, but for work sequences.
-</td>
-<td valign=top>A program <b>seq_validate</b></td>
-<td valign=top>An application-specific function,
-linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
-</tr>
-</table>
--- a/doc/backend_functions.html
+++ b/doc/backend_functions.html
@ -0,0 +1,44 @@
+<title>Back end functions</title>
+<h2>Back end functions</h2>
+
+<p>
+The general functions of a project back end include:
+<ul>
+<li> <b>Generate work</b>.
+<p>
+<li> <b>Reissue results</b>.
+Hosts may fail to return results for various reasons;
+such results are "lost".
+Lost and erroneous results may prevent
+finding a canonical result for a workunit.
+The "result reissue" mechanism generates additional
+results as needed to find a canonical result.
+
+<p>
+<li> <b>Select canonical results</b>.
+Communication from the core client can easily be altered or forged.
+Output files may be wrong.
+This can result from tampering or hardware failures.
+This problem can be addressed
+by <b>redundant computing</b>
+In this approach, each workunit is processed at least twice.
+The project back end waits until a minimum number of results have been returned,then compares the results and decides which are considered correct.
+The notion of equality of results,
+and the policy for deciding which are correct, are project-specific.
+<p>
+
+<li> <b>Grant credit</b>.
+Some users will attempt to get undeserved credit
+by falsifying their CPU metrics or CPU times.
+The back end
+finds the minimum reported credit for the correct results of a given workunit,
+and assigns this amount of credit to all the correct results.
+This ensures that as long as a reasonable majority of participants
+don't falsify credit, almost all credit accounting will be correct.
+<p>
+<li> <b>Assimilate results</b>.
+<p>
+<li> <b>Delete files</b>.
+<p>
+<li> <b>Detect "problem" workunits</b>.
+</ul>
--- a/doc/backend_programs.html
+++ b/doc/backend_programs.html
@ -0,0 +1,207 @@
+<title>Back end programs</title>
+<body bgcolor=ffffff>
+<h2>Back end programs</h2>
+
+<p>
+A project back end is implemented as a set of programs.
+Some parts of these programs are supplied by BOINC;
+other parts are project- or application-specific:
+
+<br>
+<img vspace=10 src=backend.png>
+<br>
+
+<p>
+<table border=1 cellpadding=8>
+<tr>
+<th>Component</th>
+<th>BOINC-supplied part</th>
+<th>project-supplied part</th>
+</tr>
+<tr>
+<td valign=top>
+<b>Work generator</b>: generates work units, results,
+and the corresponding input files.
+</td>
+<td valign=top>
+Functions and programs that handle the details of
+creating workunit and result database records.
+</td>
+<td valign=top>
+Programs or scripts that generate input files,
+install them on data servers, and call the BOINC functions.
+</td></tr>
+<tr>
+<td valign=top><b>Timeout check</b>:
+Checks for various timeout conditions,
+such as result timeout.
+Reissues results for workunits as needed.
+</td>
+<td valign=top>A program <b>timeout_check</b>.</td>
+<td valign=top>Some parameters used by timeout_check.</td>
+</tr>
+<tr>
+<td valign=top><b>Result validation and accounting</b>:
+compare redundant results; select a <b>canonical result</b>
+representing the correct output,
+and a <b>canonical credit</b> granted to users and hosts
+that return the correct output.</td>
+<td valign=top>A program, <b>validate</b>, that contains the
+basic logic for validation.</td>
+<td valign=top>An application-specific function, linked with <b>validate</b>,
+that compares sets of redundant results.</td>
+</tr>
+<tr>
+<td valign=top><b>Assimilator</b>:
+handles workunits that are "completed":
+that is, which have a canonical result or for which
+an error condition has occurred.
+Handling a successfully completed result might involve
+record results in a database and perhaps generating more work.</td>
+<td valign=top>
+A main program that enumerates unassimilated workunits,
+calls a project-supplied "handler" function,
+and updates the database.
+</td>
+<td valign=top>
+A handler function that assimilates a workunit,
+either by processing its canonical result
+or handling an error return.
+</td>
+</tr>
+<tr>
+<td valign=top><b>File deleter</b>: delete input and output files
+when they are no longer needed.</td>
+<td valign=top>A program <b>file_deleter</b>.</td>
+<td valign=top>None.</td>
+</tr>
+</table>
+
+<h3>Timeout checker</h3>
+<p>
+The timeout checker is passed the following parameters:
+
+max_errors
+give up on a workunit if it gets this many error results(i.e., there must be a bug in the application).
+
+max_results
+give up on a workunit if it gets this many
+non-error results without finding a canonical result
+
+redundancy
+try to get at least this many non-error results.
+
+application
+which application to handle
+
+use crontab to run timeout_checker continuously.
+
+<pre>
+    for each WU with timeout_check_time < now
+        for each result of WU
+            if result.server_state=IN_PROGRESS and now > result.report_deadline
+                result.server_state = OVER
+                result.outcome = NO_REPLY
+        if any result has outcome COULDNT_SEND
+            wu.error_mask |= COULDNT_SEND
+            got_error = true
+        if too many error results
+            wu.error_mask |= TOO_MANY_ERROR_RESULTS
+            got_error = true
+        if too many results
+            wu.error mask |= TOO_MANY_RESULTS
+            got_error = true
+        else
+            generate new results as needed
+
+        if got_error
+            for all results server_state UNSENT
+                result.server_state = OVER
+                result.outcome = DIDNT_NEED
+            if wu.assimilate_state == INIT
+                wu.assimilate_state = READY
+
+    if all results are OVER and wu.assimilate_state = DONE
+        wu.file_delete_state = READY
+        wu.timeout_check_time = 0
+    else
+        wu.timeout_check_time = now + delay_bound
+
+</pre>
+
+<h3>Validater</h3>
+<p>
+BOINC supplies a utility program <b>validate</b>
+to perform validation and credit-granting.
+This program must be linked with two project-specific functions:
+<pre>
+int check_set(vector<RESULT> results, int& canonicalid, double& credit);
+int check_pair(RESULT& r1, RESULT& r2, bool& match);
+</pre>
+<b>check_set()</b> takes a set of results.
+If there is sufficient agreement,
+it selects one of them as the "canonical" result
+(returning its ID) and also decides what credit should
+be granted for correct results for this workunit.
+<p>
+<b>check_pair()</b> compares two results and returns match=true
+if they agree.
+
+<p>
+The file <b>validate_test.C</b> contains an example
+implementation of check_set() and check_pair().
+
+<pre>
+    for each WU with need_validate = true
+        if already have canonical result
+            for each result with validate_state = INIT and outcome = SUCCESS
+                if matches canonical, grant credit
+                set result.validate_state to VALID or INVALID
+        else
+            build set of results with outcome = SUCCESS
+            if find canonical result
+                wu.assimilate_state = READY
+                for all results server_state = UNSENT
+                    result.server_state = OVER
+                    result.outcome = DIDNT_NEED
+</pre>
+
+<h3>scheduler</h3>
+<pre>
+    - when send a result
+        result.server_state = IN_PROGRESS
+        result.report_deadline = now + wu.delay_bound
+        ??? should do lookup before updating?  shmem may be stale
+            doesn't matter; can't be stale
+    - when receive a result
+        switch result.server_state
+        client_state = (from reply msg)
+        case IN_PROGRESS:
+            result.server_state = OVER
+        case OVER:
+            result.file_delete_state = READY;
+
+        if client_state is DONE
+            result.outcome = SUCCESS
+            wu.need_validate = true
+        else
+            result.outcome = CLIENT_ERROR
+            result.validate_state = INVALID
+
+</pre>
+
+<h3>Assimilator</h3>
+<pre>
+    for each WU with assimilate_state = READY
+        call project-specific handler function
+            NOTE: canonical_resultid and error_mask are not mutually exclusive
+        if all results are OVER with outcomes SUCCESS or CLIENT_ERROR
+            set result.file_delete = READY for all results
+        else
+            for each non-canonical result
+                if state is OVER and outcome is SUCCESS or CLIENT_ERROR
+                    set result.file_delete = READY
+        wu.assimilate_state = DONE
+        if all results are OVER
+            wu.file_delete_state = READY
+</pre>
--- a/doc/backend_state.html
+++ b/doc/backend_state.html
@ -0,0 +1,220 @@
+<title>Back end state transitions</title>
+<h2>Back end state transitions</h2>
+
+<p>
+The processing of workunits and results involves
+several independent activities.
+To keep track of these activities,
+workunit and result database records have several "state" fields,
+and their processing can be viewed as the combination
+of several finite-state machines.
+
+<p>
+A workunit has the following state fields:
+<ul>
+<li>
+<b>delay_bound</b>.
+upper bound for the interval between sending this WU to a host
+and getting the result
+Should be several times the execution time on an average host.
+If it's exceeded, the server "gives up" on the result
+and may delete its input files.
+If the result is returned later,
+it will still be validated and credited.
+
+<li>
+<b>canonical_resultid</b>.
+<li>
+<b>timeout_check_time</b>.
+
+<li>
+<b>file_delete_state</b>:
+Initially INIT.
+When the main state transitions to either DONE or ERROR,
+it transitions to READY,
+indicating that input files can be deleted.
+When file deletion is completed (by file_deleter)
+it transitions to DONE.
+
+<li>
+<b>assimilate_state</b>:
+Initially INIT.
+When the main state transitions to either DONE or ERROR,
+it transitions to READY,
+indicating that the workunit can be assimilated.
+When assimplateion is completed (by assimilator)
+it transitions to DONE.
+
+<li>
+<b>need_validate</b>:
+A boolean, true whenever
+the workunit has a result whose validate state is NEED_CHECK.
+The validate program sets it back to false.
+<li>
+<b>error_mask</b>.
+bit mask for error conditions
+
+</ul>
+Invariants:
+<ul>
+<li> eventually either canonical_resultid or error_mask is set
+<li> eventually timeout_check_time=0
+<li> WUs are eventually assimilated
+<li> input files are eventually deleted,
+but only when all results have state=OVER
+            (since may need to validate results that arrive after assimilation)
+        and wu.assimilate_state = DONE
+            (since project may want to do something with WU in error case)
+
+
+</ul>
+
+
+<p>
+
+A result has the following state fields:
+<ul>
+<li> <b>report_deadline</b>:
+        give up on result (and possibly delete input files)
+        if don't get reply by this time.
+        Assignment: when send result; now + WU.delay_bound
+
+<li> <b>server_state</b>:
+UNSENT, IN_PROGRESS, OVER.
+Initially UNSENT.
+Becomes IN_PROGRESS when the result has been sent to a client.
+Becomes OVER if we get a host reply,
+or the result times out, or we decide not to send it.
+<li> <b>outcome</b>:
+SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED.
+Defined if server_state = OVER.
+<li>
+<b>client_state</b>:
+Records the client state (upload, process, or download)
+where an error occurred.
+Defined if outcome is CLIENT_ERROR.
+<li>
+<b>file_delete_state</b>:
+INIT, READY, DONE.
+<li>
+<b>validate_state</b>:
+INITIAL, VALID, INVALID.
+When a canonical result has been found for the workunit,
+becomes either VALID or INVALID.
+</ul>
+
+Invariants:
+<ul>
+<li> results eventually have server_state = OVER.
+<li> output files are eventually deleted.
+Non-canonical results can be deleted as soon as the WU is assimilated.
+Canonical results can be deleted only when all results have server_state=OVER.
+If a result reply arrives after its timeout,
+the output files can be immediately deleted.
+How do we delete output files that arrive REALLY late?
+(e.g. uploaded after all results have timed out, and never reported)?
+Let X = create time of oldest unassimilated WU.
+Any output files created before X can be deleted.
+</ul>
+<h3>A note on scheduling</h3>
+<p>
+    - when is it feasible to send a result to a host?
+        Request msg should include X = amount of work currently queued.
+        TODO: include % time active in calculation??
+        Decision for each WU:
+            is X + time for WUs sent so far < delay_bound?
+    - When is a result declared "unsendable"?
+        Not a good idea to do on the basis of time;
+        do it only if a result is flushed from FIFO (see below)
+
+<h3>State transitions</h3>
+<pre>
+
+fields of "result" table:
+
+server_state
+    UNSENT
+        (on creation)
+    IN_PROGRESS
+        from UNSENT
+            scheduler: when send
+    OVER
+        from IN_PROGRESS
+            scheduler: get reply from host
+            timeout_check: now > report_deadline
+        from UNSENT
+            validate: got canonical result for this WU and server_state=UNSENT
+            timeout_check: WU has error
+
+file_delete_state
+    INIT
+        (on creation)
+    READY
+        from INIT:
+            scheduler: got reply and server_state = OVER
+            timeout_check: all results are OVER
+            assimilator: all results are OVER or result is not canonical
+        from DONE:
+            scheduler: got reply and server_state = OVER
+    DONE
+        from READY
+            file_deleter: tried to delete files
+
+validate_state
+    INIT
+    VALID
+        from INIT:
+            validate: outcome = SUCCESS and matched canonical result
+    INVALID
+        from INIT:
+            scheduler: got reply, client error
+            validate: didn't match canonical result
+
+-------------
+fields of "workunit" table
+
+need_validate
+    FALSE
+        (on creation)
+        from TRUE:
+            validate: done checking
+    TRUE
+        from FALSE:
+            scheduler: got reply w/ client_state = DONE (i.e. no error)
+
+file_delete_state
+    INIT
+        (on creation)
+    READY
+        timeout_check: all results haver server_state=OVER
+            and wu.assimilate_state = DONE
+        assimilate:
+            all results have server_state = OVER
+            (and wu.assimilate_state = DONE)
+    DONE
+
+assimilate_state
+    INIT
+        (on creation)
+    READY
+        from INIT:
+            timeout_check: WU has error
+            validate: found canonical result
+    DONE
+        from READY:
+            assimilator: done
+
+error_mask
+    COULDNT_SEND
+        timeout_check: some result has outcome COULDNT_SEND
+    TOO_MANY_ERROR_RESULTS
+        timeout_check: too many error results
+    TOO_MANY_RESULTS
+        timeout_check: too many results
+
+timeout_check_time:
+    nonzero
+        (on creation)
+    zero
+        timeout_check: all results are OVER and validate_state = DONE
+</pre>
--- a/doc/backend_work_sequence.html
+++ b/doc/backend_work_sequence.html
@ -0,0 +1,27 @@
+<title>Back ends and work sequences</title>
+<h2>Back ends and work sequences</h2>
+<p>
+Project that use work sequences will need two additional processes:
+<p>
+<table border=1 cellpadding=8>
+<tr>
+<th>Component</th>
+<th>BOINC-supplied part</th>
+<th>project-supplied part</th>
+</tr>
+<tr>
+<td valign=top><b>Work sequence relocater</b>:
+detects work sequences whose hosts have failed,
+and relocates them to other hosts.</td>
+<td valign=top>A program <b>seq_relocate</b></td>
+<td valign=top>Some parameters used by seq_relocate</td>
+</tr>
+<tr>
+<td valign=top><b>Work sequence validation and accounting</b>:
+Similar to result validation, but for work sequences.
+</td>
+<td valign=top>A program <b>seq_validate</b></td>
+<td valign=top>An application-specific function,
+linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
+</tr>
+</table>
--- a/doc/create_project.html
+++ b/doc/create_project.html
@ -68,13 +68,11 @@ Developing and operating a BOINC project back end
 How to feed work into a BOINC system and collect the results.
 </font>
 <ul>
-<li> <a href=backend.html>The components of a project back end</a>
-<li> <a href=work_states.html>Workunit and result states</a>
+<li> <a href=backend_functions.html>Back end functions</a>
+<li> <a href=backend_state.html>Back end state transitions</a>
+<li> <a href=backend_programs.html>Back end programs</a>
 <li> <a href=tools_work.html>Generating work</a>
-<li> <a href=retry.html>Generating result retries</a>
-<li> <a href=validation.html>Result validation and accounting</a>
-<li> <a href=result_process.html>Assimilation</a>
-<li> <a href=garbage.html>File deletion</a>
+<li> <a href=backend_work_sequence.html>Back ends and work sequences</a>
 <li> <a href=back_end.html>Back end examples</a>
 </ul>

--- a/doc/garbage.html
+++ b/doc/garbage.html
@ -1,2 +0,0 @@
-<title>Garbage collection</title>
-<h2>Garbage collection</h2>
--- a/doc/retry.html
+++ b/doc/retry.html
@ -1,66 +0,0 @@
-<title>Generating result retries</title>
-<h2>Generating result retries</h2>
-
-<p>
-Hosts may fail to process and return results for various reasons;
-such results are said to be <b>lost</b>.
-A combination of lost and erroneous results may prevent
-finding canonical result for a workunit.
-The <b>result retry</b> mechanism generates additional
-results as needed to find a canonical result.
-
-<p>
-The result retry mechanism has the following project-supplied parameters:
-<ul>
-<li> D<sub>WU</sub>: the expected delay (in seconds) between
-creating a WU and getting a canonical result.
-<li> D<sub>result</sub>: the expected delay (in seconds) between
-creating a result and getting a confirmation.
-<li> N<sub>Error</sub>: give up on a workunit if it gets this many error results
-(i.e., there must be a bug in the application).
-<li> N<sub>det</sub>: give up on a workunit if it gets this many
-non-error results without finding a canonical result
-(i.e., the algorithm must nondeterministic).
-<li> N<sub>redundancy</sub>: try to get at least this many non-error results.
-</ul>
-
-<p>
-Each workunit has a <b>retry check time</b>.
-This is initially set to now + D<sub>WU</sub>,
-and is set to zero if a canonical result is found for the WU.
-
-<p>
-Each result has a <b>deadline</b>,
-a time by which a confirmation is expected for the result.
-This is initially set to now + D<sub>result</sub>,
-
-<p>
-Retry generation is handled by the program <b>result_retry</b>, invoked as
-<pre>
-result_retry -appname name
-</pre>
-This program continually checks for workunits past their check time
-and without pending validation.
-For each such workunit, the program does the following:
-
-<ul>
-<li> If any result is not sent, generate an error message,
-and give up on the WU (i.e., set its check time to zero).
-This condition indicates that either
-1) the resource requirements of the WU are too much for
-any host;
-2) there are insufficient hosts to handle the rate of work generation; or
-3) scheduling servers have been out of service.
-<li> If at least N<sub>error</sub> results have an error,
-generate an error message and give up on the WU.
-<li> If at least N<sub>det</sub> results are done,
-generate an error message and give up on the WU.
-<li> Generate N<sub>redundancy</sub> - n new results for the WU,
-where n is the number of results that are done.
-The deadline of these results is now + D<sub>result</sub>.
-<li> Set the check time of the WU to now + D<sub>WU</sub>
-
-</ul>
-
-<p>
-Use crontab to run <b>result_retry</b> continuously.
--- a/doc/validation.html
+++ b/doc/validation.html
@ -1,77 +0,0 @@
-<title>Validation of results and credit</title>
-<body bgcolor=ffffff>
-<h2>Validation of results and credit</h2>
-<p>
-Communication from the core client can easily be altered or forged.
-Some users will attempt to get undeserved credit
-by falsifying their CPU metrics or CPU times.
-Output files may be wrong.
-This can result from tampering or hardware failures.
-<p>
-Both problems - credit-cheating and wrong results - can be addressed
-by <b>redundant computing</b> and <b>result validation</b>.
-In this approach, each workunit is processed at least twice.
-The project back end waits until a minimum number of results have been returned,
-then compares the results and decides which are considered correct.
-The notion of equality of results,
-and the policy for deciding which are correct,
-are project-specific.
-<p>
-The back end then marks correct results as "validated",
-finds the minimum reported credit for the correct results of a given workunit,
-and assigns this amount of credit to all the correct results.
-This ensures that as long as a reasonable majority of participants
-don't falsify credit, almost all credit accounting will be correct.
-
-<h3>The validation program</h3>
-
-BOINC supplies a utility program <b>validate</b>
-to perform validation and credit-granting.
-This program must be linked with two project-specific functions:
-<pre>
-int check_set(vector<RESULT> results, int& canonicalid, double& credit);
-int check_pair(RESULT& r1, RESULT& r2, bool& match);
-</pre>
-<b>check_set()</b> takes a set of results.
-If there is sufficient agreement,
-it selects one of them as the "canonical" result
-(returning its ID) and also decides what credit should
-be granted for correct results for this workunit.
-<p>
-<b>check_pair()</b> compares two results and returns match=true
-if they agree.
-
-<p>
-The file <b>validate_test.C</b> contains an example
-implementation of check_set() and check_pair().
-
-
-<hr>
-<h3>Implementation</h3>
-
-The following database fields are used:
-
-<p>
-<b>WORKUNIT</b>
-<dt> bool need_validate
-<dd>
-        true iff this workunit has one or more results in state DONE
-        and validate_state UNCHECKED
-<dt>
-    int canonical_resultid
-<dd>
-        nonzero if a conclusive check has been done for this WU;
-        indicates the canonical result
-                            
-<p>
-<b>RESULT</b>
-<dt>
-    int state
-<dd> INACTIVE, ..., DONE, ERROR
-<dt>
-    int validate_state
-<dd>
-        NEED_CHECK,
-        VALID,
-        INVALID
-
--- a/doc/work.html
+++ b/doc/work.html
@ -2,6 +2,9 @@
 <body bgcolor=ffffff>
 <h2>Workunits</h2> 
 <p>
+TODO: don't separate into XML/other parts;
+describe XML format only in the tools doc.
+<p>
 A <b>workunit</b> describes a computation to be performed.
 Workunits are maintained in the <b>workunit</b> table in the BOINC DB.
 The attributes of a workunit include:
@ -13,13 +16,6 @@ The attributes of a workunit include:
 (see below).
 <li> The estimated resource requirements of the work unit
 (computation, memory, disk space).
-<li> The maximum processing
-(measured in <a href=credit.html>Cobblestones</a>)
-and maximum disk space to be used for the computation.
-An instance of the computation that exceeds either of these bounds
-will be aborted.
-This mechanism is used to prevent an infinite-loop bug from
-indefinitely incapacitating a host.
 <li> A <b>delay bound</b>: upper bound on how long (in real time)
 a result associated with this work unit should take to complete.
 This determines which hosts the workunit can be sent to,
@ -49,8 +45,12 @@ The components are:
 <td>A list of environment variables in the form
 name=value&name=value&name=value.
 </td></tr>
-<tr><td>&lt;max_processing></td>
-<td>Maximum processing (in Cobblestones).
+<tr><td valign=top>&lt;max_processing></td>
+<td>Maximum processing
+(measured in <a href=credit.html>Cobblestones</a>).
+An instance of the computation that exceeds this bound will be aborted.
+This mechanism prevents an infinite-loop bug from
+indefinitely incapacitating a host.
 The default is determined by the client; typically it is 1.
 </td></tr>
 <tr><td>&lt;max_disk></td>
--- a/doc/work_states.html
+++ b/doc/work_states.html
@ -1,80 +0,0 @@
-<title>Workunit and result states</title>
-<h2>Workunit and result states</h2>
-
-<p>
-The processing of workunits and results involves
-several independent activities.
-To keep track of these activities,
-workunit and result database records have several "state" fields,
-and their processing can be viewed as the combination
-of several finite-state machines.
-
-<p>
-A workunit has the following state fields:
-<ul>
-<li>
-<b>main_state</b>:
-Initially INIT.
-If a canonical result is found (by validate)
-it transitions to DONE.
-If it's decided that no canonical result can be found
-(by result_retry)
-it transitions to ERROR.
-
-<li>
-<b>file_delete_state</b>:
-Initially INIT.
-When the main state transitions to either DONE or ERROR,
-it transitions to READY,
-indicating that input files can be deleted.
-When file deletion is completed (by file_deleter)
-it transitions to DONE.
-
-<li>
-<b>assimilate_state</b>:
-Initially INIT.
-When the main state transitions to either DONE or ERROR,
-it transitions to READY,
-indicating that the workunit can be assimilated.
-When assimplateion is completed (by assimilator)
-it transitions to DONE.
-
-<li>
-<b>need_validate</b>:
-A boolean, true whenever
-the workunit has a result whose validate state is NEED_CHECK.
-The validate program sets it back to false.
-
-</ul>
-
-
-<p>
-
-A result has the following state fields:
-<ul>
-<li>
-<b>server_state</b>:
-Initially UNSENT.
-Becomes IN_PROGRESS when the result has been sent to a client.
-Becomes DONE or ERROR if the client returns the result,
-or TIMEOUT if result_retry gives up on the result.
-<li>
-<b>client_state</b>:
-Records the client state (upload, process, or download)
-where an error occurred.
-Not relevant here.
-<li>
-<b>file_delete_state</b>:
-Initially INIT.
-When the parent workunit becomes ERROR, becomes READY,
-indication that the output files can be deleted.
-When the parent workunit becomes DONE, becomes READY
-except for the canonical result, which remains INIT.
-When the output files have been deleted (by file_deleter) becomes DONE.
-<li>
-<b>validate_state</b>:
-Initially INITIAL.
-When the result is returned, becomes NEED_CHECK.
-When a canonical result has been found for the workunit,
-becomes either VALID or INVALID.
-</ul>
--- a/html/ops/db.inc
+++ b/html/ops/db.inc
@ -112,22 +112,13 @@ function show_host($host) {

 }

-function main_state_str($s) {
-    switch($s) {
-    case 0: return "Initial";
-    case 1: return "Done";
-    case 2: return "Error";
-    }
-}
-
-function error_str($s) {
-    switch($s) {
-    case 0: return "No error";
-    case 1: return "Send failed";
-    case 2: return "Too many errors (may have bug)";
-    case 3: return "Too many done (may be nondeterministic)";
-    }
-    return "unknown";
+function wu_error_mask_str($s) {
+    $x = "";
+    if ($s & 1) $x = $x."Couldn't send result; ";
+    if ($s & 2) $x = $x."Too many errors (may have bug); ";
+    if ($s & 4) $x = $x."Too many results (may be nondeterministic)";
+    $x = $x."<br>";
+    return $x;
 }

 function assimilate_state_str($s) {
@ -162,10 +153,9 @@ function show_workunit($wu,$show_xml_doc) {
    row("Need validate?", $wu->need_validate?"yes":"no");
    row("Canonical resultid", $wu->canonical_resultid);
    row("Canonical credit", $wu->canonical_credit);
-    row("Retry check time", time_str($wu->retry_check_time));
+    row("Timeout check time", time_str($wu->timeout_check_time));
    row("Delay bound", $wu->delay_bound);
-    row("Main state", main_state_str($wu->main_state));
-    row("Error", error_str($wu->error));
+    row("Error mask", wu_error_mask_str($wu->error_mask));
    row("File delete state", file_delete_state_str($wu->file_delete_state));
    row("Assimilation state", assimilate_state_str($wu->assimilate_state));
    row("","<a href=db.php?show=result&wu_id=$wu->id>Show associated results</a>");
@ -173,19 +163,28 @@ function show_workunit($wu,$show_xml_doc) {
    echo "<p>";
 }

-function res_server_state_string($s) {
+function result_server_state_string($s) {
    switch($s) {
    case 1: return "Inactive";
    case 2: return "Unsent";
-    case 3: return "In Progress";
-    case 4: return "Done";
-    case 5: return "Timeout";
-    case 6: return "Error";
+    case 4: return "In Progress";
+    case 5: return "Over";
    }
    return "unknown";
 }

-function result_client_state_str($s) {
+function result_outcome_string($s) {
+    switch($s) {
+    case 1: return "Success";
+    case 2: return "Couldn't send";
+    case 3: return "Client error";
+    case 4: return "No reply";
+    case 5: return "Didn't need";
+    }
+    return "unknown";
+}
+
+function result_client_state_string($s) {
    switch($s) {
    case 1: return "Downloading";
    case 2: return "Computing";
@ -197,9 +196,8 @@ function result_client_state_str($s) {
 function validate_state_str($s) {
    switch($s) {
    case 0: return "Initial";
-    case 1: return "Need check";
-    case 2: return "Valid";
-    case 3: return "Invalid";
+    case 1: return "Valid";
+    case 2: return "Invalid";
    }
    return "unknown";
 }
@ -213,10 +211,11 @@ function show_result($result,$show_xml_docs,$show_stderr,$show_times) {
    }
    row("Name", $result->name);
    row("Workunit", "<a href=db.php?show=workunit&id=$result->workunitid>" . wu_name_by_id($result->workunitid) . "</a>" );
-    row("Server state", res_server_state_string($result->server_state));
-    row("Deadline", time_str($result->report_deadline));
+    row("Server state", result_server_state_string($result->server_state));
+    row("Outcome", result_outcome_string($result->outcome));
+    row("Client state", result_client_state_string($result->client_state));
    row("Host ID", "<a href=db.php?show=host&id=$result->hostid>" . host_name_by_id($result->hostid) . "</a>");
-    row("Client state", result_client_state_str($result->client_state));
+    row("Report deadline", time_str($result->report_deadline));
    row("CPU time", $result->cpu_time);
    if ($show_xml_docs) {
        row("XML doc in", "<pre>".htmlspecialchars($result->xml_doc_in)."</pre>");
--- a/html/ops/db.php
+++ b/html/ops/db.php
@ -66,7 +66,7 @@

    if (strlen($result_state) && $result_state != 0) {
        $query = append_sql_query( $query, "server_state = $result_state", $first );
-        $english_query = append_sql_query( $english_query, "server state is ".res_server_state_string($result_state), $first );
+        $english_query = append_sql_query( $english_query, "server state is ".result_server_state_string($result_state), $first );
        $rstate = $result_state;
        $first = 0;
    } else {
@ -172,7 +172,7 @@
            . "<option value=\"0\"" . ($rstate == 0 ? "selected" : "") . "> All\n"
        );
        for( $i=1;$i<=6;$i++ ) {
-            printf( "<option value=\"$i\"" . ($rstate == $i ? "selected" : "") . ">" . res_server_state_string($i) . "\n" );
+            printf( "<option value=\"$i\"" . ($rstate == $i ? "selected" : "") . ">" . result_server_state_string($i) . "\n" );
        }
        printf( "</select>\n<p>\n" );
        print_text_field( "Result in batch number:", "batch", $batch );
--- a/sched/Makefile.in
+++ b/sched/Makefile.in
@ -77,6 +77,7 @@ FILE_DELETER_OBJS = \

 ASSIMILATOR_OBJS = \
    assimilator.o \
+    assimilate_handler.o \
    config.o \
    ../db/db_mysql.o \
    ../db/mysql_util.o \
--- a/sched/assimilate_handler.C
+++ b/sched/assimilate_handler.C
@ -0,0 +1,24 @@
+#include <stdio.h>
+
+#include "db.h"
+#include "assimilate_handler.h"
+
+void assimilate_handler(
+    WORKUNIT& wu, vector<RESULT>& results, RESULT& canonical_result
+) {
+    printf("assimilating WU %s\n", wu.name);
+    if (wu.canonical_resultid) {
+        printf("canonical result:\n%s", canonical_result.xml_doc_out);
+    } else {
+        printf("no canonical result\n");
+    }
+    if (wu.error_mask&WU_ERROR_COULDNT_SEND_RESULT) {
+        printf("Error: couldn't send a result\n");
+    }
+    if (wu.error_mask&WU_ERROR_TOO_MANY_ERROR_RESULTS) {
+        printf("Error: too many error results\n");
+    }
+    if (wu.error_mask&WU_ERROR_TOO_MANY_RESULTS) {
+        printf("Error: too many total results\n");
+    }
+}
--- a/sched/assimilate_handler.h
+++ b/sched/assimilate_handler.h
@ -0,0 +1,4 @@
+#include <vector>
+#include "db.h"
+
+extern void assimilate_handler(WORKUNIT&, vector<RESULT>&, RESULT&);
--- a/sched/assimilator.C
+++ b/sched/assimilator.C
@ -21,10 +21,12 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <time.h>
+#include <vector>

 #include "db.h"
 #include "parse.h"
 #include "config.h"
+#include "assimilate_handler.h"

 CONFIG config;

@ -35,48 +37,75 @@ void write_log(char* p) {
    fprintf(stderr, "%s: %s", timestr, p);
 }

+// assimilate all WUs that need it
 // return nonzero if did anything
 //
-bool do_pass(APP app) {
+bool do_pass(APP& app) {
    WORKUNIT wu;
-    RESULT result;
-    bool did_something = false;
-    int retval;
+    RESULT canonical_result, result;
+    vector<RESULT> results;
+    bool did_something = false, delete_inputs, delete_outputs;
    char buf[MAX_BLOB_SIZE];
+    unsigned int i;

    wu.appid = app.id;
    wu.assimilate_state = ASSIMILATE_READY;
    while (!db_workunit_enum_app_assimilate_state(wu)) {
        did_something = true;

-        sprintf(buf, "Assimilating WU %s, assim state %d\n", wu.name, wu.assimilate_state);
+        sprintf(buf,
+            "Assimilating WU %s, assim state %d\n",
+            wu.name, wu.assimilate_state
+        );
        write_log(buf);

-        switch(wu.main_state) {
-        case WU_MAIN_STATE_INIT:
-            write_log("ERROR; WU shouldn't be in init state\n");
-            break;
-        case WU_MAIN_STATE_DONE:
-            if (!wu.canonical_resultid) {
-                write_log("ERROR: canonical resultid zero\n");
-                break;
+        result.workunitid = wu.id;
+        while (!db_result_enum_wuid(result)) {
+            results.push_back(result);
+            if (result.id == wu.canonical_resultid) {
+                canonical_result = result;
            }
-            retval = db_result(wu.canonical_resultid, result);
-            if (retval) {
-                write_log("can't get canonical result\n");
-                break;
-            }
-            sprintf(buf, "canonical result for WU %s:\n%s", wu.name, result.xml_doc_out);
-            write_log(buf);
-
-            result.file_delete_state = FILE_DELETE_READY;
-            db_result_update(result);
-            break;
-        case WU_MAIN_STATE_ERROR:
-            printf("WU %s had an error\n", wu.name);
-            break;
        }
+
+        assimilate_handler(wu, results, canonical_result);
+
+        delete_outputs = true;
+        delete_inputs = true;
+        for (i=0; i<results.size(); i++) {
+            result = results[i];
+            if (result.server_state != RESULT_SERVER_STATE_OVER
+                || (result.outcome != RESULT_OUTCOME_SUCCESS && result.outcome != RESULT_OUTCOME_CLIENT_ERROR)
+            ) {
+                delete_outputs = false;
+            }
+            if (result.server_state != RESULT_SERVER_STATE_OVER) {
+                delete_inputs = false;
+            }
+        }
+
+        if (delete_outputs) {
+            for (i=0; i<results.size(); i++) {
+                result = results[i];
+                result.file_delete_state = FILE_DELETE_READY;
+                db_result_update(result);
+            }
+        } else {
+            for (i=0; i<results.size(); i++) {
+                result = results[i];
+                if (result.server_state == RESULT_SERVER_STATE_OVER
+                    && result.id != wu.canonical_resultid
+                    && (result.outcome == RESULT_OUTCOME_SUCCESS || result.outcome == RESULT_OUTCOME_CLIENT_ERROR)
+                ) {
+                    result.file_delete_state = FILE_DELETE_READY;
+                    db_result_update(result);
+                }
+            }
+        }
+
        wu.assimilate_state = ASSIMILATE_DONE;
+        if (delete_inputs) {
+            wu.file_delete_state = FILE_DELETE_READY;
+        }
        db_workunit_update(wu);
    }
    return did_something;
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -332,12 +332,13 @@ int handle_results(

    for (i=0; i<sreq.results.size(); i++) {
        rp = &sreq.results[i];
+
        // acknowledge the result even if we couldn't find it --
        // don't want it to keep coming back
        //
        reply.result_acks.push_back(*rp);

-        sprintf(buf, "got ack for result %s\n", rp->name);
+        sprintf(buf, "got result %s\n", rp->name);
        write_log(buf);

        strncpy(result.name, rp->name, sizeof(result.name));
@ -347,7 +348,8 @@ int handle_results(
            write_log(buf);
            continue;
        }
-        if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
+
+        if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
            sprintf(buf,
                "got unexpected result for %s: server state is %d\n",
                rp->name, result.server_state
@ -355,6 +357,12 @@ int handle_results(
            write_log(buf);
            continue;
        }
+        if (result.server_state == RESULT_SERVER_STATE_OVER) {
+            result.file_delete_state = FILE_DELETE_READY;
+        }
+        if (result.server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
+            result.server_state = RESULT_SERVER_STATE_OVER;
+        }

        if (result.hostid != sreq.hostid) {
            sprintf(buf,
@ -365,22 +373,32 @@ int handle_results(
            continue;
        }

-        // TODO: handle error returns
-        //
        result.hostid = reply.host.id;
        result.received_time = time(0);
        result.client_state = rp->client_state;
        result.cpu_time = rp->cpu_time;
        result.claimed_credit = result.cpu_time * host.credit_per_cpu_sec;
-        result.validate_state = VALIDATE_STATE_NEED_CHECK;
-        if (result.client_state != CLIENT_DONE) {
-            result.validate_state = VALIDATE_STATE_INVALID;
-                //so we won't try to validate this result anymore
-            result.server_state = RESULT_SERVER_STATE_ERROR;
+        result.server_state = RESULT_SERVER_STATE_OVER;
+        if (result.client_state == CLIENT_DONE) {
+            result.outcome = RESULT_OUTCOME_SUCCESS;
+            retval = db_workunit(result.workunitid, wu);
+            if (retval) {
+                sprintf(buf,
+                    "can't find WU %d for result %d\n",
+                    result.workunitid, result.id
+                );
+                write_log(buf);
+            } else {
+                wu.need_validate = 1;
+                retval = db_workunit_update(wu);
+                if (retval) {
+                    write_log("Can't update WU\n");
+                }
+            }
        } else {
-            result.server_state = RESULT_SERVER_STATE_DONE;
+            result.outcome = RESULT_OUTCOME_CLIENT_ERROR;
+            result.validate_state = VALIDATE_STATE_INVALID;
        }
-
     
        strncpy(result.stderr_out, rp->stderr_out, sizeof(result.stderr_out));
        strncpy(result.xml_doc_out, rp->xml_doc_out, sizeof(result.xml_doc_out));
@ -390,20 +408,6 @@ int handle_results(
            write_log(buf);
        }

-        retval = db_workunit(result.workunitid, wu);
-        if (retval) {
-            sprintf(buf,
-                "can't find WU %d for result %d\n",
-                result.workunitid, result.id
-            );
-            write_log(buf);
-        } else {
-            wu.need_validate = 1;
-            retval = db_workunit_update(wu);
-            if (retval) {
-                write_log("Can't update WU\n");
-            }
-        }
    }
    return 0;
 }
@ -487,6 +491,7 @@ int send_work(
        result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
        result.hostid = reply.host.id;
        result.sent_time = time(0);
+        result.report_deadline = result.sent_time + wu.delay_bound;
        db_result_update(result);

        nresults++;
--- a/sched/result_retry.C
+++ b/sched/result_retry.C
@ -17,9 +17,12 @@
 // Contributor(s):
 //

-// result_retry - create new results to make up for lost ones
+// timeout_check - do various time-based tasks
+//    - time out results
+//    - create new results to make up for lost ones
+//    - check for WU error conditions
 //
-// result_retry
+// timeout_check
 //   -app appname
 //   [ -nerror n ]          if get this many errors, bail on WU
 //   [ -ndet n ]            if get this results w/o consensus, bail
@ -155,95 +158,92 @@ int assign_new_names(char* in) {
    return 0;
 }

-bool do_pass(APP& app) {
-    WORKUNIT wu;
+void handle_wu(WORKUNIT& wu) {
+    vector<RESULT> results;
    RESULT result;
    int nerrors, ndone, retval;
    unsigned int i, n;
-    bool did_something = false;
    char buf[256];
+    unsigned int now = time(0);
+    bool wu_error = false, all_over;

-    wu.retry_check_time = time(0);
-    wu.appid = app.id;
-
-    // loop over WUs that are due to be checked
+    // scan the results for the WU
    //
-    while (!db_workunit_enum_retry_check_time(wu)) {
-        vector<RESULT> results;
+    result.workunitid = wu.id;
+    while (!db_result_enum_wuid(result)) {
+        results.push_back(result);
+    }

-        did_something = true;
-        // if this WU has a canonical result, we're done
-        // (this normally doesn't happen since the retry check time
-        // is zeroed when canonical result found, but just in case).
-        //
-        if (wu.canonical_resultid) {
-            wu.retry_check_time = 0;
-            goto update_wu;
+    nerrors = 0;
+    ndone = 0;
+    for (i=0; i<results.size(); i++) {
+        result = results[i];
+
+        switch (result.server_state) {
+        case RESULT_SERVER_STATE_IN_PROGRESS:
+            if (result.report_deadline < now) {
+                result.server_state = RESULT_SERVER_STATE_OVER;
+                result.outcome = RESULT_OUTCOME_NO_REPLY;
+                db_result_update(result);
+            }
+            break;
+        case RESULT_SERVER_STATE_OVER:
+            switch (result.outcome) {
+            case RESULT_OUTCOME_COULDNT_SEND:
+                sprintf(buf, "WU %s has couldn't-send result\n", wu.name);
+                write_log(buf);
+                wu.error_mask |= WU_ERROR_COULDNT_SEND_RESULT;
+                wu_error = true;
+                break;
+            case RESULT_OUTCOME_SUCCESS:
+                ndone++;
+                break;
+            default:
+                nerrors++;
+                break;
+            }
+            break;
        }
+    }

-        // enumerate all the results for the WU
-        //
-        result.workunitid = wu.id;
-        while (!db_result_enum_wuid(result)) {
-            results.push_back(result);
-        }
+    // check for too many errors or too many results
+    //
+    if (nerrors > max_errors) {
+        sprintf(buf, "WU %s has too many errors\n", wu.name);
+        write_log(buf);
+        wu.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS;
+        wu_error = true;
+    }
+    if (ndone > max_done) {
+        sprintf(buf, "WU %s has too many answers\n", wu.name);
+        write_log(buf);
+        wu.error_mask |= WU_ERROR_TOO_MANY_RESULTS;
+        wu_error = true;
+    }

-        nerrors = 0;
-        ndone = 0;
+    // if this WU had an error, don't send any unsent results
+    //
+    if (wu_error) {
        for (i=0; i<results.size(); i++) {
            result = results[i];
-
-            // if any result is unsent, give up on the WU
-            //
            if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
-                sprintf(buf, "WU %s has unsent result\n", wu.name);
-                write_log(buf);
-                wu.main_state = WU_MAIN_STATE_ERROR;
-                wu.error = SEND_FAIL;
-                wu.file_delete_state = FILE_DELETE_READY;
-                wu.assimilate_state = ASSIMILATE_READY;
-                wu.retry_check_time = 0;
-                goto update_wu;
-            }
-            if (result.server_state == RESULT_SERVER_STATE_ERROR) {
-                nerrors++;
-            }
-            if (result.server_state == RESULT_SERVER_STATE_DONE) {
-                ndone++;
+                result.server_state = RESULT_SERVER_STATE_OVER;
+                result.outcome = RESULT_OUTCOME_DIDNT_NEED;
+                db_result_update(result);
            }
        }
-
-        // it too many errors or too many different results, bail
-        //
-        if (nerrors > max_errors) {
-            sprintf(buf, "WU %s has too many errors\n", wu.name);
-            write_log(buf);
-            wu.main_state = WU_MAIN_STATE_ERROR;
-            wu.error = TOO_MANY_ERRORS;
-            wu.file_delete_state = FILE_DELETE_READY;
+        if (wu.assimilate_state == ASSIMILATE_INIT) {
            wu.assimilate_state = ASSIMILATE_READY;
-            wu.retry_check_time = 0;
-            goto update_wu;
        }
-        if (ndone > max_done) {
-            sprintf(buf, "WU %s has too many answers\n", wu.name);
-            write_log(buf);
-            wu.main_state = WU_MAIN_STATE_ERROR;
-            wu.error = TOO_MANY_DONE;
-            wu.file_delete_state = FILE_DELETE_READY;
-            wu.assimilate_state = ASSIMILATE_READY;
-            wu.retry_check_time = 0;
-            goto update_wu;
-        }
-
-        // Generate new results if needed.
+    } else {
+        // If no error, generate new results if needed.
        // Munge the XML of an existing result
        // to create unique new output filenames.
        //
        if (nredundancy > ndone) {
            n = nredundancy - ndone;
            
-	    for (i=0; i<n; i++) {
+            for (i=0; i<n; i++) {
                result = results[0];
                make_unique_name(result.name);
                initialize_result(result, wu);
@ -258,17 +258,46 @@ bool do_pass(APP& app) {
                }
            }
        }
+    }

-        // update the WU's result retry check time
-        //
-        wu.retry_check_time = time(0) + wu.delay_bound;
-update_wu:
-        retval = db_workunit_update(wu);
-        if (retval) {
-            sprintf(buf, "db_workunit_update %d\n", retval);
-            write_log(buf);
+    // see if all results are OVER and result is assimilated;
+    // if so we don't need to check this WU ever again.
+    //
+    all_over = true;
+    for (i=0; i<results.size(); i++) {
+        result = results[i];
+        if (result.server_state != RESULT_SERVER_STATE_OVER) {
+            all_over = false;
+            break;
        }
    }
+
+    if (all_over && wu.assimilate_state == ASSIMILATE_DONE) {
+        wu.file_delete_state = FILE_DELETE_READY;
+        wu.timeout_check_time = 0;
+    } else {
+        wu.timeout_check_time = now + wu.delay_bound;
+    }
+
+    retval = db_workunit_update(wu);
+    if (retval) {
+        sprintf(buf, "db_workunit_update %d\n", retval);
+        write_log(buf);
+    }
+}
+
+bool do_pass(APP& app) {
+    WORKUNIT wu;
+    bool did_something = false;
+
+    // loop over WUs that are due to be checked
+    //
+    wu.timeout_check_time = time(0);
+    wu.appid = app.id;
+    while (!db_workunit_enum_timeout_check_time(wu)) {
+        did_something = true;
+        handle_wu(wu);
+    }
    return did_something;
 }

--- a/sched/validate.C
+++ b/sched/validate.C
@ -122,43 +122,37 @@ int grant_credit(RESULT& result, double credit) {
    return 0;
 }

-// make one pass through the workunits with need_validate set.
-// return true if there were any
-//
-bool do_validate_scan(APP& app, int min_quorum) {
-    WORKUNIT wu;
+void handle_wu(WORKUNIT& wu) {
    RESULT result, canonical_result;
-    bool found=false, match;
+    bool match, update_result;
    int retval, canonicalid;
    double credit;
    unsigned int i;
    char buf[256];

-    wu.appid = app.id;
-    while(!db_workunit_enum_app_need_validate(wu)) {
-        found = true;
-        if (wu.canonical_resultid) {
-            sprintf(buf,
-                "validating WU %s; already have canonical result\n", wu.name
-            );
-            write_log(buf);
+    if (wu.canonical_resultid) {
+        sprintf(buf,
+            "validating WU %s; already have canonical result\n", wu.name
+        );
+        write_log(buf);

-            // Here if WU already has a canonical result.
-            // Get unchecked results and see if they match the canonical result
-            //
-            retval = db_result(wu.canonical_resultid, canonical_result);
-            if (retval) {
-                write_log("can't read canonical result\n");
-                continue;
-            }
+        // Here if WU already has a canonical result.
+        // Get unchecked results and see if they match the canonical result
+        //
+        retval = db_result(wu.canonical_resultid, canonical_result);
+        if (retval) {
+            write_log("can't read canonical result\n");
+            return;
+        }

-            // scan this WU's results, and check any that need checking
-            //
-            result.workunitid = wu.id;
-            while (!db_result_enum_wuid(result)) {
-                if (result.validate_state != VALIDATE_STATE_NEED_CHECK) {
-                    continue;
-                }
+        // scan this WU's results, and check the unchecked ones
+        //
+        result.workunitid = wu.id;
+        while (!db_result_enum_wuid(result)) {
+            if (result.validate_state == VALIDATE_STATE_INIT 
+                && result.server_state == RESULT_SERVER_STATE_OVER
+                && result.outcome == RESULT_OUTCOME_SUCCESS
+            ) {
                retval = check_pair(result, canonical_result, match);
                if (retval) {
                    sprintf(buf,
@ -188,55 +182,66 @@ bool do_validate_scan(APP& app, int min_quorum) {
                    continue;
                }
            }
-        } else {
-            // Here if WU doesn't have a canonical result yet.
-            // Try to get one
+        }
+    } else {
+        vector<RESULT> results;

-            sprintf(buf, "validating WU %s; no canonical result\n", wu.name);
-            write_log(buf);
+        // Here if WU doesn't have a canonical result yet.
+        // Try to get one

-            vector<RESULT> results;
-            result.workunitid = wu.id;
-            while (!db_result_enum_wuid(result)) {
-                if (result.server_state == RESULT_SERVER_STATE_DONE) {
-                    results.push_back(result);
-                }
+        sprintf(buf, "validating WU %s; no canonical result\n", wu.name);
+        write_log(buf);
+
+        result.workunitid = wu.id;
+        while (!db_result_enum_wuid(result)) {
+            if (result.server_state == RESULT_SERVER_STATE_OVER
+                && result.outcome == RESULT_OUTCOME_SUCCESS
+            ) {
+                results.push_back(result);
            }
-            sprintf(buf, "found %d results\n", results.size());
-            write_log(buf);
-            if (results.size() >= (unsigned int)min_quorum) {
-                retval = check_set(results, canonicalid, credit);
-                if (!retval && canonicalid) {
-                    write_log("found a canonical result\n");
-                    wu.canonical_resultid = canonicalid;
-                    wu.canonical_credit = credit;
-                    wu.main_state = WU_MAIN_STATE_DONE;
-                    wu.file_delete_state = FILE_DELETE_READY;
-                    wu.assimilate_state = ASSIMILATE_READY;
-                    for (i=0; i<results.size(); i++) {
+        }
+        sprintf(buf, "found %d successful results\n", results.size());
+        write_log(buf);
+        if (results.size() >= (unsigned int)min_quorum) {
+            retval = check_set(results, canonicalid, credit);
+            if (!retval && canonicalid) {
+                write_log("found a canonical result\n");
+                wu.canonical_resultid = canonicalid;
+                wu.canonical_credit = credit;
+                wu.assimilate_state = ASSIMILATE_READY;
+                for (i=0; i<results.size(); i++) {
+                    result = results[i];
+                    update_result = false;

-                        // if result is not canonical, arrange to delete
-                        // its output files
-                        //
-                        if (results[i].id != canonicalid) {
-                            results[i].file_delete_state = FILE_DELETE_READY;
+                    // grant credit for valid results
+                    //
+                    if (result.validate_state == VALIDATE_STATE_VALID) {
+                        update_result = true;
+                        retval = grant_credit(result, credit);
+                        if (retval) {
+                            sprintf(buf,
+                                "validate: grant_credit %d\n", retval
+                            );
+                            write_log(buf);
                        }
-
-                        // grant credit for valid results
-                        //
-                        if (results[i].validate_state == VALIDATE_STATE_VALID) {
-                            retval = grant_credit(results[i], credit);
-                            if (retval) {
-                                sprintf(buf,
-                                    "validate: grant_credit %d\n", retval
-                                );
-                                write_log(buf);
-                            }
-                            results[i].granted_credit = credit;
-                        }
-                        sprintf(buf, "updating result %d to %d; credit %f\n", results[i].id, results[i].validate_state, credit);
+                        result.granted_credit = credit;
+                        sprintf(buf,
+                            "updating result %d to %d; credit %f\n",
+                            result.id, result.validate_state, credit
+                        );
                        write_log(buf);
-                        retval = db_result_update(results[i]);
+                    }
+
+                    // don't send any unsent results
+                    //
+                    if (result.server_state == RESULT_SERVER_STATE_UNSENT) {
+                        update_result = true;
+                        result.server_state = RESULT_SERVER_STATE_OVER;
+                        result.outcome = RESULT_OUTCOME_DIDNT_NEED;
+                    }
+
+                    if (update_result) {
+                        retval = db_result_update(result);
                        if (retval) {
                            sprintf(buf,
                                "validate: db_result_update %d\n", retval
@ -247,15 +252,29 @@ bool do_validate_scan(APP& app, int min_quorum) {
                }
            }
        }
+    }

-        // we've checked all results for this WU, so turn off flag
-        //
-        wu.need_validate = 0;
-        retval = db_workunit_update(wu);
-        if (retval) {
-            sprintf(buf, "db_workunit_update: %d\n", retval);
-            write_log(buf);
-        }
+    // we've checked all results for this WU, so turn off flag
+    //
+    wu.need_validate = 0;
+    retval = db_workunit_update(wu);
+    if (retval) {
+        sprintf(buf, "db_workunit_update: %d\n", retval);
+        write_log(buf);
+    }
+}
+
+// make one pass through the workunits with need_validate set.
+// return true if there were any
+//
+bool do_validate_scan(APP& app, int min_quorum) {
+    WORKUNIT wu;
+    bool found=false;
+
+    wu.appid = app.id;
+    while(!db_workunit_enum_app_need_validate(wu)) {
+        handle_wu(wu);
+        found = true;
    }
    return found;
 }
--- a/stripchart/samples/datafiles
+++ b/stripchart/samples/datafiles
@ -10,3 +10,6 @@
 #
 get_load_out:machine load:3
 count_results_out:Number of results:3
+assimilated_wus_out:Number of assimilated workunits:3
+download_size_out:Bytes in download dir:3
+upload_size_out:Bytes in upload dir:3
--- a/test/test.inc
+++ b/test/test.inc
@ -424,12 +424,17 @@ class Project {
        PassThru("cp $source_dir/stripchart/stripchart $this->project_dir/cgi/");
        PassThru("cp $source_dir/stripchart/stripchart.cnf $this->project_dir/cgi/");
        PassThru("cp $source_dir/stripchart/samples/looper $this->project_dir/cgi/");
+        PassThru("cp $source_dir/stripchart/samples/db_looper $this->project_dir/cgi/");
        PassThru("cp $source_dir/stripchart/samples/datafiles $this->project_dir/cgi/");
        PassThru("cp $source_dir/stripchart/samples/get_load $this->project_dir/cgi/");
-        macro_substitute("BOINC_DB_NAME", $this->db_name, "$source_dir/stripchart/samples/count_results", "$this->project_dir/cgi/count_results");
-        make_executable("$this->project_dir/cgi/count_results");
+        PassThru("cp $source_dir/stripchart/samples/dir_size $this->project_dir/cgi/");
+        macro_substitute("BOINC_DB_NAME", $this->db_name, "$source_dir/stripchart/samples/db_count", "$this->project_dir/cgi/db_count");
+        make_executable("$this->project_dir/cgi/db_count");
        PassThru("cd $this->project_dir/cgi; looper get_load 1 > get_load_out &");
-        PassThru("cd $this->project_dir/cgi; looper count_results 1 > count_results_out &");
+        PassThru("cd $this->project_dir/cgi; db_looper 'result' 1 > count_results_out &");
+        PassThru("cd $this->project_dir/cgi; db_looper 'workunit where assimilate_state=2' 1 > assimilated_wus_out &");
+        PassThru("cd $this->project_dir/cgi; looper 'dir_size ../download' 1 > download_size_out &");
+        PassThru("cd $this->project_dir/cgi; looper 'dir_size ../upload' 1 > upload_size_out &");
    }

    // this should stop the feeder and any other daemons
--- a/test/test_loop.php
+++ b/test/test_loop.php
@ -33,16 +33,9 @@
    $project->start_feeder();
    $project->start_make_work($work);
    $project->start_validate($app, 3);
-    //$project->start_file_delete();
+    $project->start_file_delete();
    $project->start_assimilator($app);
    $project->start_stripchart();
    $host->run();
-    //$project->stop();
-
-    //$result->state = RESULT_STATE_DONE;
-    //$result->stderr_out = "APP: upper_case: starting, argc 1";
-    //$result->exit_status = 0;
-    //$project->check_results(2, $result);
-    //$project->compare_file("uc_wu_0_0", "uc_correct_output");
-    //$project->compare_file("uc_wu_1_0", "uc_correct_output");
+    $project->stop();
 ?>
--- a/tools/backend_lib.C
+++ b/tools/backend_lib.C
@ -153,15 +153,16 @@ void initialize_result(RESULT& result, WORKUNIT& wu) {
    result.workunitid = wu.id;
    result.server_state = RESULT_SERVER_STATE_UNSENT;
    result.hostid = 0;
-    result.report_deadline = time(0) + wu.delay_bound;
+    result.report_deadline = 0;
    result.sent_time = 0;
    result.received_time = 0;
    result.client_state = 0;
    result.cpu_time = 0;
    strcpy(result.xml_doc_out, "");
    strcpy(result.stderr_out, "");
+    result.outcome = RESULT_OUTCOME_INIT;
    result.file_delete_state = ASSIMILATE_INIT;
-    result.validate_state = VALIDATE_STATE_INITIAL;
+    result.validate_state = VALIDATE_STATE_INIT;
    result.claimed_credit = 0;
    result.granted_credit = 0;
 }
--- a/tools/create_work.C
+++ b/tools/create_work.C
@ -144,7 +144,7 @@ int main(int argc, char** argv) {
    }

    wu.appid = app.id;
-    wu.retry_check_time = time(0) + wu.delay_bound;
+    wu.timeout_check_time = time(0) + wu.delay_bound;

    retval = read_key_file(keyfile, key);
    if (retval) {