diff --git a/checkin_notes b/checkin_notes index fc85246a0e..3676618b34 100755 --- a/checkin_notes +++ b/checkin_notes @@ -2263,3 +2263,16 @@ Eric October 29, 2002 mac_carbon_dsp.c,h mac_carbon_gl.c,h +David Nov 5 2002 + - Continued work on result retry mechanism: + - new fields "retry_check_time" and "state" in workunit table + - logic for creating new output filenames + - needs a bit more work + db/ + db.h + db_mysql.C + schema.sql + sched/ + Makefile.in + make_work.C + result_retry.C diff --git a/db/db.h b/db/db.h index 044a8835de..29bf5f9077 100644 --- a/db/db.h +++ b/db/db.h @@ -188,6 +188,13 @@ struct HOST { int parse_net_stats(FILE*); }; +#define WU_STATE_SEND_FAIL 1 + // failed to send results for this WU +#define WU_STATE_TOO_MANY_ERRORS 2 + // too many errors; may have bug +#define WU_STATE_TOO_MANY_DONE 3 + // too many results without consensus; may be nondeterministic + struct WORKUNIT { int id; unsigned int create_time; // time of record creation @@ -206,6 +213,8 @@ struct WORKUNIT { // VALIDATE_STATE_NEED_CHECK state int canonical_resultid; // ID of canonical result, or zero double canonical_credit; // credit that all correct results get + double retry_check_time; // when to check for result retry + int state; // see above // the following not used in the DB char app_name[256]; @@ -296,6 +305,7 @@ extern int db_workunit_update(WORKUNIT& p); extern int db_workunit_lookup_name(WORKUNIT&); //extern int db_workunit_enum_dynamic_to_send(WORKUNIT&, int); extern int db_workunit_enum_app_need_validate(WORKUNIT&); +extern int db_workunit_enum_retry_check_time(WORKUNIT&); extern int db_result_new(RESULT& p); extern int db_result(int id, RESULT&); diff --git a/db/db_mysql.C b/db/db_mysql.C index c7ec071a18..ea2791d88a 100644 --- a/db/db_mysql.C +++ b/db/db_mysql.C @@ -194,12 +194,14 @@ void struct_to_str(void* vp, char* q, int type) { "has_successor=%d, name='%s', xml_doc='%s', batch=%d, " "rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, " "need_validate=%d, " - "canonical_resultid=%d, canonical_credit=%f", + "canonical_resultid=%d, canonical_credit=%f, " + "retry_check_time=%f, state=%d", wup->id, wup->create_time, wup->appid, wup->previous_wuid, wup->has_successor?1:0, wup->name, wup->xml_doc, wup->batch, wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk, wup->need_validate, - wup->canonical_resultid, wup->canonical_credit + wup->canonical_resultid, wup->canonical_credit, + wup->retry_check_time, wup->state ); break; case TYPE_RESULT: @@ -360,6 +362,8 @@ void row_to_struct(MYSQL_ROW& r, void* vp, int type) { wup->need_validate = atoi(r[i++]); wup->canonical_resultid = atoi(r[i++]); wup->canonical_credit = atof(r[i++]); + wup->retry_check_time = atof(r[i++]); + wup->state = atoi(r[i++]); break; case TYPE_RESULT: rp = (RESULT*)vp; @@ -574,6 +578,16 @@ int db_workunit_enum_app_need_validate(WORKUNIT& p) { return db_enum(e, &p, TYPE_WORKUNIT, buf); } +int db_workunit_enum_retry_check_time(WORKUNIT& p) { + static ENUM e; + char buf[256]; + + if (!e.active) { + sprintf(buf, "where retry_check_time > 0 and retry_check_time < %f", p.retry_check_time); + } + return db_enum(e, &p, TYPE_WORKUNIT, buf); +} + ////////// RESULT ///////// diff --git a/db/schema.sql b/db/schema.sql index a2e83d889a..06440a3e54 100644 --- a/db/schema.sql +++ b/db/schema.sql @@ -128,6 +128,8 @@ create table workunit ( need_validate smallint not null, canonical_resultid integer not null, canonical_credit double not null, + retry_check_time double not null, + state integer not null, primary key (id) ); diff --git a/doc/backend.html b/doc/backend.html index 64df7e42f9..864205b4ec 100644 --- a/doc/backend.html +++ b/doc/backend.html @@ -4,7 +4,7 @@

The back end of a BOINC project consists of a number of components that together are responsible -for generating work and absorbing the results. +for generating work and handling results. Some of these components are supplied by BOINC; other parts are project- or application-specific: @@ -14,47 +14,68 @@ other parts are project- or application-specific: BOINC-supplied part project-supplied part - -Work generator: generates work units and their input files, -and generates one or more results per work unit. + +Work generator: generates work units, work sequences, results, +and the corresponding input files. - + Functions and programs that handle the details of -creating workunit and result database records. +creating workunit, work sequence, and result database records. - + Programs or scripts that generate input files, install them on data servers, and call the BOINC functions. -Result retry generator: -generates additional results if some of those initially sent are lost. -A program, result_retry, that does the job. -Some parameters used by result_retry. +Result retry generator: +generates additional results if some of those initially sent are lost +not completed, or completed erroneously. +A program, result_retry, that does the job. +Some parameters used by result_retry. -Result validation and accounting: +Result validation and accounting: compare redundant results; select a canonical result representing the correct output, and a canonical credit granted to users and hosts that return the correct output. -A program, validate, that contains the +A program, validate, that contains the basic logic for validation. -An application-specific function, linked with validate, +An application-specific function, linked with validate, that compares sets of redundant results. -Result processing: parse output files of -completed results, and take appropriate action -(record results in a database, and/or generate more work) -Function for enumerating unprocessed results. -The rest. +Work sequence relocater: +detects work sequences whose hosts have failed, +and relocates them to other hosts. +A program seq_relocate +Some parameters used by seq_relocate -Garbage collector: delete input and output files +Work sequence validation and accounting: +Similar to result validation, but for work sequences. + +A program seq_validate +An application-specific function, +linked with seq_validate, that compares sets of redundant results. + + +Result processing: parse output files of +completed results, and take appropriate action +(record results in a database, and/or generate more work) +Function for enumerating unprocessed results. +Functions and programs that handle the details of +creating workunit, work sequence, and result database records. + + +The rest. + + + +Garbage collector: delete input and output files when they are no longer needed. -A program, garbage_collect, that does the job. -None. +A program, garbage_collect, that does the job. +None. diff --git a/sched/Makefile.in b/sched/Makefile.in index cd71d616eb..07aa60d2f8 100644 --- a/sched/Makefile.in +++ b/sched/Makefile.in @@ -79,6 +79,11 @@ MAKE_WORK_OBJS = \ ../lib/crypt.o \ ../RSAEuro/source/rsaeuro.a +RESULT_RETRY_OBJS = \ + result_retry.o \ + ../db/db_mysql.o \ + ../db/mysql_util.o + FCGI_OBJS = \ handle_request.fcgi.o \ main.fcgi.o \ diff --git a/sched/make_work.C b/sched/make_work.C index 378b290eb6..750b171738 100644 --- a/sched/make_work.C +++ b/sched/make_work.C @@ -6,7 +6,8 @@ // [ -redundancy n ] // [ -cushion n ] // -// Create WU and result records as needed to maintain a pool of work. +// Create WU and result records as needed to maintain a pool of work +// (for testing purposes). // Makes a new WU for every "redundancy" results. // Clones the WU of the given name. // diff --git a/sched/result_retry.C b/sched/result_retry.C index 243a2fe1ce..9c1c5799b0 100644 --- a/sched/result_retry.C +++ b/sched/result_retry.C @@ -7,29 +7,93 @@ // [ -ndet n ] // [ -nredundancy n ] +#include +#include + +#include "db.h" + +int max_errors = 999; +int max_done = 999; +int nredundancy = 999; +int startup_time; + +// The scheme for generating unique output filenames is as follows. +// If the original filename is of the form x__y, +// then y is replaced with a string of the form time_seqno, +// where "time" is when this program started up. +// NOTE: if you ever need to start up multiple copies of this, +// you'll need to add a PID in there somewhere. +// +// If the original filename doesn't have __, add a string +// of the form __time_seqno + +void make_unique_name(char* name) { + char buf[256], *p; + static int seqno; + + sprintf(buf, "%d_%d", startup_time, seqno); + p = strstr(name, "__"); + if (p) { + strcpy(p+2, buf); + } else { + strcat(name, buf); + } +} + +// convert a result's XML document to generate new output filenames +// Look for ... elements and convert the name; +// apply the same conversion to the element later on. +// +int assign_new_names(char* in, char* out) { + char *p = in, *n1, *n2; + char name[256], buf[MAX_BLOB_SIZE]; + int len; + + while (1) { + n1 = strstr(p, ""); + if (!n1) break; + n1 += strlen(""); + n2 = strstr(p, ""); + if (!n2) { + fprintf(stderr, "malformed XML:\n%s", in); + return 1; + } + len = n2 - n1; + memcpy(name, n1, len); + name[len] = 0; + make_unique_name(name); + strcpy(buf, n2); + strcpy(n1, name); + strcat(n1, buf); + p = n1; + } + return 0; +} + void main_loop() { WORKUNIT wu; RESULT result; int nerrors, ndone; + unsigned int i, n; wu.retry_check_time = time(0); // loop over WUs that are due to be checked // - while (db_workunit_enum_check_time(wu)) { + while (db_workunit_enum_retry_check_time(wu)) { vector results; // enumerate all the results for the WU // result.workunitid = wu.id; - while (db_result_enum_workunitid(result)) { + while (db_result_enum_wuid(result)) { results.push_back(result); } nerrors = 0; ndone = 0; for (i=0; i max_done) { fprintf(stderr, "WU %s has too many answers\n", wu.name); wu.state = WU_STATE_TOO_MANY_DONE; db_workunit_update(wu); - go next_wu; + goto next_wu; } - // generate new results if needed + // Generate new results if needed. + // Munge the XML of an existing result + // to create unique new output filenames. // n = nredundancy - ndone; for (i=0; i