result retry

svn path=/trunk/boinc/; revision=570
This commit is contained in:
David Anderson 2002-11-05 18:36:09 +00:00
parent 646718a2b8
commit d3942cc197
8 changed files with 189 additions and 31 deletions

View File

@ -2263,3 +2263,16 @@ Eric October 29, 2002
mac_carbon_dsp.c,h
mac_carbon_gl.c,h
David Nov 5 2002
- Continued work on result retry mechanism:
- new fields "retry_check_time" and "state" in workunit table
- logic for creating new output filenames
- needs a bit more work
db/
db.h
db_mysql.C
schema.sql
sched/
Makefile.in
make_work.C
result_retry.C

10
db/db.h
View File

@ -188,6 +188,13 @@ struct HOST {
int parse_net_stats(FILE*);
};
#define WU_STATE_SEND_FAIL 1
// failed to send results for this WU
#define WU_STATE_TOO_MANY_ERRORS 2
// too many errors; may have bug
#define WU_STATE_TOO_MANY_DONE 3
// too many results without consensus; may be nondeterministic
struct WORKUNIT {
int id;
unsigned int create_time; // time of record creation
@ -206,6 +213,8 @@ struct WORKUNIT {
// VALIDATE_STATE_NEED_CHECK state
int canonical_resultid; // ID of canonical result, or zero
double canonical_credit; // credit that all correct results get
double retry_check_time; // when to check for result retry
int state; // see above
// the following not used in the DB
char app_name[256];
@ -296,6 +305,7 @@ extern int db_workunit_update(WORKUNIT& p);
extern int db_workunit_lookup_name(WORKUNIT&);
//extern int db_workunit_enum_dynamic_to_send(WORKUNIT&, int);
extern int db_workunit_enum_app_need_validate(WORKUNIT&);
extern int db_workunit_enum_retry_check_time(WORKUNIT&);
extern int db_result_new(RESULT& p);
extern int db_result(int id, RESULT&);

View File

@ -194,12 +194,14 @@ void struct_to_str(void* vp, char* q, int type) {
"has_successor=%d, name='%s', xml_doc='%s', batch=%d, "
"rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, "
"need_validate=%d, "
"canonical_resultid=%d, canonical_credit=%f",
"canonical_resultid=%d, canonical_credit=%f, "
"retry_check_time=%f, state=%d",
wup->id, wup->create_time, wup->appid, wup->previous_wuid,
wup->has_successor?1:0, wup->name, wup->xml_doc, wup->batch,
wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk,
wup->need_validate,
wup->canonical_resultid, wup->canonical_credit
wup->canonical_resultid, wup->canonical_credit,
wup->retry_check_time, wup->state
);
break;
case TYPE_RESULT:
@ -360,6 +362,8 @@ void row_to_struct(MYSQL_ROW& r, void* vp, int type) {
wup->need_validate = atoi(r[i++]);
wup->canonical_resultid = atoi(r[i++]);
wup->canonical_credit = atof(r[i++]);
wup->retry_check_time = atof(r[i++]);
wup->state = atoi(r[i++]);
break;
case TYPE_RESULT:
rp = (RESULT*)vp;
@ -574,6 +578,16 @@ int db_workunit_enum_app_need_validate(WORKUNIT& p) {
return db_enum(e, &p, TYPE_WORKUNIT, buf);
}
int db_workunit_enum_retry_check_time(WORKUNIT& p) {
static ENUM e;
char buf[256];
if (!e.active) {
sprintf(buf, "where retry_check_time > 0 and retry_check_time < %f", p.retry_check_time);
}
return db_enum(e, &p, TYPE_WORKUNIT, buf);
}
////////// RESULT /////////

View File

@ -128,6 +128,8 @@ create table workunit (
need_validate smallint not null,
canonical_resultid integer not null,
canonical_credit double not null,
retry_check_time double not null,
state integer not null,
primary key (id)
);

View File

@ -4,7 +4,7 @@
<p>
The <b>back end</b> of a BOINC project consists
of a number of components that together are responsible
for generating work and absorbing the results.
for generating work and handling results.
Some of these components are supplied by BOINC;
other parts are project- or application-specific:
@ -14,47 +14,68 @@ other parts are project- or application-specific:
<th>BOINC-supplied part</th>
<th>project-supplied part</th>
<tr>
<td>
<b>Work generator</b>: generates work units and their input files,
and generates one or more results per work unit.
<td valign=top>
<b>Work generator</b>: generates work units, work sequences, results,
and the corresponding input files.
</td>
<td>
<td valign=top>
Functions and programs that handle the details of
creating workunit and result database records.
creating workunit, work sequence, and result database records.
</td>
<td>
<td valign=top>
Programs or scripts that generate input files,
install them on data servers,
and call the BOINC functions.
</td></tr>
<tr>
<td><b>Result retry generator</b>:
generates additional results if some of those initially sent are lost.</td>
<td>A program, <b>result_retry</b>, that does the job.</td>
<td>Some parameters used by result_retry.</td>
<td valign=top><b>Result retry generator</b>:
generates additional results if some of those initially sent are lost
not completed, or completed erroneously.</td>
<td valign=top>A program, <b>result_retry</b>, that does the job.</td>
<td valign=top>Some parameters used by result_retry.</td>
</tr>
<tr>
<td><b>Result validation and accounting</b>:
<td valign=top><b>Result validation and accounting</b>:
compare redundant results; select a <b>canonical result</b>
representing the correct output,
and a <b>canonical credit</b> granted to users and hosts
that return the correct output.</td>
<td>A program, <b>validate</b>, that contains the
<td valign=top>A program, <b>validate</b>, that contains the
basic logic for validation.</td>
<td>An application-specific function, linked with <b>validate</b>,
<td valign=top>An application-specific function, linked with <b>validate</b>,
that compares sets of redundant results.</td>
</tr>
<tr>
<td><b>Result processing</b>: parse output files of
completed results, and take appropriate action
(record results in a database, and/or generate more work)</td>
<td>Function for enumerating unprocessed results.</td>
<td>The rest.</td>
<td valign=top><b>Work sequence relocater</b>:
detects work sequences whose hosts have failed,
and relocates them to other hosts.</td>
<td valign=top>A program <b>seq_relocate</b></td>
<td valign=top>Some parameters used by seq_relocate</td>
</tr>
<tr>
<td><b>Garbage collector</b>: delete input and output files
<td valign=top><b>Work sequence validation and accounting</b>:
Similar to result validation, but for work sequences.
</td>
<td valign=top>A program <b>seq_validate</b></td>
<td valign=top>An application-specific function,
linked with <b>seq_validate</b>, that compares sets of redundant results.</td>
</tr>
<tr>
<td valign=top><b>Result processing</b>: parse output files of
completed results, and take appropriate action
(record results in a database, and/or generate more work)</td>
<td valign=top>Function for enumerating unprocessed results.
Functions and programs that handle the details of
creating workunit, work sequence, and result database records.
</td>
<td valign=top>
The rest.
</td>
</tr>
<tr>
<td valign=top><b>Garbage collector</b>: delete input and output files
when they are no longer needed.</td>
<td>A program, <b>garbage_collect</b>, that does the job.</td>
<td>None.</td>
<td valign=top>A program, <b>garbage_collect</b>, that does the job.</td>
<td valign=top>None.</td>
</tr>
</table>

View File

@ -79,6 +79,11 @@ MAKE_WORK_OBJS = \
../lib/crypt.o \
../RSAEuro/source/rsaeuro.a
RESULT_RETRY_OBJS = \
result_retry.o \
../db/db_mysql.o \
../db/mysql_util.o
FCGI_OBJS = \
handle_request.fcgi.o \
main.fcgi.o \

View File

@ -6,7 +6,8 @@
// [ -redundancy n ]
// [ -cushion n ]
//
// Create WU and result records as needed to maintain a pool of work.
// Create WU and result records as needed to maintain a pool of work
// (for testing purposes).
// Makes a new WU for every "redundancy" results.
// Clones the WU of the given name.
//

View File

@ -7,29 +7,93 @@
// [ -ndet n ]
// [ -nredundancy n ]
#include <vector>
#include <sys/time.h>
#include "db.h"
int max_errors = 999;
int max_done = 999;
int nredundancy = 999;
int startup_time;
// The scheme for generating unique output filenames is as follows.
// If the original filename is of the form x__y,
// then y is replaced with a string of the form time_seqno,
// where "time" is when this program started up.
// NOTE: if you ever need to start up multiple copies of this,
// you'll need to add a PID in there somewhere.
//
// If the original filename doesn't have __, add a string
// of the form __time_seqno
void make_unique_name(char* name) {
char buf[256], *p;
static int seqno;
sprintf(buf, "%d_%d", startup_time, seqno);
p = strstr(name, "__");
if (p) {
strcpy(p+2, buf);
} else {
strcat(name, buf);
}
}
// convert a result's XML document to generate new output filenames
// Look for <name>...</name> elements and convert the name;
// apply the same conversion to the <file_name> element later on.
//
int assign_new_names(char* in, char* out) {
char *p = in, *n1, *n2;
char name[256], buf[MAX_BLOB_SIZE];
int len;
while (1) {
n1 = strstr(p, "<name>");
if (!n1) break;
n1 += strlen("<name>");
n2 = strstr(p, "</name>");
if (!n2) {
fprintf(stderr, "malformed XML:\n%s", in);
return 1;
}
len = n2 - n1;
memcpy(name, n1, len);
name[len] = 0;
make_unique_name(name);
strcpy(buf, n2);
strcpy(n1, name);
strcat(n1, buf);
p = n1;
}
return 0;
}
void main_loop() {
WORKUNIT wu;
RESULT result;
int nerrors, ndone;
unsigned int i, n;
wu.retry_check_time = time(0);
// loop over WUs that are due to be checked
//
while (db_workunit_enum_check_time(wu)) {
while (db_workunit_enum_retry_check_time(wu)) {
vector<RESULT> results;
// enumerate all the results for the WU
//
result.workunitid = wu.id;
while (db_result_enum_workunitid(result)) {
while (db_result_enum_wuid(result)) {
results.push_back(result);
}
nerrors = 0;
ndone = 0;
for (i=0; i<results.size(); i++) {
result = result[i];
result = results[i];
// if any result is unsent, give up on the WU
//
@ -53,16 +117,18 @@ void main_loop() {
fprintf(stderr, "WU %s has too many errors\n", wu.name);
wu.state = WU_STATE_TOO_MANY_ERRORS;
db_workunit_update(wu);
go next_wu;
goto next_wu;
}
if (ndone > max_done) {
fprintf(stderr, "WU %s has too many answers\n", wu.name);
wu.state = WU_STATE_TOO_MANY_DONE;
db_workunit_update(wu);
go next_wu;
goto next_wu;
}
// generate new results if needed
// Generate new results if needed.
// Munge the XML of an existing result
// to create unique new output filenames.
//
n = nredundancy - ndone;
for (i=0; i<n; i++) {
@ -76,4 +142,30 @@ next_wu:
}
int main(int argc, char** argv) {
int i;
bool asynch = false;
startup_time = time(0);
for (i=1; i<argc; i++) {
if (!strcmp(argv[i], "-nerror")) {
max_errors = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-ndet")) {
max_done = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-asynch")) {
asynch = true;
} else if (!strcmp(argv[i], "-nredundancy")) {
nredundancy = atoi(argv[++i]);;
}
}
if (asynch) {
if (fork()==0) {
while(1) {
main_loop();
}
}
} else {
while (1) {
main_loop();
}
}
}