result retry

svn path=/trunk/boinc/; revision=584
This commit is contained in:
David Anderson 2002-11-07 19:31:34 +00:00
parent 22e153233f
commit 783239db37
26 changed files with 424 additions and 197 deletions

View File

@ -2276,3 +2276,45 @@ David Nov 5 2002
Makefile.in
make_work.C
result_retry.C
David Nov 7 2002
- Finished and did basic testing of result retry mechanism.
It now correctly generates new results with unique names,
unique filenames, and upload signatures.
TODO: make a test script
- added DB field workunit.delay_bound:
determines deadlines of result, determines retry check period
Mandatory argument to create_work.
- Added DB indices for all enumeration functions
client/
client_types.C
db/
constraints.sql
db.h
db_mysql.C
schema.sql
doc/
result.html
tools_work.html
work.html
html_ops/
db.inc
sched/
Makefile.in
feeder.C
handle_request.C
result_retry.C
test/
1sec_wu
concat_wu
sah_result
sah_wu
test.inc
test_uc.php
uc_wu
ucs_wu
tools/
backend_lib.Ch
create_work.C
process_result_template.C

View File

@ -368,7 +368,7 @@ int FILE_INFO::write(FILE* out, bool to_server) {
pers_file_xfer->write(out);
}
if (!to_server) {
if (signed_xml) {
if (signed_xml && xml_signature) {
fprintf(out, "<signed_xml>\n%s</signed_xml>\n", signed_xml);
}
if (xml_signature) {

View File

@ -11,14 +11,18 @@ alter table app_version
alter table user
add unique(email_addr);
alter table user
add index ind_tid (teamid);
alter table team
add unique(name);
alter table workunit
add unique(name);
create index wu_val on workunit(appid, need_validate);
create index wu_retry on workunit(appid, retry_check_time);
alter table result
add unique(name);
create index res_wuid on result(workunitid);
create index ind_res_st on result(state);

15
db/db.h
View File

@ -197,12 +197,9 @@ struct HOST {
struct WORKUNIT {
int id;
unsigned int create_time; // time of record creation
unsigned int check_time; // when to check for result retury, or zero
int appid; // ID of APP record tied to this workunit
int previous_wuid;
bool has_successor;
char name[256]; // Name of the workunit
unsigned int create_time;
int appid; // associated app
char name[256];
char xml_doc[MAX_BLOB_SIZE];
int batch;
double rsc_fpops; // estimated # of FP operations
@ -214,7 +211,8 @@ struct WORKUNIT {
int canonical_resultid; // ID of canonical result, or zero
double canonical_credit; // credit that all correct results get
double retry_check_time; // when to check for result retry
int state; // see above
int state; // see values above
int delay_bound; // determines result deadline, retry check time
// the following not used in the DB
char app_name[256];
@ -303,7 +301,6 @@ extern int db_workunit_new(WORKUNIT& p);
extern int db_workunit(int id, WORKUNIT&);
extern int db_workunit_update(WORKUNIT& p);
extern int db_workunit_lookup_name(WORKUNIT&);
//extern int db_workunit_enum_dynamic_to_send(WORKUNIT&, int);
extern int db_workunit_enum_app_need_validate(WORKUNIT&);
extern int db_workunit_enum_retry_check_time(WORKUNIT&);
@ -311,7 +308,7 @@ extern int db_result_new(RESULT& p);
extern int db_result(int id, RESULT&);
extern int db_result_update(RESULT& p);
extern int db_result_lookup_name(RESULT& p);
extern int db_result_enum_to_send(RESULT&, int);
extern int db_result_enum_state(RESULT&, int);
extern int db_result_enum_wuid(RESULT&);
extern int db_result_count_state(int state, int&);

View File

@ -190,18 +190,18 @@ void struct_to_str(void* vp, char* q, int type) {
case TYPE_WORKUNIT:
wup = (WORKUNIT*)vp;
sprintf(q,
"id=%d, create_time=%d, appid=%d, previous_wuid=%d, "
"has_successor=%d, name='%s', xml_doc='%s', batch=%d, "
"id=%d, create_time=%d, appid=%d, "
"name='%s', xml_doc='%s', batch=%d, "
"rsc_fpops=%f, rsc_iops=%f, rsc_memory=%f, rsc_disk=%f, "
"need_validate=%d, "
"canonical_resultid=%d, canonical_credit=%f, "
"retry_check_time=%f, state=%d",
wup->id, wup->create_time, wup->appid, wup->previous_wuid,
wup->has_successor?1:0, wup->name, wup->xml_doc, wup->batch,
"retry_check_time=%f, delay_bound=%d, state=%d",
wup->id, wup->create_time, wup->appid,
wup->name, wup->xml_doc, wup->batch,
wup->rsc_fpops, wup->rsc_iops, wup->rsc_memory, wup->rsc_disk,
wup->need_validate,
wup->canonical_resultid, wup->canonical_credit,
wup->retry_check_time, wup->state
wup->retry_check_time, wup->delay_bound, wup->state
);
break;
case TYPE_RESULT:
@ -350,8 +350,6 @@ void row_to_struct(MYSQL_ROW& r, void* vp, int type) {
wup->id = atoi(r[i++]);
wup->create_time = atoi(r[i++]);
wup->appid = atoi(r[i++]);
wup->previous_wuid = atoi(r[i++]);
wup->has_successor = (atoi(r[i++])!=0);
strcpy2(wup->name, r[i++]);
strcpy2(wup->xml_doc, r[i++]);
wup->batch = atoi(r[i++]);
@ -363,6 +361,7 @@ void row_to_struct(MYSQL_ROW& r, void* vp, int type) {
wup->canonical_resultid = atoi(r[i++]);
wup->canonical_credit = atof(r[i++]);
wup->retry_check_time = atof(r[i++]);
wup->delay_bound = atoi(r[i++]);
wup->state = atoi(r[i++]);
break;
case TYPE_RESULT:
@ -583,7 +582,10 @@ int db_workunit_enum_retry_check_time(WORKUNIT& p) {
char buf[256];
if (!e.active) {
sprintf(buf, "where retry_check_time > 0 and retry_check_time < %f", p.retry_check_time);
sprintf(buf,
"where appid=%d and retry_check_time > 0 and retry_check_time < %f",
p.appid, p.retry_check_time
);
}
return db_enum(e, &p, TYPE_WORKUNIT, buf);
}
@ -610,11 +612,11 @@ int db_result_lookup_name(RESULT& p) {
return db_lookup(&p, TYPE_RESULT, buf);
}
int db_result_enum_to_send(RESULT& p, int limit) {
int db_result_enum_state(RESULT& p, int limit) {
static ENUM e;
char buf[256];
if (!e.active) sprintf(buf, "where state=%d", RESULT_STATE_UNSENT);
if (!e.active) sprintf(buf, "where state=%d", p.state);
return db_enum(e, &p, TYPE_RESULT, buf, limit);
}

View File

@ -116,8 +116,6 @@ create table workunit (
id integer not null auto_increment,
create_time integer not null,
appid integer not null,
previous_wuid integer not null,
has_successor smallint not null,
name varchar(254) not null,
xml_doc blob,
batch integer not null,
@ -129,6 +127,7 @@ create table workunit (
canonical_resultid integer not null,
canonical_credit double not null,
retry_check_time double not null,
delay_bound integer not null,
state integer not null,
primary key (id)
);

View File

@ -9,20 +9,14 @@ The attributes of a result include:
<ul>
<li> The name of the result (unique across all results in the project).
<li> The name of the associated workunit.
<li> The associated workunit.
<li> The time when the completed result should be reported to a
scheduling server.
This is assigned by the project, and is used by
clients to prioritize operations and to initiate scheduler RPCs.
There is no guarantee that the result will actually be reported by this time.
<li> An XML document listing the names of its output files; see below.
<li> An XML document giving the sizes and checksums of its output
files (filled in after the result is completed).
<li> The stderr output of the result.
<li> The host that computed the result.
<li> The times when the result was dispatched and received.
<li> The exit status of the application.
<li> The reported CPU time.
<li> The time when the result was dispatched.
<li> Its <b>state</b>. Values include:
<ul>
<li> Inactive (not ready to dispatch)
@ -34,49 +28,41 @@ files (filled in after the result is completed).
</ul>
</ul>
<p>
The following attributes are defined after the result is completed:
<ul>
<li> An XML document giving the sizes and checksums of its output
files (filled in after the result is completed).
<li> The stderr output of the result.
<li> The host that was sent the result.
<li> The times when the result was received.
<li> The exit status of the application.
<li> The reported CPU time.
</ul>
<p>
Results are normally created using the
<a href=tools_work.html>create_work</a> utility.
<p>
The XML document listing the output files has the form: <pre>
&lt;file_info&gt;...&lt;/file_info&gt;
&lt;file_info>...&lt;/file_info>
[ ... ]
&lt;result&gt;
&lt;name&gt;foobar&lt;/name&gt;
&lt;wu_name&gt;blah&lt;/wu_name&gt;
&lt;exit_status&gt;blah&lt;/exit_status&gt;
&lt;file_ref&gt;...&lt;/file_ref&gt;
&lt;result>
&lt;name>foobar&lt;/name>
&lt;wu_name>blah&lt;/wu_name>
&lt;exit_status>blah&lt;/exit_status>
&lt;file_ref>...&lt;/file_ref>
[ ... ]
&lt;/result&gt;
&lt;/result>
</pre>
The components are:
<ul>
<li> The <b>&lt;name&gt;</b> element is the result name.
<li> The <b>&lt;wu_name&gt;</b> element is the workunit name.
<li> Each <b>&lt;file_ref&gt;</b> element is an association to an
output file, described by a corresponding <b>&lt;file_info&gt;</b> element.
<li> The <b>&lt;name></b> element is the result name.
<li> The <b>&lt;wu_name></b> element is the workunit name.
<li> Each <b>&lt;file_ref></b> element is an association to an
output file, described by a corresponding <b>&lt;file_info></b> element.
</ul>
<p>
The XML document describing the sizes and checksums of the output
files is just a list of <b>&lt;file_info&gt;</b> elements, with the
files is just a list of <b>&lt;file_info></b> elements, with the
<b>nbytes</b> and <b>md5_cksum</b> fields present.
The project back end
must parse this field to find the locations and checksums of output files.
<p>
Several results may be associated with a single workunit.
Results
may be generated in either of two ways (selected as part of the application):
<ul>
<li> <b>Advance generation</b> of results.
One or more result records are stored in the database
when the workunit is produced.
The scheduling server dispatches each result to a single participant host.
When all result records have been dispatched,
participants hosts are "turned away".
<li> <b>On-demand generation</b> of results.
The application specifies a "result template",
which has place-holder tokens for the output filenames.
The scheduling server, in response to a host request,
generates a new result record and sends the result template.
The host generates unique output filenames,
and returns them along when it the computation is done.
</ul>

View File

@ -1,77 +1,72 @@
<title>Generating work</title>
<title>Generating work</title>
<body bgcolor=ffffff>
<h2>Generating work</h2>
<h2>Generating work</h2>
<p>
Workunits and results can be created using either a utility program
or a C++ function.
or a C++ function.
<p>
The program
The utility program is
<pre>
create_work
-appname name
-wu_name name
-wu_template file
-result_template file
-nresults n
infile_1 ... infile_m
</pre>
<p> <b>-appname</b> specifies the name of the application
<p> <b>-wu_name</b> gives the name of the workunit.
-appname name // application name
-wu_name name // workunit name
-wu_template filename // WU template filename
-result_template filename // result template filename
-nresults n // # of results to create
-db_name x // database name
-db_passwd x // database password
-upload_url x // URL for output file upload
-download_url x // base URL for input file download
-download_dir x // where to move input files
-rsc_fpops x // est. # floating-point ops
-rsc_iops x // est. # integer ops
-rsc_memory x // est. RAM working set size, bytes
-rsc_disk x // est. disk space required
-keyfile x // path of upload private key
-delay_bound x // delay bound for result completion
infile_1 ... infile_m // input files
</pre>
<p>
<b>-wu_template</b> gives the filename of a template for the
workunit XML document.
This template is macro-substituted as follows:
The workunit template file is macro-substituted as follows:
<ul>
<li>
&lt;INFILE_n/&gt; is replaced with the name of the nth input
&lt;INFILE_n/> is replaced with the name of the nth input
file.
<li>
&lt;MD5_n/&gt; is replaced with the MD5 checksum of the nth
&lt;MD5_n/> is replaced with the MD5 checksum of the nth
input file.
<li>
&lt;WU_NAME/&gt; is replaced with the workunit name.
&lt;DOWNLOAD_URL/> is replaced with the download URL.
</ul>
<p>
<b>-result_template</b> gives the filename of a template for the
result XML document.
This template is macro-substituted as follows:
The result file template is macro-substituted as follows:
<ul>
<li>
&lt;OUTFILE_n&gt; is replaced with a string of the form
&lt;OUTFILE_n> is replaced with a string of the form
"wuname_resultnum_n" where wuname is the workunit name and resultnum is
the ordinal number of the result (0, 1, ...).
<li>
&lt;WU_NAME&gt; is replaced with the workunit name.
<li>
&lt;RESULT_NAME&gt; is replaced with a string of the form
"wuname_resultnum".
&lt;UPLOAD_URL> is replaced with the upload URL.
</ul>
<p>
<b>-nresults</b> gives the number of results to be created.
<p>
<b>infile_1 ... infile_m</b> are the names of the input files.
<hr>
<p>
The C++ library (backend_lib.C,h) provides the function:
The C++ library (backend_lib.C,h) provides the function:
<pre>
int create_work(
int appid,
char* wu_name,
WORKUNIT&,
char* wu_template,
char* result_template,
char* result_template_filename,
int nresults,
char* infile_dir,
char** infiles,
int ninfiles
R_RSA_PRIVATE_KEY& key,
char* upload_url,
char* download_url
);
</pre>
</pre>
<p>
This creates a workunit and one or more results.
The arguments are
the same as for the utility program, except that the application is
identified by its database ID.
<p>
"infile_dir" is the path of the directory containing the input
files.
<p>
"infiles" is an array of names of the input files, of length "ninfiles".
The arguments are similar to those of the utility program;
some of the information is passed in the WORKUNIT structure.

View File

@ -7,15 +7,17 @@ Workunits are maintained in the <b>workunit</b> table in the BOINC DB.
The attributes of a workunit include:
</p>
<ul>
<li> Its application.
<li> Its name (unique across all workunits in the project).
<li> Its application.
<li> An XML document describing its input files and other parameters
(see below).
<li> The estimated resource requirements of the work unit
(computation, memory, disk space, network traffic).
<li> Counts of how many times this workunit should be dispatched, how
many times it has been dispatched, how many results have been returned,
and how many failures have occurred.
(computation, memory, disk space).
<li> A <b>delay bound</b>: upper bound on how long
an instance of this work unit should take to complete.
This limits what hosts the workunit is sent to,
and it's used to assign result deadlines and
times for retrying results.
</ul>
<p>
The inputs to a workunit are described by an XML document of the form
@ -23,9 +25,7 @@ The inputs to a workunit are described by an XML document of the form
[ &lt;file_info&gt;...&lt;/file_info&gt; ]
[ ... ]
&lt;workunit&gt;
&lt;name&gt;foobar&lt;/name&gt;
&lt;app_name&gt;blah&lt;/app_name&gt;
[ &lt;version_num&gt;1&lt;/version_num&gt; ]
[ &lt;command_line&gt;-flags xyz&lt;/command_line&gt; ]
[ &lt;env_vars&gt;name=val&amp;name=val&lt;/env_vars&gt; ]
[ &lt;file_ref&gt;...&lt;/file_ref&gt; ]
@ -34,10 +34,6 @@ The inputs to a workunit are described by an XML document of the form
</pre>
The components are:
<ul>
<li> The <b>&lt;name&gt;</b> element is the name of the workunit.
<li> The <b>&lt;app_name&gt;</b> element is the name of the
application.
<li> The <b>&lt;version_num&gt;</b> element is ???.
<li> The <b>&lt;command_line&gt;</b> element, if present, is the
command-line arguments to be passed to the main program.
<li> The <b>&lt;env_vars&gt;</b> element, if present, is a list of
@ -48,7 +44,7 @@ described by a <b>&lt;file_info&gt;</b> element.
</ul>
<p>
A workunit is associated with an application, not with a particular
version or set of versions???.
version or range of versions.
If the format of your input data changes in
a way that is incompatible with older versions,
you must create a new application.

View File

@ -113,6 +113,16 @@ function show_host($host) {
}
function retry_state_str($s) {
switch($s) {
case 0: return "Default";
case 1: return "Send failed";
case 2: return "Too many errors (may have bug)";
case 3: return "Too many done (may be nondeterministic)";
}
return "unknown";
}
function show_workunit($wu,$show_xml_doc) {
start_table();
row("Created", time_str($wu->create_time));
@ -129,7 +139,10 @@ function show_workunit($wu,$show_xml_doc) {
row("Need validate?", $wu->need_validate?"yes":"no");
row("Canonical resultid", $wu->canonical_resultid);
row("Canonical credit", $wu->canonical_credit);
row("","<a href=db.php?show=result&wu_id=$wu->id>Show Associated Results</a>");
row("Retry check time", time_str($wu->retry_check_time));
row("Retry state", retry_state_str($wu->state));
row("Delay bound", $wu->delay_bound);
row("","<a href=db.php?show=result&wu_id=$wu->id>Show associated results</a>");
end_table();
echo "<p>";
}
@ -143,6 +156,17 @@ function res_state_string($s) {
case 5: return "Timeout";
case 6: return "Error";
}
return "unknown";
}
function validate_state_str($s) {
switch($s) {
case 0: return "Initial";
case 1: return "Need check";
case 2: return "Valid";
case 3: return "Invalid";
}
return "unknown";
}
function show_result($result,$show_xml_docs,$show_stderr,$show_times) {
@ -155,6 +179,7 @@ function show_result($result,$show_xml_docs,$show_stderr,$show_times) {
row("Name", $result->name);
row("Workunit", "<a href=db.php?show=workunit&id=$result->workunitid>" . wu_name_by_id($result->workunitid) . "</a>" );
row("state", res_state_string($result->state));
row("Deadline", time_str($result->report_deadline));
row("Host ID", "<a href=db.php?show=host&id=$result->hostid>" . host_name_by_id($result->hostid) . "</a>");
row("Exit Status", $result->exit_status);
row("CPU time", $result->cpu_time);
@ -166,7 +191,10 @@ function show_result($result,$show_xml_docs,$show_stderr,$show_times) {
row("stderr out", "<pre>".htmlspecialchars($result->stderr_out)."</pre>");
}
row("batch", $result->batch);
row("validated", $result->validated);
row("project state", $result->project_state);
row("Validate state", validate_state_str($result->validate_state));
row("claimed credit", $result->claimed_credit);
row("granted credit", $result->granted_credit);
end_table();
echo "<p>";
}

View File

@ -16,7 +16,7 @@ CC = g++ $(CFLAGS)
CLIBS = @LIBS@
PROGS = cgi feeder show_shmem file_upload_handler validate_test make_work
PROGS = cgi feeder show_shmem file_upload_handler validate_test make_work result_retry
all: $(PROGS)
@ -81,8 +81,16 @@ MAKE_WORK_OBJS = \
RESULT_RETRY_OBJS = \
result_retry.o \
config.o \
../db/db_mysql.o \
../db/mysql_util.o
../db/mysql_util.o \
../lib/parse.o \
../lib/md5_file.o \
../lib/md5.o \
../lib/crypt.o \
../tools/backend_lib.o \
../tools/process_result_template.o \
../RSAEuro/source/rsaeuro.a
FCGI_OBJS = \
handle_request.fcgi.o \
@ -137,6 +145,9 @@ validate_test: $(VALIDATE_OBJS)
make_work: $(MAKE_WORK_OBJS)
$(CC) $(MAKE_WORK_OBJS) $(MYSQL_LIBS) $(CLIBS) -o make_work
result_retry: $(RESULT_RETRY_OBJS)
$(CC) $(RESULT_RETRY_OBJS) $(MYSQL_LIBS) $(CLIBS) -o result_retry
fcgi: $(FCGI_OBJS)
$(CC) $(FCGI_OBJS) $(MYSQL_LIBS) $(CLIBS) $(FCGI_LIBS) \
-o fcgi

View File

@ -116,7 +116,8 @@ void feeder_loop(SCHED_SHMEM* ssp) {
restarted_enum = false;
for (i=0; i<ssp->nwu_results; i++) {
if (!ssp->wu_results[i].present) {
retval = db_result_enum_to_send(result, RESULTS_PER_ENUM);
result.state = RESULT_STATE_UNSENT;
retval = db_result_enum_state(result, RESULTS_PER_ENUM);
if (retval) {
// if we already restarted the enum on this pass,
@ -130,7 +131,8 @@ void feeder_loop(SCHED_SHMEM* ssp) {
// restart the enumeration
//
restarted_enum = true;
retval = db_result_enum_to_send(result, RESULTS_PER_ENUM);
result.state = RESULT_STATE_UNSENT;
retval = db_result_enum_state(result, RESULTS_PER_ENUM);
printf("feeder: restarting enumeration: %d\n", retval);
if (retval) {
printf("feeder: enumeration returned nothing\n");

View File

@ -73,9 +73,10 @@ int insert_after(char* buffer, char* after, char* text) {
}
// add elements in xml_doc:
// WU name, and estimation of how many seconds it will take
// WU name, app name,
// and estimate of how many seconds it will take on this host
//
int insert_wu_tags(WORKUNIT& wu, double seconds) {
int insert_wu_tags(WORKUNIT& wu, double seconds, APP& app) {
char buf[256];
int retval;
@ -85,7 +86,10 @@ int insert_wu_tags(WORKUNIT& wu, double seconds) {
);
retval = insert_after(wu.xml_doc, "<workunit>\n", buf);
if (retval) return retval;
sprintf(buf, " <name>%s</name>\n", wu.name);
sprintf(buf,
" <name>%s</name>\n <app_name>%s</app_name>\n",
wu.name, app.name
);
return insert_after(wu.xml_doc, "<workunit>\n", buf);
}
@ -116,7 +120,7 @@ int add_wu_to_reply(
// add time estimate to reply
//
wu2 = wu; // make copy since we're going to modify its XML field
retval = insert_wu_tags(wu2, seconds_to_complete);
retval = insert_wu_tags(wu2, seconds_to_complete, *app);
if (retval) return retval;
reply.insert_workunit_unique(wu2);
return 0;
@ -382,7 +386,7 @@ int send_work(
) {
int i, retval, nresults = 0, seconds_to_fill;
WORKUNIT wu;
RESULT result;
RESULT result, result_copy;
#if 0
APP* app;
char prefix [256];
@ -415,13 +419,21 @@ int send_work(
);
if (retval) continue;
fprintf(stderr, "sending result name %s, id %d\n", result.name, result.id);
fprintf(stderr,
"sending result name %s, id %d\n",
result.name, result.id
);
// copy the result so we don't overwrite its XML fields
//
result_copy = result;
retval = insert_name_tags(result, wu);
retval = insert_name_tags(result_copy, wu);
if (retval) {
fprintf(stderr, "send_work: can't insert name tags\n");
}
reply.insert_result(result);
reply.insert_result(result_copy);
seconds_to_fill -= (int)estimate_duration(wu, reply.host);
result.state = RESULT_STATE_IN_PROGRESS;

View File

@ -1,21 +1,27 @@
// result_retry - create new results to make up for lost ones
//
// result_retry
// [ -dwu n ]
// [ -dresult n ]
// [ -nerror n ]
// [ -ndet n ]
// [ -nredundancy n ]
// -app appname
// [ -nerror n ] if get this many errors, bail on WU
// [ -ndet n ] if get this results w/o consensus, bail
// [ -nredundancy n ] try to get at least this many done results
// [ -asynch ] be asynchronous
#include <vector>
#include <unistd.h>
#include <sys/time.h>
#include "db.h"
#include "backend_lib.h"
#include "config.h"
int max_errors = 999;
int max_done = 999;
int nredundancy = 999;
int nredundancy = 0;
int startup_time;
CONFIG config;
R_RSA_PRIVATE_KEY key;
char app_name[256];
// The scheme for generating unique output filenames is as follows.
// If the original filename is of the form x__y,
@ -31,62 +37,116 @@ void make_unique_name(char* name) {
char buf[256], *p;
static int seqno;
sprintf(buf, "%d_%d", startup_time, seqno);
sprintf(buf, "%d_%d", startup_time, seqno++);
p = strstr(name, "__");
if (p) {
strcpy(p+2, buf);
} else {
strcat(name, "__");
strcat(name, buf);
}
}
// convert a result's XML document to generate new output filenames
// Look for <name>...</name> elements and convert the name;
// convert a result's XML document to generate new output filenames.
// The input has the form
// <file_info>
// <name>xxx</name>
// ...
// </file_info>
// ...
// <result>
// <file_ref>
// <file_name>xxx</file_name>
// ...
// </file_ref>
// ...
// </result>
//
// Look for <name>...</name> elements within <file_info>
// and make a unique name based on it;
// apply the same conversion to the <file_name> element later on.
//
int assign_new_names(char* in, char* out) {
char *p = in, *n1, *n2;
char name[256], buf[MAX_BLOB_SIZE];
// TODO: this is ad-hoc. Would be nice to use some generic
// XML parsing routines, or XSLT or something.
//
int assign_new_names(char* in) {
char *p = in, *n1, *n2, *r;
char name[256], newname[256], element[256], buf[MAX_BLOB_SIZE];
int len;
// notice where the <result> is so we don't try to convert
// the result name
//
r = strstr(in, "<result>");
while (1) {
n1 = strstr(p, "<name>");
if (!n1) break;
if (n1 > r) break; // don't go past <result>
n1 += strlen("<name>");
n2 = strstr(p, "</name>");
if (!n2) {
fprintf(stderr, "malformed XML:\n%s", in);
fprintf(stderr, "assign_new_names(): malformed XML:\n%s", in);
return 1;
}
len = n2 - n1;
memcpy(name, n1, len);
name[len] = 0;
make_unique_name(name);
strcpy(newname, name);
make_unique_name(newname);
strcpy(buf, n2);
strcpy(n1, name);
strcpy(n1, newname);
strcat(n1, buf);
// replace the name in the <file_name> element
//
sprintf(element, "<file_name>%s</file_name>", name);
n2 = strstr(n1, element);
if (!n2) {
fprintf(stderr, "assign_new_names(): no <file_name>:\n%s", in);
return 1;
}
strcpy(buf, n2+strlen(element));
sprintf(element, "<file_name>%s</file_name>", newname);
strcpy(n2, element);
strcat(n2, buf);
p = n1;
}
return 0;
}
void main_loop() {
bool do_pass(APP& app) {
WORKUNIT wu;
RESULT result;
int nerrors, ndone;
int nerrors, ndone, retval;
unsigned int i, n;
bool did_something = false;
wu.retry_check_time = time(0);
wu.appid = app.id;
// loop over WUs that are due to be checked
//
while (db_workunit_enum_retry_check_time(wu)) {
while (!db_workunit_enum_retry_check_time(wu)) {
vector<RESULT> results;
did_something = true;
// if this WU has a canonical result, we're done
// (this normally doesn't happen since the retry check time
// is zeroed when canonical result found, but just in case).
//
if (wu.canonical_resultid) {
wu.retry_check_time = 0;
goto update_wu;
}
// enumerate all the results for the WU
//
result.workunitid = wu.id;
while (db_result_enum_wuid(result)) {
while (!db_result_enum_wuid(result)) {
results.push_back(result);
}
@ -100,8 +160,8 @@ void main_loop() {
if (result.state == RESULT_STATE_UNSENT) {
fprintf(stderr, "WU %s has unsent result\n", wu.name);
wu.state = WU_STATE_SEND_FAIL;
db_workunit_update(wu);
goto next_wu;
wu.retry_check_time = 0;
goto update_wu;
}
if (result.state == RESULT_STATE_ERROR) {
nerrors++;
@ -116,38 +176,96 @@ void main_loop() {
if (nerrors > max_errors) {
fprintf(stderr, "WU %s has too many errors\n", wu.name);
wu.state = WU_STATE_TOO_MANY_ERRORS;
db_workunit_update(wu);
goto next_wu;
wu.retry_check_time = 0;
goto update_wu;
}
if (ndone > max_done) {
fprintf(stderr, "WU %s has too many answers\n", wu.name);
wu.state = WU_STATE_TOO_MANY_DONE;
db_workunit_update(wu);
goto next_wu;
wu.retry_check_time = 0;
goto update_wu;
}
// Generate new results if needed.
// Munge the XML of an existing result
// to create unique new output filenames.
//
n = nredundancy - ndone;
for (i=0; i<n; i++) {
create_result(
wu, result_template_file, suffix, key,
config.upload_url, config.download_url
);
if (nredundancy > ndone) {
n = nredundancy - ndone;
for (i=0; i<n; i++) {
result = results[0];
make_unique_name(result.name);
initialize_result(result, wu);
remove_signatures(result.xml_doc_in);
assign_new_names(result.xml_doc_in);
add_signatures(result.xml_doc_in, key);
retval = db_result_new(result);
if (retval) {
fprintf(stderr, "result_retry: db_result_new %d\n", retval);
break;
}
}
}
next_wu:
// update the WU's result retry check time
//
wu.retry_check_time = time(0) + wu.delay_bound;
update_wu:
retval = db_workunit_update(wu);
if (retval) {
fprintf(stderr, "result_retry: db_workunit_update %d\n", retval);
}
}
return did_something;
}
void main_loop() {
APP app;
bool did_something;
int retval;
retval = db_open(config.db_name, config.db_passwd);
if (retval) {
fprintf(stderr, "result_retry: db_open: %d\n", retval);
exit(1);
}
strcpy(app.name, app_name);
retval = db_app_lookup_name(app);
if (retval) {
fprintf(stderr, "result_retry: can't find app %s\n", app.name);
exit(1);
}
while (1) {
did_something = do_pass(app);
if (!did_something) sleep(1);
}
}
int main(int argc, char** argv) {
int i;
int i, retval;
bool asynch = false;
char path[256];
retval = config.parse_file();
if (retval) {
fprintf(stderr, "can't read config file\n");
exit(1);
}
sprintf(path, "%s/upload_private", config.key_dir);
retval = read_key_file(path, key);
if (retval) {
fprintf(stderr, "can't read key\n");
exit(1);
}
startup_time = time(0);
for (i=1; i<argc; i++) {
if (!strcmp(argv[i], "-nerror")) {
if (!strcmp(argv[i], "-app")) {
strcpy(app_name, argv[++i]);
} else if (!strcmp(argv[i], "-nerror")) {
max_errors = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-ndet")) {
max_done = atoi(argv[++i]);
@ -159,13 +277,9 @@ int main(int argc, char** argv) {
}
if (asynch) {
if (fork()==0) {
while(1) {
main_loop();
}
}
} else {
while (1) {
main_loop();
}
} else {
main_loop();
}
}

View File

@ -1,3 +1,2 @@
<workunit>
<app_name>1sec</app_name>
</workunit>

View File

@ -9,7 +9,6 @@
<md5_cksum><MD5_1/></md5_cksum>
</file_info>
<workunit>
<app_name>concat</app_name>
<file_ref>
<file_name><INFILE_0/></file_name>
<open_name>in1</open_name>

View File

@ -6,8 +6,8 @@
<max_nbytes>100000</max_nbytes>
</file_info>
<result>
<output_file>
<file_ref>
<file_name><OUTFILE_0/></file_name>
<open_name>outfile.sah</open_name>
</output_file>
</file_ref>
</result>

View File

@ -4,7 +4,6 @@
<md5_cksum><MD5_0/></md5_cksum>
</file_info>
<workunit>
<app_name>setiathome-3.06</app_name>
<input_file>
<file_name><INFILE_0/></file_name>
<open_name>work_unit.sah</open_name>

View File

@ -54,6 +54,7 @@ function db_query($query) {
function run_tool($cmd) {
$tool_dir = get_env_var("BOINC_SRC_DIR")."/tools/";
$cmd = $tool_dir.$cmd;
//echo $cmd;
PassThru($cmd);
}
@ -205,6 +206,7 @@ class Project {
PassThru("cp $source_dir/sched/file_upload_handler $this->project_dir/cgi/");
PassThru("cp $source_dir/sched/make_work $this->project_dir/cgi/");
PassThru("cp $source_dir/sched/feeder $this->project_dir/cgi/");
PassThru("cp $source_dir/sched/result_retry $this->project_dir/cgi/");
PassThru("cp $source_dir/sched/validate_test $this->project_dir/cgi/");
$f = fopen("$this->project_dir/cgi/config.xml", "w");
fputs($f, "<config>\n");
@ -435,6 +437,7 @@ class Work {
var $rsc_iops;
var $rsc_fpops;
var $rsc_disk;
var $delay_bound;
function Work($app) {
$this->app = $app;
@ -442,6 +445,7 @@ class Work {
$this->rcs_iops = 180000000000;
$this->rcs_fpops = 100000000000;
$this->rcs_disk = 1000000;
$this->delay_bound = 1000;
}
function install($project) {
@ -450,7 +454,7 @@ class Work {
$x = $this->input_files[$i];
PassThru("cp $x $project->project_dir/download");
}
$cmd = "create_work -db_name $project->db_name -download_dir $project->project_dir/download -upload_url $project->upload_url -download_url $project->download_url/ -keyfile $project->key_dir/upload_private -appname $app->name -rsc_iops $this->rcs_iops -rsc_fpops $this->rsc_fpops -rsc_disk $this->rsc_disk -wu_template $this->wu_template -result_template $this->result_template -nresults $this->nresults -wu_name $this->wu_template";
$cmd = "create_work -db_name $project->db_name -download_dir $project->project_dir/download -upload_url $project->upload_url -download_url $project->download_url/ -keyfile $project->key_dir/upload_private -appname $app->name -rsc_iops $this->rcs_iops -rsc_fpops $this->rsc_fpops -rsc_disk $this->rsc_disk -wu_template $this->wu_template -result_template $this->result_template -nresults $this->nresults -wu_name $this->wu_template -delay_bound $this->delay_bound";
for ($i=0; $i<sizeof($this->input_files); $i++) {
$x = $this->input_files[$i];
$cmd = $cmd." ".$x;

View File

@ -26,6 +26,7 @@
$work->wu_template = "uc_wu";
$work->result_template = "uc_result";
$work->nresults = 2;
$work->delay_bound = 10;
array_push($work->input_files, "input");
$work->install($project);

View File

@ -4,7 +4,6 @@
<md5_cksum><MD5_0/></md5_cksum>
</file_info>
<workunit>
<app_name>upper_case</app_name>
<file_ref>
<file_name><INFILE_0/></file_name>
<open_name>in</open_name>

View File

@ -4,7 +4,6 @@
<md5_cksum><MD5_0/></md5_cksum>
</file_info>
<workunit>
<app_name>upper_case</app_name>
<file_ref>
<file_name><INFILE_0/></file_name>
<open_name>in</open_name>

View File

@ -140,6 +140,29 @@ static int process_wu_template(
return 0;
}
// Set the time-varying fields of a result to their initial state.
// This is used to create clones of existing results,
// so set only the time-varying fields
//
void initialize_result(RESULT& result, WORKUNIT& wu) {
result.id = 0;
result.create_time = time(0);
result.workunitid = wu.id;
result.state = RESULT_STATE_UNSENT;
result.hostid = 0;
result.report_deadline = time(0) + wu.delay_bound;
result.sent_time = 0;
result.received_time = 0;
result.exit_status = 0;
result.cpu_time = 0;
strcpy(result.xml_doc_out, "");
strcpy(result.stderr_out, "");
result.project_state = 0;
result.validate_state = VALIDATE_STATE_INITIAL;
result.claimed_credit = 0;
result.granted_credit = 0;
}
// Create a new result for the given WU.
//
int create_result(
@ -153,18 +176,13 @@ int create_result(
int retval;
memset(&r, 0, sizeof(r));
r.report_deadline = time(0) + 1000;
// TODO: pass this in
r.create_time = time(0);
r.workunitid = wu.id;
r.state = RESULT_STATE_UNSENT;
r.validate_state = VALIDATE_STATE_INITIAL;
initialize_result(r, wu);
sprintf(r.name, "%s_%s", wu.name, result_name_suffix);
sprintf(base_outfile_name, "%s_", r.name);
strcpy(result_template_copy, result_template);
retval = process_result_template(
result_template,
result_template_copy,
key,
base_outfile_name,
upload_url, download_url
@ -216,9 +234,13 @@ int create_work(
}
for (i=0; i<nresults; i++) {
sprintf(suffix, "%d", i);
create_result(
retval = create_result(
wu, result_template, suffix, key, upload_url, download_url
);
if (retval) {
fprintf(stderr, "create_result: %d\n", retval);
break;
}
}
return 0;
}

View File

@ -19,6 +19,9 @@
#include "crypt.h"
extern int add_signatures(char*, R_RSA_PRIVATE_KEY&);
extern int remove_signatures(char*);
extern int process_result_template(
char* result_template,
R_RSA_PRIVATE_KEY& key,
@ -31,6 +34,7 @@ extern int read_filename(char* path, char* buf);
extern int read_key_file(char* keyfile, R_RSA_PRIVATE_KEY& key);
extern void initialize_result(RESULT&, WORKUNIT&);
extern int create_result(
WORKUNIT& wu, char* result_template_filename, char* suffix,
R_RSA_PRIVATE_KEY& key, char* upload_url, char* download_url

View File

@ -19,16 +19,21 @@
// create_work
// -appname name
// -wu_name name
// -wu_template filename
// -result_template filename
// -nresults n
// -db_name x
// -db_passwd x
// -upload_url x
// -download_url x
// -download_dir x
// -rsc_fpops n
// -rsc_iops n
// -rsc_memory n
// -rsc_disk n
// -wu_name name
// -wu_template filename
// -result_template filename
// -nresults n
// -keyfile path
// -delay_bound x
// infile1 infile2 ...
//
// Create a workunit and results.
@ -99,6 +104,8 @@ int main(int argc, char** argv) {
wu.rsc_disk = atof(argv[++i]);
} else if (!strcmp(argv[i], "-keyfile")) {
strcpy(keyfile, argv[++i]);
} else if (!strcmp(argv[i], "-delay_bound")) {
wu.delay_bound = atoi(argv[++i]);
} else {
infiles = argv+i;
ninfiles = argc - i;
@ -106,8 +113,12 @@ int main(int argc, char** argv) {
}
i++;
}
if (!strlen(app.name) || !strlen(wu.name) || !strlen(wu_template_file)
if (
!strlen(app.name)
|| !strlen(wu.name)
|| !strlen(wu_template_file )
|| !strlen(result_template_file)
|| wu.delay_bound==0
) {
fprintf(stderr, "create_work: bad cmdline\n");
exit(1);
@ -130,6 +141,7 @@ int main(int argc, char** argv) {
}
wu.appid = app.id;
wu.retry_check_time = time(0) + wu.delay_bound;
retval = read_key_file(keyfile, key);
if (retval) {

View File

@ -79,9 +79,10 @@ int add_signatures(char* xml, R_RSA_PRIVATE_KEY& key) {
retval = generate_signature(buf, signature, key);
if (retval) return retval;
strcpy(buf2, q2);
strcpy(q2, signature);
strcat(q2, buf2);
p = q2;
strcpy(q1, buf);
strcat(q1, signature);
strcat(q1, buf2);
p = q1;
}
return 0;
}
@ -115,7 +116,7 @@ int process_result_template(
char* upload_url, char* download_url
) {
char* p,*q;
char temp[256];
char temp[MAX_BLOB_SIZE];
char num;
int i;