diff --git a/checkin_notes b/checkin_notes index 14306cde52..ef784e82b0 100755 --- a/checkin_notes +++ b/checkin_notes @@ -21621,7 +21621,7 @@ David 19 Dec 2004 ViewResources.cpp ViewTransfers.cpp -David +David 20 Dec 2004 - User web: don't call db_init() in profile.inc. This should be done by top-level files, not .inc files @@ -21646,3 +21646,28 @@ David edit_forum_preferences_action.php profile_menu.php view_profile.php + +David 20 Dec 2004 + - have assimilate_handler return an error code. + If nonzero, assimilator exits + (don't want to mark WUs as assimilated if temporary NFS failure, e.g.) + - improve docs on file management, added doc on file_deleter + - added doc on backend utility funcs + - improve docs on assimilate_handler() + - core client: if an output file is too big, print info on size, limit + + client/ + cs_apps.C + doc/ + various + backend_util.php (new) + benchmark.php (new) + file_deleter.php (new) + sched/ + assimilate_handler.h + assimilator.C + assimilator_placeholder.C + file_deleter.C + sample_dummy_assimilator.C + sched_util.C + validate_util.C diff --git a/client/cs_apps.C b/client/cs_apps.C index 7dd28ef834..96eab456a0 100644 --- a/client/cs_apps.C +++ b/client/cs_apps.C @@ -101,6 +101,11 @@ int CLIENT_STATE::app_finished(ACTIVE_TASK& at) { "Output file %s for result %s exceeds size limit.", fip->name, rp->name ); + msg_printf( + rp->project, MSG_INFO, + "File size: %f bytes. Limit: %f bytes", + size, fip->max_nbytes + ); fip->delete_file(); fip->status = ERR_FILE_TOO_BIG; diff --git a/doc/assimilate.php b/doc/assimilate.php index 1c2de54321..cd99b1b999 100644 --- a/doc/assimilate.php +++ b/doc/assimilate.php @@ -3,13 +3,43 @@ require_once("docutil.php"); page_head("Result assimilation"); echo " Projects must create one assimilator program per application. -This is best done by linking the program sched/assimilate.C +This is done by linking the program sched/assimilate.C with an application-specific function of the form
 int assimilate_handler(
     WORKUNIT& wu, vector<RESULT>& results, RESULT& canononical_result
 );
 
+ +This is called when either + +In both cases the 'results' vector will be populated with +all the workunit's results (including unsuccessful and unsent ones). +All files (both input and output) will generally be on disk. +

+It's possible that both conditions might hold. +

+If assimilate_handler() returns zero, +the workunit record will be marked as assimilated. +If assimilate_handler() returns nonzero, +the assimilator will print an error message and exit. +Typically you should do this in any error situation. +

+You can use BOINC's +back-end utility functions +to get file pathnames and open files. + "; page_tail(); ?> diff --git a/doc/backend_util.php b/doc/backend_util.php new file mode 100644 index 0000000000..51a7951efa --- /dev/null +++ b/doc/backend_util.php @@ -0,0 +1,79 @@ + + +

+int get_output_file_path(RESULT const&, std::string&);
+
+Returns the path of a result's output file +(parses result.xml_doc_out and computes the file's position in the + hierarchical directory structure). + +

+Note: this function doesn't handle multiple output files +(if there are multiple files, it returns the path of the first one). +If your application has multiple output files, see below. +


+
+int try_fopen(char* path, FILE*& f, char* mode);
+
+Open a file, distinguishing between recoverable and nonrecoverable errors. +Returns zero on success. +Returns ERR_FOPEN if the directory is present but not the file +(this is considered a nonrecoverable error). +Returns ERR_OPENDIR if the directory is not there +(this is generally a recoverable error, like NFS mount failure). +
+
+double median_mean_credit(vector const& results);
+
+Given a vector of N correct results, computes a canonical credit as follows: + +
+

Multiple output files

+If your application has multiple output files +you'll need to generalize get_output_file_path(). +To do this you'll need to know the following: + + +

+The database field 'result.xml_doc_out' +describes a result's output files. +It has the form +

+",htmlspecialchars("
+...
+[ ... ]
+
+    foobar
+    blah
+    blah
+    ...
+    [ ... ]
+
+"),"
+
+The components are: + +

+The XML document describing the sizes and checksums of the output +files is a list of <file_info> elements, +with the nbytes and md5_cksum fields present. +The project back end +must parse this field to find the locations and checksums of output files. +"; +page_tail(); +?> diff --git a/doc/benchmark.php b/doc/benchmark.php new file mode 100644 index 0000000000..8c1dfebd41 --- /dev/null +++ b/doc/benchmark.php @@ -0,0 +1,80 @@ +How benchmarks are calculated + +

+'Whetstone' is the name of the benchmark that is reported on your +[Show computers] web page as 'Measured floating point speed'. +Dhrystone is the name of the benchmark used for 'Measured integer speed'. +Floats can have fractional parts (like 1.48283 or 3.141592); +integers are whole numbers like 1, 2, 938283 or 2004. + +Whetstone does 8 different groups of tests (repeatedly of course), +times how long they took to finish, and produces a number, +[ops performed]/[time]. +These tests all use floating point math operations of the CPUs being tested. +Some of them are simple math (addition, multiplication, division) +while others compute trigonometric and exponential functions +(sine, cosine, tangent, exponent). + + +Dhrystone checks repeated integer operations +and several operating system file handling operations. + +

+Neither of the tests really checks how well/fast a system can access memory, +and SETI@home (for example) accesses memory a lot. + +

+Here is an example of memory introducing a delay: +A Pentium 4 CPU of any speed can calculate the sine of an angle in +approximately 170 ticks of its internal clock. +It could have performed 170 regular integer additions in this time. + +

+But if it wanted to do an integer addition on a number somewhere out in memory +(say it was working on a table of numbers), the +CPU might have to wait as much as 260 ticks +for this memory integer to be delivered to the CPU. +So a badly timed integer+memory +operation would take far longer than a sine calculation. + +

+This is where Celeron CPUs can really slow down. +Pentium has many features to predict when the CPU might be getting memory, +and begins getting it long before the CPU actually calculates with it. +Thus there is much less delay for most memory operations. + +

Why 'predicted time' can be wrong

+Each WU delivered to your machine includes an estimated number of +floating point (FP) calculations. +BOINC divides this by the FP +benchmark number to estimate completion time. +SETI@home's WUs estimate number is currently always 27.9 trillion (american), +however the actual number of FP ops varies greatly which is why +WUs take different amounts of time to finish. + + +

+SETI@home uses almost all single-precision floating point math, +while Whetstone is all double-precision math. +On Intel x86 processors +the speed difference in calculating single vs. double isn't very large. + +

+SETI@home uses mostly add, sub, multiply and divide. +About 20% of its time is spent in trigonometry. +Almost all the time in Whetstone is used for trigonometry. + +

+Memory access speed and trigonometry are the two major reasons that +the benchmark results and SETI@home processing speed don't match +up on many systems. + +

+Thanks to Ben Herndon for this writeup +"; +page_tail(); +?> diff --git a/doc/create_project.php b/doc/create_project.php index 073a8f123e..0a60f97a06 100644 --- a/doc/create_project.php +++ b/doc/create_project.php @@ -101,6 +101,7 @@ How to generate tasks and handle the results.

  • Generating work
  • Result validation
  • Result assimilation +
  • Server-side file deletion @@ -128,7 +129,7 @@ Using disk space on participant hosts
  • Uploading file lists
  • Uploading files
  • Downloading files -
  • Deleting files +
  • Deleting files on client hosts diff --git a/doc/file_deleter.php b/doc/file_deleter.php new file mode 100644 index 0000000000..429584ff44 --- /dev/null +++ b/doc/file_deleter.php @@ -0,0 +1,42 @@ +file_deleter daemon. +Typically you don't need to customize this. +The default file deletion policy is: +
      +
    • A workunit's input files are deleted when +all results are 'over' (reported or timed out) +and the workunit is assimilated. +
    • A result's output files are deleted +after the workunit is assimilated. +The canonical result is handled differently, +since its output files may be needed to validate +results that are reported after assimilation; +hence its files are deleted only all results are over, +and all successful results have been validated. +
    + +

    +In some cases you may not want files to be deleted. +There are two ways to accomplish this: +

      +
    • Run the file_deleter daemon with +the -preserve_wu_files command-line option +(to preserve all input files) +and/or the -preserve_result_files command-line option +(to preserve all output files). +
    • Include <no_delete/> +in the <file_info> element for a file in a +workunit or result template. +This lets you suppress deletion on a file-by-file basis. + +
    +In either case you may need to implement your +own scheme for deleting files, +to avoid overflowing data server storage. +"; +page_tail(); +?> diff --git a/doc/files.php b/doc/files.php index eebb9e643a..a8734f9de0 100644 --- a/doc/files.php +++ b/doc/files.php @@ -5,11 +5,14 @@ echo "

    Files and data servers

    The BOINC storage model is based on files. -The inputs and outputs of applications, -and the application executables, are files. +Examples of files: +

      +
    • The inputs and outputs of applications; +
    • Application executables, libraries, etc. +

    -The BOINC core client transfers files to and from data servers -operated by the project, using HTTP. +The BOINC core client transfers files to and from project-operated +data servers using HTTP.

    A file is described by an XML element of the form ".html_text(" @@ -76,16 +79,16 @@ list_item("signature_required", ); list_item("no_delete", "If present for an input (workunit) file, - indicates that the file should NOT be removed from the download/ - directory when the workunit is completed. You should use this - if a particular input file or files are used by more than one - workunit, or will be used by future, unqueued workunits." + indicates that the file should NOT be removed from the data server's + download directory when the workunit is completed. + Use this if a particular input file or files are used by more than one + workunit, or will be used by future workunits." ); list_item("no_delete", "If present for an output (result) file, - indicates that the file should NOT be removed from the upload/ + indicates that the file should NOT be removed from the data server's upload directory when the corresponding workunit is completed. - Use with caution - this may cause your upload/ directory to overflow." + Use with caution - this may cause your upload directory to overflow." ); list_item("report_on_rpc", "Include a description of this file in scheduler RPC requests, @@ -94,12 +97,11 @@ list_item("report_on_rpc", ); list_end(); echo " -These attributes allow the specification of various types of files: for -example, input or output files that are retained for use as input to -later computations.

    Once a file is created (on a data server or a participant host) it -is immutable. +is immutable. +This means that all replicas of that file are assumed to be identical. +

    File references

    Files may be associated with workunits, @@ -123,5 +125,29 @@ list_item("main_program", "Used for files associated with application versions. It indicates that this file is the application's main program."); list_end(); + +echo " +

    File management

    +

    +BOINC's default behavior is to delete files around +when they aren't needed any more. +Specifically: +

      +
    • On the client, input files are deleted when no workunit refers to them, +and output files are deleted when no result refers to them. +Application-version files are deleted when they are referenced +only from superceded application versions. +
    • On the client, the 'sticky' flag overrides the above mechanisms +and suppresses the deletion of the file. +The file may deleted by an explicit +server request. +The file may also be deleted at any time by the core client +in order to honor limits on disk-space usage. +
    • On the server, the file deleter daemon +deletes input and output files that are no longer needed. +This can be suppressed using the 'no_delete' flag, +or using command-line options to the file deleter. +
    +"; page_tail(); ?> diff --git a/doc/validate.php b/doc/validate.php index e257d900b4..abfca31528 100644 --- a/doc/validate.php +++ b/doc/validate.php @@ -20,6 +20,7 @@ int check_set(vector results, DB_WORKUNIT& wu, int& canonicalid, double& "
    • check_set() takes a set of results (all with outcome=SUCCESS). +It reads and compares their output files. If there is a quorum of matching results, it selects one of them as the canonical result, returning its ID. In this case it also returns the credit to @@ -31,8 +32,11 @@ If, when an output file for a result has a nonrecoverable error or the file is present but has invalid contents), then it must set the result's outcome (in memory, not database) to VALIDATE_ERROR. -Note: the function try_fopen() (in lib/util.C) can be used -to distinguish recoverable and nonrecoverable file-open errors. +

      +Note: use BOINC's +back-end utility functions +to get file pathnames +and to distinguish recoverable and nonrecoverable file-open errors.

    • If a canonical result is found, check_set() must set the validate_state field of each non-ERROR result @@ -75,7 +79,7 @@ This tells the validator to write an error message and exit.

    -Neither function should delete files. +Neither function should delete files or access the BOINC database.

    A more detailed description is here.

    @@ -90,46 +94,7 @@ and regards results as equivalent only if they agree byte for byte. regards any two results as equivalent if their CPU time exceeds a given minimum. -

    -validate_util.C contains support functions for both of the above. -


    -NOTE: the above code assumes that each result -has a single output file. -Revisions will be needed to handle multiple output files. -To do this you will need to know the following: - - -

    -The database field 'result.xml_doc_out' -describes a result's output files. -It has the form -

    -",htmlspecialchars("
    -...
    -[ ... ]
    -
    -    foobar
    -    blah
    -    blah
    -    ...
    -    [ ... ]
    -
    -"),"
    -
    -The components are: -
      -
    • The <name> element is the result name. -
    • The <wu_name> element is the workunit name. -
    • Each <file_ref> element is an association to an output file, -described by a corresponding <file_info> element. -
    -

    -The XML document describing the sizes and checksums of the output -files is a list of <file_info> elements, -with the nbytes and md5_cksum fields present. -The project back end -must parse this field to find the locations and checksums of output files. "; page_tail(); ?> diff --git a/sched/assimilate_handler.h b/sched/assimilate_handler.h index 27f36d2543..910bc3be38 100644 --- a/sched/assimilate_handler.h +++ b/sched/assimilate_handler.h @@ -1,4 +1,4 @@ #include #include "boinc_db.h" -extern void assimilate_handler(WORKUNIT&, std::vector&, RESULT&); +extern int assimilate_handler(WORKUNIT&, std::vector&, RESULT&); diff --git a/sched/assimilator.C b/sched/assimilator.C index 27725ed23d..6cd732f65b 100644 --- a/sched/assimilator.C +++ b/sched/assimilator.C @@ -84,7 +84,12 @@ bool do_pass(APP& app) { } } - assimilate_handler(wu, results, canonical_result); + retval = assimilate_handler(wu, results, canonical_result); + if (retval) { + log_messages.printf(SCHED_MSG_LOG::CRITICAL, + "[%s] handler returned error %d; exiting\n", wu.name, retval + ); + } if (update_db) { sprintf( diff --git a/sched/assimilator_placeholder.C b/sched/assimilator_placeholder.C index 8c32137e7b..bff9a3022a 100644 --- a/sched/assimilator_placeholder.C +++ b/sched/assimilator_placeholder.C @@ -35,7 +35,7 @@ using std::vector; -void assimilate_handler( +int assimilate_handler( WORKUNIT& wu, vector& results, RESULT& canonical_result ) { SCOPE_MSG_LOG scope_messages(log_messages, SCHED_MSG_LOG::NORMAL); @@ -61,6 +61,7 @@ void assimilate_handler( if (wu.error_mask&WU_ERROR_TOO_MANY_SUCCESS_RESULTS) { log_messages.printf(SCHED_MSG_LOG::CRITICAL, "[%s] Error: too many success results\n", wu.name); } + return 0; } #ifdef __GNUC__ diff --git a/sched/file_deleter.C b/sched/file_deleter.C index 3348d6c79f..0e07ad0dc8 100644 --- a/sched/file_deleter.C +++ b/sched/file_deleter.C @@ -63,7 +63,7 @@ int wu_delete_files(WORKUNIT& wu) { retval = dir_hier_path( filename, config.download_dir, config.uldl_dir_fanout, pathname - ); + ); if (retval) { log_messages.printf(SCHED_MSG_LOG::CRITICAL, "[%s] dir_hier_path: %d\n", wu.name, retval); } else { @@ -147,8 +147,9 @@ bool do_pass() { while (!wu.enumerate(buf)) { did_something = true; - if (!preserve_wu_files) + if (!preserve_wu_files) { wu_delete_files(wu); + } wu.file_delete_state = FILE_DELETE_DONE; sprintf(buf, "file_delete_state=%d", wu.file_delete_state); retval= wu.update_field(buf); @@ -163,8 +164,9 @@ bool do_pass() { sprintf(buf, "where file_delete_state=%d limit 1000", FILE_DELETE_READY); while (!result.enumerate(buf)) { did_something = true; - if (!preserve_result_files) + if (!preserve_result_files) { result_delete_files(result); + } result.file_delete_state = FILE_DELETE_DONE; sprintf(buf, "file_delete_state=%d", result.file_delete_state); retval= result.update_field(buf); @@ -190,18 +192,22 @@ int main(int argc, char** argv) { } else if (!strcmp(argv[i], "-one_pass")) { one_pass = true; } else if (!strcmp(argv[i], "-preserve_wu_files")) { - // This option is primarily for testing. If enabled, the - // file_deleter will function 'normally' and will update - // the database, but will not actually delete the workunit - // input files. It's equivalent to setting - // [undocumented] for all workunit input files. + // This option is primarily for testing. + // If enabled, the file_deleter will function 'normally' + // and will update the database, + // but will not actually delete the workunit input files. + // It's equivalent to setting + // for all workunit input files. + // preserve_wu_files = true; } else if (!strcmp(argv[i], "-preserve_result_files")) { - // This option is primarily for testing. If enabled, the - // file_deleter will function 'normally' and will update - // the database but will not actually delete the result - // output files. It's equivalent to setting - // [undocumented] for all result output files. + // This option is primarily for testing. + // If enabled, the file_deleter will function 'normally' + // and will update the database, + // but will not actually delete the result output files. + // It's equivalent to setting + // for all result output files. + // preserve_result_files = true; } else if (!strcmp(argv[i], "-d")) { log_messages.set_debug_level(atoi(argv[++i])); diff --git a/sched/sample_dummy_assimilator.C b/sched/sample_dummy_assimilator.C index c013ccf456..f5092cac61 100644 --- a/sched/sample_dummy_assimilator.C +++ b/sched/sample_dummy_assimilator.C @@ -28,7 +28,7 @@ using std::vector; -void assimilate_handler( +int assimilate_handler( WORKUNIT& wu, vector& results, RESULT& canonical_result ) { SCOPE_MSG_LOG scope_messages(log_messages, SCHED_MSG_LOG::NORMAL); @@ -54,6 +54,7 @@ void assimilate_handler( if (wu.error_mask&WU_ERROR_TOO_MANY_SUCCESS_RESULTS) { log_messages.printf(SCHED_MSG_LOG::CRITICAL, "[%s] Error: too many success results\n", wu.name); } + return 0; } #ifdef __GNUC__ diff --git a/sched/sched_util.C b/sched/sched_util.C index 9c9782c950..cd2e8e2aeb 100644 --- a/sched/sched_util.C +++ b/sched/sched_util.C @@ -93,7 +93,7 @@ int try_fopen(char* path, FILE*& f, char* mode) { f = fopen(path, mode); if (!f) { - memset(dirpath, '\0', sizeof(dirpath)); + memset(dirpath, '\0', sizeof(dirpath)); p = strrchr(path, '/'); if (p) { strncpy(dirpath, path, (int)(p-path)); diff --git a/sched/validate_util.C b/sched/validate_util.C index e780b50bb0..466471d88d 100644 --- a/sched/validate_util.C +++ b/sched/validate_util.C @@ -142,7 +142,8 @@ int generic_check_set( log_messages.printf( SCHED_MSG_LOG::CRITICAL, "generic_check_set: init_result([RESULT#%d %s]) failed\n", - results[i].id, results[i].name); + results[i].id, results[i].name + ); goto cleanup; } } @@ -161,7 +162,8 @@ int generic_check_set( log_messages.printf( SCHED_MSG_LOG::CRITICAL, "generic_check_set: check_pair_with_data([RESULT#%d %s], [RESULT#%d %s]) failed\n", - results[i].id, results[i].name, results[j].id, results[j].name); + results[i].id, results[i].name, results[j].id, results[j].name + ); } else if (match) { ++neq; matches[j] = true; @@ -197,7 +199,8 @@ int generic_check_set_majority( return generic_check_set( results, canonicalid, credit, init_result_f, check_pair_with_data_f, cleanup_result_f, - results.size() / 2); + results.size() / 2 + ); } int generic_check_pair( @@ -227,7 +230,7 @@ int generic_check_pair( SCHED_MSG_LOG::CRITICAL, "[RESULT#%d %s] [RESULT#%d %s] Couldn't initialize result 2\n", r1.id, r1.name, r2.id, r2.name - ); + ); cleanup_result_f(r1, data1); return retval; }