back off if no work

svn path=/trunk/boinc/; revision=793
2003-01-02 23:12:05 +00:00 · 2003-01-02 23:12:05 +00:00 · 44ea1f7fc1
parent 38e20515b0
commit 44ea1f7fc1
11 changed files with 647 additions and 169 deletions
--- a/21
+++ b/21
@ -2802,4 +2802,23 @@ David   Dec 23 2002
        test_masterurl_failure.php
        test_sched_failure.php

-    
+David   Jan 2 2003
+    - Back off on a project if we make a scheduler RPC asking for work
+        and don't get any
+    - fixed "http_proxy_name" XML parsing problem in client
+    - added drand() function.
+        NOTE: always make repeated code blocks into functions
+    - make_work was using uninitialized seqno, caused negative
+        numbers in filenames
+
+    client/
+        client_state.C,h
+        cs_scheduler.C
+        scheduler_op.C
+    doc/
+        credit.html
+    lib/
+        util.C,h
+    sched/
+        make_work.C
+        validate.C
--- a/client/client_state.C
+++ b/client/client_state.C
@ -514,13 +514,13 @@ int CLIENT_STATE::parse_state_file() {
        } else if (match_tag(buf, "<core_client_minor_version>")) {
            // TODO: handle old client state file if different version
        } else if (match_tag(buf, "<confirm_before_connect/>")) {
-                        global_prefs.confirm_before_connecting = true;
+            global_prefs.confirm_before_connecting = true;
        } else if (match_tag(buf, "<hangup_if_dialed/>")) {
-                        global_prefs.hangup_if_dialed = true;
+            global_prefs.hangup_if_dialed = true;
        } else if (match_tag(buf, "<use_http_proxy/>")) {
-                        use_http_proxy = true;
-        } else if (parse_str(buf, "<http_proxy_server>", proxy_server_name, sizeof(proxy_server_name))) {
-        } else if (parse_int(buf, "<http_proxy_port>", proxy_server_port)) {
+            use_http_proxy = true;
+        } else if (parse_str(buf, "<proxy_server_name>", proxy_server_name, sizeof(proxy_server_name))) {
+        } else if (parse_int(buf, "<proxy_server_port>", proxy_server_port)) {
        } else {
            fprintf(stderr, "CLIENT_STATE::parse_state_file: unrecognized: %s\n", buf);
            retval = ERR_XML_PARSE;
@ -580,7 +580,9 @@ int CLIENT_STATE::write_state_file() {
        core_client_major_version,
        core_client_minor_version
    );
-        // save proxy info
+
+    // save proxy and preferences info
+    //
    fprintf(f,
        "%s"
        "%s"
@ -878,11 +880,18 @@ bool CLIENT_STATE::garbage_collect() {
        } else {
            // See if the files for this result's workunit had
            // any errors (MD5, RSA, etc)
+            //
            if(rp->wup->had_failure(failnum)) {
                // If we don't already have an error for this file
                if (rp->state < RESULT_READY_TO_ACK) {
-                  // the result wu corresponding to this result had an error downloading some input file(s).
-                    report_project_error(*rp,0,"The work_unit corresponding to this result had an error",CLIENT_DOWNLOADING);
+                    // the wu corresponding to this result
+                    // had an error downloading some input file(s).
+                    //
+                    report_project_error(
+                        *rp,0,
+                        "The work_unit corresponding to this result had an error",
+                        CLIENT_DOWNLOADING
+                    );
                }
            } else {
                rp->wup->ref_cnt++;
@ -895,9 +904,13 @@ bool CLIENT_STATE::garbage_collect() {
                //
                if(rp->output_files[i].file_info->had_failure(failnum)) {
                    if (rp->state < RESULT_READY_TO_ACK) {
-                      // had an error uploading a file for this result
+                        // had an error uploading a file for this result
+                        //
                        rp->client_state = CLIENT_UPLOADING;
-                        report_project_error(*rp,0,"The outputfile corresponding to this result had an error",CLIENT_UPLOADING);
+                        report_project_error(*rp,0,
+                            "An output file of this result had an error",
+                            CLIENT_UPLOADING
+                        );
                    }
                } else {
                    rp->output_files[i].file_info->ref_cnt++;
--- a/client/client_state.h
+++ b/client/client_state.h
@ -141,7 +141,7 @@ public:
    PROJECT* next_project_master_pending();
    double work_needed_secs();
    int make_scheduler_request(PROJECT*, double);
-    void handle_scheduler_reply(PROJECT*, char* scheduler_url);
+    int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults);
    void set_client_state_dirty(char*);
    int report_project_error( RESULT &res,int err_num, char *err_msg, int state);
        // flag a result as having an error
--- a/client/cs_scheduler.C
+++ b/client/cs_scheduler.C
@ -287,8 +287,8 @@ bool CLIENT_STATE::scheduler_rpc_poll() {

 // Handle the reply from a scheduler
 //
-void CLIENT_STATE::handle_scheduler_reply(
-    PROJECT* project, char* scheduler_url
+int CLIENT_STATE::handle_scheduler_reply(
+    PROJECT* project, char* scheduler_url, int& nresults
 ) {
    SCHEDULER_REPLY sr;
    FILE* f;
@ -296,6 +296,7 @@ void CLIENT_STATE::handle_scheduler_reply(
    unsigned int i;
    bool signature_valid;

+    nresults = 0;
    contacted_sched_server = true;
    if (log_flags.sched_op_debug) {
        f = fopen(SCHED_OP_RESULT_FILE, "r");
@ -306,7 +307,8 @@ void CLIENT_STATE::handle_scheduler_reply(
    }

    f = fopen(SCHED_OP_RESULT_FILE, "r");
-    retval = sr.parse(f);
+    if (!f) return ERR_FOPEN;
+    retval = sr.parse(f);       // check return?
    fclose(f);

    if (strlen(sr.project_name)) {
@ -326,10 +328,10 @@ void CLIENT_STATE::handle_scheduler_reply(
        project->min_rpc_time = time(0) + sr.request_delay;
    }

+    project->host_total_credit = sr.host_total_credit;
+    project->host_expavg_credit = sr.host_expavg_credit;
    if (sr.hostid) {
        project->hostid = sr.hostid;
-        project->host_total_credit = sr.host_total_credit;
-        project->host_expavg_credit = sr.host_expavg_credit;
        project->host_create_time = sr.host_create_time;
        project->rpc_seqno = 0;
    }
@ -339,6 +341,7 @@ void CLIENT_STATE::handle_scheduler_reply(
    //
    if (sr.global_prefs_xml) {
        f = fopen(GLOBAL_PREFS_FILE_NAME, "w");
+        if (!f) return ERR_FOPEN;
        fprintf(f,
            "<global_preferences>\n"
            "    <source_project>%s</source_project>\n"
@ -358,6 +361,7 @@ void CLIENT_STATE::handle_scheduler_reply(
    if (sr.project_prefs_xml) {
        char path[256];
        f = fopen(TEMP_FILE_NAME, "w");
+        if (!f) return ERR_FOPEN;
        fprintf(f,
            "<account>\n"
            "    <master_url>%s</master_url>\n"
@ -371,7 +375,9 @@ void CLIENT_STATE::handle_scheduler_reply(
        fclose(f);
        get_account_filename(project->master_url, path);
        retval = boinc_rename(TEMP_FILE_NAME, path);
+        if (retval) return ERR_RENAME;
        f = fopen(path, "r");
+        if (!f) return ERR_FOPEN;
        project->parse_account(f);
        fclose(f);
    }
@ -455,6 +461,7 @@ void CLIENT_STATE::handle_scheduler_reply(
            retval = link_result(project, rp);
            if (!retval) results.push_back(rp);
            rp->state = RESULT_NEW;
+            nresults++;
        }
    }

@ -479,4 +486,5 @@ void CLIENT_STATE::handle_scheduler_reply(
        printf("State after handle_scheduler_reply():\n");
        print_counts();
    }
+    return 0;
 }
--- a/client/scheduler_op.C
+++ b/client/scheduler_op.C
@ -16,17 +16,20 @@
 // 
 // Contributor(s):
 //
+
 #include <math.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <time.h>

+#include "util.h"
+#include "parse.h"
+#include "error_numbers.h"
+
 #include "client_state.h"
 #include "client_types.h"
-#include "error_numbers.h"
 #include "file_names.h"
 #include "log_flags.h"
-#include "parse.h"
 #include "scheduler_op.h"

 SCHEDULER_OP::SCHEDULER_OP(HTTP_OP_SET* h) {
@ -45,16 +48,17 @@ int SCHEDULER_OP::init_get_work() {
    must_get_work = true;
    project = gstate.next_project(0);
    if (project) {
-        if( (retval=init_op_project(ns)) ) {
+        retval = init_op_project(ns);
+        if (retval) {
            sprintf(err_msg, "init_get_work failed, error %d\n", retval);
            backoff(project, err_msg);
            return retval;
        }
-    }
-    else {
+    } else {
        project = gstate.next_project_master_pending();
        if (project) {
-            if ((retval=init_master_fetch(project))) {
+            retval = init_master_fetch(project);
+            if (retval) {
                sprintf(err_msg, "init_master_fetch failed, error %d\n", retval);
                backoff(project, err_msg);
            }
@ -78,6 +82,7 @@ int SCHEDULER_OP::init_return_results(PROJECT* p, double ns) {
 //
 int SCHEDULER_OP::init_op_project(double ns) {
    int retval;
+    char err_msg[256];

    if (log_flags.sched_op_debug) {
        printf("init_op_project: starting op for %s\n", project->master_url);
@ -88,15 +93,24 @@ int SCHEDULER_OP::init_op_project(double ns) {
    //
    if (project->scheduler_urls.size() == 0) {
        retval = init_master_fetch(project);
-        return retval;
+        goto done;
    }
    url_index = 0;
    retval = gstate.make_scheduler_request(project, ns);
    if (retval) {
        fprintf(stderr, "make_scheduler_request: %d\n", retval);
-        return retval;
+        goto done;
    }
-    return start_rpc();
+    retval = start_rpc();
+done:
+    if (retval) {
+        sprintf(err_msg,
+            "scheduler init_op_project to %s failed, error %d\n",
+            project->scheduler_urls[url_index].text, retval
+        );
+        backoff(project, err_msg);
+    }
+    return retval;
 }

 // Set a project's min RPC time to something in the future,
@ -109,23 +123,19 @@ int SCHEDULER_OP::set_min_rpc_time(PROJECT* p) {

    int n = p->nrpc_failures;
    if (n > RETRY_CAP) n = RETRY_CAP;
-    
+
    // we've hit the limit on master_url fetches
-    if(p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) {
+    //
+    if (p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) {
        if (log_flags.sched_op_debug) {
            printf("we've hit the limit on master_url fetches\n");
        }
-	//backoff e^MASTER_FETCH_INTERVAL * random
-	exp_backoff = (int) exp(((double)rand()/(double)RAND_MAX)*MASTER_FETCH_INTERVAL);
+        exp_backoff = (int) exp(drand()*MASTER_FETCH_INTERVAL);
+        p->min_rpc_time = time(0) + exp_backoff;
+    } else {
+        x = RETRY_BASE_PERIOD * exp(drand() * n);
+        exp_backoff =  (int)max(SCHED_RETRY_DELAY_MIN,min(SCHED_RETRY_DELAY_MAX,(int) x));
        p->min_rpc_time = time(0) + exp_backoff;
-        
-    }   
-    else {
-      //backoff RETRY_BASE_PERIOD * e^nrpc_failures * random
-        x = RETRY_BASE_PERIOD * exp(((double)rand()/(double)RAND_MAX) * n);
-	exp_backoff =  (int)max(SCHED_RETRY_DELAY_MIN,min(SCHED_RETRY_DELAY_MAX,(int) x));
-	p->min_rpc_time = time(0) + exp_backoff;
-	
    }
    if (log_flags.sched_op_debug) {
        printf(
@ -148,8 +158,11 @@ int SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) {
        p->master_url_fetch_pending = true;
        set_min_rpc_time(p);
        return 0;
-    } 
-    // if nrpc failures a multiple of master_fetch_period, then  set master_url_fetch_pending and initialize again 
+    }
+
+    // if nrpc failures a multiple of master_fetch_period,
+    // then set master_url_fetch_pending and initialize again
+    //
    if (p->nrpc_failures == MASTER_FETCH_PERIOD) {
        p->master_url_fetch_pending = true;
        p->min_rpc_time = 0;
@ -164,6 +177,7 @@ int SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) {
 }

 // low-level routine to initiate an RPC
+// If successful, creates an HTTP_OP that must be polled
 //
 int SCHEDULER_OP::start_rpc() {
    FILE *f;
@ -242,10 +256,12 @@ int SCHEDULER_OP::parse_master_file(vector<STRING256> &urls) {
        printf("Parsed master file; got %d scheduler URLs\n", (int)urls.size());
    }
    
-    //if couldn't find any urls in the master file. 
-    if((int) urls.size() == 0)
-      return -1;
-    
+    // couldn't find any urls in the master file?
+    //
+    if ((int) urls.size() == 0) {
+        return -1;
+    }
+
    return 0;
 }

@ -280,9 +296,9 @@ bool SCHEDULER_OP::update_urls(PROJECT& project, vector<STRING256> &urls) {
 // poll routine.  If an operation is in progress, check for completion
 //
 bool SCHEDULER_OP::poll() {
-    int retval;
+    int retval, nresults;
    vector<STRING256> urls;
-    bool changed, scheduler_op_done, get_master_success;
+    bool changed, scheduler_op_done;
    bool action = false;
    char err_msg[256],*err_url;

@ -290,7 +306,6 @@ bool SCHEDULER_OP::poll() {
    case SCHEDULER_OP_STATE_GET_MASTER:
        // here we're fetching the master file for a project
        //
-		get_master_success = true;
        if (http_op.http_op_state == HTTP_STATE_DONE) {
            action = true;
            project->master_url_fetch_pending = false;
@ -303,35 +318,49 @@ bool SCHEDULER_OP::poll() {
                    );
                }
                retval = parse_master_file(urls);
-                if (retval == 0) {
+                if (retval) {
+                    // master file parse failed.
+                    //
+                    project->master_fetch_failures++;
+                    backoff(project, "Master file parse failed\n");
+                    err_url = project->master_url;
+                } else {
+                    // everything succeeded.  Clear error counters
+                    //
                    changed = update_urls(*project, urls);
                    if (changed) {
                        project->min_rpc_time = 0;
                        project->nrpc_failures = 0;
                        project->master_fetch_failures = 0;
                    }
-                } else {
-                    // master file parse failed.  treat like RPC error
-                    //
-		    project->master_fetch_failures++;
-                    backoff(project, "Master file parse failed\n");
-                    get_master_success = false;
-		    err_url = project->master_url;
-               }
+                }
            } else {
-                // fetch of master file failed.  Treat like RPC error
+                // master file fetch failed.
                //
-	        project->master_fetch_failures++;
+                project->master_fetch_failures++;
                backoff(project, "Master file fetch failed\n");
-                get_master_success = false;
                err_url = project->master_url;
            }
+
+            // If don't have any schedulers for this project,
+            // it may be the wrong URL.  notify the user
+            //
+            if (project->scheduler_urls.size() == 0) {
+                sprintf(err_msg,
+                    "Could not contact %s. Make sure this is the correct project URL.",
+                    err_url
+                );
+                show_message(err_msg, "high");
+            }
+
+            // See if need to read master file for another project
+            //
            project = gstate.next_project_master_pending();
            if (project) {
-                if ((retval = init_master_fetch(project))) {
-		    project->master_fetch_failures++;
+                retval = init_master_fetch(project);
+                if (retval) {
+                    project->master_fetch_failures++;
                    backoff(project, "Master file fetch failed\n");
-                    get_master_success = false;
                    err_url = project->master_url;
                }
            } else {
@ -340,18 +369,12 @@ bool SCHEDULER_OP::poll() {
                    printf("Scheduler_op: return to idle state\n");
                }
            }
-            // If we haven't been able to successfully get the master URL file
-            // recently then notify the user
-            if (!get_master_success) {
-                char buf[256];
-                sprintf(buf, "Could not contact %s. Make sure this is the correct project URL.",
-                    err_url);
-                show_message( buf, "high" );
-            }
+
        }
        break;
    case SCHEDULER_OP_STATE_RPC:
-        // here we're doing a scheduler RPC to some project
+
+        // here we're doing a scheduler RPC
        //
        scheduler_op_done = false;
        if (http_op.http_op_state == HTTP_STATE_DONE) {
@ -364,26 +387,22 @@ bool SCHEDULER_OP::poll() {
                        project->scheduler_urls[url_index].text
                    );
                }
+
+                // scheduler RPC failed.  Try another scheduler if one exists
+                //
                url_index++;
                if (url_index < project->scheduler_urls.size()) {
                    start_rpc();
                } else {
-                    backoff(project,"");
-
+                    backoff(project, "No schedulers responded");
                    if (must_get_work) {
                        project = gstate.next_project(project);
                        if (project) {
-                            if( (retval=init_op_project(gstate.work_needed_secs())) ) {
-                                sprintf( err_msg,
-                                    "scheduler init_op_project to %s failed, error %d\n",
-                                    project->scheduler_urls[url_index].text, retval
-                                );
-                                backoff(project, err_msg);
-                            }
+                            retval = init_op_project(gstate.work_needed_secs());
                        } else {
                            scheduler_op_done = true;
                        }
-                    } 
+                    }
                    else {
                        scheduler_op_done = true;
                    }
@ -395,21 +414,27 @@ bool SCHEDULER_OP::poll() {
                        project->scheduler_urls[url_index].text
                    );
                }
-                project->nrpc_failures = 0;
-                project->min_rpc_time = 0;
-                gstate.handle_scheduler_reply(project, scheduler_url);
+                gstate.handle_scheduler_reply(project, scheduler_url, nresults);
+
+                // if we asked for work and didn't get any,
+                // back off this project
+                //
+                if (must_get_work && nresults==0) {
+                    backoff(project, "No work from project\n");
+                } else {
+                    project->nrpc_failures = 0;
+                    project->min_rpc_time = 0;
+                }
+                    
+                // if we didn't get all the work we needed,
+                // ask another project for work
+                //
                if (must_get_work) {
                    double x = gstate.work_needed_secs();
                    if (x > 0) {
                        project = gstate.next_project(project);
                        if (project) {
-                            if( (retval=init_op_project(x)) ) {
-                                sprintf( err_msg,
-                                    "scheduler init_op_project to %s failed, error %d\n",
-                                    project->scheduler_urls[url_index].text, retval
-                                );
-                                backoff(project, err_msg);
-                            }
+                            retval = init_op_project(x);
                        } else {
                            scheduler_op_done = true;
                        }
@ -421,10 +446,14 @@ bool SCHEDULER_OP::poll() {
                }
            }
        }
+
+        // If no outstanding ops, see if need a master fetch
+        //
        if (scheduler_op_done) {
-           project = gstate.next_project_master_pending();
+            project = gstate.next_project_master_pending();
            if (project) {
-                if ((retval = init_master_fetch(project))) {
+                retval = init_master_fetch(project);
+                if (retval) {
                    if (log_flags.sched_op_debug) {
                        printf("Scheduler op: init_master_fetch failed.\n" );
                    }
@ -472,7 +501,7 @@ int SCHEDULER_REPLY::parse(FILE* in) {
    code_sign_key_signature = 0;

    p = fgets(buf, 256, in);
-    // First part of content should either be tag (HTTP 1.0) or 
+    // First part of content should either be tag (HTTP 1.0) or
    // hex length of response (HTTP 1.1)
    if (!match_tag(buf, "<scheduler_reply>")) {
        fprintf(stderr, "SCHEDULER_REPLY::parse(): bad first tag %s\n", buf);
--- a/doc/credit.html
+++ b/doc/credit.html
@ -12,17 +12,13 @@ ranked by credit.
 <p>
 BOINC's credit system is based on a "reference computer" that does
 <ul>
-<li>1 billion floating-point multiply/adds per second
-<li>1 billion integer multiply/adds per second
+<li>1 billion floating-point multiplies per second
+<li>1 billion integer multiplies per second
 <li>4 billion bytes per second of traffic to and from main memory
 (sequential, half reads and half writes)
 </ul>
 BOINC's unit of credit, the <b>Cobblestone</b>,
 is one day of CPU time on the reference computer.
-(Credit should ideally reflect network transfer and disk storage as well
-as computation.
-But it's hard to verify these activities,
-so for now they aren't included.)

 <p>
 Each project maintains two types of credit:
@ -37,3 +33,31 @@ This average decreases by a factor of two every week.
 <p>
 Both types of credit (total and recent average)
 are maintained for each user and host.
+
+<h3>Leader boards</h3>
+The PHP pages supplied by BOINC include basic leaderboards:
+top users and hosts, by total and average.
+BOINC lets projects export the credit-related
+parts of their database as XML files.
+These XML files can be used to generate
+other breakdowns of users, hosts and teams,
+or to generate leaderboards based on the sum of
+credit from different projects.
+
+
+<h3>Possible future improvements</h3>
+<ul>
+<li>
+Ideally, credit should reflect network transfer and disk storage as well
+as computation.
+But it's hard to verify these activities,
+so for now they aren't included.
+<li>
+Eventually projects will develop applications that use
+graphics coprocessors or other non-CPU hardware.
+Credit should reflect the usage of such hardware.
+To accomplish this, we will need to let
+projects supply their own benchmarking functions.
+This will also handle the situation where a project's
+application does e.g. all integer arithmetic.
+</ul>
--- a/lib/util.C
+++ b/lib/util.C
@ -18,6 +18,7 @@
 //
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
 #include <math.h>
 #include <ctype.h>
 #include <sys/types.h>
@ -176,3 +177,6 @@ int lock_file(char* filename) {
    return retval;
 }

+double drand() {
+    return (double)rand()/(double)RAND_MAX;
+}
--- a/lib/util.h
+++ b/lib/util.h
@ -24,6 +24,7 @@ extern double dtime();
 extern void boinc_sleep( int seconds );
 extern int parse_command_line( char *, char ** );
 extern int lock_file(char*);
+extern double drand();

 #ifndef max
 #define max(a,b)            (((a) > (b)) ? (a) : (b))
--- a/sched/make_work.C
+++ b/sched/make_work.C
@ -46,37 +46,37 @@ int cushion = 10;
 int redundancy = 10;
 char wu_name[256], result_template_file[256];

-void replace_file_name(char * xml_doc, char * filename, char * new_filename,char * download_url)
-{
-  char buf[MAX_BLOB_SIZE], temp[256], download_path[256], new_download_path[256];
-  char * p;
+// edit a WU XML doc, replacing one filename by another
+// (should appear twice, within <file_info> and <file_ref>)
+// Also patch the download URL (redundant)
+//
+void replace_file_name(
+    char* xml_doc, char* filename, char* new_filename, char* download_url
+) {
+    char buf[MAX_BLOB_SIZE], temp[256], download_path[256],
+    new_download_path[256];
+    char * p;
  
-  sprintf(download_path,"%s/%s",download_url,filename);
-  sprintf(new_download_path,"%s/%s",download_url,new_filename);
-  strcpy(buf,xml_doc);
-  p = strtok(buf,"\n");
-  while (p) {
-    if (parse_str(p, "<name>", temp, sizeof(temp))) {
-      if(!strcmp(filename, temp))
-	{
-	  replace_element(xml_doc + (p - buf),"<name>","</name>",new_filename);
-	}
+    sprintf(download_path,"%s/%s", download_url, filename);
+    sprintf(new_download_path,"%s/%s", download_url, new_filename);
+    strcpy(buf, xml_doc);
+    p = strtok(buf,"\n");
+    while (p) {
+        if (parse_str(p, "<name>", temp, sizeof(temp))) {
+            if(!strcmp(filename, temp)) {
+                replace_element(xml_doc + (p - buf),"<name>","</name>",new_filename);
+            }
+        } else if (parse_str(p, "<file_name>", temp, sizeof(temp))) {
+            if(!strcmp(filename, temp)) {
+                replace_element(xml_doc + (p - buf),"<file_name>","</file_name>",new_filename);
+            }
+        } else if (parse_str(p, "<url>", temp, sizeof(temp))) {
+            if(!strcmp(temp, download_path)) {
+                replace_element(xml_doc + (p - buf),"<url>","</url>",new_download_path);
+            }
+        }
+        p = strtok(0, "\n");
    }
-    else if (parse_str(p, "<file_name>", temp, sizeof(temp))) {
-      if(!strcmp(filename, temp))
-	{
-	  replace_element(xml_doc + (p - buf),"<file_name>","</file_name>",new_filename);
-	}
-    }
-    else if (parse_str(p, "<url>", temp, sizeof(temp))) {
-      if(!strcmp(temp, download_path))
-	{
-	  replace_element(xml_doc + (p - buf),"<url>","</url>",new_download_path);
-	}
-    }
-    p = strtok(0, "\n");
-  }
- 
 }

 void check_trigger() {
@ -88,12 +88,15 @@ void check_trigger() {
 void make_work() {
    CONFIG config;
    char * p;
-    int retval, i, start_time=time(0), n, nresults_left;
-    char keypath[256], suffix[256], result_template[MAX_BLOB_SIZE], file_name[256], buf[MAX_BLOB_SIZE],pathname[256],new_file_name[256],new_pathname[256],command[256], starting_xml[MAX_BLOB_SIZE],new_buf[MAX_BLOB_SIZE];
+    int retval, start_time=time(0), n, nresults_left;
+    char keypath[256], suffix[256], result_template[MAX_BLOB_SIZE];
+    char file_name[256], buf[MAX_BLOB_SIZE], pathname[256];
+    char new_file_name[256], new_pathname[256], command[256];
+    char starting_xml[MAX_BLOB_SIZE], new_buf[MAX_BLOB_SIZE];
    R_RSA_PRIVATE_KEY key;
    WORKUNIT wu;
+    int seqno = 0;
   
-
    retval = config.parse_file();
    if (retval) {
        fprintf(stderr,"make_work: can't read config file\n");
@ -135,39 +138,53 @@ void make_work() {
            fprintf(stderr,"make_work: can't counts results\n");
            exit(1);
        }
-        printf("make_work: %d results\n", n);
+        printf("make_work: %d results available to send\n", n);
        if (n > cushion) {
            sleep(1);
            continue;
        }

+        // make a new workunit every "redundancy" results
+        //
        if (nresults_left == 0) {
-	   strcpy(buf,starting_xml);
-	   p = strtok(buf, "\n");
-	   strcpy(file_name, "");
-	
-	   while (p) {
-	     if (parse_str(p, "<name>", file_name, sizeof(file_name))) {
-	       sprintf(new_file_name,"%s_%d_%d",file_name,start_time,i++);
-	       sprintf(pathname, "%s/%s", config.download_dir, file_name);
-	       sprintf(new_pathname,"%s/%s",config.download_dir, new_file_name);
-	       sprintf(command,"cp %s %s",pathname,new_pathname);
-	       system(command);
-	       strcpy(new_buf,starting_xml);
-	       replace_file_name(new_buf,file_name,new_file_name,config.download_url);
-	       strcpy(wu.xml_doc, new_buf);
-	     }
-	     p = strtok(0, "\n");
-	   }
-	  nresults_left = redundancy;
-	  sprintf(wu.name, "wu_%d_%d", start_time, i++);
-	  wu.id = 0;
-	  wu.create_time = time(0);
-	  retval = db_workunit_new(wu);
-	  wu.id = db_insert_id();
-	 
+            strcpy(buf, starting_xml);
+            p = strtok(buf, "\n");
+            strcpy(file_name, "");
+        
+            // make new copies of all the WU's input files
+            //
+            while (p) {
+                if (parse_str(p, "<name>", file_name, sizeof(file_name))) {
+                    sprintf(
+                        new_file_name, "%s_%d_%d", file_name, start_time, i++
+                    );
+                    sprintf(pathname, "%s/%s", config.download_dir, file_name);
+                    sprintf(
+                        new_pathname, "%s/%s",config.download_dir, new_file_name
+                    );
+                    sprintf(command,"cp %s %s",pathname,new_pathname);
+                    if (system(command)) {
+                        fprintf(stderr, "make_work: ERROR\n");
+                        perror(command);
+                        exit();
+                    }
+                    strcpy(new_buf, starting_xml);
+                    replace_file_name(
+                        new_buf, file_name, new_file_name, config.download_url
+                    );
+                    strcpy(wu.xml_doc, new_buf);
+                }
+                p = strtok(0, "\n");
+            }
+            nresults_left = redundancy;
+            sprintf(wu.name, "wu_%d_%d", start_time, seqno);
+            wu.id = 0;
+            wu.create_time = time(0);
+            retval = db_workunit_new(wu);
+            wu.id = db_insert_id();
+         
        }
-        sprintf(suffix, "%d_%d", start_time, i++);
+        sprintf(suffix, "%d_%d", start_time, seqno++);
        create_result(
            wu, result_template, suffix, key,
            config.upload_url, config.download_url
--- a/stripchart/README
+++ b/stripchart/README
@ -0,0 +1,333 @@
+Stripchart version 2.0
+----------------------
+Author: Matt Lebofsky
+        BOINC/SETI@home - University of California, Berkeley
+        mattl@ssl.berkeley.edu
+        
+Date of recent version: November 4, 2002
+
+Requirements:
+  * a gnuplot with the ability to generate gifs
+  * perl 
+  * apache or other cgi-enabled web browser
+
+Send all thoughts and queries to: mattl@ssl.berkeley.edu
+
+This software is free to edit, distribute and use by anybody, as long as
+I get credit for it in some form or another. Thanks.
+----------------------
+
+Contents:
+
+I. Some questions and answers
+II. So how does it work?
+III. Known bugs, things to do, etc.
+
+----------------------
+I. Some questions and answers
+
+Q: What is stripchart?
+
+A: Well, it's actually two relatively small perl programs:
+
+   1. stripchart
+   
+      stripchart reads in time-based user data and, depending on a flurry of
+      command line options, generates a web-friendly .gif plotting the data.
+      The user can supply the time range, the y axis range, even the color
+      scheme, and more.
+
+   2. stripchart.cgi
+
+      stripchart.cgi is a web-based GUI interface that allows users to easily
+      select multiple data sources and various parameters to plot, allowing
+      fast comparisons without having to deal with a command line interface.
+
+Q: Why do you bother writing this program?
+
+A: Working as a systems administrator (amongst other things) for SETI@home,
+   we kept finding ourselves in dire problem-solving situations, i.e. Why
+   did the database stop working? Why is load on our web server so high? 
+      
+   So we started collecting data in flat files, keeping track of server
+   loads, database checkpoint times, even CPU temperatures. When these files
+   grew too large and unwieldy, I found myself writing (and rewriting) simple
+   scripts to generate plots on this data. Sick of constant revision whenever
+   a new problem arose, I wrote stripchart version 1.0.
+
+   Its usefulness became immediately apparent when I added on stripchart.cgi.
+   I couldn't bear to teach everybody the many command line options to 
+   stripchart, so I wrote this CGI to do all the dirty work. Suddenly we were
+   able to line up several plots, look for causes and effects, or just enjoy
+   watching the counts in our database tables grow to impossibly high numbers.
+
+   The SETI@home network has proven to be a delicate system, and keeping track
+   of all the data server, user, and web statistics has proven to be quite a
+   life saver. So when BOINC came around we felt that any project aiming to
+   embark on a similar project may need this tool. So I rewrote stripchart to
+   be a bit more friendly and general. 
+
+Q: Why don't you make .pngs or .jpgs instead of .gifs? The latest gnuplot
+   doesn't support .gifs.
+
+A: Basically gnuplot support for other graphic file formats isn't as good. For
+   example, you cannot control exact window size, font size, and colors unless
+   you make .gifs. I'm not exactly sure why this is the case, but there you have it.
+   Anywho, you can find older gnuplot distributions out there - you'll need to
+   get the gd libs first, by the way.
+
+----------------------
+II. So how does it work?
+
+You can use stripchart as a stand alone command-line program to produce plots
+whenever you like, but we highly recommend using it in conjunction with the
+stripchart.cgi for ease of use. But here's how to do it both ways.
+
+stripchart (stand alone)
+
+Before anything, look at the section GLOBAL/DEFAULT VARS in the program
+stripchart and see if you need to edit anything (usually pathnames to
+executables and such).
+
+Let's just start with the usage (obtained by typing "stripchart -h"):
+
+stripchart: creates stripchart .gif graphic based on data in flat files
+options:
+  -i: input FILE      - name of input data file (mandatory)
+  -o: output FILE     - name of output .gif file (default: STDOUT)
+  -O: output FILE     - name of output .gif file and dump to STDOUT as well
+  -f: from TIME       - stripchart with data starting at TIME 
+                        (default: 24 hours ago)
+  -t: to TIME         - stripchart with data ending at TIME (default: now)
+  -r: range RANGE     - stripchart data centered around "from" time the size
+                        of RANGE (overrides -t)
+  -l: last LINES      - stripchart last number of LINES in data file
+                        (overrides -f and -t and -r)
+  -T: title TITLE     - title to put on graphic (default: FILE RANGE)
+  -x: column X        - time or "x" column (default: 2)
+  -y: column Y        - value or "y" column (default: 3)
+  -Y: column Y'       - overplot second "y" column (default: none)
+  -b: baseline VALUE  - overplot baseline of arbitrary value VALUE
+  -B: baseline-avg    - overrides -b, it plots baseline of computed average
+  -d: dump low VALUE  - ignore data less than VALUE
+  -D: dump high VALUE - ignore data higher than VALUE
+  -v: verbose         - puts verbose runtime output to STDERR
+  -L: log             - makes y axis log scale
+  -c: colors "COLORS" - set gnuplot colors for graph/axis/fonts/data (default:
+                        "xffffff x000000 xc0c0c0 x00a000 x0000a0 x2020c0"
+                        in order: bground, axis/fonts, grids, pointcolor1,2,3)
+  -C: cgi             - output CGI header to STDOUT if being called as CGI
+  -s: stats           - turn extra plot stats on (current, avg, min, max)
+  -j: julian times    - time columns is in local julian date (legacy stuff)
+
+notes:
+  * TIME either unix date, julian date, or civil date in the form:
+      YYYY:MM:DD:HH:MM (year, month, day, hour, minute)
+    If you enter something with colons, it assumes it is civil date
+    If you have a decimal point, it assumes it is julian date
+    If it is an integer, it assumes it is unix date (epoch seconds)
+    If it is a negative number, it is in decimal days from current time
+      (i.e. -2.5 = two and a half days ago)
+    * All times on command line are assumed to be "local" times
+    * All times in the data file must be in unix date (epoch seconds)
+  * RANGE is given in decimal days (i.e. 1.25 = 1 day, 6 hours)
+  * if LINES == 0, (i.e. -l 0) then the whole data file is read in
+  * columns (given with -x, -y, -Y flags) start at 1
+  * titles given with -T can contain the following key words which will
+    be converted:
+      FILE - basename of input file
+      RANGE - pretty civil date range (in local time zone)
+    the default title is: FILE RANGE
+
+...okay that's a lot to ingest, but it's really simple. Let's take a look at an
+example (you'll find in the samples directory two files get_load and crontab).
+
+You have a machine that you want to monitor it's load. Here's a script that
+will output a single line containing two fields for time and the third with the
+actual data. For example:
+
+2002:11:05:12:51 1036529480 0.25
+
+The first field is time in an arbitrary human readable format
+(year:month:day:hour:minute), the second in epoch seconds (standard
+unix time format - the number of seconds since 00:00 1/1/1970 GMT),
+and the third is the load at this time.
+
+And we'll start collecting data every five minutes on this particular machine
+by add such a line to the crontab:
+
+0,5,10,15,20,25,30,35,40,45,50,55 * * * * /usr/local/stripchart/samples/get_load >> /disks/matt/data/machine_load
+
+So the file "machine_load" will quickly fill with lines such as the above.
+Now you may ask yourself - why two columns representing time in two different
+formats? Well sometime you just want to look at the data file itself, in which
+case the human-readable first column is quite handy to have around, but when
+making linear time plots, having time in epoch seconds is much faster to
+manipulate. So generally, we like to have at least the two time fields first,
+and the actual data in the third column. That's what stripchart expects by
+default.
+
+Note: stripchart will understand time in both epoch seconds and julian date.
+If the second time field is in julian date, you should supply the command line
+flag "-j" to warn stripchart so it knows how to handle it. 
+
+Okay. So you have this data file now. A very common thing to plot would be the
+data over the past 24 hours. Turns out that's the default! If you type on the
+command line:
+
+stripchart -i machine_load -o machine_load.gif
+
+you will quickly get a new file "machine_load.gif" with all the goods.
+
+Note: you always have to supply an input file via -i. If you don't supply
+an output file via "-o" it .gif gets dumped to stdout. If you supply an
+output file via "-O" the output is stored in both the file and to stdout.
+
+Now let's play with the time ranges. You can supply times in a variety of
+formats on the command line:
+
+   "civil date" i.e. 2002:11:05:12:51 (YYYY:MM:DD:hh:mm)
+   "epoch seconds" i.e. 1036529480
+   "julian date" i.e. 2452583.52345
+
+You can supply a date range using the -f and -t flags (from and to):
+
+stripchart -i machine_load -f 2002:11:01:00:00 -t 2002:11:04:00:00
+
+Usually the "to" time is right now, so you can quickly tell stripchart
+to plot starting at some arbitrary time "ago." This is done also via the
+"-f" flag - if it's negative it will assume you mean that many decimal
+days from now as a starting point. So "-f -3.5" will plot from 3 and a
+half days ago until now.
+
+You can also supply a "range" centered around the from time. For example,
+to plot the 24 hours centered around 2002:11:01:13:40:
+
+stripchart -i machine_load -f 2002:11:01:13:40 -r 1
+
+On some rare occasions you might want to plot the last number of lines
+in a file, regardless of what time they were. If you supply the number
+of lines via the "-l" flag, it overrides any time ranges you may have
+supplied.
+
+Moving on to some other useful flags in no particular order:
+
+To change the default title (which is the basename of the file and
+the time range being plotted), you can do so via the "-T" command.
+Make sure to put the title in quotes. Within the title string the
+all-uppercase string "FILE" will be replaced with the file basename,
+and the string "RANGE" will be replaced by the time range. So in
+essence, the default title string is "FILE RANGE".
+
+If you have data files in different formats, you can specify the data
+columns using the "-x" and "-y" flags. By default -x is 2 and -y is 3.
+Sometimes we have datafiles with many columns so we actively have to tell
+stripchart which is the correct data column.
+
+However, you might want to overplot one column on top of another. If your
+data file has a second data column, you can specify what that is via the
+-Y flag, and this data will be overplotted onto the data from the first
+data column.
+
+Sometime you want to plot a horizontal rule or a "baseline". You can
+turn this feature on by specifying the value with the "-b" flag. If you
+use the "-B" flag (without any values) it automatically computes the
+average over the time range and plots that as the baseline. Simple!
+
+If you want to excise certain y values, you can do so with the dump
+flags, i.e. "-d" and "-D". In particular, any values lower than the one
+supplied with "-d" will be dumped, and any values higher supplied by
+"-D" will be dumped. 
+
+To log the y axis, use the "-L" flag. Quite straightforward.
+
+A very useful flag is "-s" which outputs a line of stats underneath
+the plot title. It shows the current value, and the minimum, maximum
+and average values during the plot range.
+
+For verbose output to stderr, use the "-v" flag. It may not make much
+sense, but it's useful for debugging.
+
+Using the "-C" flag causes stripchart to spit out the "Content-type"
+lines necessary for incorporating stripchart plots into CGIs. This
+doesn't work so well now, but there it is.
+
+Okay. That's enough about the flags, and hopefully enough to get you
+playing around with stripchart and plotting some stuff. Now onto:
+
+stripchart.cgi
+
+First and foremost, you need to do the following before running the
+CGI version of stripchart:
+
+1. Put stripchart.cgi in a cgi-enabled web-accessible directory
+2. Make a "lib" directory somewhere that the web server can read/write to
+3. Edit stripchart.cgi GLOBAL/DEFAULT VARS to point to proper paths, including
+   the files "querylist" and "datafiles" in the aforementioned "lib" directory.
+4. Edit the "lib/datafiles" file to contain entries for all your data files.
+   You can find an example datafiles in the samples directory. Follow the
+   instructions in the comment lines, adding your entries below the header.
+
+That should be it, I think. Now go to the URL wherever your stripchart.cgi
+is sitting. If all is well..
+
+You will be immediately presented with a web form. Ignore the "select query"
+pulldown menu for now. Underneath that you will see a line:
+
+Number of stripcharts: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 
+
+By default stripchart.cgi presents you with the ability to plot 4 simultaneous
+stripcharts, but you can select any number 1-20 by clicking on those numbers.
+The less plots, the faster a web page gets generated.
+
+For each plot, you get a pull down menu which should contain all the entries
+you already put in "datafiles". Here you are selecting your data source.
+
+Then you can select the time of time range: last x hours, last x days, or
+an arbitrary date range. By default the last x hours radio button is selected -
+to pick another type of time range make sure you select the radio button
+before it. Then enter the range via the pull down menus.
+
+Then you get a simple list of checkbox/input options. You can check to log
+the y axis, baseline the average, baseline an arbitrary value (which you
+enter in the window, enter a y minimum, or enter a maximum. 
+
+When everything is selected, click on the "click here" button to plot.
+Depending on the speed of your machine, you should soon be presented with
+all the plots your desired, and the form underneath the plots which can
+edit to your heart's content. If you want to reset the form values, click
+on the "reset form" link.
+
+Note the "save images in /tmp" checkbox. If that is checked and you plot
+the stripcharts, numbered .gif files will be placed in /tmp on the web
+server machine so you can copy them elsewhere (files will be named:
+stripchart_plot_1.gif, etc.).
+
+On the topmost "click here" button you will note an "enter name to save
+query" balloon. If you enter a name here (any old string) this exact query
+will be saved into the "querylist" file which will then later appear in the
+pulldown menu at the top. That way if you have a favorite set of diagnostic
+plots which you check every morning, you don't have to enter the entire form
+every time.
+
+If you want to delete a query, enter the name in that same field but click
+the "delete" checkbox next to it. Next time you "click here" the query will
+be deleted.
+
+----------------------
+III. Known bugs, things to do, etc.
+
+* stripchart -C flag is kind of pointless and doesn't work in practice.
+* plots on data collected over small time ranges (points every few seconds, for
+  example) hasn't been tested.
+* plots that don't work via stripchart.cgi either show ugly broken image icons
+  or nothing at all - either way it's ungraceful.
+* pulldown menus and various plots sometimes need to be refreshed via a hard
+  refresh (i.e. shift-refresh). 
+* this readme kinda stinks.
+* and many many other issues I'm failing to detail now!
+
+If you have any problems using the product, feel free to e-mail me at:
+
+	mattl@ssl.berkeley.edu
+
--- a/52
+++ b/52
@ -1,32 +1,62 @@
 -----------------------
 BUGS (arranged from high to low priority)
 -----------------------
- Suspend/resume not fully functional on Windows, no way to suspend/resume on UNIX
+- Suspend/resume not fully functional on Windows,
+    no way to suspend/resume on UNIX
 - Currently, if there are multiple CPUs they work on the same result
- "Show Graphics" menu item brings up minimized window, client does not remember window size/pos after close/reopen, window closes and does not reopen when workunit finishes and new workunit starts
- No easy way to quit projects on UNIX
+- "Show Graphics" menu item brings up minimized window,
+    client does not remember window size/pos after close/reopen,
+    window closes and does not reopen when workunit finishes
+    and new workunit starts
+- No easy way to quit/add projects on UNIX
 - Write a README file
+    what should it say?
+- "ACTIVE_TASK.check_app_status_files: could not delete
+    slots\0\fraction_done.xml: -110" appears in stderr.txt on Windows
+- "no work available" appears sporadically though work is eventually assigned,
+    not sure if it is assigned immediately or on next RPC
+- Should include option in Windows client or installer whether to
+    run client at startup or not
 - Screensaver "blank screen" functionality not implemented
- "ACTIVE_TASK.check_app_status_files: could not delete slots\0\fraction_done.xml: -110" appears in stderr.txt on Windows
- "no work available" appears sporadically though work is eventually assigned, not sure if it is assigned immediately or on next RPC
- Should include option in Windows client or installer whether to run client at startup or not
 - Scheduler reply includes blank lines that XML parser complains about
- Time to completion isn't too accurate, this is more of an Astropulse problem involving fraction_done
+- boinc_gui.exe priority should be lower (?),
+    launched app priorities should be very low
+- on final panel of install, add checkbox to let user view readme
+- Time to completion isn't too accurate,
+    this is more of an Astropulse problem involving fraction_done
 - Report problems page on maggie doesn't link to anything
- Host stats incorrectly reports number of times connected with same ip (unconfirmed)
- CPU time updates infrequently (every 10 seconds), should there be a user control for this?
- Client treats URL "maggie/ap/" different than URL "maggie/ap", though this isn't really a bug it might be good to fix anyway
- Astropulse uses a lot of memory (~70 MB) b/c of the dispersion table, should this be decreased?
+- Host stats incorrectly reports number of times connected with same ip
+    (unconfirmed)
+- CPU time updates infrequently (every 10 seconds),
+    should there be a user control for this?
+- Client treats URL "maggie/ap/" different than URL "maggie/ap",
+    though this isn't really a bug it might be good to fix anyway
+- Astropulse uses a lot of memory (~70 MB) b/c of the dispersion table,
+        should this be decreased?
 - CPU time for a completed workunit is incorrect (unconfirmed)
+- verify that if file xfer is interrupted, it resumes at right place
+    (and progress bar is correct)
 - client died quickly on Mandrake 9.0 linux (unconfirmed)
 - make pie chart colors/labels easier to understand
 - need a way to refresh prefs from client
 - columns expand when window expands
+- % done display screwed up in progress bar; also dark color is hard to read
+- Download speed is not as fast as it should be
+- Result status should say "downloading files", "uploading files", etc.
+- result % done should have 2 digits after dec pt

 -----------------------
 HIGH-PRIORITY (should do for beta test)
 -----------------------
+
+Windows: put text (e.g. column headings, menus) in a file
+    so that people can develop alternate versions of the file
+    for different languages.
+
+Windows: use messages rather than interrupt for timing
+
 make get_local_ip_addr() work in all cases
+
 est_time_to_completion doesn't work for non-running tasks

 run backend programs (validate/file_deleter/assimilate)