From 44ea1f7fc1f24ed84983af7bdd4ed13e9a1cea67 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 2 Jan 2003 23:12:05 +0000 Subject: [PATCH] back off if no work svn path=/trunk/boinc/; revision=793 --- checkin_notes | 21 ++- client/client_state.C | 33 +++-- client/client_state.h | 2 +- client/cs_scheduler.C | 18 ++- client/scheduler_op.C | 183 +++++++++++++---------- doc/credit.html | 36 ++++- lib/util.C | 4 + lib/util.h | 1 + sched/make_work.C | 133 +++++++++-------- stripchart/README | 333 ++++++++++++++++++++++++++++++++++++++++++ todo | 52 +++++-- 11 files changed, 647 insertions(+), 169 deletions(-) create mode 100644 stripchart/README diff --git a/checkin_notes b/checkin_notes index 3204daf39c..a59ecd713f 100755 --- a/checkin_notes +++ b/checkin_notes @@ -2802,4 +2802,23 @@ David Dec 23 2002 test_masterurl_failure.php test_sched_failure.php - +David Jan 2 2003 + - Back off on a project if we make a scheduler RPC asking for work + and don't get any + - fixed "http_proxy_name" XML parsing problem in client + - added drand() function. + NOTE: always make repeated code blocks into functions + - make_work was using uninitialized seqno, caused negative + numbers in filenames + + client/ + client_state.C,h + cs_scheduler.C + scheduler_op.C + doc/ + credit.html + lib/ + util.C,h + sched/ + make_work.C + validate.C diff --git a/client/client_state.C b/client/client_state.C index fdcab6b57b..4b863e9723 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -514,13 +514,13 @@ int CLIENT_STATE::parse_state_file() { } else if (match_tag(buf, "")) { // TODO: handle old client state file if different version } else if (match_tag(buf, "")) { - global_prefs.confirm_before_connecting = true; + global_prefs.confirm_before_connecting = true; } else if (match_tag(buf, "")) { - global_prefs.hangup_if_dialed = true; + global_prefs.hangup_if_dialed = true; } else if (match_tag(buf, "")) { - use_http_proxy = true; - } else if (parse_str(buf, "", proxy_server_name, sizeof(proxy_server_name))) { - } else if (parse_int(buf, "", proxy_server_port)) { + use_http_proxy = true; + } else if (parse_str(buf, "", proxy_server_name, sizeof(proxy_server_name))) { + } else if (parse_int(buf, "", proxy_server_port)) { } else { fprintf(stderr, "CLIENT_STATE::parse_state_file: unrecognized: %s\n", buf); retval = ERR_XML_PARSE; @@ -580,7 +580,9 @@ int CLIENT_STATE::write_state_file() { core_client_major_version, core_client_minor_version ); - // save proxy info + + // save proxy and preferences info + // fprintf(f, "%s" "%s" @@ -878,11 +880,18 @@ bool CLIENT_STATE::garbage_collect() { } else { // See if the files for this result's workunit had // any errors (MD5, RSA, etc) + // if(rp->wup->had_failure(failnum)) { // If we don't already have an error for this file if (rp->state < RESULT_READY_TO_ACK) { - // the result wu corresponding to this result had an error downloading some input file(s). - report_project_error(*rp,0,"The work_unit corresponding to this result had an error",CLIENT_DOWNLOADING); + // the wu corresponding to this result + // had an error downloading some input file(s). + // + report_project_error( + *rp,0, + "The work_unit corresponding to this result had an error", + CLIENT_DOWNLOADING + ); } } else { rp->wup->ref_cnt++; @@ -895,9 +904,13 @@ bool CLIENT_STATE::garbage_collect() { // if(rp->output_files[i].file_info->had_failure(failnum)) { if (rp->state < RESULT_READY_TO_ACK) { - // had an error uploading a file for this result + // had an error uploading a file for this result + // rp->client_state = CLIENT_UPLOADING; - report_project_error(*rp,0,"The outputfile corresponding to this result had an error",CLIENT_UPLOADING); + report_project_error(*rp,0, + "An output file of this result had an error", + CLIENT_UPLOADING + ); } } else { rp->output_files[i].file_info->ref_cnt++; diff --git a/client/client_state.h b/client/client_state.h index 097a604708..f71fde0b4e 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -141,7 +141,7 @@ public: PROJECT* next_project_master_pending(); double work_needed_secs(); int make_scheduler_request(PROJECT*, double); - void handle_scheduler_reply(PROJECT*, char* scheduler_url); + int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults); void set_client_state_dirty(char*); int report_project_error( RESULT &res,int err_num, char *err_msg, int state); // flag a result as having an error diff --git a/client/cs_scheduler.C b/client/cs_scheduler.C index 1531c35361..eab09d5467 100644 --- a/client/cs_scheduler.C +++ b/client/cs_scheduler.C @@ -287,8 +287,8 @@ bool CLIENT_STATE::scheduler_rpc_poll() { // Handle the reply from a scheduler // -void CLIENT_STATE::handle_scheduler_reply( - PROJECT* project, char* scheduler_url +int CLIENT_STATE::handle_scheduler_reply( + PROJECT* project, char* scheduler_url, int& nresults ) { SCHEDULER_REPLY sr; FILE* f; @@ -296,6 +296,7 @@ void CLIENT_STATE::handle_scheduler_reply( unsigned int i; bool signature_valid; + nresults = 0; contacted_sched_server = true; if (log_flags.sched_op_debug) { f = fopen(SCHED_OP_RESULT_FILE, "r"); @@ -306,7 +307,8 @@ void CLIENT_STATE::handle_scheduler_reply( } f = fopen(SCHED_OP_RESULT_FILE, "r"); - retval = sr.parse(f); + if (!f) return ERR_FOPEN; + retval = sr.parse(f); // check return? fclose(f); if (strlen(sr.project_name)) { @@ -326,10 +328,10 @@ void CLIENT_STATE::handle_scheduler_reply( project->min_rpc_time = time(0) + sr.request_delay; } + project->host_total_credit = sr.host_total_credit; + project->host_expavg_credit = sr.host_expavg_credit; if (sr.hostid) { project->hostid = sr.hostid; - project->host_total_credit = sr.host_total_credit; - project->host_expavg_credit = sr.host_expavg_credit; project->host_create_time = sr.host_create_time; project->rpc_seqno = 0; } @@ -339,6 +341,7 @@ void CLIENT_STATE::handle_scheduler_reply( // if (sr.global_prefs_xml) { f = fopen(GLOBAL_PREFS_FILE_NAME, "w"); + if (!f) return ERR_FOPEN; fprintf(f, "\n" " %s\n" @@ -358,6 +361,7 @@ void CLIENT_STATE::handle_scheduler_reply( if (sr.project_prefs_xml) { char path[256]; f = fopen(TEMP_FILE_NAME, "w"); + if (!f) return ERR_FOPEN; fprintf(f, "\n" " %s\n" @@ -371,7 +375,9 @@ void CLIENT_STATE::handle_scheduler_reply( fclose(f); get_account_filename(project->master_url, path); retval = boinc_rename(TEMP_FILE_NAME, path); + if (retval) return ERR_RENAME; f = fopen(path, "r"); + if (!f) return ERR_FOPEN; project->parse_account(f); fclose(f); } @@ -455,6 +461,7 @@ void CLIENT_STATE::handle_scheduler_reply( retval = link_result(project, rp); if (!retval) results.push_back(rp); rp->state = RESULT_NEW; + nresults++; } } @@ -479,4 +486,5 @@ void CLIENT_STATE::handle_scheduler_reply( printf("State after handle_scheduler_reply():\n"); print_counts(); } + return 0; } diff --git a/client/scheduler_op.C b/client/scheduler_op.C index 5b0b398393..76b523323b 100644 --- a/client/scheduler_op.C +++ b/client/scheduler_op.C @@ -16,17 +16,20 @@ // // Contributor(s): // + #include #include #include #include +#include "util.h" +#include "parse.h" +#include "error_numbers.h" + #include "client_state.h" #include "client_types.h" -#include "error_numbers.h" #include "file_names.h" #include "log_flags.h" -#include "parse.h" #include "scheduler_op.h" SCHEDULER_OP::SCHEDULER_OP(HTTP_OP_SET* h) { @@ -45,16 +48,17 @@ int SCHEDULER_OP::init_get_work() { must_get_work = true; project = gstate.next_project(0); if (project) { - if( (retval=init_op_project(ns)) ) { + retval = init_op_project(ns); + if (retval) { sprintf(err_msg, "init_get_work failed, error %d\n", retval); backoff(project, err_msg); return retval; } - } - else { + } else { project = gstate.next_project_master_pending(); if (project) { - if ((retval=init_master_fetch(project))) { + retval = init_master_fetch(project); + if (retval) { sprintf(err_msg, "init_master_fetch failed, error %d\n", retval); backoff(project, err_msg); } @@ -78,6 +82,7 @@ int SCHEDULER_OP::init_return_results(PROJECT* p, double ns) { // int SCHEDULER_OP::init_op_project(double ns) { int retval; + char err_msg[256]; if (log_flags.sched_op_debug) { printf("init_op_project: starting op for %s\n", project->master_url); @@ -88,15 +93,24 @@ int SCHEDULER_OP::init_op_project(double ns) { // if (project->scheduler_urls.size() == 0) { retval = init_master_fetch(project); - return retval; + goto done; } url_index = 0; retval = gstate.make_scheduler_request(project, ns); if (retval) { fprintf(stderr, "make_scheduler_request: %d\n", retval); - return retval; + goto done; } - return start_rpc(); + retval = start_rpc(); +done: + if (retval) { + sprintf(err_msg, + "scheduler init_op_project to %s failed, error %d\n", + project->scheduler_urls[url_index].text, retval + ); + backoff(project, err_msg); + } + return retval; } // Set a project's min RPC time to something in the future, @@ -109,23 +123,19 @@ int SCHEDULER_OP::set_min_rpc_time(PROJECT* p) { int n = p->nrpc_failures; if (n > RETRY_CAP) n = RETRY_CAP; - + // we've hit the limit on master_url fetches - if(p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) { + // + if (p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) { if (log_flags.sched_op_debug) { printf("we've hit the limit on master_url fetches\n"); } - //backoff e^MASTER_FETCH_INTERVAL * random - exp_backoff = (int) exp(((double)rand()/(double)RAND_MAX)*MASTER_FETCH_INTERVAL); + exp_backoff = (int) exp(drand()*MASTER_FETCH_INTERVAL); + p->min_rpc_time = time(0) + exp_backoff; + } else { + x = RETRY_BASE_PERIOD * exp(drand() * n); + exp_backoff = (int)max(SCHED_RETRY_DELAY_MIN,min(SCHED_RETRY_DELAY_MAX,(int) x)); p->min_rpc_time = time(0) + exp_backoff; - - } - else { - //backoff RETRY_BASE_PERIOD * e^nrpc_failures * random - x = RETRY_BASE_PERIOD * exp(((double)rand()/(double)RAND_MAX) * n); - exp_backoff = (int)max(SCHED_RETRY_DELAY_MIN,min(SCHED_RETRY_DELAY_MAX,(int) x)); - p->min_rpc_time = time(0) + exp_backoff; - } if (log_flags.sched_op_debug) { printf( @@ -148,8 +158,11 @@ int SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) { p->master_url_fetch_pending = true; set_min_rpc_time(p); return 0; - } - // if nrpc failures a multiple of master_fetch_period, then set master_url_fetch_pending and initialize again + } + + // if nrpc failures a multiple of master_fetch_period, + // then set master_url_fetch_pending and initialize again + // if (p->nrpc_failures == MASTER_FETCH_PERIOD) { p->master_url_fetch_pending = true; p->min_rpc_time = 0; @@ -164,6 +177,7 @@ int SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) { } // low-level routine to initiate an RPC +// If successful, creates an HTTP_OP that must be polled // int SCHEDULER_OP::start_rpc() { FILE *f; @@ -242,10 +256,12 @@ int SCHEDULER_OP::parse_master_file(vector &urls) { printf("Parsed master file; got %d scheduler URLs\n", (int)urls.size()); } - //if couldn't find any urls in the master file. - if((int) urls.size() == 0) - return -1; - + // couldn't find any urls in the master file? + // + if ((int) urls.size() == 0) { + return -1; + } + return 0; } @@ -280,9 +296,9 @@ bool SCHEDULER_OP::update_urls(PROJECT& project, vector &urls) { // poll routine. If an operation is in progress, check for completion // bool SCHEDULER_OP::poll() { - int retval; + int retval, nresults; vector urls; - bool changed, scheduler_op_done, get_master_success; + bool changed, scheduler_op_done; bool action = false; char err_msg[256],*err_url; @@ -290,7 +306,6 @@ bool SCHEDULER_OP::poll() { case SCHEDULER_OP_STATE_GET_MASTER: // here we're fetching the master file for a project // - get_master_success = true; if (http_op.http_op_state == HTTP_STATE_DONE) { action = true; project->master_url_fetch_pending = false; @@ -303,35 +318,49 @@ bool SCHEDULER_OP::poll() { ); } retval = parse_master_file(urls); - if (retval == 0) { + if (retval) { + // master file parse failed. + // + project->master_fetch_failures++; + backoff(project, "Master file parse failed\n"); + err_url = project->master_url; + } else { + // everything succeeded. Clear error counters + // changed = update_urls(*project, urls); if (changed) { project->min_rpc_time = 0; project->nrpc_failures = 0; project->master_fetch_failures = 0; } - } else { - // master file parse failed. treat like RPC error - // - project->master_fetch_failures++; - backoff(project, "Master file parse failed\n"); - get_master_success = false; - err_url = project->master_url; - } + } } else { - // fetch of master file failed. Treat like RPC error + // master file fetch failed. // - project->master_fetch_failures++; + project->master_fetch_failures++; backoff(project, "Master file fetch failed\n"); - get_master_success = false; err_url = project->master_url; } + + // If don't have any schedulers for this project, + // it may be the wrong URL. notify the user + // + if (project->scheduler_urls.size() == 0) { + sprintf(err_msg, + "Could not contact %s. Make sure this is the correct project URL.", + err_url + ); + show_message(err_msg, "high"); + } + + // See if need to read master file for another project + // project = gstate.next_project_master_pending(); if (project) { - if ((retval = init_master_fetch(project))) { - project->master_fetch_failures++; + retval = init_master_fetch(project); + if (retval) { + project->master_fetch_failures++; backoff(project, "Master file fetch failed\n"); - get_master_success = false; err_url = project->master_url; } } else { @@ -340,18 +369,12 @@ bool SCHEDULER_OP::poll() { printf("Scheduler_op: return to idle state\n"); } } - // If we haven't been able to successfully get the master URL file - // recently then notify the user - if (!get_master_success) { - char buf[256]; - sprintf(buf, "Could not contact %s. Make sure this is the correct project URL.", - err_url); - show_message( buf, "high" ); - } + } break; case SCHEDULER_OP_STATE_RPC: - // here we're doing a scheduler RPC to some project + + // here we're doing a scheduler RPC // scheduler_op_done = false; if (http_op.http_op_state == HTTP_STATE_DONE) { @@ -364,26 +387,22 @@ bool SCHEDULER_OP::poll() { project->scheduler_urls[url_index].text ); } + + // scheduler RPC failed. Try another scheduler if one exists + // url_index++; if (url_index < project->scheduler_urls.size()) { start_rpc(); } else { - backoff(project,""); - + backoff(project, "No schedulers responded"); if (must_get_work) { project = gstate.next_project(project); if (project) { - if( (retval=init_op_project(gstate.work_needed_secs())) ) { - sprintf( err_msg, - "scheduler init_op_project to %s failed, error %d\n", - project->scheduler_urls[url_index].text, retval - ); - backoff(project, err_msg); - } + retval = init_op_project(gstate.work_needed_secs()); } else { scheduler_op_done = true; } - } + } else { scheduler_op_done = true; } @@ -395,21 +414,27 @@ bool SCHEDULER_OP::poll() { project->scheduler_urls[url_index].text ); } - project->nrpc_failures = 0; - project->min_rpc_time = 0; - gstate.handle_scheduler_reply(project, scheduler_url); + gstate.handle_scheduler_reply(project, scheduler_url, nresults); + + // if we asked for work and didn't get any, + // back off this project + // + if (must_get_work && nresults==0) { + backoff(project, "No work from project\n"); + } else { + project->nrpc_failures = 0; + project->min_rpc_time = 0; + } + + // if we didn't get all the work we needed, + // ask another project for work + // if (must_get_work) { double x = gstate.work_needed_secs(); if (x > 0) { project = gstate.next_project(project); if (project) { - if( (retval=init_op_project(x)) ) { - sprintf( err_msg, - "scheduler init_op_project to %s failed, error %d\n", - project->scheduler_urls[url_index].text, retval - ); - backoff(project, err_msg); - } + retval = init_op_project(x); } else { scheduler_op_done = true; } @@ -421,10 +446,14 @@ bool SCHEDULER_OP::poll() { } } } + + // If no outstanding ops, see if need a master fetch + // if (scheduler_op_done) { - project = gstate.next_project_master_pending(); + project = gstate.next_project_master_pending(); if (project) { - if ((retval = init_master_fetch(project))) { + retval = init_master_fetch(project); + if (retval) { if (log_flags.sched_op_debug) { printf("Scheduler op: init_master_fetch failed.\n" ); } @@ -472,7 +501,7 @@ int SCHEDULER_REPLY::parse(FILE* in) { code_sign_key_signature = 0; p = fgets(buf, 256, in); - // First part of content should either be tag (HTTP 1.0) or + // First part of content should either be tag (HTTP 1.0) or // hex length of response (HTTP 1.1) if (!match_tag(buf, "")) { fprintf(stderr, "SCHEDULER_REPLY::parse(): bad first tag %s\n", buf); diff --git a/doc/credit.html b/doc/credit.html index 60dc583a96..5bf56c33b5 100644 --- a/doc/credit.html +++ b/doc/credit.html @@ -12,17 +12,13 @@ ranked by credit.

BOINC's credit system is based on a "reference computer" that does

    -
  • 1 billion floating-point multiply/adds per second -
  • 1 billion integer multiply/adds per second +
  • 1 billion floating-point multiplies per second +
  • 1 billion integer multiplies per second
  • 4 billion bytes per second of traffic to and from main memory (sequential, half reads and half writes)
BOINC's unit of credit, the Cobblestone, is one day of CPU time on the reference computer. -(Credit should ideally reflect network transfer and disk storage as well -as computation. -But it's hard to verify these activities, -so for now they aren't included.)

Each project maintains two types of credit: @@ -37,3 +33,31 @@ This average decreases by a factor of two every week.

Both types of credit (total and recent average) are maintained for each user and host. + +

Leader boards

+The PHP pages supplied by BOINC include basic leaderboards: +top users and hosts, by total and average. +BOINC lets projects export the credit-related +parts of their database as XML files. +These XML files can be used to generate +other breakdowns of users, hosts and teams, +or to generate leaderboards based on the sum of +credit from different projects. + + +

Possible future improvements

+
    +
  • +Ideally, credit should reflect network transfer and disk storage as well +as computation. +But it's hard to verify these activities, +so for now they aren't included. +
  • +Eventually projects will develop applications that use +graphics coprocessors or other non-CPU hardware. +Credit should reflect the usage of such hardware. +To accomplish this, we will need to let +projects supply their own benchmarking functions. +This will also handle the situation where a project's +application does e.g. all integer arithmetic. +
diff --git a/lib/util.C b/lib/util.C index 7909c70682..9d167e071b 100755 --- a/lib/util.C +++ b/lib/util.C @@ -18,6 +18,7 @@ // #include #include +#include #include #include #include @@ -176,3 +177,6 @@ int lock_file(char* filename) { return retval; } +double drand() { + return (double)rand()/(double)RAND_MAX; +} diff --git a/lib/util.h b/lib/util.h index 1bb264bc7c..a910c42c95 100755 --- a/lib/util.h +++ b/lib/util.h @@ -24,6 +24,7 @@ extern double dtime(); extern void boinc_sleep( int seconds ); extern int parse_command_line( char *, char ** ); extern int lock_file(char*); +extern double drand(); #ifndef max #define max(a,b) (((a) > (b)) ? (a) : (b)) diff --git a/sched/make_work.C b/sched/make_work.C index 0cdb932550..7078000ffe 100644 --- a/sched/make_work.C +++ b/sched/make_work.C @@ -46,37 +46,37 @@ int cushion = 10; int redundancy = 10; char wu_name[256], result_template_file[256]; -void replace_file_name(char * xml_doc, char * filename, char * new_filename,char * download_url) -{ - char buf[MAX_BLOB_SIZE], temp[256], download_path[256], new_download_path[256]; - char * p; +// edit a WU XML doc, replacing one filename by another +// (should appear twice, within and ) +// Also patch the download URL (redundant) +// +void replace_file_name( + char* xml_doc, char* filename, char* new_filename, char* download_url +) { + char buf[MAX_BLOB_SIZE], temp[256], download_path[256], + new_download_path[256]; + char * p; - sprintf(download_path,"%s/%s",download_url,filename); - sprintf(new_download_path,"%s/%s",download_url,new_filename); - strcpy(buf,xml_doc); - p = strtok(buf,"\n"); - while (p) { - if (parse_str(p, "", temp, sizeof(temp))) { - if(!strcmp(filename, temp)) - { - replace_element(xml_doc + (p - buf),"","",new_filename); - } + sprintf(download_path,"%s/%s", download_url, filename); + sprintf(new_download_path,"%s/%s", download_url, new_filename); + strcpy(buf, xml_doc); + p = strtok(buf,"\n"); + while (p) { + if (parse_str(p, "", temp, sizeof(temp))) { + if(!strcmp(filename, temp)) { + replace_element(xml_doc + (p - buf),"","",new_filename); + } + } else if (parse_str(p, "", temp, sizeof(temp))) { + if(!strcmp(filename, temp)) { + replace_element(xml_doc + (p - buf),"","",new_filename); + } + } else if (parse_str(p, "", temp, sizeof(temp))) { + if(!strcmp(temp, download_path)) { + replace_element(xml_doc + (p - buf),"","",new_download_path); + } + } + p = strtok(0, "\n"); } - else if (parse_str(p, "", temp, sizeof(temp))) { - if(!strcmp(filename, temp)) - { - replace_element(xml_doc + (p - buf),"","",new_filename); - } - } - else if (parse_str(p, "", temp, sizeof(temp))) { - if(!strcmp(temp, download_path)) - { - replace_element(xml_doc + (p - buf),"","",new_download_path); - } - } - p = strtok(0, "\n"); - } - } void check_trigger() { @@ -88,12 +88,15 @@ void check_trigger() { void make_work() { CONFIG config; char * p; - int retval, i, start_time=time(0), n, nresults_left; - char keypath[256], suffix[256], result_template[MAX_BLOB_SIZE], file_name[256], buf[MAX_BLOB_SIZE],pathname[256],new_file_name[256],new_pathname[256],command[256], starting_xml[MAX_BLOB_SIZE],new_buf[MAX_BLOB_SIZE]; + int retval, start_time=time(0), n, nresults_left; + char keypath[256], suffix[256], result_template[MAX_BLOB_SIZE]; + char file_name[256], buf[MAX_BLOB_SIZE], pathname[256]; + char new_file_name[256], new_pathname[256], command[256]; + char starting_xml[MAX_BLOB_SIZE], new_buf[MAX_BLOB_SIZE]; R_RSA_PRIVATE_KEY key; WORKUNIT wu; + int seqno = 0; - retval = config.parse_file(); if (retval) { fprintf(stderr,"make_work: can't read config file\n"); @@ -135,39 +138,53 @@ void make_work() { fprintf(stderr,"make_work: can't counts results\n"); exit(1); } - printf("make_work: %d results\n", n); + printf("make_work: %d results available to send\n", n); if (n > cushion) { sleep(1); continue; } + // make a new workunit every "redundancy" results + // if (nresults_left == 0) { - strcpy(buf,starting_xml); - p = strtok(buf, "\n"); - strcpy(file_name, ""); - - while (p) { - if (parse_str(p, "", file_name, sizeof(file_name))) { - sprintf(new_file_name,"%s_%d_%d",file_name,start_time,i++); - sprintf(pathname, "%s/%s", config.download_dir, file_name); - sprintf(new_pathname,"%s/%s",config.download_dir, new_file_name); - sprintf(command,"cp %s %s",pathname,new_pathname); - system(command); - strcpy(new_buf,starting_xml); - replace_file_name(new_buf,file_name,new_file_name,config.download_url); - strcpy(wu.xml_doc, new_buf); - } - p = strtok(0, "\n"); - } - nresults_left = redundancy; - sprintf(wu.name, "wu_%d_%d", start_time, i++); - wu.id = 0; - wu.create_time = time(0); - retval = db_workunit_new(wu); - wu.id = db_insert_id(); - + strcpy(buf, starting_xml); + p = strtok(buf, "\n"); + strcpy(file_name, ""); + + // make new copies of all the WU's input files + // + while (p) { + if (parse_str(p, "", file_name, sizeof(file_name))) { + sprintf( + new_file_name, "%s_%d_%d", file_name, start_time, i++ + ); + sprintf(pathname, "%s/%s", config.download_dir, file_name); + sprintf( + new_pathname, "%s/%s",config.download_dir, new_file_name + ); + sprintf(command,"cp %s %s",pathname,new_pathname); + if (system(command)) { + fprintf(stderr, "make_work: ERROR\n"); + perror(command); + exit(); + } + strcpy(new_buf, starting_xml); + replace_file_name( + new_buf, file_name, new_file_name, config.download_url + ); + strcpy(wu.xml_doc, new_buf); + } + p = strtok(0, "\n"); + } + nresults_left = redundancy; + sprintf(wu.name, "wu_%d_%d", start_time, seqno); + wu.id = 0; + wu.create_time = time(0); + retval = db_workunit_new(wu); + wu.id = db_insert_id(); + } - sprintf(suffix, "%d_%d", start_time, i++); + sprintf(suffix, "%d_%d", start_time, seqno++); create_result( wu, result_template, suffix, key, config.upload_url, config.download_url diff --git a/stripchart/README b/stripchart/README new file mode 100644 index 0000000000..75f5eb9969 --- /dev/null +++ b/stripchart/README @@ -0,0 +1,333 @@ +Stripchart version 2.0 +---------------------- +Author: Matt Lebofsky + BOINC/SETI@home - University of California, Berkeley + mattl@ssl.berkeley.edu + +Date of recent version: November 4, 2002 + +Requirements: + * a gnuplot with the ability to generate gifs + * perl + * apache or other cgi-enabled web browser + +Send all thoughts and queries to: mattl@ssl.berkeley.edu + +This software is free to edit, distribute and use by anybody, as long as +I get credit for it in some form or another. Thanks. +---------------------- + +Contents: + +I. Some questions and answers +II. So how does it work? +III. Known bugs, things to do, etc. + +---------------------- +I. Some questions and answers + +Q: What is stripchart? + +A: Well, it's actually two relatively small perl programs: + + 1. stripchart + + stripchart reads in time-based user data and, depending on a flurry of + command line options, generates a web-friendly .gif plotting the data. + The user can supply the time range, the y axis range, even the color + scheme, and more. + + 2. stripchart.cgi + + stripchart.cgi is a web-based GUI interface that allows users to easily + select multiple data sources and various parameters to plot, allowing + fast comparisons without having to deal with a command line interface. + +Q: Why do you bother writing this program? + +A: Working as a systems administrator (amongst other things) for SETI@home, + we kept finding ourselves in dire problem-solving situations, i.e. Why + did the database stop working? Why is load on our web server so high? + + So we started collecting data in flat files, keeping track of server + loads, database checkpoint times, even CPU temperatures. When these files + grew too large and unwieldy, I found myself writing (and rewriting) simple + scripts to generate plots on this data. Sick of constant revision whenever + a new problem arose, I wrote stripchart version 1.0. + + Its usefulness became immediately apparent when I added on stripchart.cgi. + I couldn't bear to teach everybody the many command line options to + stripchart, so I wrote this CGI to do all the dirty work. Suddenly we were + able to line up several plots, look for causes and effects, or just enjoy + watching the counts in our database tables grow to impossibly high numbers. + + The SETI@home network has proven to be a delicate system, and keeping track + of all the data server, user, and web statistics has proven to be quite a + life saver. So when BOINC came around we felt that any project aiming to + embark on a similar project may need this tool. So I rewrote stripchart to + be a bit more friendly and general. + +Q: Why don't you make .pngs or .jpgs instead of .gifs? The latest gnuplot + doesn't support .gifs. + +A: Basically gnuplot support for other graphic file formats isn't as good. For + example, you cannot control exact window size, font size, and colors unless + you make .gifs. I'm not exactly sure why this is the case, but there you have it. + Anywho, you can find older gnuplot distributions out there - you'll need to + get the gd libs first, by the way. + +---------------------- +II. So how does it work? + +You can use stripchart as a stand alone command-line program to produce plots +whenever you like, but we highly recommend using it in conjunction with the +stripchart.cgi for ease of use. But here's how to do it both ways. + +stripchart (stand alone) + +Before anything, look at the section GLOBAL/DEFAULT VARS in the program +stripchart and see if you need to edit anything (usually pathnames to +executables and such). + +Let's just start with the usage (obtained by typing "stripchart -h"): + +stripchart: creates stripchart .gif graphic based on data in flat files +options: + -i: input FILE - name of input data file (mandatory) + -o: output FILE - name of output .gif file (default: STDOUT) + -O: output FILE - name of output .gif file and dump to STDOUT as well + -f: from TIME - stripchart with data starting at TIME + (default: 24 hours ago) + -t: to TIME - stripchart with data ending at TIME (default: now) + -r: range RANGE - stripchart data centered around "from" time the size + of RANGE (overrides -t) + -l: last LINES - stripchart last number of LINES in data file + (overrides -f and -t and -r) + -T: title TITLE - title to put on graphic (default: FILE RANGE) + -x: column X - time or "x" column (default: 2) + -y: column Y - value or "y" column (default: 3) + -Y: column Y' - overplot second "y" column (default: none) + -b: baseline VALUE - overplot baseline of arbitrary value VALUE + -B: baseline-avg - overrides -b, it plots baseline of computed average + -d: dump low VALUE - ignore data less than VALUE + -D: dump high VALUE - ignore data higher than VALUE + -v: verbose - puts verbose runtime output to STDERR + -L: log - makes y axis log scale + -c: colors "COLORS" - set gnuplot colors for graph/axis/fonts/data (default: + "xffffff x000000 xc0c0c0 x00a000 x0000a0 x2020c0" + in order: bground, axis/fonts, grids, pointcolor1,2,3) + -C: cgi - output CGI header to STDOUT if being called as CGI + -s: stats - turn extra plot stats on (current, avg, min, max) + -j: julian times - time columns is in local julian date (legacy stuff) + +notes: + * TIME either unix date, julian date, or civil date in the form: + YYYY:MM:DD:HH:MM (year, month, day, hour, minute) + If you enter something with colons, it assumes it is civil date + If you have a decimal point, it assumes it is julian date + If it is an integer, it assumes it is unix date (epoch seconds) + If it is a negative number, it is in decimal days from current time + (i.e. -2.5 = two and a half days ago) + * All times on command line are assumed to be "local" times + * All times in the data file must be in unix date (epoch seconds) + * RANGE is given in decimal days (i.e. 1.25 = 1 day, 6 hours) + * if LINES == 0, (i.e. -l 0) then the whole data file is read in + * columns (given with -x, -y, -Y flags) start at 1 + * titles given with -T can contain the following key words which will + be converted: + FILE - basename of input file + RANGE - pretty civil date range (in local time zone) + the default title is: FILE RANGE + +...okay that's a lot to ingest, but it's really simple. Let's take a look at an +example (you'll find in the samples directory two files get_load and crontab). + +You have a machine that you want to monitor it's load. Here's a script that +will output a single line containing two fields for time and the third with the +actual data. For example: + +2002:11:05:12:51 1036529480 0.25 + +The first field is time in an arbitrary human readable format +(year:month:day:hour:minute), the second in epoch seconds (standard +unix time format - the number of seconds since 00:00 1/1/1970 GMT), +and the third is the load at this time. + +And we'll start collecting data every five minutes on this particular machine +by add such a line to the crontab: + +0,5,10,15,20,25,30,35,40,45,50,55 * * * * /usr/local/stripchart/samples/get_load >> /disks/matt/data/machine_load + +So the file "machine_load" will quickly fill with lines such as the above. +Now you may ask yourself - why two columns representing time in two different +formats? Well sometime you just want to look at the data file itself, in which +case the human-readable first column is quite handy to have around, but when +making linear time plots, having time in epoch seconds is much faster to +manipulate. So generally, we like to have at least the two time fields first, +and the actual data in the third column. That's what stripchart expects by +default. + +Note: stripchart will understand time in both epoch seconds and julian date. +If the second time field is in julian date, you should supply the command line +flag "-j" to warn stripchart so it knows how to handle it. + +Okay. So you have this data file now. A very common thing to plot would be the +data over the past 24 hours. Turns out that's the default! If you type on the +command line: + +stripchart -i machine_load -o machine_load.gif + +you will quickly get a new file "machine_load.gif" with all the goods. + +Note: you always have to supply an input file via -i. If you don't supply +an output file via "-o" it .gif gets dumped to stdout. If you supply an +output file via "-O" the output is stored in both the file and to stdout. + +Now let's play with the time ranges. You can supply times in a variety of +formats on the command line: + + "civil date" i.e. 2002:11:05:12:51 (YYYY:MM:DD:hh:mm) + "epoch seconds" i.e. 1036529480 + "julian date" i.e. 2452583.52345 + +You can supply a date range using the -f and -t flags (from and to): + +stripchart -i machine_load -f 2002:11:01:00:00 -t 2002:11:04:00:00 + +Usually the "to" time is right now, so you can quickly tell stripchart +to plot starting at some arbitrary time "ago." This is done also via the +"-f" flag - if it's negative it will assume you mean that many decimal +days from now as a starting point. So "-f -3.5" will plot from 3 and a +half days ago until now. + +You can also supply a "range" centered around the from time. For example, +to plot the 24 hours centered around 2002:11:01:13:40: + +stripchart -i machine_load -f 2002:11:01:13:40 -r 1 + +On some rare occasions you might want to plot the last number of lines +in a file, regardless of what time they were. If you supply the number +of lines via the "-l" flag, it overrides any time ranges you may have +supplied. + +Moving on to some other useful flags in no particular order: + +To change the default title (which is the basename of the file and +the time range being plotted), you can do so via the "-T" command. +Make sure to put the title in quotes. Within the title string the +all-uppercase string "FILE" will be replaced with the file basename, +and the string "RANGE" will be replaced by the time range. So in +essence, the default title string is "FILE RANGE". + +If you have data files in different formats, you can specify the data +columns using the "-x" and "-y" flags. By default -x is 2 and -y is 3. +Sometimes we have datafiles with many columns so we actively have to tell +stripchart which is the correct data column. + +However, you might want to overplot one column on top of another. If your +data file has a second data column, you can specify what that is via the +-Y flag, and this data will be overplotted onto the data from the first +data column. + +Sometime you want to plot a horizontal rule or a "baseline". You can +turn this feature on by specifying the value with the "-b" flag. If you +use the "-B" flag (without any values) it automatically computes the +average over the time range and plots that as the baseline. Simple! + +If you want to excise certain y values, you can do so with the dump +flags, i.e. "-d" and "-D". In particular, any values lower than the one +supplied with "-d" will be dumped, and any values higher supplied by +"-D" will be dumped. + +To log the y axis, use the "-L" flag. Quite straightforward. + +A very useful flag is "-s" which outputs a line of stats underneath +the plot title. It shows the current value, and the minimum, maximum +and average values during the plot range. + +For verbose output to stderr, use the "-v" flag. It may not make much +sense, but it's useful for debugging. + +Using the "-C" flag causes stripchart to spit out the "Content-type" +lines necessary for incorporating stripchart plots into CGIs. This +doesn't work so well now, but there it is. + +Okay. That's enough about the flags, and hopefully enough to get you +playing around with stripchart and plotting some stuff. Now onto: + +stripchart.cgi + +First and foremost, you need to do the following before running the +CGI version of stripchart: + +1. Put stripchart.cgi in a cgi-enabled web-accessible directory +2. Make a "lib" directory somewhere that the web server can read/write to +3. Edit stripchart.cgi GLOBAL/DEFAULT VARS to point to proper paths, including + the files "querylist" and "datafiles" in the aforementioned "lib" directory. +4. Edit the "lib/datafiles" file to contain entries for all your data files. + You can find an example datafiles in the samples directory. Follow the + instructions in the comment lines, adding your entries below the header. + +That should be it, I think. Now go to the URL wherever your stripchart.cgi +is sitting. If all is well.. + +You will be immediately presented with a web form. Ignore the "select query" +pulldown menu for now. Underneath that you will see a line: + +Number of stripcharts: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 + +By default stripchart.cgi presents you with the ability to plot 4 simultaneous +stripcharts, but you can select any number 1-20 by clicking on those numbers. +The less plots, the faster a web page gets generated. + +For each plot, you get a pull down menu which should contain all the entries +you already put in "datafiles". Here you are selecting your data source. + +Then you can select the time of time range: last x hours, last x days, or +an arbitrary date range. By default the last x hours radio button is selected - +to pick another type of time range make sure you select the radio button +before it. Then enter the range via the pull down menus. + +Then you get a simple list of checkbox/input options. You can check to log +the y axis, baseline the average, baseline an arbitrary value (which you +enter in the window, enter a y minimum, or enter a maximum. + +When everything is selected, click on the "click here" button to plot. +Depending on the speed of your machine, you should soon be presented with +all the plots your desired, and the form underneath the plots which can +edit to your heart's content. If you want to reset the form values, click +on the "reset form" link. + +Note the "save images in /tmp" checkbox. If that is checked and you plot +the stripcharts, numbered .gif files will be placed in /tmp on the web +server machine so you can copy them elsewhere (files will be named: +stripchart_plot_1.gif, etc.). + +On the topmost "click here" button you will note an "enter name to save +query" balloon. If you enter a name here (any old string) this exact query +will be saved into the "querylist" file which will then later appear in the +pulldown menu at the top. That way if you have a favorite set of diagnostic +plots which you check every morning, you don't have to enter the entire form +every time. + +If you want to delete a query, enter the name in that same field but click +the "delete" checkbox next to it. Next time you "click here" the query will +be deleted. + +---------------------- +III. Known bugs, things to do, etc. + +* stripchart -C flag is kind of pointless and doesn't work in practice. +* plots on data collected over small time ranges (points every few seconds, for + example) hasn't been tested. +* plots that don't work via stripchart.cgi either show ugly broken image icons + or nothing at all - either way it's ungraceful. +* pulldown menus and various plots sometimes need to be refreshed via a hard + refresh (i.e. shift-refresh). +* this readme kinda stinks. +* and many many other issues I'm failing to detail now! + +If you have any problems using the product, feel free to e-mail me at: + + mattl@ssl.berkeley.edu + diff --git a/todo b/todo index 4df2462f9d..626d981f99 100755 --- a/todo +++ b/todo @@ -1,32 +1,62 @@ ----------------------- BUGS (arranged from high to low priority) ----------------------- -- Suspend/resume not fully functional on Windows, no way to suspend/resume on UNIX +- Suspend/resume not fully functional on Windows, + no way to suspend/resume on UNIX - Currently, if there are multiple CPUs they work on the same result -- "Show Graphics" menu item brings up minimized window, client does not remember window size/pos after close/reopen, window closes and does not reopen when workunit finishes and new workunit starts -- No easy way to quit projects on UNIX +- "Show Graphics" menu item brings up minimized window, + client does not remember window size/pos after close/reopen, + window closes and does not reopen when workunit finishes + and new workunit starts +- No easy way to quit/add projects on UNIX - Write a README file + what should it say? +- "ACTIVE_TASK.check_app_status_files: could not delete + slots\0\fraction_done.xml: -110" appears in stderr.txt on Windows +- "no work available" appears sporadically though work is eventually assigned, + not sure if it is assigned immediately or on next RPC +- Should include option in Windows client or installer whether to + run client at startup or not - Screensaver "blank screen" functionality not implemented -- "ACTIVE_TASK.check_app_status_files: could not delete slots\0\fraction_done.xml: -110" appears in stderr.txt on Windows -- "no work available" appears sporadically though work is eventually assigned, not sure if it is assigned immediately or on next RPC -- Should include option in Windows client or installer whether to run client at startup or not - Scheduler reply includes blank lines that XML parser complains about -- Time to completion isn't too accurate, this is more of an Astropulse problem involving fraction_done +- boinc_gui.exe priority should be lower (?), + launched app priorities should be very low +- on final panel of install, add checkbox to let user view readme +- Time to completion isn't too accurate, + this is more of an Astropulse problem involving fraction_done - Report problems page on maggie doesn't link to anything -- Host stats incorrectly reports number of times connected with same ip (unconfirmed) -- CPU time updates infrequently (every 10 seconds), should there be a user control for this? -- Client treats URL "maggie/ap/" different than URL "maggie/ap", though this isn't really a bug it might be good to fix anyway -- Astropulse uses a lot of memory (~70 MB) b/c of the dispersion table, should this be decreased? +- Host stats incorrectly reports number of times connected with same ip + (unconfirmed) +- CPU time updates infrequently (every 10 seconds), + should there be a user control for this? +- Client treats URL "maggie/ap/" different than URL "maggie/ap", + though this isn't really a bug it might be good to fix anyway +- Astropulse uses a lot of memory (~70 MB) b/c of the dispersion table, + should this be decreased? - CPU time for a completed workunit is incorrect (unconfirmed) +- verify that if file xfer is interrupted, it resumes at right place + (and progress bar is correct) - client died quickly on Mandrake 9.0 linux (unconfirmed) - make pie chart colors/labels easier to understand - need a way to refresh prefs from client - columns expand when window expands +- % done display screwed up in progress bar; also dark color is hard to read +- Download speed is not as fast as it should be +- Result status should say "downloading files", "uploading files", etc. +- result % done should have 2 digits after dec pt ----------------------- HIGH-PRIORITY (should do for beta test) ----------------------- + +Windows: put text (e.g. column headings, menus) in a file + so that people can develop alternate versions of the file + for different languages. + +Windows: use messages rather than interrupt for timing + make get_local_ip_addr() work in all cases + est_time_to_completion doesn't work for non-running tasks run backend programs (validate/file_deleter/assimilate)