back off if no work

svn path=/trunk/boinc/; revision=793
This commit is contained in:
David Anderson 2003-01-02 23:12:05 +00:00
parent 38e20515b0
commit 44ea1f7fc1
11 changed files with 647 additions and 169 deletions

View File

@ -2802,4 +2802,23 @@ David Dec 23 2002
test_masterurl_failure.php
test_sched_failure.php
David Jan 2 2003
- Back off on a project if we make a scheduler RPC asking for work
and don't get any
- fixed "http_proxy_name" XML parsing problem in client
- added drand() function.
NOTE: always make repeated code blocks into functions
- make_work was using uninitialized seqno, caused negative
numbers in filenames
client/
client_state.C,h
cs_scheduler.C
scheduler_op.C
doc/
credit.html
lib/
util.C,h
sched/
make_work.C
validate.C

View File

@ -514,13 +514,13 @@ int CLIENT_STATE::parse_state_file() {
} else if (match_tag(buf, "<core_client_minor_version>")) {
// TODO: handle old client state file if different version
} else if (match_tag(buf, "<confirm_before_connect/>")) {
global_prefs.confirm_before_connecting = true;
global_prefs.confirm_before_connecting = true;
} else if (match_tag(buf, "<hangup_if_dialed/>")) {
global_prefs.hangup_if_dialed = true;
global_prefs.hangup_if_dialed = true;
} else if (match_tag(buf, "<use_http_proxy/>")) {
use_http_proxy = true;
} else if (parse_str(buf, "<http_proxy_server>", proxy_server_name, sizeof(proxy_server_name))) {
} else if (parse_int(buf, "<http_proxy_port>", proxy_server_port)) {
use_http_proxy = true;
} else if (parse_str(buf, "<proxy_server_name>", proxy_server_name, sizeof(proxy_server_name))) {
} else if (parse_int(buf, "<proxy_server_port>", proxy_server_port)) {
} else {
fprintf(stderr, "CLIENT_STATE::parse_state_file: unrecognized: %s\n", buf);
retval = ERR_XML_PARSE;
@ -580,7 +580,9 @@ int CLIENT_STATE::write_state_file() {
core_client_major_version,
core_client_minor_version
);
// save proxy info
// save proxy and preferences info
//
fprintf(f,
"%s"
"%s"
@ -878,11 +880,18 @@ bool CLIENT_STATE::garbage_collect() {
} else {
// See if the files for this result's workunit had
// any errors (MD5, RSA, etc)
//
if(rp->wup->had_failure(failnum)) {
// If we don't already have an error for this file
if (rp->state < RESULT_READY_TO_ACK) {
// the result wu corresponding to this result had an error downloading some input file(s).
report_project_error(*rp,0,"The work_unit corresponding to this result had an error",CLIENT_DOWNLOADING);
// the wu corresponding to this result
// had an error downloading some input file(s).
//
report_project_error(
*rp,0,
"The work_unit corresponding to this result had an error",
CLIENT_DOWNLOADING
);
}
} else {
rp->wup->ref_cnt++;
@ -895,9 +904,13 @@ bool CLIENT_STATE::garbage_collect() {
//
if(rp->output_files[i].file_info->had_failure(failnum)) {
if (rp->state < RESULT_READY_TO_ACK) {
// had an error uploading a file for this result
// had an error uploading a file for this result
//
rp->client_state = CLIENT_UPLOADING;
report_project_error(*rp,0,"The outputfile corresponding to this result had an error",CLIENT_UPLOADING);
report_project_error(*rp,0,
"An output file of this result had an error",
CLIENT_UPLOADING
);
}
} else {
rp->output_files[i].file_info->ref_cnt++;

View File

@ -141,7 +141,7 @@ public:
PROJECT* next_project_master_pending();
double work_needed_secs();
int make_scheduler_request(PROJECT*, double);
void handle_scheduler_reply(PROJECT*, char* scheduler_url);
int handle_scheduler_reply(PROJECT*, char* scheduler_url, int& nresults);
void set_client_state_dirty(char*);
int report_project_error( RESULT &res,int err_num, char *err_msg, int state);
// flag a result as having an error

View File

@ -287,8 +287,8 @@ bool CLIENT_STATE::scheduler_rpc_poll() {
// Handle the reply from a scheduler
//
void CLIENT_STATE::handle_scheduler_reply(
PROJECT* project, char* scheduler_url
int CLIENT_STATE::handle_scheduler_reply(
PROJECT* project, char* scheduler_url, int& nresults
) {
SCHEDULER_REPLY sr;
FILE* f;
@ -296,6 +296,7 @@ void CLIENT_STATE::handle_scheduler_reply(
unsigned int i;
bool signature_valid;
nresults = 0;
contacted_sched_server = true;
if (log_flags.sched_op_debug) {
f = fopen(SCHED_OP_RESULT_FILE, "r");
@ -306,7 +307,8 @@ void CLIENT_STATE::handle_scheduler_reply(
}
f = fopen(SCHED_OP_RESULT_FILE, "r");
retval = sr.parse(f);
if (!f) return ERR_FOPEN;
retval = sr.parse(f); // check return?
fclose(f);
if (strlen(sr.project_name)) {
@ -326,10 +328,10 @@ void CLIENT_STATE::handle_scheduler_reply(
project->min_rpc_time = time(0) + sr.request_delay;
}
project->host_total_credit = sr.host_total_credit;
project->host_expavg_credit = sr.host_expavg_credit;
if (sr.hostid) {
project->hostid = sr.hostid;
project->host_total_credit = sr.host_total_credit;
project->host_expavg_credit = sr.host_expavg_credit;
project->host_create_time = sr.host_create_time;
project->rpc_seqno = 0;
}
@ -339,6 +341,7 @@ void CLIENT_STATE::handle_scheduler_reply(
//
if (sr.global_prefs_xml) {
f = fopen(GLOBAL_PREFS_FILE_NAME, "w");
if (!f) return ERR_FOPEN;
fprintf(f,
"<global_preferences>\n"
" <source_project>%s</source_project>\n"
@ -358,6 +361,7 @@ void CLIENT_STATE::handle_scheduler_reply(
if (sr.project_prefs_xml) {
char path[256];
f = fopen(TEMP_FILE_NAME, "w");
if (!f) return ERR_FOPEN;
fprintf(f,
"<account>\n"
" <master_url>%s</master_url>\n"
@ -371,7 +375,9 @@ void CLIENT_STATE::handle_scheduler_reply(
fclose(f);
get_account_filename(project->master_url, path);
retval = boinc_rename(TEMP_FILE_NAME, path);
if (retval) return ERR_RENAME;
f = fopen(path, "r");
if (!f) return ERR_FOPEN;
project->parse_account(f);
fclose(f);
}
@ -455,6 +461,7 @@ void CLIENT_STATE::handle_scheduler_reply(
retval = link_result(project, rp);
if (!retval) results.push_back(rp);
rp->state = RESULT_NEW;
nresults++;
}
}
@ -479,4 +486,5 @@ void CLIENT_STATE::handle_scheduler_reply(
printf("State after handle_scheduler_reply():\n");
print_counts();
}
return 0;
}

View File

@ -16,17 +16,20 @@
//
// Contributor(s):
//
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#include "util.h"
#include "parse.h"
#include "error_numbers.h"
#include "client_state.h"
#include "client_types.h"
#include "error_numbers.h"
#include "file_names.h"
#include "log_flags.h"
#include "parse.h"
#include "scheduler_op.h"
SCHEDULER_OP::SCHEDULER_OP(HTTP_OP_SET* h) {
@ -45,16 +48,17 @@ int SCHEDULER_OP::init_get_work() {
must_get_work = true;
project = gstate.next_project(0);
if (project) {
if( (retval=init_op_project(ns)) ) {
retval = init_op_project(ns);
if (retval) {
sprintf(err_msg, "init_get_work failed, error %d\n", retval);
backoff(project, err_msg);
return retval;
}
}
else {
} else {
project = gstate.next_project_master_pending();
if (project) {
if ((retval=init_master_fetch(project))) {
retval = init_master_fetch(project);
if (retval) {
sprintf(err_msg, "init_master_fetch failed, error %d\n", retval);
backoff(project, err_msg);
}
@ -78,6 +82,7 @@ int SCHEDULER_OP::init_return_results(PROJECT* p, double ns) {
//
int SCHEDULER_OP::init_op_project(double ns) {
int retval;
char err_msg[256];
if (log_flags.sched_op_debug) {
printf("init_op_project: starting op for %s\n", project->master_url);
@ -88,15 +93,24 @@ int SCHEDULER_OP::init_op_project(double ns) {
//
if (project->scheduler_urls.size() == 0) {
retval = init_master_fetch(project);
return retval;
goto done;
}
url_index = 0;
retval = gstate.make_scheduler_request(project, ns);
if (retval) {
fprintf(stderr, "make_scheduler_request: %d\n", retval);
return retval;
goto done;
}
return start_rpc();
retval = start_rpc();
done:
if (retval) {
sprintf(err_msg,
"scheduler init_op_project to %s failed, error %d\n",
project->scheduler_urls[url_index].text, retval
);
backoff(project, err_msg);
}
return retval;
}
// Set a project's min RPC time to something in the future,
@ -109,23 +123,19 @@ int SCHEDULER_OP::set_min_rpc_time(PROJECT* p) {
int n = p->nrpc_failures;
if (n > RETRY_CAP) n = RETRY_CAP;
// we've hit the limit on master_url fetches
if(p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) {
//
if (p->master_fetch_failures >= MASTER_FETCH_RETRY_CAP) {
if (log_flags.sched_op_debug) {
printf("we've hit the limit on master_url fetches\n");
}
//backoff e^MASTER_FETCH_INTERVAL * random
exp_backoff = (int) exp(((double)rand()/(double)RAND_MAX)*MASTER_FETCH_INTERVAL);
exp_backoff = (int) exp(drand()*MASTER_FETCH_INTERVAL);
p->min_rpc_time = time(0) + exp_backoff;
} else {
x = RETRY_BASE_PERIOD * exp(drand() * n);
exp_backoff = (int)max(SCHED_RETRY_DELAY_MIN,min(SCHED_RETRY_DELAY_MAX,(int) x));
p->min_rpc_time = time(0) + exp_backoff;
}
else {
//backoff RETRY_BASE_PERIOD * e^nrpc_failures * random
x = RETRY_BASE_PERIOD * exp(((double)rand()/(double)RAND_MAX) * n);
exp_backoff = (int)max(SCHED_RETRY_DELAY_MIN,min(SCHED_RETRY_DELAY_MAX,(int) x));
p->min_rpc_time = time(0) + exp_backoff;
}
if (log_flags.sched_op_debug) {
printf(
@ -148,8 +158,11 @@ int SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) {
p->master_url_fetch_pending = true;
set_min_rpc_time(p);
return 0;
}
// if nrpc failures a multiple of master_fetch_period, then set master_url_fetch_pending and initialize again
}
// if nrpc failures a multiple of master_fetch_period,
// then set master_url_fetch_pending and initialize again
//
if (p->nrpc_failures == MASTER_FETCH_PERIOD) {
p->master_url_fetch_pending = true;
p->min_rpc_time = 0;
@ -164,6 +177,7 @@ int SCHEDULER_OP::backoff( PROJECT* p, char *error_msg ) {
}
// low-level routine to initiate an RPC
// If successful, creates an HTTP_OP that must be polled
//
int SCHEDULER_OP::start_rpc() {
FILE *f;
@ -242,10 +256,12 @@ int SCHEDULER_OP::parse_master_file(vector<STRING256> &urls) {
printf("Parsed master file; got %d scheduler URLs\n", (int)urls.size());
}
//if couldn't find any urls in the master file.
if((int) urls.size() == 0)
return -1;
// couldn't find any urls in the master file?
//
if ((int) urls.size() == 0) {
return -1;
}
return 0;
}
@ -280,9 +296,9 @@ bool SCHEDULER_OP::update_urls(PROJECT& project, vector<STRING256> &urls) {
// poll routine. If an operation is in progress, check for completion
//
bool SCHEDULER_OP::poll() {
int retval;
int retval, nresults;
vector<STRING256> urls;
bool changed, scheduler_op_done, get_master_success;
bool changed, scheduler_op_done;
bool action = false;
char err_msg[256],*err_url;
@ -290,7 +306,6 @@ bool SCHEDULER_OP::poll() {
case SCHEDULER_OP_STATE_GET_MASTER:
// here we're fetching the master file for a project
//
get_master_success = true;
if (http_op.http_op_state == HTTP_STATE_DONE) {
action = true;
project->master_url_fetch_pending = false;
@ -303,35 +318,49 @@ bool SCHEDULER_OP::poll() {
);
}
retval = parse_master_file(urls);
if (retval == 0) {
if (retval) {
// master file parse failed.
//
project->master_fetch_failures++;
backoff(project, "Master file parse failed\n");
err_url = project->master_url;
} else {
// everything succeeded. Clear error counters
//
changed = update_urls(*project, urls);
if (changed) {
project->min_rpc_time = 0;
project->nrpc_failures = 0;
project->master_fetch_failures = 0;
}
} else {
// master file parse failed. treat like RPC error
//
project->master_fetch_failures++;
backoff(project, "Master file parse failed\n");
get_master_success = false;
err_url = project->master_url;
}
}
} else {
// fetch of master file failed. Treat like RPC error
// master file fetch failed.
//
project->master_fetch_failures++;
project->master_fetch_failures++;
backoff(project, "Master file fetch failed\n");
get_master_success = false;
err_url = project->master_url;
}
// If don't have any schedulers for this project,
// it may be the wrong URL. notify the user
//
if (project->scheduler_urls.size() == 0) {
sprintf(err_msg,
"Could not contact %s. Make sure this is the correct project URL.",
err_url
);
show_message(err_msg, "high");
}
// See if need to read master file for another project
//
project = gstate.next_project_master_pending();
if (project) {
if ((retval = init_master_fetch(project))) {
project->master_fetch_failures++;
retval = init_master_fetch(project);
if (retval) {
project->master_fetch_failures++;
backoff(project, "Master file fetch failed\n");
get_master_success = false;
err_url = project->master_url;
}
} else {
@ -340,18 +369,12 @@ bool SCHEDULER_OP::poll() {
printf("Scheduler_op: return to idle state\n");
}
}
// If we haven't been able to successfully get the master URL file
// recently then notify the user
if (!get_master_success) {
char buf[256];
sprintf(buf, "Could not contact %s. Make sure this is the correct project URL.",
err_url);
show_message( buf, "high" );
}
}
break;
case SCHEDULER_OP_STATE_RPC:
// here we're doing a scheduler RPC to some project
// here we're doing a scheduler RPC
//
scheduler_op_done = false;
if (http_op.http_op_state == HTTP_STATE_DONE) {
@ -364,26 +387,22 @@ bool SCHEDULER_OP::poll() {
project->scheduler_urls[url_index].text
);
}
// scheduler RPC failed. Try another scheduler if one exists
//
url_index++;
if (url_index < project->scheduler_urls.size()) {
start_rpc();
} else {
backoff(project,"");
backoff(project, "No schedulers responded");
if (must_get_work) {
project = gstate.next_project(project);
if (project) {
if( (retval=init_op_project(gstate.work_needed_secs())) ) {
sprintf( err_msg,
"scheduler init_op_project to %s failed, error %d\n",
project->scheduler_urls[url_index].text, retval
);
backoff(project, err_msg);
}
retval = init_op_project(gstate.work_needed_secs());
} else {
scheduler_op_done = true;
}
}
}
else {
scheduler_op_done = true;
}
@ -395,21 +414,27 @@ bool SCHEDULER_OP::poll() {
project->scheduler_urls[url_index].text
);
}
project->nrpc_failures = 0;
project->min_rpc_time = 0;
gstate.handle_scheduler_reply(project, scheduler_url);
gstate.handle_scheduler_reply(project, scheduler_url, nresults);
// if we asked for work and didn't get any,
// back off this project
//
if (must_get_work && nresults==0) {
backoff(project, "No work from project\n");
} else {
project->nrpc_failures = 0;
project->min_rpc_time = 0;
}
// if we didn't get all the work we needed,
// ask another project for work
//
if (must_get_work) {
double x = gstate.work_needed_secs();
if (x > 0) {
project = gstate.next_project(project);
if (project) {
if( (retval=init_op_project(x)) ) {
sprintf( err_msg,
"scheduler init_op_project to %s failed, error %d\n",
project->scheduler_urls[url_index].text, retval
);
backoff(project, err_msg);
}
retval = init_op_project(x);
} else {
scheduler_op_done = true;
}
@ -421,10 +446,14 @@ bool SCHEDULER_OP::poll() {
}
}
}
// If no outstanding ops, see if need a master fetch
//
if (scheduler_op_done) {
project = gstate.next_project_master_pending();
project = gstate.next_project_master_pending();
if (project) {
if ((retval = init_master_fetch(project))) {
retval = init_master_fetch(project);
if (retval) {
if (log_flags.sched_op_debug) {
printf("Scheduler op: init_master_fetch failed.\n" );
}
@ -472,7 +501,7 @@ int SCHEDULER_REPLY::parse(FILE* in) {
code_sign_key_signature = 0;
p = fgets(buf, 256, in);
// First part of content should either be tag (HTTP 1.0) or
// First part of content should either be tag (HTTP 1.0) or
// hex length of response (HTTP 1.1)
if (!match_tag(buf, "<scheduler_reply>")) {
fprintf(stderr, "SCHEDULER_REPLY::parse(): bad first tag %s\n", buf);

View File

@ -12,17 +12,13 @@ ranked by credit.
<p>
BOINC's credit system is based on a "reference computer" that does
<ul>
<li>1 billion floating-point multiply/adds per second
<li>1 billion integer multiply/adds per second
<li>1 billion floating-point multiplies per second
<li>1 billion integer multiplies per second
<li>4 billion bytes per second of traffic to and from main memory
(sequential, half reads and half writes)
</ul>
BOINC's unit of credit, the <b>Cobblestone</b>,
is one day of CPU time on the reference computer.
(Credit should ideally reflect network transfer and disk storage as well
as computation.
But it's hard to verify these activities,
so for now they aren't included.)
<p>
Each project maintains two types of credit:
@ -37,3 +33,31 @@ This average decreases by a factor of two every week.
<p>
Both types of credit (total and recent average)
are maintained for each user and host.
<h3>Leader boards</h3>
The PHP pages supplied by BOINC include basic leaderboards:
top users and hosts, by total and average.
BOINC lets projects export the credit-related
parts of their database as XML files.
These XML files can be used to generate
other breakdowns of users, hosts and teams,
or to generate leaderboards based on the sum of
credit from different projects.
<h3>Possible future improvements</h3>
<ul>
<li>
Ideally, credit should reflect network transfer and disk storage as well
as computation.
But it's hard to verify these activities,
so for now they aren't included.
<li>
Eventually projects will develop applications that use
graphics coprocessors or other non-CPU hardware.
Credit should reflect the usage of such hardware.
To accomplish this, we will need to let
projects supply their own benchmarking functions.
This will also handle the situation where a project's
application does e.g. all integer arithmetic.
</ul>

View File

@ -18,6 +18,7 @@
//
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <sys/types.h>
@ -176,3 +177,6 @@ int lock_file(char* filename) {
return retval;
}
double drand() {
return (double)rand()/(double)RAND_MAX;
}

View File

@ -24,6 +24,7 @@ extern double dtime();
extern void boinc_sleep( int seconds );
extern int parse_command_line( char *, char ** );
extern int lock_file(char*);
extern double drand();
#ifndef max
#define max(a,b) (((a) > (b)) ? (a) : (b))

View File

@ -46,37 +46,37 @@ int cushion = 10;
int redundancy = 10;
char wu_name[256], result_template_file[256];
void replace_file_name(char * xml_doc, char * filename, char * new_filename,char * download_url)
{
char buf[MAX_BLOB_SIZE], temp[256], download_path[256], new_download_path[256];
char * p;
// edit a WU XML doc, replacing one filename by another
// (should appear twice, within <file_info> and <file_ref>)
// Also patch the download URL (redundant)
//
void replace_file_name(
char* xml_doc, char* filename, char* new_filename, char* download_url
) {
char buf[MAX_BLOB_SIZE], temp[256], download_path[256],
new_download_path[256];
char * p;
sprintf(download_path,"%s/%s",download_url,filename);
sprintf(new_download_path,"%s/%s",download_url,new_filename);
strcpy(buf,xml_doc);
p = strtok(buf,"\n");
while (p) {
if (parse_str(p, "<name>", temp, sizeof(temp))) {
if(!strcmp(filename, temp))
{
replace_element(xml_doc + (p - buf),"<name>","</name>",new_filename);
}
sprintf(download_path,"%s/%s", download_url, filename);
sprintf(new_download_path,"%s/%s", download_url, new_filename);
strcpy(buf, xml_doc);
p = strtok(buf,"\n");
while (p) {
if (parse_str(p, "<name>", temp, sizeof(temp))) {
if(!strcmp(filename, temp)) {
replace_element(xml_doc + (p - buf),"<name>","</name>",new_filename);
}
} else if (parse_str(p, "<file_name>", temp, sizeof(temp))) {
if(!strcmp(filename, temp)) {
replace_element(xml_doc + (p - buf),"<file_name>","</file_name>",new_filename);
}
} else if (parse_str(p, "<url>", temp, sizeof(temp))) {
if(!strcmp(temp, download_path)) {
replace_element(xml_doc + (p - buf),"<url>","</url>",new_download_path);
}
}
p = strtok(0, "\n");
}
else if (parse_str(p, "<file_name>", temp, sizeof(temp))) {
if(!strcmp(filename, temp))
{
replace_element(xml_doc + (p - buf),"<file_name>","</file_name>",new_filename);
}
}
else if (parse_str(p, "<url>", temp, sizeof(temp))) {
if(!strcmp(temp, download_path))
{
replace_element(xml_doc + (p - buf),"<url>","</url>",new_download_path);
}
}
p = strtok(0, "\n");
}
}
void check_trigger() {
@ -88,12 +88,15 @@ void check_trigger() {
void make_work() {
CONFIG config;
char * p;
int retval, i, start_time=time(0), n, nresults_left;
char keypath[256], suffix[256], result_template[MAX_BLOB_SIZE], file_name[256], buf[MAX_BLOB_SIZE],pathname[256],new_file_name[256],new_pathname[256],command[256], starting_xml[MAX_BLOB_SIZE],new_buf[MAX_BLOB_SIZE];
int retval, start_time=time(0), n, nresults_left;
char keypath[256], suffix[256], result_template[MAX_BLOB_SIZE];
char file_name[256], buf[MAX_BLOB_SIZE], pathname[256];
char new_file_name[256], new_pathname[256], command[256];
char starting_xml[MAX_BLOB_SIZE], new_buf[MAX_BLOB_SIZE];
R_RSA_PRIVATE_KEY key;
WORKUNIT wu;
int seqno = 0;
retval = config.parse_file();
if (retval) {
fprintf(stderr,"make_work: can't read config file\n");
@ -135,39 +138,53 @@ void make_work() {
fprintf(stderr,"make_work: can't counts results\n");
exit(1);
}
printf("make_work: %d results\n", n);
printf("make_work: %d results available to send\n", n);
if (n > cushion) {
sleep(1);
continue;
}
// make a new workunit every "redundancy" results
//
if (nresults_left == 0) {
strcpy(buf,starting_xml);
p = strtok(buf, "\n");
strcpy(file_name, "");
while (p) {
if (parse_str(p, "<name>", file_name, sizeof(file_name))) {
sprintf(new_file_name,"%s_%d_%d",file_name,start_time,i++);
sprintf(pathname, "%s/%s", config.download_dir, file_name);
sprintf(new_pathname,"%s/%s",config.download_dir, new_file_name);
sprintf(command,"cp %s %s",pathname,new_pathname);
system(command);
strcpy(new_buf,starting_xml);
replace_file_name(new_buf,file_name,new_file_name,config.download_url);
strcpy(wu.xml_doc, new_buf);
}
p = strtok(0, "\n");
}
nresults_left = redundancy;
sprintf(wu.name, "wu_%d_%d", start_time, i++);
wu.id = 0;
wu.create_time = time(0);
retval = db_workunit_new(wu);
wu.id = db_insert_id();
strcpy(buf, starting_xml);
p = strtok(buf, "\n");
strcpy(file_name, "");
// make new copies of all the WU's input files
//
while (p) {
if (parse_str(p, "<name>", file_name, sizeof(file_name))) {
sprintf(
new_file_name, "%s_%d_%d", file_name, start_time, i++
);
sprintf(pathname, "%s/%s", config.download_dir, file_name);
sprintf(
new_pathname, "%s/%s",config.download_dir, new_file_name
);
sprintf(command,"cp %s %s",pathname,new_pathname);
if (system(command)) {
fprintf(stderr, "make_work: ERROR\n");
perror(command);
exit();
}
strcpy(new_buf, starting_xml);
replace_file_name(
new_buf, file_name, new_file_name, config.download_url
);
strcpy(wu.xml_doc, new_buf);
}
p = strtok(0, "\n");
}
nresults_left = redundancy;
sprintf(wu.name, "wu_%d_%d", start_time, seqno);
wu.id = 0;
wu.create_time = time(0);
retval = db_workunit_new(wu);
wu.id = db_insert_id();
}
sprintf(suffix, "%d_%d", start_time, i++);
sprintf(suffix, "%d_%d", start_time, seqno++);
create_result(
wu, result_template, suffix, key,
config.upload_url, config.download_url

333
stripchart/README Normal file
View File

@ -0,0 +1,333 @@
Stripchart version 2.0
----------------------
Author: Matt Lebofsky
BOINC/SETI@home - University of California, Berkeley
mattl@ssl.berkeley.edu
Date of recent version: November 4, 2002
Requirements:
* a gnuplot with the ability to generate gifs
* perl
* apache or other cgi-enabled web browser
Send all thoughts and queries to: mattl@ssl.berkeley.edu
This software is free to edit, distribute and use by anybody, as long as
I get credit for it in some form or another. Thanks.
----------------------
Contents:
I. Some questions and answers
II. So how does it work?
III. Known bugs, things to do, etc.
----------------------
I. Some questions and answers
Q: What is stripchart?
A: Well, it's actually two relatively small perl programs:
1. stripchart
stripchart reads in time-based user data and, depending on a flurry of
command line options, generates a web-friendly .gif plotting the data.
The user can supply the time range, the y axis range, even the color
scheme, and more.
2. stripchart.cgi
stripchart.cgi is a web-based GUI interface that allows users to easily
select multiple data sources and various parameters to plot, allowing
fast comparisons without having to deal with a command line interface.
Q: Why do you bother writing this program?
A: Working as a systems administrator (amongst other things) for SETI@home,
we kept finding ourselves in dire problem-solving situations, i.e. Why
did the database stop working? Why is load on our web server so high?
So we started collecting data in flat files, keeping track of server
loads, database checkpoint times, even CPU temperatures. When these files
grew too large and unwieldy, I found myself writing (and rewriting) simple
scripts to generate plots on this data. Sick of constant revision whenever
a new problem arose, I wrote stripchart version 1.0.
Its usefulness became immediately apparent when I added on stripchart.cgi.
I couldn't bear to teach everybody the many command line options to
stripchart, so I wrote this CGI to do all the dirty work. Suddenly we were
able to line up several plots, look for causes and effects, or just enjoy
watching the counts in our database tables grow to impossibly high numbers.
The SETI@home network has proven to be a delicate system, and keeping track
of all the data server, user, and web statistics has proven to be quite a
life saver. So when BOINC came around we felt that any project aiming to
embark on a similar project may need this tool. So I rewrote stripchart to
be a bit more friendly and general.
Q: Why don't you make .pngs or .jpgs instead of .gifs? The latest gnuplot
doesn't support .gifs.
A: Basically gnuplot support for other graphic file formats isn't as good. For
example, you cannot control exact window size, font size, and colors unless
you make .gifs. I'm not exactly sure why this is the case, but there you have it.
Anywho, you can find older gnuplot distributions out there - you'll need to
get the gd libs first, by the way.
----------------------
II. So how does it work?
You can use stripchart as a stand alone command-line program to produce plots
whenever you like, but we highly recommend using it in conjunction with the
stripchart.cgi for ease of use. But here's how to do it both ways.
stripchart (stand alone)
Before anything, look at the section GLOBAL/DEFAULT VARS in the program
stripchart and see if you need to edit anything (usually pathnames to
executables and such).
Let's just start with the usage (obtained by typing "stripchart -h"):
stripchart: creates stripchart .gif graphic based on data in flat files
options:
-i: input FILE - name of input data file (mandatory)
-o: output FILE - name of output .gif file (default: STDOUT)
-O: output FILE - name of output .gif file and dump to STDOUT as well
-f: from TIME - stripchart with data starting at TIME
(default: 24 hours ago)
-t: to TIME - stripchart with data ending at TIME (default: now)
-r: range RANGE - stripchart data centered around "from" time the size
of RANGE (overrides -t)
-l: last LINES - stripchart last number of LINES in data file
(overrides -f and -t and -r)
-T: title TITLE - title to put on graphic (default: FILE RANGE)
-x: column X - time or "x" column (default: 2)
-y: column Y - value or "y" column (default: 3)
-Y: column Y' - overplot second "y" column (default: none)
-b: baseline VALUE - overplot baseline of arbitrary value VALUE
-B: baseline-avg - overrides -b, it plots baseline of computed average
-d: dump low VALUE - ignore data less than VALUE
-D: dump high VALUE - ignore data higher than VALUE
-v: verbose - puts verbose runtime output to STDERR
-L: log - makes y axis log scale
-c: colors "COLORS" - set gnuplot colors for graph/axis/fonts/data (default:
"xffffff x000000 xc0c0c0 x00a000 x0000a0 x2020c0"
in order: bground, axis/fonts, grids, pointcolor1,2,3)
-C: cgi - output CGI header to STDOUT if being called as CGI
-s: stats - turn extra plot stats on (current, avg, min, max)
-j: julian times - time columns is in local julian date (legacy stuff)
notes:
* TIME either unix date, julian date, or civil date in the form:
YYYY:MM:DD:HH:MM (year, month, day, hour, minute)
If you enter something with colons, it assumes it is civil date
If you have a decimal point, it assumes it is julian date
If it is an integer, it assumes it is unix date (epoch seconds)
If it is a negative number, it is in decimal days from current time
(i.e. -2.5 = two and a half days ago)
* All times on command line are assumed to be "local" times
* All times in the data file must be in unix date (epoch seconds)
* RANGE is given in decimal days (i.e. 1.25 = 1 day, 6 hours)
* if LINES == 0, (i.e. -l 0) then the whole data file is read in
* columns (given with -x, -y, -Y flags) start at 1
* titles given with -T can contain the following key words which will
be converted:
FILE - basename of input file
RANGE - pretty civil date range (in local time zone)
the default title is: FILE RANGE
...okay that's a lot to ingest, but it's really simple. Let's take a look at an
example (you'll find in the samples directory two files get_load and crontab).
You have a machine that you want to monitor it's load. Here's a script that
will output a single line containing two fields for time and the third with the
actual data. For example:
2002:11:05:12:51 1036529480 0.25
The first field is time in an arbitrary human readable format
(year:month:day:hour:minute), the second in epoch seconds (standard
unix time format - the number of seconds since 00:00 1/1/1970 GMT),
and the third is the load at this time.
And we'll start collecting data every five minutes on this particular machine
by add such a line to the crontab:
0,5,10,15,20,25,30,35,40,45,50,55 * * * * /usr/local/stripchart/samples/get_load >> /disks/matt/data/machine_load
So the file "machine_load" will quickly fill with lines such as the above.
Now you may ask yourself - why two columns representing time in two different
formats? Well sometime you just want to look at the data file itself, in which
case the human-readable first column is quite handy to have around, but when
making linear time plots, having time in epoch seconds is much faster to
manipulate. So generally, we like to have at least the two time fields first,
and the actual data in the third column. That's what stripchart expects by
default.
Note: stripchart will understand time in both epoch seconds and julian date.
If the second time field is in julian date, you should supply the command line
flag "-j" to warn stripchart so it knows how to handle it.
Okay. So you have this data file now. A very common thing to plot would be the
data over the past 24 hours. Turns out that's the default! If you type on the
command line:
stripchart -i machine_load -o machine_load.gif
you will quickly get a new file "machine_load.gif" with all the goods.
Note: you always have to supply an input file via -i. If you don't supply
an output file via "-o" it .gif gets dumped to stdout. If you supply an
output file via "-O" the output is stored in both the file and to stdout.
Now let's play with the time ranges. You can supply times in a variety of
formats on the command line:
"civil date" i.e. 2002:11:05:12:51 (YYYY:MM:DD:hh:mm)
"epoch seconds" i.e. 1036529480
"julian date" i.e. 2452583.52345
You can supply a date range using the -f and -t flags (from and to):
stripchart -i machine_load -f 2002:11:01:00:00 -t 2002:11:04:00:00
Usually the "to" time is right now, so you can quickly tell stripchart
to plot starting at some arbitrary time "ago." This is done also via the
"-f" flag - if it's negative it will assume you mean that many decimal
days from now as a starting point. So "-f -3.5" will plot from 3 and a
half days ago until now.
You can also supply a "range" centered around the from time. For example,
to plot the 24 hours centered around 2002:11:01:13:40:
stripchart -i machine_load -f 2002:11:01:13:40 -r 1
On some rare occasions you might want to plot the last number of lines
in a file, regardless of what time they were. If you supply the number
of lines via the "-l" flag, it overrides any time ranges you may have
supplied.
Moving on to some other useful flags in no particular order:
To change the default title (which is the basename of the file and
the time range being plotted), you can do so via the "-T" command.
Make sure to put the title in quotes. Within the title string the
all-uppercase string "FILE" will be replaced with the file basename,
and the string "RANGE" will be replaced by the time range. So in
essence, the default title string is "FILE RANGE".
If you have data files in different formats, you can specify the data
columns using the "-x" and "-y" flags. By default -x is 2 and -y is 3.
Sometimes we have datafiles with many columns so we actively have to tell
stripchart which is the correct data column.
However, you might want to overplot one column on top of another. If your
data file has a second data column, you can specify what that is via the
-Y flag, and this data will be overplotted onto the data from the first
data column.
Sometime you want to plot a horizontal rule or a "baseline". You can
turn this feature on by specifying the value with the "-b" flag. If you
use the "-B" flag (without any values) it automatically computes the
average over the time range and plots that as the baseline. Simple!
If you want to excise certain y values, you can do so with the dump
flags, i.e. "-d" and "-D". In particular, any values lower than the one
supplied with "-d" will be dumped, and any values higher supplied by
"-D" will be dumped.
To log the y axis, use the "-L" flag. Quite straightforward.
A very useful flag is "-s" which outputs a line of stats underneath
the plot title. It shows the current value, and the minimum, maximum
and average values during the plot range.
For verbose output to stderr, use the "-v" flag. It may not make much
sense, but it's useful for debugging.
Using the "-C" flag causes stripchart to spit out the "Content-type"
lines necessary for incorporating stripchart plots into CGIs. This
doesn't work so well now, but there it is.
Okay. That's enough about the flags, and hopefully enough to get you
playing around with stripchart and plotting some stuff. Now onto:
stripchart.cgi
First and foremost, you need to do the following before running the
CGI version of stripchart:
1. Put stripchart.cgi in a cgi-enabled web-accessible directory
2. Make a "lib" directory somewhere that the web server can read/write to
3. Edit stripchart.cgi GLOBAL/DEFAULT VARS to point to proper paths, including
the files "querylist" and "datafiles" in the aforementioned "lib" directory.
4. Edit the "lib/datafiles" file to contain entries for all your data files.
You can find an example datafiles in the samples directory. Follow the
instructions in the comment lines, adding your entries below the header.
That should be it, I think. Now go to the URL wherever your stripchart.cgi
is sitting. If all is well..
You will be immediately presented with a web form. Ignore the "select query"
pulldown menu for now. Underneath that you will see a line:
Number of stripcharts: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
By default stripchart.cgi presents you with the ability to plot 4 simultaneous
stripcharts, but you can select any number 1-20 by clicking on those numbers.
The less plots, the faster a web page gets generated.
For each plot, you get a pull down menu which should contain all the entries
you already put in "datafiles". Here you are selecting your data source.
Then you can select the time of time range: last x hours, last x days, or
an arbitrary date range. By default the last x hours radio button is selected -
to pick another type of time range make sure you select the radio button
before it. Then enter the range via the pull down menus.
Then you get a simple list of checkbox/input options. You can check to log
the y axis, baseline the average, baseline an arbitrary value (which you
enter in the window, enter a y minimum, or enter a maximum.
When everything is selected, click on the "click here" button to plot.
Depending on the speed of your machine, you should soon be presented with
all the plots your desired, and the form underneath the plots which can
edit to your heart's content. If you want to reset the form values, click
on the "reset form" link.
Note the "save images in /tmp" checkbox. If that is checked and you plot
the stripcharts, numbered .gif files will be placed in /tmp on the web
server machine so you can copy them elsewhere (files will be named:
stripchart_plot_1.gif, etc.).
On the topmost "click here" button you will note an "enter name to save
query" balloon. If you enter a name here (any old string) this exact query
will be saved into the "querylist" file which will then later appear in the
pulldown menu at the top. That way if you have a favorite set of diagnostic
plots which you check every morning, you don't have to enter the entire form
every time.
If you want to delete a query, enter the name in that same field but click
the "delete" checkbox next to it. Next time you "click here" the query will
be deleted.
----------------------
III. Known bugs, things to do, etc.
* stripchart -C flag is kind of pointless and doesn't work in practice.
* plots on data collected over small time ranges (points every few seconds, for
example) hasn't been tested.
* plots that don't work via stripchart.cgi either show ugly broken image icons
or nothing at all - either way it's ungraceful.
* pulldown menus and various plots sometimes need to be refreshed via a hard
refresh (i.e. shift-refresh).
* this readme kinda stinks.
* and many many other issues I'm failing to detail now!
If you have any problems using the product, feel free to e-mail me at:
mattl@ssl.berkeley.edu

52
todo
View File

@ -1,32 +1,62 @@
-----------------------
BUGS (arranged from high to low priority)
-----------------------
- Suspend/resume not fully functional on Windows, no way to suspend/resume on UNIX
- Suspend/resume not fully functional on Windows,
no way to suspend/resume on UNIX
- Currently, if there are multiple CPUs they work on the same result
- "Show Graphics" menu item brings up minimized window, client does not remember window size/pos after close/reopen, window closes and does not reopen when workunit finishes and new workunit starts
- No easy way to quit projects on UNIX
- "Show Graphics" menu item brings up minimized window,
client does not remember window size/pos after close/reopen,
window closes and does not reopen when workunit finishes
and new workunit starts
- No easy way to quit/add projects on UNIX
- Write a README file
what should it say?
- "ACTIVE_TASK.check_app_status_files: could not delete
slots\0\fraction_done.xml: -110" appears in stderr.txt on Windows
- "no work available" appears sporadically though work is eventually assigned,
not sure if it is assigned immediately or on next RPC
- Should include option in Windows client or installer whether to
run client at startup or not
- Screensaver "blank screen" functionality not implemented
- "ACTIVE_TASK.check_app_status_files: could not delete slots\0\fraction_done.xml: -110" appears in stderr.txt on Windows
- "no work available" appears sporadically though work is eventually assigned, not sure if it is assigned immediately or on next RPC
- Should include option in Windows client or installer whether to run client at startup or not
- Scheduler reply includes blank lines that XML parser complains about
- Time to completion isn't too accurate, this is more of an Astropulse problem involving fraction_done
- boinc_gui.exe priority should be lower (?),
launched app priorities should be very low
- on final panel of install, add checkbox to let user view readme
- Time to completion isn't too accurate,
this is more of an Astropulse problem involving fraction_done
- Report problems page on maggie doesn't link to anything
- Host stats incorrectly reports number of times connected with same ip (unconfirmed)
- CPU time updates infrequently (every 10 seconds), should there be a user control for this?
- Client treats URL "maggie/ap/" different than URL "maggie/ap", though this isn't really a bug it might be good to fix anyway
- Astropulse uses a lot of memory (~70 MB) b/c of the dispersion table, should this be decreased?
- Host stats incorrectly reports number of times connected with same ip
(unconfirmed)
- CPU time updates infrequently (every 10 seconds),
should there be a user control for this?
- Client treats URL "maggie/ap/" different than URL "maggie/ap",
though this isn't really a bug it might be good to fix anyway
- Astropulse uses a lot of memory (~70 MB) b/c of the dispersion table,
should this be decreased?
- CPU time for a completed workunit is incorrect (unconfirmed)
- verify that if file xfer is interrupted, it resumes at right place
(and progress bar is correct)
- client died quickly on Mandrake 9.0 linux (unconfirmed)
- make pie chart colors/labels easier to understand
- need a way to refresh prefs from client
- columns expand when window expands
- % done display screwed up in progress bar; also dark color is hard to read
- Download speed is not as fast as it should be
- Result status should say "downloading files", "uploading files", etc.
- result % done should have 2 digits after dec pt
-----------------------
HIGH-PRIORITY (should do for beta test)
-----------------------
Windows: put text (e.g. column headings, menus) in a file
so that people can develop alternate versions of the file
for different languages.
Windows: use messages rather than interrupt for timing
make get_local_ip_addr() work in all cases
est_time_to_completion doesn't work for non-running tasks
run backend programs (validate/file_deleter/assimilate)