diff --git a/checkin_notes b/checkin_notes index c45e0d4810..0f7da882e3 100755 --- a/checkin_notes +++ b/checkin_notes @@ -23737,3 +23737,38 @@ Bruce 2 Feb 2005 sched/ server_types.C + +Bruce 2 Feb 2005 + - Improvements to the file deletion mechanism. Now try removing files if + no work was sent to hosts, and available space<0 OR if available space>0 + but work was unfeasible because the disk bound requirements of the work + exceeded the available space. + - Added a new config.xml boolean element called 'choose_download_url_by_timezone' + This requires that projects provide a 2-column file in the project root named + 'download_servers'. An example is: + 3600 http://einstein.aei.mpg.de + -21600 http://einstein.phys.uwm.edu + The first column is offset from UTC in seconds, and the second column is the URL + of the download server. When enabled, the scheduler will replace the download + path for data and executables by a list of download URLs, ordered by proximity + to the host's timezone. The download path must start with the + BOINC default download/ and the different download servers must have identical + file paths under download/, in other words they must be mirrored. + - Really exciting news, David: I have finally gotten emacs to obey your + indentations and formatting conventions, apart from doing this: + some_function(foo, bar + ); + instead of this: + some_function(foo, bar + ); + Emacs experts, advice appreciated! + + download/ + configuration.php + sched/ + sched_config.h + sched_config.C + sched_send.C + handle_request.C + + diff --git a/doc/configuration.php b/doc/configuration.php index aa96fa0f59..0bb4479a23 100644 --- a/doc/configuration.php +++ b/doc/configuration.php @@ -34,16 +34,17 @@ htmlspecialchars(" http://A/URL /path/to/directory - [ ] - [ ] - [ ] - [ N ] - [ N ] - [ N ] - [ ] - [ ] - [ N ] - [ N ] + [ ] + [ ] + [ N ] + [ N ] + [ N ] + [ ] + [ ] + [ N ] + [ N ] ../ @@ -194,7 +195,20 @@ list_item("min_core_client_version", a version number less than this, it returns an error message and doesn't do any other processing." ); - +list_item("choose_download_url_by_timezone", + "When the scheduler sends work to hosts, it replaces the download + URL appearing in the data and executable file descriptions with + the download URL closest to the host's timezone. The project + must provide a two-column file called 'download_servers' in the + project root directory. This is a list of all download servers + that will be inserted when work is sent to hosts. The first column + is an integer listing the server's offset in seconds from UTC. + The second column is the server URL in the format such as + http://einstein.phys.uwm.edu. The download servers must + have identical file hierarchies and contents, and the path to + file and executables must start with '/download/...' as in + 'http://einstein.phys.uwm.edu/download/123/some_file_name'." +); list_end(); // THE INFORMATION BELOW NEEDS TO BE ORGANIZED AND PUT INTO TABLES OR SOME OTHER LESS CRAMPED FORM diff --git a/sched/handle_request.C b/sched/handle_request.C index cbb4f5fa58..a8f8d9ad92 100644 --- a/sched/handle_request.C +++ b/sched/handle_request.C @@ -908,7 +908,7 @@ extern double watch_diskspace[3]; int delete_file_from_host(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply) { int nfiles = (int)sreq.file_infos.size(); char buf[256]; - + if (!nfiles) { log_messages.printf( SCHED_MSG_LOG::CRITICAL, @@ -949,11 +949,10 @@ int delete_file_from_host(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply) { SCHED_MSG_LOG::DEBUG, "[HOST#%d]: delete file %s (make space)\n", sreply.host.id, fi.name ); - // give host an hour to nuke the file and come back. This might + // give host 4 hours to nuke the file and come back. This might // in general be too soon, since host needs to complete any work // that depends upon this file, before it will be removed by core client. // - sprintf(buf, "Removing file %s to free up disk space", fi.name); USER_MESSAGE um(buf, "low"); sreply.insert_message(um); @@ -970,11 +969,11 @@ void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *t } sprintf(tmpfilename, "sched_reply_%06d_%06d", sreq.hostid, sreq.rpc_seqno); - // use _XXXXXX if you want random filenames rather than deterministic - // mkstemp(tmpfilename); - + // use _XXXXXX if you want random filenames rather than + // deterministic mkstemp(tmpfilename); + fp=fopen(tmpfilename, "w"); - + if (!fp) { log_messages.printf( SCHED_MSG_LOG::CRITICAL, @@ -982,15 +981,15 @@ void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *t ); return; } - + log_messages.printf( SCHED_MSG_LOG::DEBUG, "Found %s, so writing %s\n", trigger, tmpfilename ); - + sreply.write(fp); fclose(fp); - + sprintf(tmpfilename, "sched_request_%06d_%06d", sreq.hostid, sreq.rpc_seqno); fp=fopen(tmpfilename, "w"); @@ -1040,21 +1039,26 @@ void handle_request( sreply.insert_message(um); sreply.nucleus_only = true; } - + // if we got no work, and we have no file space, delete some files - if (sreply.results.size()==0 && max_allowable_disk(sreq)<0) { - // try to delete a file to make more space. Also give some - // hints to the user about what's going wrong (lack of disk - // space). - delete_file_from_host(sreq, sreply); + if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) { + // try to delete a file to make more space. Also give some + // hints to the user about what's going wrong (lack of disk + // space). + delete_file_from_host(sreq, sreply); } - + #if 1 - if (sreply.results.size()==0) { + // You can call debug_sched() for whatever situation is of + // interest to you. It won't do anything unless you create + // (touch) the file 'debug_sched' in the project root directory. + // + if (sreply.results.size()==0 && sreply.hostid && sreq.work_req_seconds>1.0) + debug_sched(sreq, sreply, "../debug_sched"); + else if (max_allowable_disk(sreq)<0 || (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) debug_sched(sreq, sreply, "../debug_sched"); - } #endif - + sreply.write(fout); } diff --git a/sched/sched_config.C b/sched/sched_config.C index b107842c52..f5aec5b5a8 100644 --- a/sched/sched_config.C +++ b/sched/sched_config.C @@ -90,6 +90,8 @@ int SCHED_CONFIG::parse(char* buf) { parse_int(buf, "", locality_scheduling_wait_period); parse_int(buf, "", locality_scheduling_send_timeout); parse_int(buf, "", min_core_client_version); + parse_bool(buf, "choose_download_url_by_timezone", choose_download_url_by_timezone); + if (match_tag(buf, "")) { char hostname[256]; gethostname(hostname, 256); diff --git a/sched/sched_config.h b/sched/sched_config.h index 18c0267c57..acf85b0b9e 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -59,6 +59,7 @@ public: int locality_scheduling_wait_period; int locality_scheduling_send_timeout; int min_core_client_version; + bool choose_download_url_by_timezone; int parse(char*); int parse_file(char* dir="."); diff --git a/sched/sched_send.C b/sched/sched_send.C index 1b3f76a1c8..18069899b1 100644 --- a/sched/sched_send.C +++ b/sched/sched_send.C @@ -23,6 +23,8 @@ #include #include #include +#include + using namespace std; #include "error_numbers.h" @@ -34,6 +36,7 @@ using namespace std; #include "sched_msgs.h" #include "sched_send.h" #include "sched_locality.h" +#include "../lib/parse.h" #ifdef _USING_FCGI_ #include "fcgi_stdio.h" @@ -229,12 +232,186 @@ int insert_after(char* buffer, char* after, char* text) { return 0; } +typedef struct urltag { + int zone; + char name[124]; +} URLTYPE; + + +// these global variables are needed to pass information into the +// compare function below. +static int tzone=0; +static int hostid=0; + +static int compare(const void *x, const void *y) { + URLTYPE *a=(URLTYPE *)x; + URLTYPE *b=(URLTYPE *)y; + + char longname[512]; + + if (abs(tzone - (a->zone))zone))) + return -1; + + if (abs(tzone - (a->zone))>abs(tzone - (b->zone))) + return +1; + + // In order to ensure uniform distribution, we hash paths that are + // equidistant from the host's timezone in a way that gives a + // unique ordering for each host but which is effectively random + // between hosts. + sprintf(longname, "%s%d", a->name, hostid); + std::string sa = md5_string((const unsigned char *)longname, strlen((const char *)longname)); + sprintf(longname, "%s%d", b->name, hostid); + std::string sb = md5_string((const unsigned char *)longname, strlen((const char *)longname)); + int xa = strtol(sa.substr(1, 7).c_str(), 0, 16); + int xb = strtol(sb.substr(1, 7).c_str(), 0, 16); + + if (xaxb) + return 1; + + return 0; +} + +// file +// +static URLTYPE *cached=NULL; +#define BLOCKSIZE 32 +URLTYPE* read_download_list(){ + FILE *fp; + int count=0; + int i; + + if (cached) + return cached; + + if (!(fp=fopen("../download_servers", "r"))) + return NULL; + + // read in lines from file + while (1) { + // allocate memory in blocks + if ((count % BLOCKSIZE)==0) { + cached=(URLTYPE *)realloc(cached, (count+BLOCKSIZE)*sizeof(URLTYPE)); + if (!cached) + return NULL; + } + // read timezone offset and URL from file, and store in cache + // list + if (2==fscanf(fp, "%d %s", &(cached[count].zone), cached[count].name)) + count++; + else { + // provide a null terminator so we don't need to keep + // another global variable for count. + cached[count].name[0]='\0'; + break; + } + } + fclose(fp); + + if (!count) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "file ../download_servers contained no valid entries!\n" + "format of this file is one or more lines containing:\n" + "TIMEZONE_OFFSET htt://some.url.path\n" + ); + free(cached); + return NULL; + } + + // sort URLs by distance from host timezone. See compare() above + // for details. + qsort(cached, count, sizeof(URLTYPE), compare); + + log_messages.printf( + SCHED_MSG_LOG::DEBUG, "Sorted list of URLs follows\n" + ); + for (i=0; i%s/%s", i?"\n ":"", serverlist[i].name, path); + + // make a second copy in the same order + for (i=0; strlen(serverlist[i].name); i++) + start+=sprintf(start, "%s%s/%s", "\n ", serverlist[i].name, path); + + return (start-buffer); +} + +// returns zero on success, non-zero to indicate an error +// +int add_download_servers(char *old_xml, char *new_xml, int timezone) { + char *p, *q, *r; + + p=r=old_xml; + + // search for next URL to do surgery on + while ((q=strstr(p, ""))) { + char *s; + char path[1024]; + int len = q-p; + + strncpy(new_xml, p, len); + + new_xml += len; + + // locate next instance of + if (!(r=strstr(q, ""))) + return 1; + r += strlen(""); + + // parse out the URL + if (!parse_str(q, "", path, 1024)) + return 1; + + // find start of 'download/' + if (!(s=strstr(path,"download/"))) + return 1; + + // insert new download list in place of the original one + len = make_download_list(new_xml, s, timezone); + if (len<0) + return 1; + new_xml += len; + + // advance pointer to start looking for next tag. + p=r; + } + + strcpy(new_xml, r); + return 0; +} + // add elements to WU's xml_doc, in preparation for sending // it to a client // int insert_wu_tags(WORKUNIT& wu, APP& app) { char buf[LARGE_BLOB_SIZE]; - + sprintf(buf, " %f\n" " %f\n" @@ -305,18 +482,40 @@ int add_wu_to_reply( APP* app, APP_VERSION* avp ) { int retval; - WORKUNIT wu2; - + WORKUNIT wu2, wu3; + // add the app, app_version, and workunit to the reply, // but only if they aren't already there // if (avp) { + APP_VERSION av2=*avp, *avp2=&av2; + + if (config.choose_download_url_by_timezone) { + // replace the download URL for apps with a list of + // multiple download servers. + + // set these global variables, needed by the compare() + // function so that the download URL list can be sorted by + // timezone + tzone=reply.host.timezone; + hostid=reply.host.id; + retval = add_download_servers(avp->xml_doc, av2.xml_doc, reply.host.timezone); + if (retval) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "add_download_servers(to APP version) failed\n" + ); + // restore original WU! + av2=*avp; + } + } + reply.insert_app_unique(*app); - reply.insert_app_version_unique(*avp); + reply.insert_app_version_unique(*avp2); log_messages.printf( SCHED_MSG_LOG::DEBUG, "[HOST#%d] Sending app_version %s %s %d\n", - reply.host.id, app->name, platform.name, avp->version_num + reply.host.id, app->name, platform.name, avp2->version_num ); } @@ -328,7 +527,29 @@ int add_wu_to_reply( log_messages.printf(SCHED_MSG_LOG::NORMAL, "insert_wu_tags failed\n"); return retval; } - reply.insert_workunit_unique(wu2); + wu3=wu2; + if (config.choose_download_url_by_timezone) { + // replace the download URL for WU files with a list of + // multiple download servers. + + // set these global variables, needed by the compare() + // function so that the download URL list can be sorted by + // timezone + tzone=reply.host.timezone; + hostid=reply.host.id; + + retval = add_download_servers(wu2.xml_doc, wu3.xml_doc, reply.host.timezone); + if (retval) { + log_messages.printf( + SCHED_MSG_LOG::CRITICAL, + "add_download_servers(to WU) failed\n" + ); + // restore original WU! + wu3=wu2; + } + } + + reply.insert_workunit_unique(wu3); return 0; }