Improvements to the file deletion mechanism. Now try removing files if

no work was sent to hosts, and available space<0 OR if available space>0 but work was unfeasible because the disk bound requirements of the work exceeded the available space. Added a new config.xml boolean element called 'choose_download_url_by_timezone' This requires that projects provide a 2-column file in the project root named 'download_servers'. An example is: 3600 http://einstein.aei.mpg.de -21600 http://einstein.phys.uwm.edu The first column is offset from UTC in seconds, and the second column is the URL of the download server. When enabled, the scheduler will replace the download path for data and executables by a list of download URLs, ordered by proximity to the host's timezone. The download path must start with the BOINC default download/ and the different download servers must have identical file paths under download/, in other words they must be mirrored. svn path=/trunk/boinc/; revision=5275
2005-02-02 18:13:00 +00:00 · 2005-02-02 18:13:00 +00:00 · a27705fd59
parent 02d0ac6fd8
commit a27705fd59
6 changed files with 314 additions and 37 deletions
--- a/35
+++ b/35
@ -23737,3 +23737,38 @@ Bruce  2 Feb 2005

    sched/
        server_types.C
+
+Bruce 2 Feb 2005
+    - Improvements to the file deletion mechanism.  Now try removing files if
+      no work was sent to hosts, and available space<0 OR if available space>0
+      but work was unfeasible because the disk bound requirements of the work
+      exceeded the available space.
+   -  Added a new config.xml boolean element called 'choose_download_url_by_timezone'
+      This requires that projects provide a 2-column file in the project root named
+      'download_servers'.  An example is:
+           3600   http://einstein.aei.mpg.de
+         -21600   http://einstein.phys.uwm.edu
+      The first column is offset from UTC in seconds, and the second column is the URL
+      of the download server.  When enabled, the scheduler will replace the download
+      path for data and executables by a list of download URLs, ordered by proximity
+      to the host's timezone.  The download path must start with the
+      BOINC default download/ and the different download servers must have identical
+      file paths under download/, in other words they must be mirrored.
+    - Really exciting news, David: I have finally gotten emacs to obey your
+      indentations and formatting conventions, apart from doing this:
+      some_function(foo, bar
+                    );
+      instead of this:
+      some_function(foo, bar
+      );
+      Emacs experts, advice appreciated!
+
+    download/
+        configuration.php
+    sched/
+        sched_config.h
+        sched_config.C
+        sched_send.C
+        handle_request.C
+
+
--- a/doc/configuration.php
+++ b/doc/configuration.php
@ -34,16 +34,17 @@ htmlspecialchars("
    <stripchart_cgi_url>    http://A/URL          </stripchart_cgi_url>
    <log_dir>               /path/to/directory    </log_dir>

-    [ <disable_account_creation/>                                              ]
-    [ <show_results/>                                                          ]
-    [ <one_result_per_user_per_wu/>                                            ]
-    [ <max_wus_to_send>                 N   </max_wus_to_send>                 ]
-    [ <min_sendwork_interval>           N   </min_sendwork_interval>           ]
-    [ <daily_result_quota>              N   </daily_result_quota>              ]
-    [ <enforce_delay_bound/>                                                   ]
-    [ <locality_scheduling/>                                                   ]
-    [ <locality_scheduling_wait_period> N   </locality_scheduling_wait_period> ]
-    [ <min_core_client_version>         N   </min_core_client_version          ]
+    [ <disable_account_creation/>                                               ]
+    [ <show_results/>                                                           ]
+    [ <one_result_per_user_per_wu/>                                             ]
+    [ <max_wus_to_send>                  N   </max_wus_to_send>                 ]
+    [ <min_sendwork_interval>            N   </min_sendwork_interval>           ]
+    [ <daily_result_quota>               N   </daily_result_quota>              ]
+    [ <enforce_delay_bound/>                                                    ]
+    [ <locality_scheduling/>                                                    ]
+    [ <locality_scheduling_wait_period>  N   </locality_scheduling_wait_period> ]
+    [ <min_core_client_version>          N   </min_core_client_version          ]
+    [ <choose_download_url_by_timezone/>                                        ]

    <!-- optional; defaults as indicated: -->
    <project_dir>  ../      </project_dir>  <!-- relative to location of 'start' -->
@ -194,7 +195,20 @@ list_item("min_core_client_version",
    a version number less than this,
    it returns an error message and doesn't do any other processing."
 );
-
+list_item("choose_download_url_by_timezone",
+     "When the scheduler sends work to hosts, it replaces the download
+      URL appearing in the data and executable file descriptions with
+      the download URL closest to the host's timezone.  The project
+      must provide a two-column file called 'download_servers' in the
+      project root directory.  This is a list of all download servers
+      that will be inserted when work is sent to hosts.  The first column
+      is an integer listing the server's offset in seconds from UTC.
+      The second column is the server URL in the format such as
+      http://einstein.phys.uwm.edu.  The download servers must
+      have identical file hierarchies and contents, and the path to
+      file and executables must start with '/download/...' as in 
+      'http://einstein.phys.uwm.edu/download/123/some_file_name'."
+);
 list_end();

 // THE INFORMATION BELOW NEEDS TO BE ORGANIZED AND PUT INTO TABLES OR SOME OTHER LESS CRAMPED FORM
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -908,7 +908,7 @@ extern double watch_diskspace[3];
 int delete_file_from_host(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply) {
    int nfiles = (int)sreq.file_infos.size();
    char buf[256];
-                                                                                                                                                    
+
    if (!nfiles) {
        log_messages.printf(
            SCHED_MSG_LOG::CRITICAL,
@ -949,11 +949,10 @@ int delete_file_from_host(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply) {
        SCHED_MSG_LOG::DEBUG,
        "[HOST#%d]: delete file %s (make space)\n", sreply.host.id, fi.name
    );
-    // give host an hour to nuke the file and come back.  This might
+    // give host 4 hours to nuke the file and come back.  This might
    // in general be too soon, since host needs to complete any work
    // that depends upon this file, before it will be removed by core client.
    //
-    
    sprintf(buf, "Removing file %s to free up disk space", fi.name);
    USER_MESSAGE um(buf, "low");
    sreply.insert_message(um);
@ -970,11 +969,11 @@ void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *t
    }

    sprintf(tmpfilename, "sched_reply_%06d_%06d", sreq.hostid, sreq.rpc_seqno);
-    // use _XXXXXX if you want random filenames rather than deterministic
-    // mkstemp(tmpfilename);
-
+    // use _XXXXXX if you want random filenames rather than
+    // deterministic mkstemp(tmpfilename);
+    
    fp=fopen(tmpfilename, "w");
-
+    
    if (!fp) {
        log_messages.printf(
            SCHED_MSG_LOG::CRITICAL,
@ -982,15 +981,15 @@ void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *t
        );
        return;
    }
-      
+    
    log_messages.printf(
        SCHED_MSG_LOG::DEBUG,
        "Found %s, so writing %s\n", trigger, tmpfilename
    );
-
+    
    sreply.write(fp);
    fclose(fp);
-
+    
    sprintf(tmpfilename, "sched_request_%06d_%06d", sreq.hostid, sreq.rpc_seqno);
    fp=fopen(tmpfilename, "w");

@ -1040,21 +1039,26 @@ void handle_request(
        sreply.insert_message(um);
        sreply.nucleus_only = true;
    }
-
+    
    // if we got no work, and we have no file space, delete some files
-    if (sreply.results.size()==0 && max_allowable_disk(sreq)<0) {
-      // try to delete a file to make more space.  Also give some
-      // hints to the user about what's going wrong (lack of disk
-      // space).
-      delete_file_from_host(sreq, sreply);
+    if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) {
+        // try to delete a file to make more space.  Also give some
+        // hints to the user about what's going wrong (lack of disk
+        // space).
+        delete_file_from_host(sreq, sreply);
    }
-
+    
 #if 1
-    if (sreply.results.size()==0) {
+    // You can call debug_sched() for whatever situation is of
+    // interest to you.  It won't do anything unless you create
+    // (touch) the file 'debug_sched' in the project root directory.
+    //
+    if (sreply.results.size()==0 && sreply.hostid && sreq.work_req_seconds>1.0) 
+        debug_sched(sreq, sreply, "../debug_sched");
+    else if (max_allowable_disk(sreq)<0 || (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0))
        debug_sched(sreq, sreply, "../debug_sched");
-    }
 #endif
-
+    
    sreply.write(fout);
 }

--- a/sched/sched_config.C
+++ b/sched/sched_config.C
@ -90,6 +90,8 @@ int SCHED_CONFIG::parse(char* buf) {
    parse_int(buf, "<locality_scheduling_wait_period>", locality_scheduling_wait_period);
    parse_int(buf, "<locality_scheduling_send_timeout>", locality_scheduling_send_timeout);
    parse_int(buf, "<min_core_client_version>", min_core_client_version);
+    parse_bool(buf, "choose_download_url_by_timezone", choose_download_url_by_timezone);
+
    if (match_tag(buf, "</config>")) {
        char hostname[256];
        gethostname(hostname, 256);
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -59,6 +59,7 @@ public:
    int locality_scheduling_wait_period;
    int locality_scheduling_send_timeout;
    int min_core_client_version;
+    bool choose_download_url_by_timezone;

    int parse(char*);
    int parse_file(char* dir=".");
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -23,6 +23,8 @@
 #include <string>
 #include <ctime>
 #include <cstdio>
+#include <stdlib.h>
+
 using namespace std;

 #include "error_numbers.h"
@ -34,6 +36,7 @@ using namespace std;
 #include "sched_msgs.h"
 #include "sched_send.h"
 #include "sched_locality.h"
+#include "../lib/parse.h"

 #ifdef _USING_FCGI_
 #include "fcgi_stdio.h"
@ -229,12 +232,186 @@ int insert_after(char* buffer, char* after, char* text) {
    return 0;
 }

+typedef struct urltag {
+    int zone;
+    char name[124];
+} URLTYPE;
+
+
+// these global variables are needed to pass information into the
+// compare function below.
+static int tzone=0;
+static int hostid=0;
+
+static int compare(const void *x, const void *y) {
+    URLTYPE *a=(URLTYPE *)x;
+    URLTYPE *b=(URLTYPE *)y;
+    
+    char longname[512];
+    
+    if (abs(tzone - (a->zone))<abs(tzone - (b->zone)))
+        return -1;
+    
+    if (abs(tzone - (a->zone))>abs(tzone - (b->zone)))
+        return +1;
+    
+    // In order to ensure uniform distribution, we hash paths that are
+    // equidistant from the host's timezone in a way that gives a
+    // unique ordering for each host but which is effectively random
+    // between hosts.
+    sprintf(longname, "%s%d", a->name, hostid);
+    std::string sa = md5_string((const unsigned char *)longname, strlen((const char *)longname));
+    sprintf(longname, "%s%d", b->name, hostid);
+    std::string sb = md5_string((const unsigned char *)longname, strlen((const char *)longname));
+    int xa = strtol(sa.substr(1, 7).c_str(), 0, 16);
+    int xb = strtol(sb.substr(1, 7).c_str(), 0, 16);
+    
+    if (xa<xb)
+        return -1;
+    
+    if (xa>xb)
+        return 1;
+    
+    return 0;
+}
+
+// file
+//
+static URLTYPE *cached=NULL;
+#define BLOCKSIZE 32
+URLTYPE* read_download_list(){
+    FILE *fp;
+    int count=0;
+    int i;
+    
+    if (cached)
+        return cached;
+    
+    if (!(fp=fopen("../download_servers", "r")))
+        return NULL;
+    
+    // read in lines from file
+    while (1) {
+        // allocate memory in blocks
+        if ((count % BLOCKSIZE)==0) {
+            cached=(URLTYPE *)realloc(cached, (count+BLOCKSIZE)*sizeof(URLTYPE));
+            if (!cached)
+                return NULL;
+        }
+        // read timezone offset and URL from file, and store in cache
+        // list
+        if (2==fscanf(fp, "%d %s", &(cached[count].zone), cached[count].name))
+            count++;
+        else {
+            // provide a null terminator so we don't need to keep
+            // another global variable for count.
+            cached[count].name[0]='\0';
+            break;
+        }
+    }
+    fclose(fp);
+    
+    if (!count) {
+        log_messages.printf(
+            SCHED_MSG_LOG::CRITICAL,
+            "file ../download_servers contained no valid entries!\n"
+            "format of this file is one or more lines containing:\n"
+            "TIMEZONE_OFFSET  htt://some.url.path\n"
+        );
+        free(cached);
+        return NULL;
+    }
+    
+    // sort URLs by distance from host timezone.  See compare() above
+    // for details.
+    qsort(cached, count, sizeof(URLTYPE), compare);
+    
+    log_messages.printf(
+        SCHED_MSG_LOG::DEBUG, "Sorted list of URLs follows\n" 
+    );
+    for (i=0; i<count; i++) {
+        log_messages.printf(
+            SCHED_MSG_LOG::DEBUG, "zone=%d name=%s\n", cached[i].zone, cached[i].name
+        );
+    }
+    return cached;
+}
+
+// return number of bytes written, or <0 to indicate an error
+//
+int make_download_list(char *buffer, char *path, int timezone) {
+    char *start=buffer;
+    int i;
+
+    // global variable used in the compare() function
+    tzone=timezone;
+    URLTYPE *serverlist=read_download_list();
+    
+    if (!serverlist)
+        return -1;
+    
+    // print list of servers in sorted order.  Space is to format them
+    // nicely
+    for (i=0; strlen(serverlist[i].name); i++)
+        start+=sprintf(start, "%s<url>%s/%s</url>", i?"\n    ":"", serverlist[i].name, path);
+
+    // make a second copy in the same order
+    for (i=0; strlen(serverlist[i].name); i++)
+        start+=sprintf(start, "%s<url>%s/%s</url>", "\n    ", serverlist[i].name, path);
+    
+    return (start-buffer);
+}
+
+// returns zero on success, non-zero to indicate an error
+//
+int add_download_servers(char *old_xml, char *new_xml, int timezone) {
+    char *p, *q, *r;
+    
+    p=r=old_xml;
+
+    // search for next URL to do surgery on 
+    while ((q=strstr(p, "<url>"))) {
+        char *s;
+        char path[1024];
+        int len = q-p;
+        
+        strncpy(new_xml, p, len);
+        
+        new_xml += len;
+        
+        // locate next instance of </url>
+        if (!(r=strstr(q, "</url>")))
+            return 1;
+        r += strlen("</url>");
+        
+        // parse out the URL
+        if (!parse_str(q, "<url>", path, 1024))
+            return 1;
+        
+        // find start of 'download/'
+        if (!(s=strstr(path,"download/")))
+            return 1;
+        
+        // insert new download list in place of the original one
+        len = make_download_list(new_xml, s, timezone);
+        if (len<0)
+            return 1;
+        new_xml += len;
+        
+        // advance pointer to start looking for next <url> tag.
+        p=r;
+    }
+    
+    strcpy(new_xml, r);
+    return 0;
+}
+
 // add elements to WU's xml_doc, in preparation for sending
 // it to a client
 //
 int insert_wu_tags(WORKUNIT& wu, APP& app) {
    char buf[LARGE_BLOB_SIZE];
-
+    
    sprintf(buf,
        "    <rsc_fpops_est>%f</rsc_fpops_est>\n"
        "    <rsc_fpops_bound>%f</rsc_fpops_bound>\n"
@ -305,18 +482,40 @@ int add_wu_to_reply(
    APP* app, APP_VERSION* avp
 ) {
    int retval;
-    WORKUNIT wu2;
-
+    WORKUNIT wu2, wu3;
+    
    // add the app, app_version, and workunit to the reply,
    // but only if they aren't already there
    //
    if (avp) {
+        APP_VERSION av2=*avp, *avp2=&av2;
+        
+        if (config.choose_download_url_by_timezone) {
+            // replace the download URL for apps with a list of
+            // multiple download servers.
+
+            // set these global variables, needed by the compare()
+            // function so that the download URL list can be sorted by
+            // timezone
+            tzone=reply.host.timezone;
+            hostid=reply.host.id;
+            retval = add_download_servers(avp->xml_doc, av2.xml_doc, reply.host.timezone);
+            if (retval) {
+                log_messages.printf(
+                    SCHED_MSG_LOG::CRITICAL,
+                    "add_download_servers(to APP version) failed\n"
+                );
+                // restore original WU!
+                av2=*avp;
+            }
+        }
+        
        reply.insert_app_unique(*app);
-        reply.insert_app_version_unique(*avp);
+        reply.insert_app_version_unique(*avp2);
        log_messages.printf(
            SCHED_MSG_LOG::DEBUG,
            "[HOST#%d] Sending app_version %s %s %d\n",
-            reply.host.id, app->name, platform.name, avp->version_num
+            reply.host.id, app->name, platform.name, avp2->version_num
        );
    }

@ -328,7 +527,29 @@ int add_wu_to_reply(
        log_messages.printf(SCHED_MSG_LOG::NORMAL, "insert_wu_tags failed\n");
        return retval;
    }
-    reply.insert_workunit_unique(wu2);
+    wu3=wu2;
+    if (config.choose_download_url_by_timezone) {
+        // replace the download URL for WU files with a list of
+        // multiple download servers.
+        
+        // set these global variables, needed by the compare()
+        // function so that the download URL list can be sorted by
+        // timezone
+        tzone=reply.host.timezone;
+        hostid=reply.host.id;
+        
+        retval = add_download_servers(wu2.xml_doc, wu3.xml_doc, reply.host.timezone);
+        if (retval) {
+            log_messages.printf(
+                SCHED_MSG_LOG::CRITICAL,
+                "add_download_servers(to WU) failed\n"
+            );
+            // restore original WU!
+            wu3=wu2;
+        }
+    }
+    
+    reply.insert_workunit_unique(wu3);
    return 0;
 }