- Overdue checkins on a bunch of minor scheduler code. Some that is

E@H specific is now included (but protected by #ifdef EINSTEIN_AT_HOME to make it simpler for me to maintain consistency with BOINC cvs. - Added project-specific unacceptable_os() function for rejecting hosts. - Transitioner and scheduler now initalize host.max_results_day correctly in database under all circumstances. - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and properly redirected. This was broken. David, please see comment near one of the probable_user_browser=true in handle_request.C. I think something is wrong here (or I am missing the point!). - More info about requests is logged - If the scheduler hangs (incoming request incomplete) it will normally be killed by Apache after a timeout. But this happens silently. So I now install a signal handler and catch this SIGTERM. In this case an error message is logged and all open files are flushed before exit(1)ing. - If IO is passed through files, check that request length and content length agree and log a message if they do NOT. - active_frac not correctly reported by 4.19 and earlier core clients. Adjust for this in estimating wallclock execution times. - Added a small block into validator code to attach a debugger. svn path=/trunk/boinc/; revision=5688
2005-03-19 18:24:24 +00:00 · 2005-03-19 18:24:24 +00:00 · 58f5b86288
parent 94d0e9b63f
commit 58f5b86288
5 changed files with 210 additions and 25 deletions
--- a/31
+++ b/31
@ -26028,3 +26028,34 @@ David  19 Mar 2005
        license_1.0.txt (removed)
    win_build/installerv2/redist/0409/
        eula.rtf
+
+Bruce  19 Mar 2005
+    - Overdue checkins on a bunch of minor scheduler code.  Some that is
+      E@H specific is now included (but protected by
+      #ifdef EINSTEIN_AT_HOME
+      to make it simpler for me to maintain consistency with BOINC cvs.
+    - Added project-specific unacceptable_os() function for rejecting hosts.
+    - Transitioner and scheduler now initalize host.max_results_day correctly
+      in database under all circumstances.
+    - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and
+      properly redirected.  This was broken. David, please see comment near
+      one of the probable_user_browser=true in handle_request.C.  I think
+      something is wrong here (or I am missing the point!).
+    - More info about requests is logged
+    - If the scheduler hangs (incoming request incomplete) it will normally be
+      killed by Apache after a timeout.  But this happens silently.  So I now
+      install a signal handler and catch this SIGTERM.  In this case an
+      error message is logged and all open files are flushed before exit(1)ing.
+    - If IO is passed through files, check that request length and content length
+      agree and log a message if they do NOT.
+    - active_frac not correctly reported by 4.19 and earlier core clients.
+      Adjust for this in estimating wallclock execution times.
+    - Added a small block into validator code to attach a debugger.
+
+    sched/
+        handle_request.C
+        sched_send.C
+        main.C
+        transitioner.C
+        validator.C
+
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -99,6 +99,11 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
            goto lookup_user_and_make_new_host;
        }
        reply.host = host;
+        log_messages.printf(
+                SCHED_MSG_LOG::DEBUG,
+                "Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n",
+                sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno
+            );

        strlcpy(
            user.authenticator, sreq.authenticator,
@ -513,7 +518,7 @@ int handle_results(
        srip->exit_status = rp->exit_status;
        srip->app_version_num = rp->app_version_num;
        srip->claimed_credit = rp->cpu_time * reply.host.credit_per_cpu_sec;
-#if 1
+#ifdef EINSTEIN_AT_HOME
        log_messages.printf(SCHED_MSG_LOG::DEBUG,
            "cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.credit_per_cpu_sec, srip->claimed_credit
        );
@ -715,6 +720,41 @@ void warn_user_if_core_client_upgrade_scheduled(
    return;
 }

+#ifdef EINSTEIN_AT_HOME
+bool unacceptable_os(
+        SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
+) {
+    log_messages.printf(
+        SCHED_MSG_LOG::NORMAL,
+        "OS version %s %s\n",
+        sreq.host.os_name, sreq.host.os_version
+    );
+
+    if (!strcmp(sreq.host.os_name, "Darwin") && 
+           (!strncmp(sreq.host.os_version, "5.", 2) || 
+            !strncmp(sreq.host.os_version, "6.", 2)
+           ) 
+        ) {
+        log_messages.printf(
+            SCHED_MSG_LOG::NORMAL,
+            "Unacceptable OS %s %s\n",
+            sreq.host.os_name, sreq.host.os_version
+        );
+        USER_MESSAGE um("Project only supports MacOS Darwin versions 7.X and above",
+                        "low");
+        reply.insert_message(um);
+        reply.set_delay(3600*24);
+        return true;
+    }
+    return false;
+}
+#else
+bool unacceptable_os(
+        SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
+) {
+    return false;
+}
+#endif // EINSTEIN_AT_HOME

 bool wrong_core_client_version(
    SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
@ -725,9 +765,7 @@ bool wrong_core_client_version(
        // TODO: check for user-agent not empty and not BOINC
        wrong_version = true;
        sprintf(msg,
-            "To participate in this project, "
-            "you must use major version %d of the BOINC core client. "
-            "Your core client is major version %d.",
+            "Need major version %d of the BOINC core client. You have %d.",
            BOINC_MAJOR_VERSION,
            sreq.core_client_major_version
        );
@ -743,9 +781,7 @@ bool wrong_core_client_version(
        if (sreq.core_client_minor_version < minor) {
            wrong_version = true;
            sprintf(msg,
-                "To participate in this project, "
-                "you must use version %d.%02d or higher of the BOINC core client.  "
-                "Your core client is version %d.%02d.",
+                "Need version %d.%02d or higher of the BOINC core client. You have %d.%02d.",
                major, minor,
                sreq.core_client_major_version, sreq.core_client_minor_version
            );
@ -760,6 +796,7 @@ bool wrong_core_client_version(
    if (wrong_version) {
        USER_MESSAGE um(msg, "low");
        reply.insert_message(um);
+        // IS THE FOLLOWING LINE CORRECT?  I DON'T UNDERSTAND IT.  Bruce
        reply.probable_user_browser = true;
        reply.set_delay(3600*24);
        return true;
@ -852,7 +889,7 @@ void process_request(

    // if different major version of BOINC, just send a message
    //
-    if (wrong_core_client_version(sreq, reply)) {
+    if (wrong_core_client_version(sreq, reply) || unacceptable_os(sreq, reply)) {
        ok_to_send_work = false;

        // if no results, return without accessing DB
@ -863,7 +900,7 @@ void process_request(
    } else {
        warn_user_if_core_client_upgrade_scheduled(sreq, reply);
    }
-  
+
    if (config.locality_scheduling) {
        have_no_work = false;
    } else {
@ -927,6 +964,13 @@ void process_request(
    rpc_time_tm = localtime((const time_t*)&reply.host.rpc_time);
    current_rpc_dayofyear = rpc_time_tm->tm_yday;

+    if (config.daily_result_quota) {
+        if (reply.host.max_results_day <= 0 || reply.host.max_results_day > config.daily_result_quota) {
+            reply.host.max_results_day = config.daily_result_quota;
+            log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Initializing max_results_day to %d\n", reply.host.id, config.daily_result_quota);
+        }
+    }
+
    if (last_rpc_dayofyear != current_rpc_dayofyear) {
        log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Resetting nresults_today\n", reply.host.id);
        reply.host.nresults_today = 0;
@ -1139,19 +1183,38 @@ void handle_request(
        );
        process_request(sreq, sreply, ss, code_sign_key);
    } else {
+        // BOINC scheduler requests use method POST.  So method GET
+        // means that someone is trying a browser.
+        char *rm=getenv("REQUEST_METHOD");
+        if (rm && !strcmp(rm, "GET")) sreply.probable_user_browser=true;
+        
        log_messages.printf(
            SCHED_MSG_LOG::NORMAL,
-            "Incomplete request received from IP %s, auth %s, platform %s, version %d.%02d\n",
+            "Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%02d\n",
+            sreply.probable_user_browser?"(probably a browser) ":"",
            get_remote_addr(), sreq.authenticator, sreq.platform_name,
            sreq.core_client_major_version, sreq.core_client_minor_version
        );
+        
        USER_MESSAGE um("Incomplete request received.", "low");
        sreply.insert_message(um);
        sreply.nucleus_only = true;
    }
+
+#ifdef EINSTEIN_AT_HOME
+    // for testing
+    if (sreply.user.id==3) {
+        USER_MESSAGE um("THIS IS A SHORT MESSAGE. \n AND ANOTHER", "high");
+        // USER_MESSAGE um("THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE. \n"
+	//		"THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "low");
+        sreply.insert_message(um);
+        // USER_MESSAGE um2("THIS IS A VERY LONG TEST MESSAGE2. THIS IS A VERY LONG TEST MESSAGE. \n"
+	//		"THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "high");
+        // sreply.insert_message(um2);
+    }
+#endif
    
    // if we got no work, and we have no file space, delete some files
-	//
    if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) {
        // try to delete a file to make more space.
        // Also give some hints to the user about what's going wrong
@ -1160,7 +1223,17 @@ void handle_request(
        delete_file_from_host(sreq, sreply);
    }
    
-#if 1
+    // write all messages to log file
+    for (unsigned int i=0; i<sreply.messages.size(); i++) {
+        USER_MESSAGE um = sreply.messages[i];
+        log_messages.printf(SCHED_MSG_LOG::DEBUG,
+            "[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
+        );
+    }
+
+    debug_sched(sreq, sreply, "../debug_sched");
+
+#ifdef EINSTEIN_AT_HOME
    // You can call debug_sched() for whatever situation is of
    // interest to you.  It won't do anything unless you create
    // (touch) the file 'debug_sched' in the project root directory.
--- a/sched/main.C
+++ b/sched/main.C
@ -28,6 +28,8 @@
 using namespace std;

 #include <unistd.h>
+#include <signal.h>
+#include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/time.h>
@ -95,6 +97,48 @@ int open_database() {
    return 0;
 }

+void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger); 
+
+// If the scheduler 'hangs', which it can do if a request is not fully processed
+// or some other process arises, then Apache will send a SIGTERM to the cgi.
+// This signal handler ensures that rather than dying silently, the cgi process
+// will leave behind some record in the log file.
+//
+void sigterm_handler(int signo) {
+   log_messages.printf(SCHED_MSG_LOG::CRITICAL, 
+       "BOINC scheduler (pid=%d) caught signal %d.  Exit(1)ing\n",
+       (int)getpid(), signo
+    );
+    fflush(NULL);
+    exit(1);
+    return;
+}
+
+void log_request_info(int& length) {
+    char *cl=getenv("CONTENT_LENGTH");
+    char *ri=getenv("REMOTE_ADDR");
+    char *rm=getenv("REQUEST_METHOD");
+    char *ct=getenv("CONTENT_TYPE");
+    char *ha=getenv("HTTP_ACCEPT");
+    char *hu=getenv("HTTP_USER_AGENT");
+
+    log_messages.printf(SCHED_MSG_LOG::DEBUG,
+        "REQUEST_METHOD=%s "
+        "CONTENT_TYPE=%s "
+        "HTTP_ACCEPT=%s "
+        "HTTP_USER_AGENT=%s\n",
+        rm?rm:"" , ct?ct:"", ha?ha:"", hu?hu:""
+    );
+
+    if (!cl) {
+        log_messages.printf(SCHED_MSG_LOG::CRITICAL, "CONTENT_LENGTH environment variable not set\n");
+    }
+    else {
+        length=atoi(cl);
+        log_messages.printf(SCHED_MSG_LOG::DEBUG, "CONTENT_LENGTH=%d from %s\n", length, ri?ri:"[Unknown]");
+    }
+}
+
 int main() {
    FILE* fin, *fout;
    int i, retval;
@ -104,6 +148,12 @@ int main() {
    unsigned int counter=0;
    char* code_sign_key;
    bool project_stopped = false;
+    int length=-1;
+
+    // install a signal handler that catches SIGTERMS sent by Apache if the cgi
+    // times out.
+    //
+    signal(SIGTERM, sigterm_handler);

 #ifndef _USING_FCGI_
    char *stderr_buffer, buf[256];
@ -235,8 +285,11 @@ int main() {
        send_message("Project is temporarily shut down for maintenance", 3600);
        goto done;
    }
+    log_request_info(length);
    fprintf(stdout,"Content-type: text/plain\n\n");
+
    if (use_files) {
+        struct stat statbuf;
        // the code below is convoluted because,
        // instead of going from stdin to stdout directly,
        // we go via a pair of disk files
@ -252,6 +305,14 @@ int main() {
        }
        copy_stream(stdin, fout);
        fclose(fout);
+        stat(req_path, &statbuf);
+        if (length>=0 && (statbuf.st_size != length)) {
+            log_messages.printf(SCHED_MSG_LOG::CRITICAL,
+                "Request length %d != CONTENT_LENGTH %d\n",
+                (int)statbuf.st_size, length
+            );
+        }
+
        fin = fopen(req_path, "r");
        if (!fin) {
            log_messages.printf(SCHED_MSG_LOG::CRITICAL, "can't read request file\n");
@ -273,8 +334,13 @@ int main() {
        }
        copy_stream(fin, stdout);
        fclose(fin);
-        //unlink(req_path);
-        //unlink(reply_path);
+#ifdef EINSTEIN_AT_HOME
+        if (getenv("CONTENT_LENGTH")) unlink(req_path);
+        if (getenv("CONTENT_LENGTH")) unlink(reply_path);
+#else
+        // unlink(req_path);
+        // unlink(reply_path);
+#endif
    } else {
        handle_request(stdin, stdout, *ssp, code_sign_key);
    }
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -178,14 +178,19 @@ static double estimate_wallclock_duration(
    WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
 ) {
    double running_frac;
-    running_frac = reply.host.active_frac * reply.host.on_frac;
+    if (reply.wreq.core_client_version<=419) {
+        running_frac = reply.host.on_frac;
+    }
+    else {
+        running_frac = reply.host.active_frac * reply.host.on_frac;
+    }
    if (running_frac < HOST_ACTIVE_FRAC_MIN) {
        running_frac = HOST_ACTIVE_FRAC_MIN;
    }
    if (running_frac > 1) running_frac = 1;
    double ecd = estimate_cpu_duration(wu, reply);
    double ewd = ecd/(running_frac*request.resource_share_fraction);
-#if 0
+#ifdef EINSTEIN_AT_HOME
    log_messages.printf(
        SCHED_MSG_LOG::DEBUG, "est cpu dur %f; running_frac %f; rsf %f; est %f\n",
        ecd, running_frac, request.resource_share_fraction, ewd
@ -533,15 +538,13 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
        return false;
    }
    if (wreq.nresults >= config.max_wus_to_send) return false;
-    if (config.daily_result_quota) {
-        if (host.max_results_day == 0) {
-            host.max_results_day = config.daily_result_quota;
-        }

+    if (config.daily_result_quota) {
        // scale daily quota by #CPUs, up to a limit of 4
        //
        int ncpus = host.p_ncpus;
        if (ncpus > 4) ncpus = 4;
+        if (ncpus < 1) ncpus = 1;
        wreq.daily_result_quota = ncpus*host.max_results_day;
        if (host.nresults_today >= wreq.daily_result_quota) {
            wreq.daily_result_quota_exceeded = true;
@ -903,11 +906,20 @@ int send_work(
        }
        if (reply.wreq.insufficient_speed) {
            char helpful[512];
-            sprintf(helpful,
-                "(won't finish in time) "
-                "Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
-                100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
-            );
+            if (reply.wreq.core_client_version>419) {
+                sprintf(helpful,
+                    "(won't finish in time) "
+                    "Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
+                    100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
+                );
+            }
+            else {
+                sprintf(helpful,
+                    "(won't finish in time) "
+                    "Computer available %.1f%% of time, this project gets %.1f%% of that",
+                    100.0*reply.host.on_frac, 100.0*sreq.resource_share_fraction
+                );
+            }
            USER_MESSAGE um(helpful, "high");
            reply.insert_message(um);
        }
--- a/sched/transitioner.C
+++ b/sched/transitioner.C
@ -70,6 +70,9 @@ int penalize_host(int hostid, double delay_bound) {
    int retval = host.lookup_id(hostid);
    if (retval) return retval;
    compute_avg_turnaround(host, delay_bound);
+    if (host.max_results_day <= 0 || host.max_results_day > config.daily_result_quota) {
+        host.max_results_day = config.daily_result_quota;
+    }
    host.max_results_day -= 1;
    if (host.max_results_day < 1) {
        host.max_results_day = 1;