From 58f5b862884f13e453f1eb6cfe2a9c8b1741d516 Mon Sep 17 00:00:00 2001
From: Bruce Allen <bruce.allen@aei.mpg.de>
Date: Sat, 19 Mar 2005 18:24:24 +0000
Subject: [PATCH]     - Overdue checkins on a bunch of minor scheduler code. 
 Some that is       E@H specific is now included (but protected by      
 #ifdef EINSTEIN_AT_HOME       to make it simpler for me to maintain
 consistency with BOINC cvs.     - Added project-specific unacceptable_os()
 function for rejecting hosts.     - Transitioner and scheduler now initalize
 host.max_results_day correctly       in database under all circumstances.    
 - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and    
   properly redirected.  This was broken. David, please see comment near      
 one of the probable_user_browser=true in handle_request.C.  I think      
 something is wrong here (or I am missing the point!).     - More info about
 requests is logged     - If the scheduler hangs (incoming request incomplete)
 it will normally be       killed by Apache after a timeout.  But this happens
 silently.  So I now       install a signal handler and catch this SIGTERM. 
 In this case an       error message is logged and all open files are flushed
 before exit(1)ing.     - If IO is passed through files, check that request
 length and content length       agree and log a message if they do NOT.     -
 active_frac not correctly reported by 4.19 and earlier core clients.      
 Adjust for this in estimating wallclock execution times.     - Added a small
 block into validator code to attach a debugger.

svn path=/trunk/boinc/; revision=5688
---
 checkin_notes          | 31 ++++++++++++++
 sched/handle_request.C | 97 ++++++++++++++++++++++++++++++++++++------
 sched/main.C           | 70 +++++++++++++++++++++++++++++-
 sched/sched_send.C     | 34 ++++++++++-----
 sched/transitioner.C   |  3 ++
 5 files changed, 210 insertions(+), 25 deletions(-)

diff --git a/checkin_notes b/checkin_notes
index 0df76bc193..29619c92a9 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -26028,3 +26028,34 @@ David  19 Mar 2005
         license_1.0.txt (removed)
     win_build/installerv2/redist/0409/
         eula.rtf
+
+Bruce  19 Mar 2005
+    - Overdue checkins on a bunch of minor scheduler code.  Some that is
+      E@H specific is now included (but protected by
+      #ifdef EINSTEIN_AT_HOME
+      to make it simpler for me to maintain consistency with BOINC cvs.
+    - Added project-specific unacceptable_os() function for rejecting hosts.
+    - Transitioner and scheduler now initalize host.max_results_day correctly
+      in database under all circumstances.
+    - Browser requests are now correctly identifed (REQUEST_METHO=="GET") and
+      properly redirected.  This was broken. David, please see comment near
+      one of the probable_user_browser=true in handle_request.C.  I think
+      something is wrong here (or I am missing the point!).
+    - More info about requests is logged
+    - If the scheduler hangs (incoming request incomplete) it will normally be
+      killed by Apache after a timeout.  But this happens silently.  So I now
+      install a signal handler and catch this SIGTERM.  In this case an
+      error message is logged and all open files are flushed before exit(1)ing.
+    - If IO is passed through files, check that request length and content length
+      agree and log a message if they do NOT.
+    - active_frac not correctly reported by 4.19 and earlier core clients.
+      Adjust for this in estimating wallclock execution times.
+    - Added a small block into validator code to attach a debugger.
+
+    sched/
+        handle_request.C
+        sched_send.C
+        main.C
+        transitioner.C
+        validator.C
+
diff --git a/sched/handle_request.C b/sched/handle_request.C
index 585709f125..f96bae55be 100644
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@@ -99,6 +99,11 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
             goto lookup_user_and_make_new_host;
         }
         reply.host = host;
+        log_messages.printf(
+                SCHED_MSG_LOG::DEBUG,
+                "Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n",
+                sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno
+            );
 
         strlcpy(
             user.authenticator, sreq.authenticator,
@@ -513,7 +518,7 @@ int handle_results(
         srip->exit_status = rp->exit_status;
         srip->app_version_num = rp->app_version_num;
         srip->claimed_credit = rp->cpu_time * reply.host.credit_per_cpu_sec;
-#if 1
+#ifdef EINSTEIN_AT_HOME
         log_messages.printf(SCHED_MSG_LOG::DEBUG,
             "cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.credit_per_cpu_sec, srip->claimed_credit
         );
@@ -715,6 +720,41 @@ void warn_user_if_core_client_upgrade_scheduled(
     return;
 }
 
+#ifdef EINSTEIN_AT_HOME
+bool unacceptable_os(
+        SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
+) {
+    log_messages.printf(
+        SCHED_MSG_LOG::NORMAL,
+        "OS version %s %s\n",
+        sreq.host.os_name, sreq.host.os_version
+    );
+
+    if (!strcmp(sreq.host.os_name, "Darwin") && 
+           (!strncmp(sreq.host.os_version, "5.", 2) || 
+            !strncmp(sreq.host.os_version, "6.", 2)
+           ) 
+        ) {
+        log_messages.printf(
+            SCHED_MSG_LOG::NORMAL,
+            "Unacceptable OS %s %s\n",
+            sreq.host.os_name, sreq.host.os_version
+        );
+        USER_MESSAGE um("Project only supports MacOS Darwin versions 7.X and above",
+                        "low");
+        reply.insert_message(um);
+        reply.set_delay(3600*24);
+        return true;
+    }
+    return false;
+}
+#else
+bool unacceptable_os(
+        SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
+) {
+    return false;
+}
+#endif // EINSTEIN_AT_HOME
 
 bool wrong_core_client_version(
     SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
@@ -725,9 +765,7 @@ bool wrong_core_client_version(
         // TODO: check for user-agent not empty and not BOINC
         wrong_version = true;
         sprintf(msg,
-            "To participate in this project, "
-            "you must use major version %d of the BOINC core client. "
-            "Your core client is major version %d.",
+            "Need major version %d of the BOINC core client. You have %d.",
             BOINC_MAJOR_VERSION,
             sreq.core_client_major_version
         );
@@ -743,9 +781,7 @@ bool wrong_core_client_version(
         if (sreq.core_client_minor_version < minor) {
             wrong_version = true;
             sprintf(msg,
-                "To participate in this project, "
-                "you must use version %d.%02d or higher of the BOINC core client.  "
-                "Your core client is version %d.%02d.",
+                "Need version %d.%02d or higher of the BOINC core client. You have %d.%02d.",
                 major, minor,
                 sreq.core_client_major_version, sreq.core_client_minor_version
             );
@@ -760,6 +796,7 @@ bool wrong_core_client_version(
     if (wrong_version) {
         USER_MESSAGE um(msg, "low");
         reply.insert_message(um);
+        // IS THE FOLLOWING LINE CORRECT?  I DON'T UNDERSTAND IT.  Bruce
         reply.probable_user_browser = true;
         reply.set_delay(3600*24);
         return true;
@@ -852,7 +889,7 @@ void process_request(
 
     // if different major version of BOINC, just send a message
     //
-    if (wrong_core_client_version(sreq, reply)) {
+    if (wrong_core_client_version(sreq, reply) || unacceptable_os(sreq, reply)) {
         ok_to_send_work = false;
 
         // if no results, return without accessing DB
@@ -863,7 +900,7 @@ void process_request(
     } else {
         warn_user_if_core_client_upgrade_scheduled(sreq, reply);
     }
-  
+
     if (config.locality_scheduling) {
         have_no_work = false;
     } else {
@@ -927,6 +964,13 @@ void process_request(
     rpc_time_tm = localtime((const time_t*)&reply.host.rpc_time);
     current_rpc_dayofyear = rpc_time_tm->tm_yday;
 
+    if (config.daily_result_quota) {
+        if (reply.host.max_results_day <= 0 || reply.host.max_results_day > config.daily_result_quota) {
+            reply.host.max_results_day = config.daily_result_quota;
+            log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Initializing max_results_day to %d\n", reply.host.id, config.daily_result_quota);
+        }
+    }
+
     if (last_rpc_dayofyear != current_rpc_dayofyear) {
         log_messages.printf(SCHED_MSG_LOG::DEBUG, "[HOST#%d] Resetting nresults_today\n", reply.host.id);
         reply.host.nresults_today = 0;
@@ -1139,19 +1183,38 @@ void handle_request(
         );
         process_request(sreq, sreply, ss, code_sign_key);
     } else {
+        // BOINC scheduler requests use method POST.  So method GET
+        // means that someone is trying a browser.
+        char *rm=getenv("REQUEST_METHOD");
+        if (rm && !strcmp(rm, "GET")) sreply.probable_user_browser=true;
+        
         log_messages.printf(
             SCHED_MSG_LOG::NORMAL,
-            "Incomplete request received from IP %s, auth %s, platform %s, version %d.%02d\n",
+            "Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%02d\n",
+            sreply.probable_user_browser?"(probably a browser) ":"",
             get_remote_addr(), sreq.authenticator, sreq.platform_name,
             sreq.core_client_major_version, sreq.core_client_minor_version
         );
+        
         USER_MESSAGE um("Incomplete request received.", "low");
         sreply.insert_message(um);
         sreply.nucleus_only = true;
     }
+
+#ifdef EINSTEIN_AT_HOME
+    // for testing
+    if (sreply.user.id==3) {
+        USER_MESSAGE um("THIS IS A SHORT MESSAGE. \n AND ANOTHER", "high");
+        // USER_MESSAGE um("THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE. \n"
+	//		"THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "low");
+        sreply.insert_message(um);
+        // USER_MESSAGE um2("THIS IS A VERY LONG TEST MESSAGE2. THIS IS A VERY LONG TEST MESSAGE. \n"
+	//		"THIS IS A VERY LONG TEST MESSAGE. THIS IS A VERY LONG TEST MESSAGE.", "high");
+        // sreply.insert_message(um2);
+    }
+#endif
     
     // if we got no work, and we have no file space, delete some files
-	//
     if (sreply.results.size()==0 && (sreply.wreq.insufficient_disk || sreply.wreq.disk_available<0)) {
         // try to delete a file to make more space.
         // Also give some hints to the user about what's going wrong
@@ -1160,7 +1223,17 @@ void handle_request(
         delete_file_from_host(sreq, sreply);
     }
     
-#if 1
+    // write all messages to log file
+    for (unsigned int i=0; i<sreply.messages.size(); i++) {
+        USER_MESSAGE um = sreply.messages[i];
+        log_messages.printf(SCHED_MSG_LOG::DEBUG,
+            "[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
+        );
+    }
+
+    debug_sched(sreq, sreply, "../debug_sched");
+
+#ifdef EINSTEIN_AT_HOME
     // You can call debug_sched() for whatever situation is of
     // interest to you.  It won't do anything unless you create
     // (touch) the file 'debug_sched' in the project root directory.
diff --git a/sched/main.C b/sched/main.C
index 30ca49566b..76a52d9627 100644
--- a/sched/main.C
+++ b/sched/main.C
@@ -28,6 +28,8 @@
 using namespace std;
 
 #include <unistd.h>
+#include <signal.h>
+#include <errno.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <sys/time.h>
@@ -95,6 +97,48 @@ int open_database() {
     return 0;
 }
 
+void debug_sched(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& sreply, const char *trigger); 
+
+// If the scheduler 'hangs', which it can do if a request is not fully processed
+// or some other process arises, then Apache will send a SIGTERM to the cgi.
+// This signal handler ensures that rather than dying silently, the cgi process
+// will leave behind some record in the log file.
+//
+void sigterm_handler(int signo) {
+   log_messages.printf(SCHED_MSG_LOG::CRITICAL, 
+       "BOINC scheduler (pid=%d) caught signal %d.  Exit(1)ing\n",
+       (int)getpid(), signo
+    );
+    fflush(NULL);
+    exit(1);
+    return;
+}
+
+void log_request_info(int& length) {
+    char *cl=getenv("CONTENT_LENGTH");
+    char *ri=getenv("REMOTE_ADDR");
+    char *rm=getenv("REQUEST_METHOD");
+    char *ct=getenv("CONTENT_TYPE");
+    char *ha=getenv("HTTP_ACCEPT");
+    char *hu=getenv("HTTP_USER_AGENT");
+
+    log_messages.printf(SCHED_MSG_LOG::DEBUG,
+        "REQUEST_METHOD=%s "
+        "CONTENT_TYPE=%s "
+        "HTTP_ACCEPT=%s "
+        "HTTP_USER_AGENT=%s\n",
+        rm?rm:"" , ct?ct:"", ha?ha:"", hu?hu:""
+    );
+
+    if (!cl) {
+        log_messages.printf(SCHED_MSG_LOG::CRITICAL, "CONTENT_LENGTH environment variable not set\n");
+    }
+    else {
+        length=atoi(cl);
+        log_messages.printf(SCHED_MSG_LOG::DEBUG, "CONTENT_LENGTH=%d from %s\n", length, ri?ri:"[Unknown]");
+    }
+}
+
 int main() {
     FILE* fin, *fout;
     int i, retval;
@@ -104,6 +148,12 @@ int main() {
     unsigned int counter=0;
     char* code_sign_key;
     bool project_stopped = false;
+    int length=-1;
+
+    // install a signal handler that catches SIGTERMS sent by Apache if the cgi
+    // times out.
+    //
+    signal(SIGTERM, sigterm_handler);
 
 #ifndef _USING_FCGI_
     char *stderr_buffer, buf[256];
@@ -235,8 +285,11 @@ int main() {
         send_message("Project is temporarily shut down for maintenance", 3600);
         goto done;
     }
+    log_request_info(length);
     fprintf(stdout,"Content-type: text/plain\n\n");
+
     if (use_files) {
+        struct stat statbuf;
         // the code below is convoluted because,
         // instead of going from stdin to stdout directly,
         // we go via a pair of disk files
@@ -252,6 +305,14 @@ int main() {
         }
         copy_stream(stdin, fout);
         fclose(fout);
+        stat(req_path, &statbuf);
+        if (length>=0 && (statbuf.st_size != length)) {
+            log_messages.printf(SCHED_MSG_LOG::CRITICAL,
+                "Request length %d != CONTENT_LENGTH %d\n",
+                (int)statbuf.st_size, length
+            );
+        }
+
         fin = fopen(req_path, "r");
         if (!fin) {
             log_messages.printf(SCHED_MSG_LOG::CRITICAL, "can't read request file\n");
@@ -273,8 +334,13 @@ int main() {
         }
         copy_stream(fin, stdout);
         fclose(fin);
-        //unlink(req_path);
-        //unlink(reply_path);
+#ifdef EINSTEIN_AT_HOME
+        if (getenv("CONTENT_LENGTH")) unlink(req_path);
+        if (getenv("CONTENT_LENGTH")) unlink(reply_path);
+#else
+        // unlink(req_path);
+        // unlink(reply_path);
+#endif
     } else {
         handle_request(stdin, stdout, *ssp, code_sign_key);
     }
diff --git a/sched/sched_send.C b/sched/sched_send.C
index 866c502dd6..b76276d579 100644
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@@ -178,14 +178,19 @@ static double estimate_wallclock_duration(
     WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
 ) {
     double running_frac;
-    running_frac = reply.host.active_frac * reply.host.on_frac;
+    if (reply.wreq.core_client_version<=419) {
+        running_frac = reply.host.on_frac;
+    }
+    else {
+        running_frac = reply.host.active_frac * reply.host.on_frac;
+    }
     if (running_frac < HOST_ACTIVE_FRAC_MIN) {
         running_frac = HOST_ACTIVE_FRAC_MIN;
     }
     if (running_frac > 1) running_frac = 1;
     double ecd = estimate_cpu_duration(wu, reply);
     double ewd = ecd/(running_frac*request.resource_share_fraction);
-#if 0
+#ifdef EINSTEIN_AT_HOME
     log_messages.printf(
         SCHED_MSG_LOG::DEBUG, "est cpu dur %f; running_frac %f; rsf %f; est %f\n",
         ecd, running_frac, request.resource_share_fraction, ewd
@@ -533,15 +538,13 @@ bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
         return false;
     }
     if (wreq.nresults >= config.max_wus_to_send) return false;
-    if (config.daily_result_quota) {
-        if (host.max_results_day == 0) {
-            host.max_results_day = config.daily_result_quota;
-        }
 
+    if (config.daily_result_quota) {
         // scale daily quota by #CPUs, up to a limit of 4
         //
         int ncpus = host.p_ncpus;
         if (ncpus > 4) ncpus = 4;
+        if (ncpus < 1) ncpus = 1;
         wreq.daily_result_quota = ncpus*host.max_results_day;
         if (host.nresults_today >= wreq.daily_result_quota) {
             wreq.daily_result_quota_exceeded = true;
@@ -903,11 +906,20 @@ int send_work(
         }
         if (reply.wreq.insufficient_speed) {
             char helpful[512];
-            sprintf(helpful,
-                "(won't finish in time) "
-                "Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
-                100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
-            );
+            if (reply.wreq.core_client_version>419) {
+                sprintf(helpful,
+                    "(won't finish in time) "
+                    "Computer on %.1f%% of time, BOINC on %.1f%% of that, this project gets %.1f%% of that",
+                    100.0*reply.host.on_frac, 100.0*reply.host.active_frac, 100.0*sreq.resource_share_fraction
+                );
+            }
+            else {
+                sprintf(helpful,
+                    "(won't finish in time) "
+                    "Computer available %.1f%% of time, this project gets %.1f%% of that",
+                    100.0*reply.host.on_frac, 100.0*sreq.resource_share_fraction
+                );
+            }
             USER_MESSAGE um(helpful, "high");
             reply.insert_message(um);
         }
diff --git a/sched/transitioner.C b/sched/transitioner.C
index a5e546231e..ea98c02e46 100644
--- a/sched/transitioner.C
+++ b/sched/transitioner.C
@@ -70,6 +70,9 @@ int penalize_host(int hostid, double delay_bound) {
     int retval = host.lookup_id(hostid);
     if (retval) return retval;
     compute_avg_turnaround(host, delay_bound);
+    if (host.max_results_day <= 0 || host.max_results_day > config.daily_result_quota) {
+        host.max_results_day = config.daily_result_quota;
+    }
     host.max_results_day -= 1;
     if (host.max_results_day < 1) {
         host.max_results_day = 1;