scheduler: Another feature for debugging the scheduler.

Previously (little known) the scheduler could be hacked to preserve the sched_request.xml and sched_reply.xml in own directories (you had to modify the initial value of use_files in sched_main.cpp). This feature could now be switched on and off on the fly just by changing the project config. When there is an (existing) directory configured as <debug_req_reply_dir>, each schduler instance will write three files in there: PID_C_sched.log, PID_C_sched_request.xml and (if all goes well) PID_C_sched_reply.xml. PID is the process id of this scheduler instance, C is an internal counter within the process if FCGI is used. The sched.log will contain nothing else than the pid and the IP address of the client. This should allow for identifying the scheduler instance responsible for a given apache error log message ("premature end of script headers") when a scheduler crashed. sched_request.xml (obviously) is the scheduler request, and if the scheduler doesn't crash in between, there will also be the reply to the client kept in sched_reply.xml Remove the <debug_req_reply_dir> tag from the project config to turn this feature off. svn path=/trunk/boinc/; revision=25349
2012-02-27 13:12:24 +00:00 · 2012-02-27 13:12:24 +00:00 · 9cb28dd25c
parent 3fa88ac1e3
commit 9cb28dd25c
4 changed files with 68 additions and 16 deletions
--- a/27
+++ b/27
@ -2262,3 +2262,30 @@ Bernd  27 Feb 2012
        sched_config.h
        sched_config.cpp
        sched_main.cpp
+
+Bernd  27 Feb 2012
+    - scheduler: Another feature for debugging the scheduler.
+        Previously (little known) the scheduler could be hacked to preserve
+        the sched_request.xml and sched_reply.xml in own directories
+        (you had to modify the initial value of use_files in sched_main.cpp).
+        This feature could now be switched on and off on the fly just by
+        changing the project config.
+        When there is an (existing) directory configured as
+        <debug_req_reply_dir>, each schduler instance will write three
+        files in there: PID_C_sched.log, PID_C_sched_request.xml and (if all
+        goes well) PID_C_sched_reply.xml. PID is the process id of this
+        scheduler instance, C is an internal counter within the process if
+        FCGI is used. The sched.log will contain nothing else than the
+        pid and the IP address of the client. This should allow for
+        identifying the scheduler instance responsible for a given
+        apache error log message ("premature end of script headers") when
+        a scheduler crashed. sched_request.xml (obviously) is the scheduler
+        request, and if the scheduler doesn't crash in between, there will
+        also be the reply to the client kept in sched_reply.xml
+        Remove the <debug_req_reply_dir> tag from the project config
+        to turn this feature off.
+
+    sched/
+        sched_config.h
+        sched_config.cpp
+        sched_main.cpp
--- a/sched/sched_config.cpp
+++ b/sched/sched_config.cpp
@ -320,6 +320,8 @@ int SCHED_CONFIG::parse(FILE* f) {
        if (xp.parse_bool("debug_user_messages", debug_user_messages)) continue;
        if (xp.parse_bool("debug_version_select", debug_version_select)) continue;

+        if (xp.parse_str("debug_req_reply_dir", debug_req_reply_dir, sizeof(debug_req_reply_dir))) continue;
+
        // don't complain about unparsed XML;
        // there are lots of tags the scheduler doesn't know about

--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -190,6 +190,8 @@ struct SCHED_CONFIG {
    bool debug_user_messages;
    bool debug_version_select;

+    char debug_req_reply_dir[256];  // keep sched_request and sched_reply
+                                    // in files in this directory
    int parse(FILE*);
    int parse_aux(FILE*);
    int parse_file(const char *dir = 0);
--- a/sched/sched_main.cpp
+++ b/sched/sched_main.cpp
@ -17,7 +17,6 @@

 // The BOINC scheduling server.

-// Note: use_files is a compile setting that records everything in files.
 // Also, You can call debug_sched() for whatever situation is of
 // interest to you.  It won't do anything unless you create
 // (touch) the file 'debug_sched' in the project root directory.
@ -69,10 +68,6 @@
 #define DEBUG_LEVEL  999
 #define MAX_FCGI_COUNT  20

-#define REQ_FILE_PREFIX "boinc_req/"
-#define REPLY_FILE_PREFIX "boinc_reply/"
-bool use_files = false;     // use disk files for req/reply msgs (for debugging)
-
 GUI_URLS gui_urls;
 PROJECT_FILES project_files;
 key_t sema_key;
@ -348,6 +343,11 @@ void attach_to_feeder_shmem() {
    }
 }

+inline static const char* get_remote_addr() {
+    const char * r = getenv("REMOTE_ADDR");
+    return r ? r : "?.?.?.?";
+}
+
 int main(int argc, char** argv) {
 #ifndef _USING_FCGI_
    FILE* fin, *fout;
@ -355,7 +355,7 @@ int main(int argc, char** argv) {
    FCGI_FILE *fin, *fout;
 #endif
    int i, retval;
-    char req_path[256], reply_path[256], path[256];
+    char req_path[256], reply_path[256], log_path[256], path[256];
    unsigned int counter=0;
    char* code_sign_key;
    int length=-1;
@ -503,7 +503,7 @@ int main(int argc, char** argv) {
        goto done;
    }

-    if (use_files) {
+    if (strlen(config.debug_req_reply_dir)) {
        struct stat statbuf;
        // the code below is convoluted because,
        // instead of going from stdin to stdout directly,
@ -513,8 +513,25 @@ int main(int argc, char** argv) {
        // NOTE: to use this, you must create group-writeable dirs
        // boinc_req and boinc_reply in the project dir
        //
-        sprintf(req_path, "%s%d_%u", config.project_path(REQ_FILE_PREFIX), g_pid, counter);
-        sprintf(reply_path, "%s%d_%u", config.project_path(REPLY_FILE_PREFIX), g_pid, counter);
+        sprintf(req_path, "%s/%d_%u_sched_request.xml", config.debug_req_reply_dir, g_pid, counter);
+        sprintf(reply_path, "%s/%d_%u_sched_reply.xml", config.debug_req_reply_dir, g_pid, counter);
+
+        // keep an own 'log' per PID in case general logging fails
+        // this allows to associate at leas the scheduler request with the client
+        // IP address (as shown in httpd error log) in case of a crash
+        sprintf(log_path, "%s/%d_%u_sched.log", config.debug_req_reply_dir, g_pid, counter);
+#ifndef _USING_FCGI_
+        fout = fopen(log_path, "a");
+#else
+        fout = FCGI::fopen(log_path,"a");
+#endif
+        fprintf(fout, "PID: %d Client IP: %s\n", g_pid, get_remote_addr());
+        fclose(fout);
+
+        log_messages.printf(MSG_DEBUG,
+            "keeping sched_request in %s, sched_reply in %s, custom log in %s\n",
+            req_path, reply_path, log_path
+        );
 #ifndef _USING_FCGI_
        fout = fopen(req_path, "w");
 #else
@ -575,13 +592,17 @@ int main(int argc, char** argv) {
        }
        copy_stream(fin, stdout);
        fclose(fin);
-#ifdef EINSTEIN_AT_HOME
-        if (getenv("CONTENT_LENGTH")) unlink(req_path);
-        if (getenv("CONTENT_LENGTH")) unlink(reply_path);
-#else
-        // unlink(req_path);
-        // unlink(reply_path);
-#endif
+
+        // if not contacted from a client, don't keep the log files
+        /* not sure what lead to the assumption of a client setting
+           CONTENT_LENGTH, but it's wrong at least on our current
+           project / Apache / Client configuration. Commented out.
+        if (getenv("CONTENT_LENGTH")) {
+          unlink(req_path);
+          unlink(reply_path);
+        }
+        */
+
 #ifndef _USING_FCGI_
    } else if (batch) {
        while (!feof(stdin)) {