- scheduler: clean up message log.

Merge redundant messages. Condition various messages on config flags. - client (Unix) fix to CUDA detection if LD_LIBRARY_PATH is "" svn path=/trunk/boinc/; revision=15122
2008-05-02 17:48:29 +00:00 · 2008-05-02 17:48:29 +00:00 · 6e6fab3e7c
parent 2411314ddc
commit 6e6fab3e7c
10 changed files with 170 additions and 112 deletions
--- a/15
+++ b/15
@ -3538,3 +3538,18 @@ Rom    May 2 2008 (HEAD)
        configure.ac
        version.h

+David  May 2 2008
+    - scheduler: clean up message log.
+        Merge redundant messages.
+        Condition various messages on config flags.
+    - client (Unix) fix to CUDA detection if LD_LIBRARY_PATH is ""
+
+    lib/
+        coproc.C
+    sched/
+        handle_request.C
+        main.C
+        sched_config.C,h
+        sched_result.C
+        sched_send.C
+        server_types.C,h
--- a/lib/coproc.C
+++ b/lib/coproc.C
@ -132,7 +132,7 @@ void COPROC_CUDA::get(COPROCS& coprocs) {
    // Leave it there, shouldn't hurt.
    //
    char* p = getenv("LD_LIBRARY_PATH");
-    if (p) {
+    if (p && strlen(p)) {
        if (!strstr(p, "/usr/local/cuda/lib")) {
            char libpath[8192];
            sprintf(libpath, "%s:/usr/local/cuda/lib", p);
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -258,10 +258,6 @@ int authenticate_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        }

        reply.host = host;
-        log_messages.printf(MSG_DEBUG,
-            "Request [HOST#%d] Database [HOST#%d] Request [RPC#%d] Database [RPC#%d]\n",
-            sreq.hostid, host.id, sreq.rpc_seqno, host.rpc_seqno
-        );

        // look up user based on the ID in host record,
        // and see if the authenticator matches (regular or weak)
@ -1133,6 +1129,23 @@ void handle_msgs_to_host(SCHEDULER_REPLY& reply) {
    }
 }

+static void log_request(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
+    log_messages.printf(MSG_NORMAL,
+        "Request: [USER#%d] [HOST#%d] [IP %s] client %d.%d.%d, work req %d sec\n",
+        reply.user.id, reply.host.id, get_remote_addr(),
+        sreq.core_client_major_version, sreq.core_client_minor_version,
+        sreq.core_client_release,
+        (int)sreq.work_req_seconds
+    );
+    if (config.debug_request_details) {
+        log_messages.printf(MSG_DEBUG,
+             "Request details: auth %s, RPC seqno %d, platform %s\n",
+             sreq.authenticator, sreq.rpc_seqno, sreq.platform.name
+        );
+    }
+    log_messages.set_indent_level(2);
+}
+
 void process_request(
    SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, char* code_sign_key
 ) {
@ -1215,12 +1228,7 @@ void process_request(
    initial_host = reply.host;
    reply.host.rpc_seqno = sreq.rpc_seqno;

-    log_messages.printf(MSG_NORMAL,
-        "Processing request from [USER#%d] [HOST#%d] [IP %s] [RPC#%d] core client version %d.%d.%d\n",
-        reply.user.id, reply.host.id, get_remote_addr(), sreq.rpc_seqno,
-        sreq.core_client_major_version, sreq.core_client_minor_version,
-        sreq.core_client_release
-    );
+    log_request(sreq, reply);

    // is host blacklisted?
    //
@ -1361,6 +1369,33 @@ leave:
    }
 }

+static void log_incomplete_request(SCHEDULER_REQUEST& sreq) {
+    // BOINC scheduler requests use method POST.
+    // So method GET means that someone is trying a browser.
+    //
+    char *rm=getenv("REQUEST_METHOD");
+    bool used_get = false;
+    if (rm && !strcmp(rm, "GET")) {
+        used_get = true;
+    }
+    log_messages.printf(MSG_NORMAL,
+        "Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%d.%d\n",
+        used_get?"(used GET method - probably a browser) ":"",
+        get_remote_addr(), sreq.authenticator, sreq.platform.name,
+        sreq.core_client_major_version, sreq.core_client_minor_version,
+        sreq.core_client_release
+    );
+}
+
+static void log_user_messages(SCHEDULER_REPLY& sreply) {
+    for (unsigned int i=0; i<sreply.messages.size(); i++) {
+        USER_MESSAGE um = sreply.messages[i];
+        log_messages.printf(MSG_DEBUG,
+            "[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
+        );
+    }
+}
+
 void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
    SCHEDULER_REQUEST sreq;
    SCHEDULER_REPLY sreply;
@ -1370,32 +1405,9 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
    log_messages.set_indent_level(1);

    if (sreq.parse(fin) == 0){
-        log_messages.printf(MSG_NORMAL,
-             "Handling request: IP %s, auth %s, host %d, platform %s, version %d.%d.%d, work req %d sec\n",
-             get_remote_addr(), sreq.authenticator, sreq.hostid, sreq.platform.name,
-             sreq.core_client_major_version, sreq.core_client_minor_version,
-             sreq.core_client_release,
-             (int)sreq.work_req_seconds
-        );
        process_request(sreq, sreply, code_sign_key);
-		log_messages.set_indent_level(2);
    } else {
-        // BOINC scheduler requests use method POST.
-        // So method GET means that someone is trying a browser.
-        //
-        char *rm=getenv("REQUEST_METHOD");
-        if (rm && !strcmp(rm, "GET")) {
-            sreply.probable_user_browser=true;
-        }
-		log_messages.set_indent_level(2);
-        log_messages.printf(MSG_NORMAL,
-            "Incomplete request received %sfrom IP %s, auth %s, platform %s, version %d.%d.%d\n",
-            sreply.probable_user_browser?"(probably a browser) ":"",
-            get_remote_addr(), sreq.authenticator, sreq.platform.name,
-            sreq.core_client_major_version, sreq.core_client_minor_version,
-            sreq.core_client_release
-        );
-
+        log_incomplete_request(sreq);
        USER_MESSAGE um("Incomplete request received.", "low");
        sreply.insert_message(um);
        sreply.nucleus_only = true;
@ -1405,13 +1417,8 @@ void handle_request(FILE* fin, FILE* fout, char* code_sign_key) {
        send_file_deletes(sreq, sreply);
    }

-    // write all messages to log file
-    //
-    for (unsigned int i=0; i<sreply.messages.size(); i++) {
-        USER_MESSAGE um = sreply.messages[i];
-        log_messages.printf(MSG_DEBUG,
-            "[HOST#%d] MSG(%4s) %s \n", sreply.host.id, um.priority.c_str(), um.message.c_str()
-        );
+    if (config.debug_user_messages) {
+        log_user_messages(sreply);
    }

    sreply.write(fout);
--- a/sched/main.C
+++ b/sched/main.C
@ -186,7 +186,7 @@ void sigterm_handler(int signo) {
    return;
 }

-void log_request_info(int& length) {
+static void log_request_headers(int& length) {
    char *cl=getenv("CONTENT_LENGTH");
    char *ri=getenv("REMOTE_ADDR");
    char *rm=getenv("REQUEST_METHOD");
@ -399,7 +399,9 @@ int main(int argc, char** argv) {
        counter++;
        log_messages.set_indent_level(0);
 #endif
-    log_request_info(length);
+    if (config.debug_request_headers) {
+        log_request_headers(length);
+    }

    if (check_stop_sched()) {
        send_message("Project is temporarily shut down for maintenance", 3600);
--- a/sched/sched_config.C
+++ b/sched/sched_config.C
@ -187,6 +187,10 @@ int SCHED_CONFIG::parse(FILE* f) {
        if (xp.parse_bool(tag, "debug_assignment", debug_assignment)) continue;
        if (xp.parse_bool(tag, "debug_prefs", debug_prefs)) continue;
        if (xp.parse_bool(tag, "debug_send", debug_send)) continue;
+        if (xp.parse_bool(tag, "debug_request_headers", debug_request_headers)) continue;
+        if (xp.parse_bool(tag, "debug_user_messages", debug_user_messages)) continue;
+        if (xp.parse_bool(tag, "debug_request_details", debug_request_details)) continue;
+        if (xp.parse_bool(tag, "debug_handle_results", debug_handle_results)) continue;

        // don't complain about unparsed XML;
        // there are lots of tags the scheduler doesn't know about
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -126,6 +126,10 @@ public:
    bool debug_assignment;
    bool debug_prefs;
    bool debug_send;
+    bool debug_request_headers;
+    bool debug_user_messages;
+    bool debug_request_details;
+    bool debug_handle_results;

    int parse(FILE*);
    int parse_file(const char* dir=".");
--- a/sched/sched_result.C
+++ b/sched/sched_result.C
@ -24,6 +24,7 @@
 #include "server_types.h"
 #include "sched_msgs.h"
 #include "sched_util.h"
+#include "sched_config.h"

 #include "sched_result.h"

@ -86,12 +87,14 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
            continue;
        }

-        log_messages.printf(MSG_NORMAL,
-            "[HOST#%d] [RESULT#%d %s] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n",
-            reply.host.id, srip->id, srip->name, srip->server_state,
-            srip->outcome, srip->client_state, srip->validate_state,
-            srip->file_delete_state
-        );
+        if (config.debug_handle_results) {
+            log_messages.printf(MSG_DEBUG,
+                "[HOST#%d] [RESULT#%d %s] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n",
+                reply.host.id, srip->id, srip->name, srip->server_state,
+                srip->outcome, srip->client_state, srip->validate_state,
+                srip->file_delete_state
+            );
+        }

        // Do various sanity checks.
        // If one of them fails, set srip->id = 0,
@ -166,8 +169,9 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

        if (srip->received_time) {
            log_messages.printf(MSG_CRITICAL,
-                "[HOST#%d] [RESULT#%d %s] got result twice\n",
-                reply.host.id, srip->id, srip->name
+                "[HOST#%d] [RESULT#%d %s] already got result, at %s \n",
+                reply.host.id, srip->id, srip->name,
+                time_to_string(srip->received_time)
            );
            srip->id = 0;
            reply.result_acks.push_back(std::string(rp->name));
@ -246,9 +250,11 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        } else {
            srip->claimed_credit = srip->cpu_time * reply.host.claimed_credit_per_cpu_sec;
        }
-        log_messages.printf(MSG_DEBUG,
-            "cpu %f cpcs %f, cc %f\n", srip->cpu_time, reply.host.claimed_credit_per_cpu_sec, srip->claimed_credit
-        );
+        if (config.debug_handle_results) {
+            log_messages.printf(MSG_DEBUG,
+                "cpu time %f credit/sec %f, claimed credit %f\n", srip->cpu_time, reply.host.claimed_credit_per_cpu_sec, srip->claimed_credit
+            );
+        }
        srip->server_state = RESULT_SERVER_STATE_OVER;

        strlcpy(srip->stderr_out, rp->stderr_out, sizeof(srip->stderr_out));
@ -262,16 +268,20 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

        if ((srip->client_state == RESULT_FILES_UPLOADED) && (srip->exit_status == 0)) {
            srip->outcome = RESULT_OUTCOME_SUCCESS;
-            log_messages.printf(MSG_DEBUG,
-                "[RESULT#%d %s]: setting outcome SUCCESS\n",
-                srip->id, srip->name
-            );
+            if (config.debug_handle_results) {
+                log_messages.printf(MSG_DEBUG,
+                    "[RESULT#%d %s]: setting outcome SUCCESS\n",
+                    srip->id, srip->name
+                );
+            }
            reply.got_good_result();
        } else {
-            log_messages.printf(MSG_DEBUG,
-                "[RESULT#%d %s]: client_state %d exit_status %d; setting outcome ERROR\n",
-                srip->id, srip->name, srip->client_state, srip->exit_status
-            );
+            if (config.debug_handle_results) {
+                log_messages.printf(MSG_DEBUG,
+                    "[RESULT#%d %s]: client_state %d exit_status %d; setting outcome ERROR\n",
+                    srip->id, srip->name, srip->client_state, srip->exit_status
+                );
+            }
            srip->outcome = RESULT_OUTCOME_CLIENT_ERROR;
            srip->validate_state = VALIDATE_STATE_INVALID;
            reply.got_bad_result();
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -330,18 +330,8 @@ double estimate_cpu_duration(WORKUNIT& wu, SCHEDULER_REPLY& reply) {
 static double estimate_wallclock_duration(
    WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
 ) {
-    double running_frac;
-    if (reply.wreq.core_client_version<=419) {
-        running_frac = reply.host.on_frac;
-    } else {
-        running_frac = reply.host.active_frac * reply.host.on_frac;
-    }
-    if (running_frac < HOST_ACTIVE_FRAC_MIN) {
-        running_frac = HOST_ACTIVE_FRAC_MIN;
-    }
-    if (running_frac > 1) running_frac = 1;
    double ecd = estimate_cpu_duration(wu, reply);
-    double ewd = ecd/running_frac;
+    double ewd = ecd/reply.wreq.running_frac;
    if (reply.host.duration_correction_factor) {
        ewd *= reply.host.duration_correction_factor;
    }
@ -350,8 +340,8 @@ static double estimate_wallclock_duration(
    }
    if (config.debug_send) {
        log_messages.printf(MSG_DEBUG,
-            "est cpu dur %f; running_frac %f; est %f\n",
-            ecd, running_frac, ewd
+            "est cpu dur %f;  est wall dur %f\n",
+            ecd, reply.wreq.running_frac, ewd
        );
    }
    return ewd;
@ -401,13 +391,6 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
    double expavg_time = reply.host.expavg_time;
    double avg_turnaround = reply.host.avg_turnaround;
    update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
-    if (config.debug_send) {
-        log_messages.printf(MSG_DEBUG,
-            "[HOST#%d] Checking if reliable (OS = %s) error_rate = %.3f avg_turnaround = %.0f hrs\n",
-            reply.host.id, reply.host.os_name, reply.host.error_rate,
-            reply.host.avg_turnaround/3600
-        );
-    }

 	// Platforms other then Windows, Linux and Intel Macs need a
    // larger set of computers to be marked reliable
@ -428,13 +411,15 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
        && (config.daily_result_quota == 0 || reply.host.max_results_day >= config.daily_result_quota)
     ) {
        reply.wreq.host_info.reliable = true;
-        if (config.debug_send) {
-            log_messages.printf(MSG_DEBUG,
-                "[HOST#%d] is reliable (OS = %s) error_rate = %.3f avg_turnaround(hours) = %.0f \n",
-                reply.host.id, reply.host.os_name, reply.host.error_rate,
-                reply.host.avg_turnaround/3600
-            );
-        }
+    }
+    if (config.debug_send) {
+        log_messages.printf(MSG_DEBUG,
+            "[HOST#%d] is%s reliable (OS = %s) error_rate = %.3f avg_turn_hrs = %.0f \n",
+            reply.host.id,
+            reply.wreq.host_info.reliable?"":" not",
+            reply.host.os_name, reply.host.error_rate,
+            reply.host.avg_turnaround/3600
+        );
    }
    return 0;
 }
@ -563,9 +548,9 @@ static inline int check_deadline(
        if (diff > 0) {
            if (config.debug_send) {
                log_messages.printf(MSG_DEBUG,
-                    "[WU#%d %s] needs %d seconds on [HOST#%d]; delay_bound is %d (estimated_delay is %f)\n",
-                    wu.id, wu.name, (int)ewd, reply.host.id, wu.delay_bound,
-                    request.estimated_delay
+                    "[WU#%d %s] est report delay %d on [HOST#%d]; delay_bound is %d\n",
+                    wu.id, wu.name, (int)est_report_delay,
+                    reply.host.id, wu.delay_bound
                );
            }
            reply.wreq.speed.set_insufficient(diff);
@ -1201,7 +1186,21 @@ static void explain_to_user(SCHEDULER_REPLY& reply) {
    }
 }

+static void get_running_frac(SCHEDULER_REPLY& reply) {
+    if (reply.wreq.core_client_version<=419) {
+        reply.wreq.running_frac = reply.host.on_frac;
+    } else {
+        reply.wreq.running_frac = reply.host.active_frac * reply.host.on_frac;
+    }
+    if (reply.wreq.running_frac < HOST_ACTIVE_FRAC_MIN) {
+        reply.wreq.running_frac = HOST_ACTIVE_FRAC_MIN;
+    }
+    if (reply.wreq.running_frac > 1) reply.wreq.running_frac = 1;
+}
+
 void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
+    if (sreq.work_req_seconds <= 0) return;
+
    reply.wreq.core_client_version =
        sreq.core_client_major_version*100 + sreq.core_client_minor_version;
    reply.wreq.disk_available = max_allowable_disk(sreq, reply);
@ -1214,18 +1213,33 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
    }

    get_host_info(reply); // parse project prefs for app details
-    reply.wreq.beta_only = false;
-    reply.wreq.user_apps_only = true;
+
+    get_running_frac(reply);

    if (config.debug_send) {
        log_messages.printf(MSG_DEBUG,
-            "[HOST#%d] got request for %f seconds of work; available disk %f GB\n",
-            reply.host.id, sreq.work_req_seconds, reply.wreq.disk_available/1e9
+            "%s matchmaking scheduling; %s EDF sim\n",
+#ifdef MATCHMAKER
+            "Using",
+#else
+            "Not using",
+#endif
+            config.workload_sim?"Using":"Not using"
+        );
+        log_messages.printf(MSG_DEBUG,
+            "available disk %f GB, work_buf_min %d\n",
+            reply.wreq.disk_available/1e9,
+            (int)sreq.global_prefs.work_buf_min()
+        );
+        log_messages.printf(MSG_DEBUG,
+            "running frac %f DCF %f CPU effic %f est delay %d\n",
+            reply.wreq.running_frac,
+            reply.host.duration_correction_factor,
+            reply.host.cpu_efficiency,
+            (int)sreq.estimated_delay
        );
    }

-    if (sreq.work_req_seconds <= 0) return;
-
    reply.wreq.seconds_to_fill = sreq.work_req_seconds;
    if (reply.wreq.seconds_to_fill > MAX_SECONDS_TO_SEND) {
        reply.wreq.seconds_to_fill = MAX_SECONDS_TO_SEND;
@ -1258,6 +1272,9 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
 #ifdef MATCHMAKER
        send_work_matchmaker(sreq, reply);
 #else
+        reply.wreq.beta_only = false;
+        reply.wreq.user_apps_only = true;
+
        // give top priority to results that require a 'reliable host'
        //
        if (reply.wreq.host_info.reliable) {
@ -1308,11 +1325,6 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
 #endif
    }

-    log_messages.printf(MSG_NORMAL,
-        "[HOST#%d] Sent %d results [scheduler ran %f seconds]\n",
-        reply.host.id, reply.wreq.nresults, elapsed_wallclock_time() 
-    );
-
    explain_to_user(reply);
 }

@ -1635,9 +1647,11 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        JOB job;
        job.index = i;
        job.get_score(sreq, reply);
-        log_messages.printf(MSG_NORMAL,
-            "score for %s: %f\n", wu_result.workunit.name, job.score
-        );
+        if (config.debug_send) {
+            log_messages.printf(MSG_DEBUG,
+                "score for %s: %f\n", wu_result.workunit.name, job.score
+            );
+        }
        if (job.score > jobs.lowest_score()) {
            ssp->wu_results[i].state = pid;
            unlock_sema();
--- a/sched/server_types.C
+++ b/sched/server_types.C
@ -496,7 +496,6 @@ SCHEDULER_REPLY::SCHEDULER_REPLY() {
    memset(&team, 0, sizeof(team));
    nucleus_only = false;
    project_is_down = false;
-    probable_user_browser = false;
    send_msg_ack = false;
    strcpy(email_hash, "");
 }
@ -543,10 +542,12 @@ int SCHEDULER_REPLY::write(FILE* fout) {
            request_delay=min_delay_needed; 
        }
        fprintf(fout, "<request_delay>%f</request_delay>\n", request_delay);
-        log_messages.printf(MSG_NORMAL,
-            "sending delay request %f\n", request_delay
-        );
    }
+    log_messages.printf(MSG_NORMAL,
+        "Sending reply to [HOST#%d]: %d results, delay req %f [scheduler ran %f seconds]\n",
+        host.id, wreq.nresults, request_delay, elapsed_wallclock_time() 
+    );
+
    if (wreq.core_client_version <= 419) {
        std::string msg;
        std::string pri = "low";
@ -940,6 +941,7 @@ int RESULT::parse_from_client(FILE* fin) {
        }
        if (match_tag(buf, "<platform>")) continue;
        if (match_tag(buf, "<version_num>")) continue;
+        if (match_tag(buf, "<plan_class>")) continue;
        if (match_tag(buf, "<completed_time>")) continue;
        if (match_tag(buf, "<file_name>")) continue;
        if (match_tag(buf, "<file_ref>")) continue;
--- a/sched/server_types.h
+++ b/sched/server_types.h
@ -110,6 +110,7 @@ struct WORK_REQ {
    double disk_available;
    int nresults;
    int core_client_version;
+    double running_frac;

    RESOURCE disk;
    RESOURCE mem;
@ -301,7 +302,6 @@ struct SCHEDULER_REPLY {
    int lockfile_fd; // file descriptor of lockfile, or -1 if no lock.
    bool send_global_prefs;
    bool nucleus_only;          // send only message
-    bool probable_user_browser;
    USER user;
    char email_hash[MD5_LEN];
    HOST host;                  // after validation, contains full host rec