- client (Unix) Add /usr/local/cuda/lib/ to LD_LIBRARY_PATH

before looking for CUDA library - scheduler: some additional work on matchmaker scheduling Changed check_app_filter() so that it doesn't depend on the current multi-phase approach; move that logic to scan_array() svn path=/trunk/boinc/; revision=15109
2008-04-30 20:31:33 +00:00 · 2008-04-30 20:31:33 +00:00 · 938d77ce4c
parent 7da4959dff
commit 938d77ce4c
6 changed files with 157 additions and 104 deletions
--- a/14
+++ b/14
@ -3453,3 +3453,17 @@ Rom    April 30 2008
        graphics2_unix.C
        graphics2_win.C

+David  April 30 2008
+    - client (Unix) Add /usr/local/cuda/lib/ to LD_LIBRARY_PATH
+        before looking for CUDA library
+    - scheduler: some additional work on matchmaker scheduling
+        Changed check_app_filter() so that it doesn't depend on
+        the current multi-phase approach;
+        move that logic to scan_array()
+
+    lib/
+        coproc.C
+    sched/
+        sched_array.C
+        sched_locality.C
+        sched_send.C,h
--- a/lib/coproc.C
+++ b/lib/coproc.C
@ -96,24 +96,24 @@ COPROC* COPROCS::lookup(char* name) {
 void COPROC_CUDA::get(COPROCS& coprocs) {
   int count, retval;

-#ifdef _WIN32
+#ifdef _WIN32
   int (__stdcall* __cudaGetDeviceCount)( int * );
   int (__stdcall* __cudaGetDeviceProperties) ( cudaDeviceProp*, int );
-   int bufsize=256;
-   char buf[256], path[256];
-   HKEY key;
-   retval = RegOpenKeyEx(
-        HKEY_LOCAL_MACHINE,
-        _T("SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA CUDA"),
-        NULL,
-        KEY_READ,
-        &key
-   );
-   if (retval != ERROR_SUCCESS) return;
-   retval = RegQueryValueEx(key, "InstallDir", NULL, NULL, (LPBYTE)buf, (LPDWORD)&bufsize);
-   RegCloseKey(key);
-   if (retval != ERROR_SUCCESS) return;
-   sprintf(path, "%s\\bin\\cudart.dll", buf);
+   int bufsize=256;
+   char buf[256], path[256];
+   HKEY key;
+   retval = RegOpenKeyEx(
+        HKEY_LOCAL_MACHINE,
+        _T("SOFTWARE\\NVIDIA Corporation\\Installed Products\\NVIDIA CUDA"),
+        NULL,
+        KEY_READ,
+        &key
+   );
+   if (retval != ERROR_SUCCESS) return;
+   retval = RegQueryValueEx(key, "InstallDir", NULL, NULL, (LPBYTE)buf, (LPDWORD)&bufsize);
+   RegCloseKey(key);
+   if (retval != ERROR_SUCCESS) return;
+   sprintf(path, "%s\\bin\\cudart.dll", buf);

   HMODULE cudalib = LoadLibrary(path);
   __cudaGetDeviceCount = (int(__stdcall*)(int*)) GetProcAddress(cudalib, "cudaGetDeviceCount");
@ -121,11 +121,22 @@ void COPROC_CUDA::get(COPROCS& coprocs) {
   __cudaGetDeviceProperties = (int(__stdcall*)(cudaDeviceProp*, int)) GetProcAddress( cudalib, "cudaGetDeviceProperties" );
    if (!__cudaGetDeviceProperties) return;
 #else
-   int (*__cudaGetDeviceCount)( int * );
-   int (*__cudaGetDeviceProperties) ( cudaDeviceProp*, int );
+   void (*__cudaGetDeviceCount)( int * );
+   void (*__cudaGetDeviceProperties) ( cudaDeviceProp*, int );
+
+    // Add CUDA dir to library path while looking for CUDA lib.
+    // Leave it there, shouldn't hurt.
+    //
+    char* p = getenv("LD_LIBRARY_PATH");
+    if (!strstr(p, "/usr/local/cuda/lib")) {
+        char libpath[8192];
+        sprintf(libpath, "%s:/usr/local/cuda/lib", p);
+        setenv("LD_LIBRARY_PATH", libpath, 1);
+    }
+
 #ifdef __APPLE__
   void *cudalib = dlopen ("libcudart.dylib", RTLD_NOW );
-   #else
+#else
   void *cudalib = dlopen ("libcudart.so", RTLD_NOW );
 #endif
   if(!cudalib) return;
@ -134,13 +145,12 @@ void COPROC_CUDA::get(COPROCS& coprocs) {
   __cudaGetDeviceProperties = (void(*)(cudaDeviceProp*, int)) dlsym( cudalib, "cudaGetDeviceProperties" );
    if (!__cudaGetDeviceProperties) return;
 #endif
-   retval = (*__cudaGetDeviceCount)(&count);
-   if (retval || count < 1) return;
+   (*__cudaGetDeviceCount)(&count);
+   if (count < 1) return;

   for (int i=0; i<count; i++) {
       COPROC_CUDA* cc = new COPROC_CUDA;
-       retval = (*__cudaGetDeviceProperties)(&cc->prop, i);
-       if (retval) continue;
+       (*__cudaGetDeviceProperties)(&cc->prop, i);
       cc->count = 1;
       strcpy(cc->name, "CUDA");
       coprocs.coprocs.push_back(cc);
--- a/sched/sched_array.C
+++ b/sched/sched_array.C
@ -110,15 +110,33 @@ void scan_work_array(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
            continue;
        }
        
+        wu = wu_result.workunit;
+
+        // check app filter if needed
+        //
+        if (reply.wreq.user_apps_only &&
+            (!reply.wreq.beta_only || config.distinct_beta_apps)
+        ) {
+            if (app_not_selected(wu, sreq, reply)) {
+                reply.wreq.no_allowed_apps_available = true;
+                if (config.debug_send) {
+                    log_messages.printf(MSG_DEBUG,
+                        "[USER#%d] [WU#%d] user doesn't want work for this application\n",
+                        reply.user.id, wu.id
+                    );
+                }
+                continue;
+            }
+        }
+
        // don't send if host can't handle it
        //
-        wu = wu_result.workunit;
-        retval = wu_is_infeasible(wu, sreq, reply, *app);
+        retval = wu_is_infeasible_fast(wu, sreq, reply, *app);
        if (retval) {
-               log_messages.printf(MSG_DEBUG,
-                   "[HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
-                   reply.host.id, wu.id, wu.name, infeasible_string(retval)
-               );
+            log_messages.printf(MSG_DEBUG,
+                "[HOST#%d] [WU#%d %s] WU is infeasible: %s\n",
+                reply.host.id, wu.id, wu.name, infeasible_string(retval)
+            );
            continue;
        }

--- a/sched/sched_locality.C
+++ b/sched/sched_locality.C
@ -296,12 +296,8 @@ static int possibly_send_result(

    if (!bavp) return ERR_NO_APP_VERSION;

-    // wu_is_infeasible() returns the reason why the WU is not feasible;
-    // INFEASIBLE_MEM, INFEASIBLE_DISK, INFEASIBLE_CPU.
-    // see sched_send.h.
-    // 
    APP* app = ssp->lookup_app(wu.appid);
-    if (wu_is_infeasible(wu, sreq, reply, *app)) {
+    if (wu_is_infeasible_fast(wu, sreq, reply, *app)) {
        return ERR_INSUFFICIENT_RESOURCE;
    }

--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -362,7 +362,7 @@ static double estimate_wallclock_duration(
 //
 static int get_host_info(SCHEDULER_REPLY& reply) {
    char buf[8096];
-    std::string str;
+    string str;
    unsigned int pos = 0;
    int temp_int;
    bool flag;
@ -389,12 +389,7 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
        reply.wreq.host_info.allow_beta_work = flag;
 	}
 
-    // Decide whether or not this computer is a 'reliable' computer
-    //
-    double expavg_credit = reply.host.expavg_credit;
-    double expavg_time = reply.host.expavg_time;
-    double avg_turnaround = reply.host.avg_turnaround;
-    update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
+    // Decide whether or not this computer is 'reliable'
    // A computer is reliable if the following conditions are true
    // (for those that are set in the config file)
    // 1) The host average turnaround is less than the config
@ -402,6 +397,10 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
    // 2) The host error rate is less then the config max error rate
    // 3) The host results per day is equal to the config file value
    //
+    double expavg_credit = reply.host.expavg_credit;
+    double expavg_time = reply.host.expavg_time;
+    double avg_turnaround = reply.host.avg_turnaround;
+    update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
    if (config.debug_send) {
        log_messages.printf(MSG_DEBUG,
            "[HOST#%d] Checking if reliable (OS = %s) error_rate = %.3f avg_turnaround = %.0f hrs\n",
@ -414,7 +413,11 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
    // larger set of computers to be marked reliable
    //
    double multiplier = 1.0;
-    if (strstr(reply.host.os_name,"Windows") || strstr(reply.host.os_name,"Linux") || (strstr(reply.host.os_name,"Darwin") && !(strstr(reply.host.p_vendor,"Power Macintosh")))) {
+    if (strstr(reply.host.os_name,"Windows")
+        || strstr(reply.host.os_name,"Linux")
+        || (strstr(reply.host.os_name,"Darwin")
+            && !(strstr(reply.host.p_vendor,"Power Macintosh"))
+    )) {
    	multiplier = 1.0;
    } else {
    	multiplier = 1.8;
@ -436,36 +439,22 @@ static int get_host_info(SCHEDULER_REPLY& reply) {
    return 0;
 }

-// Check to see if the user has set application preferences.
-// If they have, then only send work for the allowed applications
+// Return true if the user has set application preferences,
+// and this job is not for a selected app
 //
-static inline int check_app_filter(
+bool app_not_selected(
    WORKUNIT& wu, SCHEDULER_REQUEST& , SCHEDULER_REPLY& reply
 ) {
    unsigned int i;

-    if (reply.wreq.host_info.preferred_apps.size() == 0) return 0;
-    bool app_allowed = false;
+    if (reply.wreq.host_info.preferred_apps.size() == 0) return false;
    for (i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {
-        if (wu.appid==reply.wreq.host_info.preferred_apps[i].appid) {
-            app_allowed = true;
+        if (wu.appid == reply.wreq.host_info.preferred_apps[i].appid) {
    	    reply.wreq.host_info.preferred_apps[i].work_available = true;
-            break;
+            return false;
        }
    }
-    if (!app_allowed && reply.wreq.user_apps_only && 
-        (!reply.wreq.beta_only || config.distinct_beta_apps)
-    ) {
-        reply.wreq.no_allowed_apps_available = true;
-        if (config.debug_send) {
-            log_messages.printf(MSG_DEBUG,
-                "[USER#%d] [WU#%d] user doesn't want work for this application\n",
-                reply.user.id, wu.id
-            );
-        }
-        return INFEASIBLE_APP_SETTING;
-    }
-    return 0;
+    return true;
 }

 // see how much RAM we can use on this machine
@ -586,7 +575,7 @@ static inline int check_deadline(
    return 0;
 }

-// Quick checks (no DB access) to see if the WU can be sent on the host.
+// Fast checks (no DB access) to see if the job can be sent to the host.
 // Reasons why not include:
 // 1) the host doesn't have enough memory;
 // 2) the host doesn't have enough disk space;
@ -594,10 +583,7 @@ static inline int check_deadline(
 //    the host probably won't get the result done within the delay bound
 // 4) app isn't in user's "approved apps" list
 //
-// TODO: this should be used in locality scheduling case too.
-// Should move a few other checks from sched_array.C
-//
-int wu_is_infeasible(
+int wu_is_infeasible_fast(
    WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply, APP& app
 ) {
    int retval;
@ -631,8 +617,6 @@ int wu_is_infeasible(
        }
    }

-    retval = check_app_filter(wu, request, reply);
-    if (retval) return retval;
    retval = check_memory(wu, request, reply);
    if (retval) return retval;
    retval = check_disk(wu, request, reply);
@ -1061,6 +1045,7 @@ int add_result_to_reply(
 //
 static void explain_to_user(SCHEDULER_REPLY& reply) {
    char helpful[512];
+    unsigned int i;

    // If work was sent from apps // the user did not select, explain
    //
@ -1069,7 +1054,7 @@ static void explain_to_user(SCHEDULER_REPLY& reply) {
        reply.insert_message(um);

        // Inform the user about applications with no work
-        for (int i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {
+        for (i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {
         	if (!reply.wreq.host_info.preferred_apps[i].work_available) {
         		APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid);
         		// don't write message if the app is deprecated
@ -1087,7 +1072,7 @@ static void explain_to_user(SCHEDULER_REPLY& reply) {

        // Tell the user about applications they didn't qualify for
        //
-        for(int i=0;i<preferred_app_message_index;i++){
+        for(i=0;i<preferred_app_message_index;i++){
            reply.insert_message(reply.wreq.no_work_messages.at(i));
        }
        USER_MESSAGE um1("You have selected to receive work from other applications if no work is available for the applications you selected", "high");
@ -1103,8 +1088,8 @@ static void explain_to_user(SCHEDULER_REPLY& reply) {
        USER_MESSAGE um2("No work sent", "high");
        reply.insert_message(um2);
        // Inform the user about applications with no work
-        for(int i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {
-         	if ( !reply.wreq.host_info.preferred_apps[i].work_available ) {
+        for(i=0; i<reply.wreq.host_info.preferred_apps.size(); i++) {
+         	if (!reply.wreq.host_info.preferred_apps[i].work_available) {
         		APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid);
         		// don't write message if the app is deprecated
         		if ( app != NULL ) {
@ -1124,7 +1109,7 @@ static void explain_to_user(SCHEDULER_REPLY& reply) {
        }
        if (reply.wreq.no_allowed_apps_available) {
            USER_MESSAGE um(
-                "No work available for the applications you have selected.  Please check your settings on the website.",
+                "No work available for the applications you have selected.  Please check your settings on the web site.",
                "high"
            );
            reply.insert_message(um);
@ -1306,7 +1291,8 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        reply.wreq.infeasible_only = false;
        scan_work_array(sreq, reply);
        
-       	// If the user has said they prefer to only receive work from certain apps
+        // If user has selected apps but will accept any,
+        // and we haven't found any jobs for selected apps, try others
       	//
       	if (!reply.wreq.nresults && reply.wreq.host_info.allow_non_preferred_apps ) {
       		reply.wreq.user_apps_only = false;
@ -1334,13 +1320,13 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

 struct JOB{
    int index;
-    double value;
+    double score;
    double est_time;
    double disk_usage;
    APP* app;
    BEST_APP_VERSION* bavp;

-    void get_value(SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
+    void get_score(SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
 };

 struct JOB_SET {
@ -1351,8 +1337,8 @@ struct JOB_SET {
    std::list<JOB> jobs;     // sorted high to low

    void add_job(JOB&);
-    double higher_value_disk_usage(double);
-    double lowest_value();
+    double higher_score_disk_usage(double);
+    double lowest_score();
    inline bool request_satisfied() {
        return est_time >= work_req;
    }
@ -1382,10 +1368,10 @@ int read_sendable_result(DB_RESULT& result) {
    return 0;
 }

-// compute a "value" for sending this WU to this host.
+// compute a "score" for sending this job to this host.
 // return 0 if the WU is infeasible
 //
-void JOB::get_value(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
+void JOB::get_score(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
    bool found;
    WORKUNIT wu;
    int retval;
@ -1393,7 +1379,7 @@ void JOB::get_value(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
    WU_RESULT& wu_result = ssp->wu_results[index];
    wu = wu_result.workunit;

-    value = 0;
+    score = 0;

    // Find the app and app_version for the client's platform.
    //
@ -1406,9 +1392,8 @@ void JOB::get_value(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        bavp = get_app_version(sreq, reply, wu);
        if (!bavp) return;
    }
-    if (app == NULL) return; // this should never happen

-    retval = wu_is_infeasible(wu, sreq, reply, *app);
+    retval = wu_is_infeasible_fast(wu, sreq, reply, *app);
    if (retval) {
        if (config.debug_send) {
            log_messages.printf(MSG_DEBUG,
@ -1419,24 +1404,50 @@ void JOB::get_value(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        return;
    }

-    value = 1;
-    double val = 1;
+    score = 1;
+
+    // check if user has selected apps
+    //
+    if (!reply.wreq.host_info.allow_beta_work || config.distinct_beta_apps) {
+        if (app_not_selected(wu, sreq, reply)) {
+            if (!reply.wreq.host_info.allow_non_preferred_apps) {
+                score = 0;
+                return;
+            }
+        } else {
+            if (reply.wreq.host_info.allow_non_preferred_apps) {
+                score += 1;
+            }
+        }
+    }
+
+    // if it's a beta user, try to send beta jobs
+    //
    if (app->beta) {
        if (reply.wreq.host_info.allow_beta_work) {
-            value += 1;
+            score += 1;
        } else {
-            value = 0;
+            score = 0;
            return;
        }
-    } else {
-        if (reply.wreq.host_info.reliable && (wu_result.need_reliable)) {
-            value += 1;
-        }
+    }
+
+    // if job needs to get done fast, send to fast/reliable host
+    //
+    if (reply.wreq.host_info.reliable && (wu_result.need_reliable)) {
+        score += 1;
    }
    
+    // if job already committed to an HR class,
+    // try to send to host in that class
+    //
    if (wu_result.infeasible_count) {
-        value += 1;
+        score += 1;
    }
+
+    // If user has selected some apps but will accept jobs from others,
+    // try to send them jobs from the selected apps
+    //
 }

 bool wu_is_infeasible_slow(
@ -1523,12 +1534,12 @@ bool wu_is_infeasible_slow(
    return false;
 }

-double JOB_SET::lowest_value() {
+double JOB_SET::lowest_score() {
    if (jobs.empty()) return 0;
-    return jobs.back().value;
+    return jobs.back().score;
 }

-// add the given job, and remove lowest-value jobs
+// add the given job, and remove lowest-score jobs
 // that are in excess of work request
 // or that cause the disk limit to be exceeded
 //
@ -1551,7 +1562,7 @@ void JOB_SET::add_job(JOB& job) {
    }
    list<JOB>::iterator i = jobs.begin();
    while (i != jobs.end()) {
-        if (i->value < job.value) {
+        if (i->score < job.score) {
            jobs.insert(i, job);
            break;
        }
@ -1564,13 +1575,13 @@ void JOB_SET::add_job(JOB& job) {
    disk_usage += job.disk_usage;
 }

-// return the disk usage of jobs above the given value
+// return the disk usage of jobs above the given score
 //
-double JOB_SET::higher_value_disk_usage(double v) {
+double JOB_SET::higher_score_disk_usage(double v) {
    double sum = 0;
    list<JOB>::iterator i = jobs.begin();
    while (i != jobs.end()) {
-        if (i->value < v) break;
+        if (i->score < v) break;
        sum += i->disk_usage;
        i++;
    }
@ -1623,8 +1634,11 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {

        JOB job;
        job.index = i;
-        job.get_value(sreq, reply);
-        if (job.value > jobs.lowest_value()) {
+        job.get_score(sreq, reply);
+        log_messages.printf(MSG_NORMAL,
+            "score for %s: %f\n", wu_result.workunit.name, job.score
+        );
+        if (job.score > jobs.lowest_score()) {
            ssp->wu_results[i].state = pid;
            unlock_sema();
            if (wu_is_infeasible_slow(wu_result, sreq, reply)) {
--- a/sched/sched_send.h
+++ b/sched/sched_send.h
@ -44,7 +44,7 @@ extern bool app_core_compatible(WORK_REQ& wreq, APP_VERSION& av);
 #define INFEASIBLE_HR       8
 #define INFEASIBLE_BANDWIDTH 9

-extern int wu_is_infeasible(
+extern int wu_is_infeasible_fast(
    WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&, APP&
 );
 
@ -59,3 +59,4 @@ extern int update_wu_transition_time(WORKUNIT wu, time_t x);
 extern void lock_sema();
 extern void unlock_sema();
 extern const char* infeasible_string(int);
+extern bool app_not_selected(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&);