*** empty log message ***

svn path=/trunk/boinc/; revision=11336
2006-10-22 00:42:44 +00:00 · 2006-10-22 00:42:44 +00:00 · c98a2415af
parent ba8bea7a2d
commit c98a2415af
13 changed files with 240 additions and 58 deletions
--- a/39
+++ b/39
@ -11331,3 +11331,42 @@ Charlie 21 Oct 2006
    mac_build/
        boinc.xcodeproj/
            project.pbxproj
+
+David  21 Oct 2006
+	- Server: apps can be labeled as "beta",
+		and jobs for them will be sent only to users
+		with "<beta>1</beta>" in their project preferences.
+		This lets you do application testing without
+		creating a separate project,
+		which has advantages both for you (1 projects instead of 2)
+		and for testers (their testing gives them points
+		in your main project)
+	- Scheduler: refined the criteria for "reliable" hosts
+		to which old results are sent,
+		and added a param for reducing delay bound
+		when sending results to reliable hosts
+	- add -sleep_interval option to assimilator
+	- move -reliable flag from feeder
+	- add to sched config:
+		reliable_min_avg_credit
+		reliable_min_avg_turnaround
+		reliable_time
+		reliable_reduced_delay_bound
+
+	(from Kevin Reed)
+
+	NOTES: updating your server software to this version or later requires
+		a small database change (see html/ops/db_update.php)
+
+	db/
+		boinc_db.C,h
+	lib/
+		parse.C
+	sched/
+		assimilator.C
+		feeder.C
+		sched_array.C
+		sched_config.C,h
+		sched_locality.C
+		sched_send.C,h
+		server_types.h
--- a/db/boinc_db.C
+++ b/db/boinc_db.C
@ -129,9 +129,9 @@ void DB_PLATFORM::db_parse(MYSQL_ROW &r) {
 void DB_APP::db_print(char* buf){
    sprintf(buf,
        "create_time=%d, name='%s', min_version=%d, "
-        "deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f",
+        "deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f, beta=%d",
        create_time, name, min_version,
-        deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight
+        deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight, beta
    );
 }

@ -146,6 +146,7 @@ void DB_APP::db_parse(MYSQL_ROW &r) {
    strcpy2(user_friendly_name, r[i++]);
    homogeneous_redundancy = atoi(r[i++]);
    weight = atof(r[i++]);
+    beta = atoi(r[i++]);
 }

 void DB_APP_VERSION::db_print(char* buf){
--- a/db/boinc_db.h
+++ b/db/boinc_db.h
@ -69,6 +69,7 @@ struct APP {
    bool homogeneous_redundancy;
    double weight;          // tells the feeder what fraction of results
                            // should come from this app
+    int beta;

    int write(FILE*);
    void clear();
--- a/lib/parse.C
+++ b/lib/parse.C
@ -286,14 +286,19 @@ void extract_venue(char* in, char* venue_name, char* out) {
        q = strstr(out, "</venue");
        if (q) *q = 0;
    } else {
-        strcpy(out, in);
-        while (1) {
-            p = strstr(out, "<venue");
-            if (!p) break;
-            q = strstr(p, "</venue>\n");
-            if (!q) break;
-            strcpy(p, q+strlen("</venue>\n"));
-        }
+        q = in;
+        strcpy(out, "");
+       	while (1) {
+           	p = strstr(q, "<venue");
+           	if (!p) {
+       	        strcat(out, q);
+                break;
+            }
+           	strncat(out, q, p-q);
+           	q = strstr(p, "</venue>");
+           	if (!q) break;
+           	q += strlen("</venue>");
+       	}
    }
 }

--- a/sched/assimilator.C
+++ b/sched/assimilator.C
@ -51,6 +51,8 @@ int wu_id_modulus=0, wu_id_remainder=0;

 #define SLEEP_INTERVAL 10

+int sleep_interval = SLEEP_INTERVAL;
+
 int one_pass_N_WU=0;

 // assimilate all WUs that need it
@ -154,6 +156,8 @@ int main(int argc, char** argv) {
        } else if (!strcmp(argv[i], "-one_pass_N_WU")) {
            one_pass_N_WU = atoi(argv[++i]);
            one_pass = true;
+        } else if (!strcmp(argv[i], "-sleep_interval")) {
+            sleep_interval = atoi(argv[++i]);
        } else if (!strcmp(argv[i], "-one_pass")) {
            one_pass = true;
        } else if (!strcmp(argv[i], "-d")) {
@ -214,7 +218,7 @@ int main(int argc, char** argv) {
    while (1) {
        if (!do_pass(app)) {
            if (one_pass) break;
-            sleep(SLEEP_INTERVAL);
+            sleep(sleep_interval);
        }
    }
 }
--- a/sched/feeder.C
+++ b/sched/feeder.C
@ -32,7 +32,6 @@
 //  [ -purge_stale x ]    remove work items from the shared memory segment
 //                        that have been there for longer then x minutes
 //                        but haven't been assigned
-//  [ -reliable x ]       flag results for workunits older then x days as "need_reliable"
 //
 // Creates a shared memory segment containing DB info,
 // including the work array (results/workunits to send).
@ -143,7 +142,6 @@ char select_clause[256];
 double sleep_interval = DEFAULT_SLEEP_INTERVAL;
 bool all_apps = false;
 int purge_stale_time = 0;
-int reliable_time = 0;

 void cleanup_shmem() {
    ssp->ready = false;
@ -406,8 +404,8 @@ static void scan_work_array(
                // workunits older then the specificed time as needing a reliable
                // host
                wu_result.need_reliable = 0;
-                if (reliable_time) {
-                	if ((wu_result.workunit.create_time + reliable_time*86400) <= time(0)) {
+                if (config.reliable_time) {
+                	if ((wu_result.workunit.create_time + config.reliable_time) <= time(0)) {
                		wu_result.need_reliable = true;
                	}
                }
@ -526,13 +524,11 @@ int main(int argc, char** argv) {
        } else if (!strcmp(argv[i], "-allapps")) {
            all_apps = true;
        } else if (!strcmp(argv[i], "-priority_order")) {
-            order_clause = "order by priority desc ";
+            order_clause = "order by result.priority desc ";
        } else if (!strcmp(argv[i], "-priority_order_create_time")) {
-            order_clause = "order by priority desc, workunit.create_time ";
+            order_clause = "order by result.priority desc, workunit.create_time ";
        } else if (!strcmp(argv[i], "-purge_stale")) {
            purge_stale_time = atoi(argv[++i])*60;
-        } else if (!strcmp(argv[i], "-reliable")) {
-            reliable_time = atoi(argv[++i]);
        } else if (!strcmp(argv[i], "-mod")) {
            int n = atoi(argv[++i]);
            int j = atoi(argv[++i]);
--- a/sched/sched_array.C
+++ b/sched/sched_array.C
@ -79,24 +79,44 @@ void scan_work_array(
            continue;
        }
        
-        // If this is a reliable host and we are checking for results that
-        // need a reliable host, then continue if the result is a normal result
+        // If we are looking for beta results and result is not a beta result
+        // then move on
        //
-        if (reply.wreq.reliable_only && (!wu_result.need_reliable)) {
-        	continue;
+        APP* app = ss.lookup_app(wu_result.workunit.appid);
+        if (app == NULL) continue; // this should never happen
+        if (reply.wreq.beta_only) {
+        	if (!app->beta) {
+        		continue;
+        	}
+            log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+                "[HOST#%d] beta work found.  Result id %d \n",
+                reply.host.id 
+            );
+        } else {
+         	if (app->beta) {
+        		continue;
+        	}
        }
        
+        // If this is a reliable host and we are checking for results that
+        // need a reliable host, then continue if the result is a normal result
+        // skip if the app is beta (beta apps don't use the reliable mechanism)
+        //
+        if (!app->beta) {
+        	if (reply.wreq.reliable_only && (!wu_result.need_reliable)) {
+        		continue;
+        	} else if (!reply.wreq.reliable_only && wu_result.need_reliable) {
+        		continue;
+        	}
+        }
+        
+        // If we are looking for infeasible results and the result is not infeasiable
+        // then move on
+        //
        if (reply.wreq.infeasible_only && (wu_result.infeasible_count==0)) {
            continue;
        }
        
-        // Never send a result that needs a reliable host to one that 
-        // has not earned credit
-        //
-        if (wu_result.need_reliable && reply.host.total_credit == 0) {
-        	continue;
-        }
-
        // don't send if we're already sending a result for same WU
        //
        if (config.one_result_per_user_per_wu) {
@ -108,7 +128,7 @@ void scan_work_array(
        // don't send if host can't handle it
        //
        wu = wu_result.workunit;
-        if (wu_is_infeasible(wu, sreq, reply)) {
+        if (wu_is_infeasible(wu, sreq, reply, ss)) {
           	log_messages.printf(
               	SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n",
               	reply.host.id, wu.id, wu.name
--- a/sched/sched_config.C
+++ b/sched/sched_config.C
@ -120,8 +120,12 @@ int SCHED_CONFIG::parse(FILE* f) {
        else if (xp.parse_bool(tag, "dont_delete_batches", dont_delete_batches)) continue;
        else if (xp.parse_int(tag, "sched_debug_level", sched_debug_level)) continue;
        else if (xp.parse_int(tag, "fuh_debug_level", fuh_debug_level)) continue;
-
-        // tags the scheduler doesn't care about
+        else if (xp.parse_int(tag, "reliable_min_avg_credit", reliable_min_avg_credit)) continue;
+        else if (xp.parse_int(tag, "reliable_min_avg_turnaround", reliable_min_avg_turnaround)) continue;
+        else if (xp.parse_int(tag, "reliable_time", reliable_time)) continue;
+        else if (xp.parse_double(tag, "reliable_reduced_delay_bound", reliable_reduced_delay_bound)) continue;
+		
+        // some tags that scheduler doesn't care about
        //
        else if (xp.parse_str(tag, "cgi_url", temp, sizeof(temp))) continue;
        else if (xp.parse_str(tag, "log_dir", temp, sizeof(temp))) continue;
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@ -76,6 +76,10 @@ public:
    bool dont_delete_batches;
    int sched_debug_level;
    int fuh_debug_level;
+    int reliable_time; // age of workunit before requiring reliable
+    int reliable_min_avg_credit;
+    int reliable_min_avg_turnaround;
+    double reliable_reduced_delay_bound;

    int parse(FILE*);
    int parse_file(const char* dir=".");
--- a/sched/sched_locality.C
+++ b/sched/sched_locality.C
@ -287,7 +287,7 @@ static int possibly_send_result(
    // why the WU is not feasible.  These are defined in sched_send.h.
    // INFEASIBLE_MEM, INFEASIBLE_DISK, INFEASIBLE_CPU.
    // 
-    if (wu_is_infeasible(wu, sreq, reply)) {
+    if (wu_is_infeasible(wu, sreq, reply, ss)) {
        return ERR_INSUFFICIENT_RESOURCE;
    }

--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@ -248,25 +248,76 @@ static double estimate_wallclock_duration(
    return ewd;
 }

-// scan user's project prefs for elements of the form <app_id>N</app_id>,
-// indicating the apps they want to run.
+// Find or compute various details for the host.
+// These parameters affect how work is sent to the host
 //
-static int find_allowed_apps(
-    SCHEDULER_REPLY& reply, std::vector<int> *app_ids
-) {
+static int get_host_details(SCHEDULER_REPLY& reply) {
    char buf[8096];
   	std::string str;
   	extract_venue(reply.user.project_prefs, reply.host.venue, buf);
   	str = buf;
 	unsigned int pos = 0;
 	int temp_int;
+    USER_APP_DTL* app_dtl;
+
+    // scan user's project prefs for elements of the form <app_id>N</app_id>,
+    // indicating the apps they want to run.
+    //
 	while (parse_int(str.substr(pos,str.length()-pos).c_str(), "<app_id>", temp_int)) {
-		(*app_ids).push_back(temp_int);
+        app_dtl = new USER_APP_DTL();
+        app_dtl->appid = temp_int;
+        app_dtl->work_available=0;
+        reply.wreq.host_dtls.preferred_apps.push_back(app_dtl);
+
 		pos = str.find("<app_id>", pos) + 1;
 	}
+    temp_int = parse_int(buf,"<allow_beta_work>",temp_int);
+    reply.wreq.host_dtls.allow_beta_work = temp_int;
+ 
+    // Decide whether or not this computer is a 'reliable' computer
+    //
+    double expavg_credit = reply.host.expavg_credit;
+    double expavg_time = reply.host.expavg_time;
+    double avg_turnaround = reply.host.avg_turnaround;
+    update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
+    if (strstr(reply.host.os_name,"Windows") || strstr(reply.host.os_name,"Linux")
+    ) {
+        if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit || config.reliable_min_avg_credit == 0)
+            && (avg_turnaround < config.reliable_min_avg_turnaround || config.reliable_min_avg_turnaround == 0)
+        ){
+            reply.wreq.host_dtls.reliable = true;
+            log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL,
+                "[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n",
+                reply.host.id, reply.host.os_name, expavg_credit,
+                avg_turnaround/3600
+            );
+        }
+    } else {
+        if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit*.75 || config.reliable_min_avg_credit == 0)
+            && (avg_turnaround < config.reliable_min_avg_turnaround*1.25 || config.reliable_min_avg_turnaround == 0)
+        ){
+            reply.wreq.host_dtls.reliable = true;
+            log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL,
+                "[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n",
+                reply.host.id, reply.host.os_name, expavg_credit,
+                avg_turnaround/3600
+            );
+        }
+    }
 	return 0;
 }

+int find_preferred_app_index(SCHEDULER_REPLY& reply, int appid) {
+    int result = -1;
+    for (int i=0; i<reply.wreq.host_dtls.preferred_apps.size(); i++) {
+        if (reply.wreq.host_dtls.preferred_apps[i]->appid == appid ) {
+            result = i;
+            break;
+        }
+    }
+    return result;
+}
+
 // if the WU can't be executed on the host, return a bitmap of reasons why.
 // Reasons include:
 // 1) the host doesn't have enough memory;
@ -279,7 +330,8 @@ static int find_allowed_apps(
 // In particular it doesn't enforce the one-result-per-user-per-wu rule
 //
 int wu_is_infeasible(
-    WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
+    WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply,
+    SCHED_SHMEM& ss
 ) {
    int reason = 0;
    unsigned int i;
@ -288,17 +340,25 @@ int wu_is_infeasible(
    // If they have then only send work for the allowed applications
    // TODO: call find_allowed_apps() only once, not once for each WU!!
    //
-    std::vector<int> app_ids;
-    find_allowed_apps(reply, &app_ids);
-    if (app_ids.size() > 0) {
-    	bool app_allowed = false;
-    	for(i=0; i<app_ids.size(); i++) {
-    		if (wu.appid==app_ids[i]) {
+    bool app_allowed = false;
+    if (reply.wreq.host_dtls.preferred_apps.size() > 0) {
+        for (i=0; i<reply.wreq.host_dtls.preferred_apps.size(); i++) {
+            log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+                "Scanning preferred apps. index=%d, appid=%d, work_avail=%d\n",
+                i, reply.wreq.host_dtls.preferred_apps[i]->appid,
+                reply.wreq.host_dtls.preferred_apps[i]->work_available
+            );
+            if (wu.appid==reply.wreq.host_dtls.preferred_apps[i]->appid) {
+
    			app_allowed = true;
+                reply.wreq.host_dtls.preferred_apps[i]->work_available=1;
    			break;
    		}
    	}
-    	if (!app_allowed) {
+
+        // Only mark infeasible if we are looking at user preferred apps only
+        //
+        if (!app_allowed && !reply.wreq.beta_only) {
        	reply.wreq.no_allowed_apps_available = true;
    		reason |= INFEASIBLE_APP_SETTING;
 			log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
@ -661,10 +721,21 @@ int add_result_to_reply(
    result.sent_time = time(0);
    int old_server_state = result.server_state;

+    // If the workunit needs reliable and is being sent to a reliable host,
+    // then shorten the delay bound by the percent specified
+    //
+    int delay_bound = wu.delay_bound;
+    if (config.reliable_time && reply.wreq.host_dtls.reliable && config.reliable_reduced_delay_bound > 0.01) {
+        if ((wu.create_time + config.reliable_time) <= time(0)) {
+            delay_bound = (int) (delay_bound * config.reliable_reduced_delay_bound);
+        }
+    }
+
+
    if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
        // We are sending this result for the first time
        //
-        result.report_deadline = result.sent_time + wu.delay_bound;
+        result.report_deadline = result.sent_time + delay_bound;
        result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
    } else {
        // Result was ALREADY sent to this host but never arrived.
@ -677,8 +748,8 @@ int add_result_to_reply(
        if (result.report_deadline < result.sent_time) {
            result.report_deadline = result.sent_time + 10;
        }
-        if (result.report_deadline > result.sent_time + wu.delay_bound) {
-            result.report_deadline = result.sent_time + wu.delay_bound;
+        if (result.report_deadline > result.sent_time + delay_bound) {
+            result.report_deadline = result.sent_time + delay_bound;
        }

        log_messages.printf(
@ -767,6 +838,8 @@ int send_work(
    reply.wreq.core_client_version = sreq.core_client_major_version*100
        + sreq.core_client_minor_version;
    reply.wreq.nresults = 0;
+    get_host_details(reply); // parse project prefs for app details
+    reply.wreq.beta_only = false;

    log_messages.printf(
        SCHED_MSG_LOG::MSG_NORMAL,
@ -800,16 +873,27 @@ int send_work(
    } else {
    	// give top priority to results that require a 'reliable host'
        //
-        double expavg_credit = reply.host.expavg_credit;
-        double expavg_time = reply.host.expavg_time;
-        update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
-        if ((expavg_credit/reply.host.p_ncpus) > 70) {
+        if (reply.wreq.host_dtls.reliable) {
        	reply.wreq.reliable_only = true;
        	reply.wreq.infeasible_only = false;
-            log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] is reliable\n", reply.host.id);
        	scan_work_array(sreq, reply, platform, ss);
        }
    	reply.wreq.reliable_only = false;
+
+        // give 2nd priority to results that are for a beta app
+        // (projects should load beta work with care
+        // otherwise your users won't get production work done!
+        //
+        if (reply.wreq.host_dtls.allow_beta_work) {
+            reply.wreq.beta_only=true;
+            log_messages.printf(
+                SCHED_MSG_LOG::MSG_DEBUG,
+                "[HOST#%d] will accept beta work.  Scanning for beta work.\n",
+                reply.host.id
+            );
+            scan_work_array(sreq, reply, platform, ss);
+        }
+        reply.wreq.beta_only=false;
    	
        // give next priority to results that were infeasible for some other host
        //
@ -935,6 +1019,14 @@ int send_work(
            reply.set_delay(delay_time);
        }
    }
+
+    // free memory
+    //
+    for (int i=0; i<reply.wreq.host_dtls.preferred_apps.size(); i++) {
+        delete(reply.wreq.host_dtls.preferred_apps[i]);
+    }
+    reply.wreq.host_dtls.preferred_apps.clear();
+
    return 0;
 }

--- a/sched/sched_send.h
+++ b/sched/sched_send.h
@ -49,8 +49,8 @@ extern bool app_core_compatible(WORK_REQ& wreq, APP_VERSION& av);
 #define INFEASIBLE_WORK_BUF 8
 #define INFEASIBLE_APP_SETTING 16

-extern int wu_is_infeasible(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
-
+extern int wu_is_infeasible(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&, SCHED_SHMEM&);
+ 
 extern double max_allowable_disk(SCHEDULER_REQUEST&, SCHEDULER_REPLY&);

 extern bool wu_already_in_reply(WORKUNIT& wu, SCHEDULER_REPLY& reply);
--- a/sched/server_types.h
+++ b/sched/server_types.h
@ -27,11 +27,27 @@
 #include "common_defs.h"
 #include "md5_file.h"

+struct USER_APP_DTL {
+	int appid;
+	int reason;
+	int work_available;
+};
+
+// Details concerning a hosts settings
+//
+struct HOST_DETAILS {
+	int allow_beta_work;
+	bool reliable;
+	std::vector<USER_APP_DTL*> preferred_apps;
+};
+
 // summary of a client's request for work, and our response to it
 //
 struct WORK_REQ {
    bool infeasible_only;
    bool reliable_only;
+    bool beta_only;
+    HOST_DETAILS host_dtls;
    double seconds_to_fill;
 		// in "normalized CPU seconds" (see doc/work_req.php)
    double disk_available;