diff --git a/checkin_notes b/checkin_notes
index 4e1ae5c9b7..8b1d6a56cf 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -11331,3 +11331,42 @@ Charlie 21 Oct 2006
mac_build/
boinc.xcodeproj/
project.pbxproj
+
+David 21 Oct 2006
+ - Server: apps can be labeled as "beta",
+ and jobs for them will be sent only to users
+ with "1" in their project preferences.
+ This lets you do application testing without
+ creating a separate project,
+ which has advantages both for you (1 projects instead of 2)
+ and for testers (their testing gives them points
+ in your main project)
+ - Scheduler: refined the criteria for "reliable" hosts
+ to which old results are sent,
+ and added a param for reducing delay bound
+ when sending results to reliable hosts
+ - add -sleep_interval option to assimilator
+ - move -reliable flag from feeder
+ - add to sched config:
+ reliable_min_avg_credit
+ reliable_min_avg_turnaround
+ reliable_time
+ reliable_reduced_delay_bound
+
+ (from Kevin Reed)
+
+ NOTES: updating your server software to this version or later requires
+ a small database change (see html/ops/db_update.php)
+
+ db/
+ boinc_db.C,h
+ lib/
+ parse.C
+ sched/
+ assimilator.C
+ feeder.C
+ sched_array.C
+ sched_config.C,h
+ sched_locality.C
+ sched_send.C,h
+ server_types.h
diff --git a/db/boinc_db.C b/db/boinc_db.C
index 9b8e817493..24e4da3b83 100644
--- a/db/boinc_db.C
+++ b/db/boinc_db.C
@@ -129,9 +129,9 @@ void DB_PLATFORM::db_parse(MYSQL_ROW &r) {
void DB_APP::db_print(char* buf){
sprintf(buf,
"create_time=%d, name='%s', min_version=%d, "
- "deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f",
+ "deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f, beta=%d",
create_time, name, min_version,
- deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight
+ deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight, beta
);
}
@@ -146,6 +146,7 @@ void DB_APP::db_parse(MYSQL_ROW &r) {
strcpy2(user_friendly_name, r[i++]);
homogeneous_redundancy = atoi(r[i++]);
weight = atof(r[i++]);
+ beta = atoi(r[i++]);
}
void DB_APP_VERSION::db_print(char* buf){
diff --git a/db/boinc_db.h b/db/boinc_db.h
index dc872b7763..2855fb84e6 100755
--- a/db/boinc_db.h
+++ b/db/boinc_db.h
@@ -69,6 +69,7 @@ struct APP {
bool homogeneous_redundancy;
double weight; // tells the feeder what fraction of results
// should come from this app
+ int beta;
int write(FILE*);
void clear();
diff --git a/lib/parse.C b/lib/parse.C
index 95c57a3c38..fb88a6d692 100644
--- a/lib/parse.C
+++ b/lib/parse.C
@@ -286,14 +286,19 @@ void extract_venue(char* in, char* venue_name, char* out) {
q = strstr(out, "\n");
- if (!q) break;
- strcpy(p, q+strlen("\n"));
- }
+ q = in;
+ strcpy(out, "");
+ while (1) {
+ p = strstr(q, "");
+ if (!q) break;
+ q += strlen("");
+ }
}
}
diff --git a/sched/assimilator.C b/sched/assimilator.C
index 0b36b75f53..7c47b4f554 100644
--- a/sched/assimilator.C
+++ b/sched/assimilator.C
@@ -51,6 +51,8 @@ int wu_id_modulus=0, wu_id_remainder=0;
#define SLEEP_INTERVAL 10
+int sleep_interval = SLEEP_INTERVAL;
+
int one_pass_N_WU=0;
// assimilate all WUs that need it
@@ -154,6 +156,8 @@ int main(int argc, char** argv) {
} else if (!strcmp(argv[i], "-one_pass_N_WU")) {
one_pass_N_WU = atoi(argv[++i]);
one_pass = true;
+ } else if (!strcmp(argv[i], "-sleep_interval")) {
+ sleep_interval = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-one_pass")) {
one_pass = true;
} else if (!strcmp(argv[i], "-d")) {
@@ -214,7 +218,7 @@ int main(int argc, char** argv) {
while (1) {
if (!do_pass(app)) {
if (one_pass) break;
- sleep(SLEEP_INTERVAL);
+ sleep(sleep_interval);
}
}
}
diff --git a/sched/feeder.C b/sched/feeder.C
index 5d727b5561..60ef070eac 100644
--- a/sched/feeder.C
+++ b/sched/feeder.C
@@ -32,7 +32,6 @@
// [ -purge_stale x ] remove work items from the shared memory segment
// that have been there for longer then x minutes
// but haven't been assigned
-// [ -reliable x ] flag results for workunits older then x days as "need_reliable"
//
// Creates a shared memory segment containing DB info,
// including the work array (results/workunits to send).
@@ -143,7 +142,6 @@ char select_clause[256];
double sleep_interval = DEFAULT_SLEEP_INTERVAL;
bool all_apps = false;
int purge_stale_time = 0;
-int reliable_time = 0;
void cleanup_shmem() {
ssp->ready = false;
@@ -406,8 +404,8 @@ static void scan_work_array(
// workunits older then the specificed time as needing a reliable
// host
wu_result.need_reliable = 0;
- if (reliable_time) {
- if ((wu_result.workunit.create_time + reliable_time*86400) <= time(0)) {
+ if (config.reliable_time) {
+ if ((wu_result.workunit.create_time + config.reliable_time) <= time(0)) {
wu_result.need_reliable = true;
}
}
@@ -526,13 +524,11 @@ int main(int argc, char** argv) {
} else if (!strcmp(argv[i], "-allapps")) {
all_apps = true;
} else if (!strcmp(argv[i], "-priority_order")) {
- order_clause = "order by priority desc ";
+ order_clause = "order by result.priority desc ";
} else if (!strcmp(argv[i], "-priority_order_create_time")) {
- order_clause = "order by priority desc, workunit.create_time ";
+ order_clause = "order by result.priority desc, workunit.create_time ";
} else if (!strcmp(argv[i], "-purge_stale")) {
purge_stale_time = atoi(argv[++i])*60;
- } else if (!strcmp(argv[i], "-reliable")) {
- reliable_time = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-mod")) {
int n = atoi(argv[++i]);
int j = atoi(argv[++i]);
diff --git a/sched/sched_array.C b/sched/sched_array.C
index dbb57090c6..5e862698bd 100644
--- a/sched/sched_array.C
+++ b/sched/sched_array.C
@@ -79,24 +79,44 @@ void scan_work_array(
continue;
}
- // If this is a reliable host and we are checking for results that
- // need a reliable host, then continue if the result is a normal result
+ // If we are looking for beta results and result is not a beta result
+ // then move on
//
- if (reply.wreq.reliable_only && (!wu_result.need_reliable)) {
- continue;
+ APP* app = ss.lookup_app(wu_result.workunit.appid);
+ if (app == NULL) continue; // this should never happen
+ if (reply.wreq.beta_only) {
+ if (!app->beta) {
+ continue;
+ }
+ log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+ "[HOST#%d] beta work found. Result id %d \n",
+ reply.host.id
+ );
+ } else {
+ if (app->beta) {
+ continue;
+ }
}
+ // If this is a reliable host and we are checking for results that
+ // need a reliable host, then continue if the result is a normal result
+ // skip if the app is beta (beta apps don't use the reliable mechanism)
+ //
+ if (!app->beta) {
+ if (reply.wreq.reliable_only && (!wu_result.need_reliable)) {
+ continue;
+ } else if (!reply.wreq.reliable_only && wu_result.need_reliable) {
+ continue;
+ }
+ }
+
+ // If we are looking for infeasible results and the result is not infeasiable
+ // then move on
+ //
if (reply.wreq.infeasible_only && (wu_result.infeasible_count==0)) {
continue;
}
- // Never send a result that needs a reliable host to one that
- // has not earned credit
- //
- if (wu_result.need_reliable && reply.host.total_credit == 0) {
- continue;
- }
-
// don't send if we're already sending a result for same WU
//
if (config.one_result_per_user_per_wu) {
@@ -108,7 +128,7 @@ void scan_work_array(
// don't send if host can't handle it
//
wu = wu_result.workunit;
- if (wu_is_infeasible(wu, sreq, reply)) {
+ if (wu_is_infeasible(wu, sreq, reply, ss)) {
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n",
reply.host.id, wu.id, wu.name
diff --git a/sched/sched_config.C b/sched/sched_config.C
index f440463c9b..9af81e92e4 100644
--- a/sched/sched_config.C
+++ b/sched/sched_config.C
@@ -120,8 +120,12 @@ int SCHED_CONFIG::parse(FILE* f) {
else if (xp.parse_bool(tag, "dont_delete_batches", dont_delete_batches)) continue;
else if (xp.parse_int(tag, "sched_debug_level", sched_debug_level)) continue;
else if (xp.parse_int(tag, "fuh_debug_level", fuh_debug_level)) continue;
-
- // tags the scheduler doesn't care about
+ else if (xp.parse_int(tag, "reliable_min_avg_credit", reliable_min_avg_credit)) continue;
+ else if (xp.parse_int(tag, "reliable_min_avg_turnaround", reliable_min_avg_turnaround)) continue;
+ else if (xp.parse_int(tag, "reliable_time", reliable_time)) continue;
+ else if (xp.parse_double(tag, "reliable_reduced_delay_bound", reliable_reduced_delay_bound)) continue;
+
+ // some tags that scheduler doesn't care about
//
else if (xp.parse_str(tag, "cgi_url", temp, sizeof(temp))) continue;
else if (xp.parse_str(tag, "log_dir", temp, sizeof(temp))) continue;
diff --git a/sched/sched_config.h b/sched/sched_config.h
index 6bb49076dd..c0e5ed4cd1 100644
--- a/sched/sched_config.h
+++ b/sched/sched_config.h
@@ -76,6 +76,10 @@ public:
bool dont_delete_batches;
int sched_debug_level;
int fuh_debug_level;
+ int reliable_time; // age of workunit before requiring reliable
+ int reliable_min_avg_credit;
+ int reliable_min_avg_turnaround;
+ double reliable_reduced_delay_bound;
int parse(FILE*);
int parse_file(const char* dir=".");
diff --git a/sched/sched_locality.C b/sched/sched_locality.C
index a7f96d16e5..ab5281a736 100644
--- a/sched/sched_locality.C
+++ b/sched/sched_locality.C
@@ -287,7 +287,7 @@ static int possibly_send_result(
// why the WU is not feasible. These are defined in sched_send.h.
// INFEASIBLE_MEM, INFEASIBLE_DISK, INFEASIBLE_CPU.
//
- if (wu_is_infeasible(wu, sreq, reply)) {
+ if (wu_is_infeasible(wu, sreq, reply, ss)) {
return ERR_INSUFFICIENT_RESOURCE;
}
diff --git a/sched/sched_send.C b/sched/sched_send.C
index 9df3f40864..843e871a28 100644
--- a/sched/sched_send.C
+++ b/sched/sched_send.C
@@ -248,25 +248,76 @@ static double estimate_wallclock_duration(
return ewd;
}
-// scan user's project prefs for elements of the form N,
-// indicating the apps they want to run.
+// Find or compute various details for the host.
+// These parameters affect how work is sent to the host
//
-static int find_allowed_apps(
- SCHEDULER_REPLY& reply, std::vector *app_ids
-) {
+static int get_host_details(SCHEDULER_REPLY& reply) {
char buf[8096];
std::string str;
extract_venue(reply.user.project_prefs, reply.host.venue, buf);
str = buf;
unsigned int pos = 0;
int temp_int;
+ USER_APP_DTL* app_dtl;
+
+ // scan user's project prefs for elements of the form N,
+ // indicating the apps they want to run.
+ //
while (parse_int(str.substr(pos,str.length()-pos).c_str(), "", temp_int)) {
- (*app_ids).push_back(temp_int);
+ app_dtl = new USER_APP_DTL();
+ app_dtl->appid = temp_int;
+ app_dtl->work_available=0;
+ reply.wreq.host_dtls.preferred_apps.push_back(app_dtl);
+
pos = str.find("", pos) + 1;
}
+ temp_int = parse_int(buf,"",temp_int);
+ reply.wreq.host_dtls.allow_beta_work = temp_int;
+
+ // Decide whether or not this computer is a 'reliable' computer
+ //
+ double expavg_credit = reply.host.expavg_credit;
+ double expavg_time = reply.host.expavg_time;
+ double avg_turnaround = reply.host.avg_turnaround;
+ update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
+ if (strstr(reply.host.os_name,"Windows") || strstr(reply.host.os_name,"Linux")
+ ) {
+ if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit || config.reliable_min_avg_credit == 0)
+ && (avg_turnaround < config.reliable_min_avg_turnaround || config.reliable_min_avg_turnaround == 0)
+ ){
+ reply.wreq.host_dtls.reliable = true;
+ log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL,
+ "[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n",
+ reply.host.id, reply.host.os_name, expavg_credit,
+ avg_turnaround/3600
+ );
+ }
+ } else {
+ if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit*.75 || config.reliable_min_avg_credit == 0)
+ && (avg_turnaround < config.reliable_min_avg_turnaround*1.25 || config.reliable_min_avg_turnaround == 0)
+ ){
+ reply.wreq.host_dtls.reliable = true;
+ log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL,
+ "[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n",
+ reply.host.id, reply.host.os_name, expavg_credit,
+ avg_turnaround/3600
+ );
+ }
+ }
return 0;
}
+int find_preferred_app_index(SCHEDULER_REPLY& reply, int appid) {
+ int result = -1;
+ for (int i=0; iappid == appid ) {
+ result = i;
+ break;
+ }
+ }
+ return result;
+}
+
// if the WU can't be executed on the host, return a bitmap of reasons why.
// Reasons include:
// 1) the host doesn't have enough memory;
@@ -279,7 +330,8 @@ static int find_allowed_apps(
// In particular it doesn't enforce the one-result-per-user-per-wu rule
//
int wu_is_infeasible(
- WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
+ WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply,
+ SCHED_SHMEM& ss
) {
int reason = 0;
unsigned int i;
@@ -288,17 +340,25 @@ int wu_is_infeasible(
// If they have then only send work for the allowed applications
// TODO: call find_allowed_apps() only once, not once for each WU!!
//
- std::vector app_ids;
- find_allowed_apps(reply, &app_ids);
- if (app_ids.size() > 0) {
- bool app_allowed = false;
- for(i=0; i 0) {
+ for (i=0; iappid,
+ reply.wreq.host_dtls.preferred_apps[i]->work_available
+ );
+ if (wu.appid==reply.wreq.host_dtls.preferred_apps[i]->appid) {
+
app_allowed = true;
+ reply.wreq.host_dtls.preferred_apps[i]->work_available=1;
break;
}
}
- if (!app_allowed) {
+
+ // Only mark infeasible if we are looking at user preferred apps only
+ //
+ if (!app_allowed && !reply.wreq.beta_only) {
reply.wreq.no_allowed_apps_available = true;
reason |= INFEASIBLE_APP_SETTING;
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
@@ -661,10 +721,21 @@ int add_result_to_reply(
result.sent_time = time(0);
int old_server_state = result.server_state;
+ // If the workunit needs reliable and is being sent to a reliable host,
+ // then shorten the delay bound by the percent specified
+ //
+ int delay_bound = wu.delay_bound;
+ if (config.reliable_time && reply.wreq.host_dtls.reliable && config.reliable_reduced_delay_bound > 0.01) {
+ if ((wu.create_time + config.reliable_time) <= time(0)) {
+ delay_bound = (int) (delay_bound * config.reliable_reduced_delay_bound);
+ }
+ }
+
+
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
// We are sending this result for the first time
//
- result.report_deadline = result.sent_time + wu.delay_bound;
+ result.report_deadline = result.sent_time + delay_bound;
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
} else {
// Result was ALREADY sent to this host but never arrived.
@@ -677,8 +748,8 @@ int add_result_to_reply(
if (result.report_deadline < result.sent_time) {
result.report_deadline = result.sent_time + 10;
}
- if (result.report_deadline > result.sent_time + wu.delay_bound) {
- result.report_deadline = result.sent_time + wu.delay_bound;
+ if (result.report_deadline > result.sent_time + delay_bound) {
+ result.report_deadline = result.sent_time + delay_bound;
}
log_messages.printf(
@@ -767,6 +838,8 @@ int send_work(
reply.wreq.core_client_version = sreq.core_client_major_version*100
+ sreq.core_client_minor_version;
reply.wreq.nresults = 0;
+ get_host_details(reply); // parse project prefs for app details
+ reply.wreq.beta_only = false;
log_messages.printf(
SCHED_MSG_LOG::MSG_NORMAL,
@@ -800,16 +873,27 @@ int send_work(
} else {
// give top priority to results that require a 'reliable host'
//
- double expavg_credit = reply.host.expavg_credit;
- double expavg_time = reply.host.expavg_time;
- update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
- if ((expavg_credit/reply.host.p_ncpus) > 70) {
+ if (reply.wreq.host_dtls.reliable) {
reply.wreq.reliable_only = true;
reply.wreq.infeasible_only = false;
- log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] is reliable\n", reply.host.id);
scan_work_array(sreq, reply, platform, ss);
}
reply.wreq.reliable_only = false;
+
+ // give 2nd priority to results that are for a beta app
+ // (projects should load beta work with care
+ // otherwise your users won't get production work done!
+ //
+ if (reply.wreq.host_dtls.allow_beta_work) {
+ reply.wreq.beta_only=true;
+ log_messages.printf(
+ SCHED_MSG_LOG::MSG_DEBUG,
+ "[HOST#%d] will accept beta work. Scanning for beta work.\n",
+ reply.host.id
+ );
+ scan_work_array(sreq, reply, platform, ss);
+ }
+ reply.wreq.beta_only=false;
// give next priority to results that were infeasible for some other host
//
@@ -935,6 +1019,14 @@ int send_work(
reply.set_delay(delay_time);
}
}
+
+ // free memory
+ //
+ for (int i=0; i preferred_apps;
+};
+
// summary of a client's request for work, and our response to it
//
struct WORK_REQ {
bool infeasible_only;
bool reliable_only;
+ bool beta_only;
+ HOST_DETAILS host_dtls;
double seconds_to_fill;
// in "normalized CPU seconds" (see doc/work_req.php)
double disk_available;