*** empty log message ***

svn path=/trunk/boinc/; revision=11336
This commit is contained in:
David Anderson 2006-10-22 00:42:44 +00:00
parent ba8bea7a2d
commit c98a2415af
13 changed files with 240 additions and 58 deletions

View File

@ -11331,3 +11331,42 @@ Charlie 21 Oct 2006
mac_build/
boinc.xcodeproj/
project.pbxproj
David 21 Oct 2006
- Server: apps can be labeled as "beta",
and jobs for them will be sent only to users
with "<beta>1</beta>" in their project preferences.
This lets you do application testing without
creating a separate project,
which has advantages both for you (1 projects instead of 2)
and for testers (their testing gives them points
in your main project)
- Scheduler: refined the criteria for "reliable" hosts
to which old results are sent,
and added a param for reducing delay bound
when sending results to reliable hosts
- add -sleep_interval option to assimilator
- move -reliable flag from feeder
- add to sched config:
reliable_min_avg_credit
reliable_min_avg_turnaround
reliable_time
reliable_reduced_delay_bound
(from Kevin Reed)
NOTES: updating your server software to this version or later requires
a small database change (see html/ops/db_update.php)
db/
boinc_db.C,h
lib/
parse.C
sched/
assimilator.C
feeder.C
sched_array.C
sched_config.C,h
sched_locality.C
sched_send.C,h
server_types.h

View File

@ -129,9 +129,9 @@ void DB_PLATFORM::db_parse(MYSQL_ROW &r) {
void DB_APP::db_print(char* buf){
sprintf(buf,
"create_time=%d, name='%s', min_version=%d, "
"deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f",
"deprecated=%d, user_friendly_name='%s', homogeneous_redundancy=%d, weight=%f, beta=%d",
create_time, name, min_version,
deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight
deprecated?1:0, user_friendly_name, homogeneous_redundancy?1:0, weight, beta
);
}
@ -146,6 +146,7 @@ void DB_APP::db_parse(MYSQL_ROW &r) {
strcpy2(user_friendly_name, r[i++]);
homogeneous_redundancy = atoi(r[i++]);
weight = atof(r[i++]);
beta = atoi(r[i++]);
}
void DB_APP_VERSION::db_print(char* buf){

View File

@ -69,6 +69,7 @@ struct APP {
bool homogeneous_redundancy;
double weight; // tells the feeder what fraction of results
// should come from this app
int beta;
int write(FILE*);
void clear();

View File

@ -286,14 +286,19 @@ void extract_venue(char* in, char* venue_name, char* out) {
q = strstr(out, "</venue");
if (q) *q = 0;
} else {
strcpy(out, in);
while (1) {
p = strstr(out, "<venue");
if (!p) break;
q = strstr(p, "</venue>\n");
if (!q) break;
strcpy(p, q+strlen("</venue>\n"));
}
q = in;
strcpy(out, "");
while (1) {
p = strstr(q, "<venue");
if (!p) {
strcat(out, q);
break;
}
strncat(out, q, p-q);
q = strstr(p, "</venue>");
if (!q) break;
q += strlen("</venue>");
}
}
}

View File

@ -51,6 +51,8 @@ int wu_id_modulus=0, wu_id_remainder=0;
#define SLEEP_INTERVAL 10
int sleep_interval = SLEEP_INTERVAL;
int one_pass_N_WU=0;
// assimilate all WUs that need it
@ -154,6 +156,8 @@ int main(int argc, char** argv) {
} else if (!strcmp(argv[i], "-one_pass_N_WU")) {
one_pass_N_WU = atoi(argv[++i]);
one_pass = true;
} else if (!strcmp(argv[i], "-sleep_interval")) {
sleep_interval = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-one_pass")) {
one_pass = true;
} else if (!strcmp(argv[i], "-d")) {
@ -214,7 +218,7 @@ int main(int argc, char** argv) {
while (1) {
if (!do_pass(app)) {
if (one_pass) break;
sleep(SLEEP_INTERVAL);
sleep(sleep_interval);
}
}
}

View File

@ -32,7 +32,6 @@
// [ -purge_stale x ] remove work items from the shared memory segment
// that have been there for longer then x minutes
// but haven't been assigned
// [ -reliable x ] flag results for workunits older then x days as "need_reliable"
//
// Creates a shared memory segment containing DB info,
// including the work array (results/workunits to send).
@ -143,7 +142,6 @@ char select_clause[256];
double sleep_interval = DEFAULT_SLEEP_INTERVAL;
bool all_apps = false;
int purge_stale_time = 0;
int reliable_time = 0;
void cleanup_shmem() {
ssp->ready = false;
@ -406,8 +404,8 @@ static void scan_work_array(
// workunits older then the specificed time as needing a reliable
// host
wu_result.need_reliable = 0;
if (reliable_time) {
if ((wu_result.workunit.create_time + reliable_time*86400) <= time(0)) {
if (config.reliable_time) {
if ((wu_result.workunit.create_time + config.reliable_time) <= time(0)) {
wu_result.need_reliable = true;
}
}
@ -526,13 +524,11 @@ int main(int argc, char** argv) {
} else if (!strcmp(argv[i], "-allapps")) {
all_apps = true;
} else if (!strcmp(argv[i], "-priority_order")) {
order_clause = "order by priority desc ";
order_clause = "order by result.priority desc ";
} else if (!strcmp(argv[i], "-priority_order_create_time")) {
order_clause = "order by priority desc, workunit.create_time ";
order_clause = "order by result.priority desc, workunit.create_time ";
} else if (!strcmp(argv[i], "-purge_stale")) {
purge_stale_time = atoi(argv[++i])*60;
} else if (!strcmp(argv[i], "-reliable")) {
reliable_time = atoi(argv[++i]);
} else if (!strcmp(argv[i], "-mod")) {
int n = atoi(argv[++i]);
int j = atoi(argv[++i]);

View File

@ -79,24 +79,44 @@ void scan_work_array(
continue;
}
// If this is a reliable host and we are checking for results that
// need a reliable host, then continue if the result is a normal result
// If we are looking for beta results and result is not a beta result
// then move on
//
if (reply.wreq.reliable_only && (!wu_result.need_reliable)) {
continue;
APP* app = ss.lookup_app(wu_result.workunit.appid);
if (app == NULL) continue; // this should never happen
if (reply.wreq.beta_only) {
if (!app->beta) {
continue;
}
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d] beta work found. Result id %d \n",
reply.host.id
);
} else {
if (app->beta) {
continue;
}
}
// If this is a reliable host and we are checking for results that
// need a reliable host, then continue if the result is a normal result
// skip if the app is beta (beta apps don't use the reliable mechanism)
//
if (!app->beta) {
if (reply.wreq.reliable_only && (!wu_result.need_reliable)) {
continue;
} else if (!reply.wreq.reliable_only && wu_result.need_reliable) {
continue;
}
}
// If we are looking for infeasible results and the result is not infeasiable
// then move on
//
if (reply.wreq.infeasible_only && (wu_result.infeasible_count==0)) {
continue;
}
// Never send a result that needs a reliable host to one that
// has not earned credit
//
if (wu_result.need_reliable && reply.host.total_credit == 0) {
continue;
}
// don't send if we're already sending a result for same WU
//
if (config.one_result_per_user_per_wu) {
@ -108,7 +128,7 @@ void scan_work_array(
// don't send if host can't handle it
//
wu = wu_result.workunit;
if (wu_is_infeasible(wu, sreq, reply)) {
if (wu_is_infeasible(wu, sreq, reply, ss)) {
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] [WU#%d %s] WU is infeasible\n",
reply.host.id, wu.id, wu.name

View File

@ -120,8 +120,12 @@ int SCHED_CONFIG::parse(FILE* f) {
else if (xp.parse_bool(tag, "dont_delete_batches", dont_delete_batches)) continue;
else if (xp.parse_int(tag, "sched_debug_level", sched_debug_level)) continue;
else if (xp.parse_int(tag, "fuh_debug_level", fuh_debug_level)) continue;
// tags the scheduler doesn't care about
else if (xp.parse_int(tag, "reliable_min_avg_credit", reliable_min_avg_credit)) continue;
else if (xp.parse_int(tag, "reliable_min_avg_turnaround", reliable_min_avg_turnaround)) continue;
else if (xp.parse_int(tag, "reliable_time", reliable_time)) continue;
else if (xp.parse_double(tag, "reliable_reduced_delay_bound", reliable_reduced_delay_bound)) continue;
// some tags that scheduler doesn't care about
//
else if (xp.parse_str(tag, "cgi_url", temp, sizeof(temp))) continue;
else if (xp.parse_str(tag, "log_dir", temp, sizeof(temp))) continue;

View File

@ -76,6 +76,10 @@ public:
bool dont_delete_batches;
int sched_debug_level;
int fuh_debug_level;
int reliable_time; // age of workunit before requiring reliable
int reliable_min_avg_credit;
int reliable_min_avg_turnaround;
double reliable_reduced_delay_bound;
int parse(FILE*);
int parse_file(const char* dir=".");

View File

@ -287,7 +287,7 @@ static int possibly_send_result(
// why the WU is not feasible. These are defined in sched_send.h.
// INFEASIBLE_MEM, INFEASIBLE_DISK, INFEASIBLE_CPU.
//
if (wu_is_infeasible(wu, sreq, reply)) {
if (wu_is_infeasible(wu, sreq, reply, ss)) {
return ERR_INSUFFICIENT_RESOURCE;
}

View File

@ -248,25 +248,76 @@ static double estimate_wallclock_duration(
return ewd;
}
// scan user's project prefs for elements of the form <app_id>N</app_id>,
// indicating the apps they want to run.
// Find or compute various details for the host.
// These parameters affect how work is sent to the host
//
static int find_allowed_apps(
SCHEDULER_REPLY& reply, std::vector<int> *app_ids
) {
static int get_host_details(SCHEDULER_REPLY& reply) {
char buf[8096];
std::string str;
extract_venue(reply.user.project_prefs, reply.host.venue, buf);
str = buf;
unsigned int pos = 0;
int temp_int;
USER_APP_DTL* app_dtl;
// scan user's project prefs for elements of the form <app_id>N</app_id>,
// indicating the apps they want to run.
//
while (parse_int(str.substr(pos,str.length()-pos).c_str(), "<app_id>", temp_int)) {
(*app_ids).push_back(temp_int);
app_dtl = new USER_APP_DTL();
app_dtl->appid = temp_int;
app_dtl->work_available=0;
reply.wreq.host_dtls.preferred_apps.push_back(app_dtl);
pos = str.find("<app_id>", pos) + 1;
}
temp_int = parse_int(buf,"<allow_beta_work>",temp_int);
reply.wreq.host_dtls.allow_beta_work = temp_int;
// Decide whether or not this computer is a 'reliable' computer
//
double expavg_credit = reply.host.expavg_credit;
double expavg_time = reply.host.expavg_time;
double avg_turnaround = reply.host.avg_turnaround;
update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
if (strstr(reply.host.os_name,"Windows") || strstr(reply.host.os_name,"Linux")
) {
if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit || config.reliable_min_avg_credit == 0)
&& (avg_turnaround < config.reliable_min_avg_turnaround || config.reliable_min_avg_turnaround == 0)
){
reply.wreq.host_dtls.reliable = true;
log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL,
"[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n",
reply.host.id, reply.host.os_name, expavg_credit,
avg_turnaround/3600
);
}
} else {
if (((expavg_credit/reply.host.p_ncpus) > config.reliable_min_avg_credit*.75 || config.reliable_min_avg_credit == 0)
&& (avg_turnaround < config.reliable_min_avg_turnaround*1.25 || config.reliable_min_avg_turnaround == 0)
){
reply.wreq.host_dtls.reliable = true;
log_messages.printf(SCHED_MSG_LOG::MSG_NORMAL,
"[HOST#%d] is reliable (OS = %s) expavg_credit = %.0f avg_turnaround(hours) = %.0f \n",
reply.host.id, reply.host.os_name, expavg_credit,
avg_turnaround/3600
);
}
}
return 0;
}
int find_preferred_app_index(SCHEDULER_REPLY& reply, int appid) {
int result = -1;
for (int i=0; i<reply.wreq.host_dtls.preferred_apps.size(); i++) {
if (reply.wreq.host_dtls.preferred_apps[i]->appid == appid ) {
result = i;
break;
}
}
return result;
}
// if the WU can't be executed on the host, return a bitmap of reasons why.
// Reasons include:
// 1) the host doesn't have enough memory;
@ -279,7 +330,8 @@ static int find_allowed_apps(
// In particular it doesn't enforce the one-result-per-user-per-wu rule
//
int wu_is_infeasible(
WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply
WORKUNIT& wu, SCHEDULER_REQUEST& request, SCHEDULER_REPLY& reply,
SCHED_SHMEM& ss
) {
int reason = 0;
unsigned int i;
@ -288,17 +340,25 @@ int wu_is_infeasible(
// If they have then only send work for the allowed applications
// TODO: call find_allowed_apps() only once, not once for each WU!!
//
std::vector<int> app_ids;
find_allowed_apps(reply, &app_ids);
if (app_ids.size() > 0) {
bool app_allowed = false;
for(i=0; i<app_ids.size(); i++) {
if (wu.appid==app_ids[i]) {
bool app_allowed = false;
if (reply.wreq.host_dtls.preferred_apps.size() > 0) {
for (i=0; i<reply.wreq.host_dtls.preferred_apps.size(); i++) {
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"Scanning preferred apps. index=%d, appid=%d, work_avail=%d\n",
i, reply.wreq.host_dtls.preferred_apps[i]->appid,
reply.wreq.host_dtls.preferred_apps[i]->work_available
);
if (wu.appid==reply.wreq.host_dtls.preferred_apps[i]->appid) {
app_allowed = true;
reply.wreq.host_dtls.preferred_apps[i]->work_available=1;
break;
}
}
if (!app_allowed) {
// Only mark infeasible if we are looking at user preferred apps only
//
if (!app_allowed && !reply.wreq.beta_only) {
reply.wreq.no_allowed_apps_available = true;
reason |= INFEASIBLE_APP_SETTING;
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
@ -661,10 +721,21 @@ int add_result_to_reply(
result.sent_time = time(0);
int old_server_state = result.server_state;
// If the workunit needs reliable and is being sent to a reliable host,
// then shorten the delay bound by the percent specified
//
int delay_bound = wu.delay_bound;
if (config.reliable_time && reply.wreq.host_dtls.reliable && config.reliable_reduced_delay_bound > 0.01) {
if ((wu.create_time + config.reliable_time) <= time(0)) {
delay_bound = (int) (delay_bound * config.reliable_reduced_delay_bound);
}
}
if (result.server_state != RESULT_SERVER_STATE_IN_PROGRESS) {
// We are sending this result for the first time
//
result.report_deadline = result.sent_time + wu.delay_bound;
result.report_deadline = result.sent_time + delay_bound;
result.server_state = RESULT_SERVER_STATE_IN_PROGRESS;
} else {
// Result was ALREADY sent to this host but never arrived.
@ -677,8 +748,8 @@ int add_result_to_reply(
if (result.report_deadline < result.sent_time) {
result.report_deadline = result.sent_time + 10;
}
if (result.report_deadline > result.sent_time + wu.delay_bound) {
result.report_deadline = result.sent_time + wu.delay_bound;
if (result.report_deadline > result.sent_time + delay_bound) {
result.report_deadline = result.sent_time + delay_bound;
}
log_messages.printf(
@ -767,6 +838,8 @@ int send_work(
reply.wreq.core_client_version = sreq.core_client_major_version*100
+ sreq.core_client_minor_version;
reply.wreq.nresults = 0;
get_host_details(reply); // parse project prefs for app details
reply.wreq.beta_only = false;
log_messages.printf(
SCHED_MSG_LOG::MSG_NORMAL,
@ -800,16 +873,27 @@ int send_work(
} else {
// give top priority to results that require a 'reliable host'
//
double expavg_credit = reply.host.expavg_credit;
double expavg_time = reply.host.expavg_time;
update_average(0, 0, CREDIT_HALF_LIFE, expavg_credit, expavg_time);
if ((expavg_credit/reply.host.p_ncpus) > 70) {
if (reply.wreq.host_dtls.reliable) {
reply.wreq.reliable_only = true;
reply.wreq.infeasible_only = false;
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, "[HOST#%d] is reliable\n", reply.host.id);
scan_work_array(sreq, reply, platform, ss);
}
reply.wreq.reliable_only = false;
// give 2nd priority to results that are for a beta app
// (projects should load beta work with care
// otherwise your users won't get production work done!
//
if (reply.wreq.host_dtls.allow_beta_work) {
reply.wreq.beta_only=true;
log_messages.printf(
SCHED_MSG_LOG::MSG_DEBUG,
"[HOST#%d] will accept beta work. Scanning for beta work.\n",
reply.host.id
);
scan_work_array(sreq, reply, platform, ss);
}
reply.wreq.beta_only=false;
// give next priority to results that were infeasible for some other host
//
@ -935,6 +1019,14 @@ int send_work(
reply.set_delay(delay_time);
}
}
// free memory
//
for (int i=0; i<reply.wreq.host_dtls.preferred_apps.size(); i++) {
delete(reply.wreq.host_dtls.preferred_apps[i]);
}
reply.wreq.host_dtls.preferred_apps.clear();
return 0;
}

View File

@ -49,8 +49,8 @@ extern bool app_core_compatible(WORK_REQ& wreq, APP_VERSION& av);
#define INFEASIBLE_WORK_BUF 8
#define INFEASIBLE_APP_SETTING 16
extern int wu_is_infeasible(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
extern int wu_is_infeasible(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&, SCHED_SHMEM&);
extern double max_allowable_disk(SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
extern bool wu_already_in_reply(WORKUNIT& wu, SCHEDULER_REPLY& reply);

View File

@ -27,11 +27,27 @@
#include "common_defs.h"
#include "md5_file.h"
struct USER_APP_DTL {
int appid;
int reason;
int work_available;
};
// Details concerning a hosts settings
//
struct HOST_DETAILS {
int allow_beta_work;
bool reliable;
std::vector<USER_APP_DTL*> preferred_apps;
};
// summary of a client's request for work, and our response to it
//
struct WORK_REQ {
bool infeasible_only;
bool reliable_only;
bool beta_only;
HOST_DETAILS host_dtls;
double seconds_to_fill;
// in "normalized CPU seconds" (see doc/work_req.php)
double disk_available;