- scheduler: add <have_cuda_apps> config flag.

If set the "effective NCPUS" (which is used to scale
    daily_result_quota and max_wus_in_progress)
    is max'd with the # of CUDA GPUs.

svn path=/trunk/boinc/; revision=16246
This commit is contained in:
David Anderson 2008-10-21 23:16:07 +00:00
parent 1a409685e5
commit 5039207e2c
8 changed files with 62 additions and 34 deletions

View File

@ -8511,3 +8511,16 @@ David 21 Oct 2008
- make "make distclean" work in all directories
samples/*/Makefile
David 21 Oct 2008
- scheduler: add <have_cuda_apps> config flag.
If set the "effective NCPUS" (which is used to scale
daily_result_quota and max_wus_in_progress)
is max'd with the # of CUDA GPUs.
sched/
sched_array.cpp
sched_config.cpp,h
sched_locality.cpp
sched_send.cpp,h
server_types.h

View File

@ -54,7 +54,7 @@ void scan_work_array(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
rnd_off = rand() % ssp->max_wu_results;
for (j=0; j<ssp->max_wu_results; j++) {
i = (j+rnd_off) % ssp->max_wu_results;
if (!reply.work_needed()) break;
if (!work_needed(sreq, reply, false)) break;
WU_RESULT& wu_result = ssp->wu_results[i];

View File

@ -184,6 +184,7 @@ int SCHED_CONFIG::parse(FILE* f) {
if (xp.parse_bool(tag, "use_credit_multiplier", use_credit_multiplier)) continue;
if (xp.parse_bool(tag, "multiple_clients_per_host", multiple_clients_per_host)) continue;
if (xp.parse_bool(tag, "no_vista_sandbox", no_vista_sandbox)) continue;
if (xp.parse_bool(tag, "have_cuda_apps", have_cuda_apps)) continue;
if (xp.parse_bool(tag, "debug_version_select", debug_version_select)) continue;

View File

@ -125,6 +125,7 @@ public:
bool use_credit_multiplier;
bool multiple_clients_per_host;
bool no_vista_sandbox;
bool have_cuda_apps;
// log flags
//

View File

@ -488,7 +488,7 @@ static int send_results_for_file(
nsent = 0;
if (!reply.work_needed(true)) {
if (!work_needed(sreq, reply, true)) {
return 0;
}
@ -519,7 +519,7 @@ static int send_results_for_file(
for (i=0; i<100; i++) { // avoid infinite loop
int query_retval;
if (!reply.work_needed(true)) break;
if (!work_needed(sreq, reply, true)) break;
log_messages.printf(MSG_DEBUG,
"in_send_results_for_file(%s, %d) prev_result.id=%d\n", filename, i, prev_result.id
@ -740,7 +740,7 @@ static int send_new_file_work_deterministic_seeded(
if (retval==ERR_NO_APP_VERSION || retval==ERR_INSUFFICIENT_RESOURCE) return retval;
if (nsent>0 || !reply.work_needed(true)) break;
if (nsent>0 || !work_needed(sreq, reply, true)) break;
// construct a name which is lexically greater than the name of any result
// which uses this file.
sprintf(min_resultname, "%s__~", filename);
@ -793,7 +793,7 @@ static int send_new_file_work_deterministic(
// continue deterministic search at lexically first possible
// filename, continue to randomly choosen one
if (!getfile_retval && reply.work_needed(true)) {
if (!getfile_retval && work_needed(sreq, reply, true)) {
send_new_file_work_deterministic_seeded(
sreq, reply, nsent, "", start_filename
);
@ -834,7 +834,7 @@ static int send_new_file_work(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
) {
while (reply.work_needed(true)) {
while (work_needed(sreq, reply, true)) {
int retval_sow, retval_snfwws;
double frac=((double)rand())/(double)RAND_MAX;
int now = time(0);
@ -857,7 +857,7 @@ static int send_new_file_work(
if (retval_sow==ERR_NO_APP_VERSION || retval_sow==ERR_INSUFFICIENT_RESOURCE) return retval_sow;
while (reply.work_needed(true) && retry<5) {
while (work_needed(sreq, reply, true) && retry<5) {
log_messages.printf(MSG_DEBUG,
"send_new_file_work(%d): try to send from working set\n", retry
);
@ -867,7 +867,7 @@ static int send_new_file_work(
}
if (reply.work_needed(true)) {
if (work_needed(sreq, reply, true)) {
log_messages.printf(MSG_DEBUG,
"send_new_file_work(): try deterministic method\n"
);
@ -878,7 +878,7 @@ static int send_new_file_work(
return 1;
}
}
} // while reply.work_needed(true)
} // while reply.work_needed(sreq, reply, true)
return 0;
}
@ -898,7 +898,7 @@ static int send_old_work(
DB_RESULT result;
int now=time(0);
if (!reply.work_needed(true)) {
if (!work_needed(sreq, reply, true)) {
return 0;
}
@ -1067,7 +1067,7 @@ void send_work_locality(
int k = (i+j)%nfiles;
int retval_srff;
if (!reply.work_needed(true)) break;
if (!work_needed(sreq, reply, true)) break;
FILE_INFO& fi = sreq.file_infos[k];
retval_srff=send_results_for_file(
fi.name, nsent, sreq, reply, false
@ -1081,7 +1081,7 @@ void send_work_locality(
// If the work was not sent for other (dynamic) reason such as insufficient
// cpu, then DON'T delete the file.
//
if (nsent == 0 && reply.work_needed(true) && config.file_deletion_strategy == 1) {
if (nsent == 0 && work_needed(sreq, reply, true) && config.file_deletion_strategy == 1) {
reply.file_deletes.push_back(fi);
log_messages.printf(MSG_DEBUG,
"[HOST#%d]: delete file %s (not needed)\n", reply.host.id, fi.name
@ -1104,7 +1104,7 @@ void send_work_locality(
// send new files if needed
//
if (reply.work_needed(true)) {
if (work_needed(sreq, reply, true)) {
send_new_file_work(sreq, reply);
}
}

View File

@ -77,10 +77,21 @@ const char* infeasible_string(int code) {
const int MIN_SECONDS_TO_SEND = 0;
const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY);
inline int effective_ncpus(HOST& host) {
int ncpus = host.p_ncpus;
// return a number that
// - is the # of CPUs in EDF simulation
// - scales the daily result quota
// - scales max_wus_in_progress
inline int effective_ncpus(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
int ncpus = reply.host.p_ncpus;
if (ncpus > config.max_ncpus) ncpus = config.max_ncpus;
if (ncpus < 1) ncpus = 1;
if (config.have_cuda_apps) {
COPROC* cp = sreq.coprocs.lookup("cuda");
if (cp && cp->count > ncpus) {
ncpus = cp->count;
}
}
return ncpus;
}
@ -653,7 +664,7 @@ int wu_is_infeasible_fast(
}
IP_RESULT candidate("", wu.delay_bound, est_cpu);
strcpy(candidate.name, wu.name);
if (check_candidate(candidate, effective_ncpus(reply.host), request.ip_results)) {
if (check_candidate(candidate, effective_ncpus(request, reply), request.ip_results)) {
// it passed the feasibility test,
// but don't add it the the workload yet;
// wait until we commit to sending it
@ -833,47 +844,49 @@ void unlock_sema() {
// and we haven't exceeded result per RPC limit,
// and we haven't exceeded results per day limit
//
bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
bool work_needed(
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, bool locality_sched
) {
if (locality_sched) {
// if we've failed to send a result because of a transient condition,
// return false to preserve invariant
//
if (wreq.disk.insufficient || wreq.speed.insufficient || wreq.mem.insufficient || wreq.no_allowed_apps_available) {
if (reply.wreq.disk.insufficient || reply.wreq.speed.insufficient || reply.wreq.mem.insufficient || reply.wreq.no_allowed_apps_available) {
return false;
}
}
if (wreq.seconds_to_fill <= 0) return false;
if (wreq.disk_available <= 0) {
if (reply.wreq.seconds_to_fill <= 0) return false;
if (reply.wreq.disk_available <= 0) {
return false;
}
if (wreq.nresults >= config.max_wus_to_send) return false;
if (reply.wreq.nresults >= config.max_wus_to_send) return false;
int ncpus = effective_ncpus(host);
int ncpus = effective_ncpus(sreq, reply);
// host.max_results_day is between 1 and config.daily_result_quota inclusive
// wreq.daily_result_quota is between ncpus
// and ncpus*host.max_results_day inclusive
//
if (config.daily_result_quota) {
if (host.max_results_day == 0 || host.max_results_day>config.daily_result_quota) {
host.max_results_day = config.daily_result_quota;
if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {
reply.host.max_results_day = config.daily_result_quota;
}
wreq.daily_result_quota = ncpus*host.max_results_day;
if (host.nresults_today >= wreq.daily_result_quota) {
wreq.daily_result_quota_exceeded = true;
reply.wreq.daily_result_quota = ncpus*reply.host.max_results_day;
if (reply.host.nresults_today >= reply.wreq.daily_result_quota) {
reply.wreq.daily_result_quota_exceeded = true;
return false;
}
}
if (config.max_wus_in_progress) {
if (wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) {
if (reply.wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) {
if (config.debug_send) {
log_messages.printf(MSG_DEBUG,
"in-progress job limit exceeded; %d > %d*%d\n",
wreq.nresults_on_host, config.max_wus_in_progress, ncpus
reply.wreq.nresults_on_host, config.max_wus_in_progress, ncpus
);
}
wreq.cache_size_exceeded = true;
reply.wreq.cache_size_exceeded = true;
return false;
}
}
@ -1022,7 +1035,7 @@ int add_result_to_reply(
result.bavp = bavp;
reply.insert_result(result);
reply.wreq.seconds_to_fill -= wu_seconds_filled;
request.estimated_delay += wu_seconds_filled/effective_ncpus(reply.host);
request.estimated_delay += wu_seconds_filled/effective_ncpus(request, reply);
reply.wreq.nresults++;
reply.wreq.nresults_on_host++;
if (!resent_result) reply.host.nresults_today++;
@ -1407,7 +1420,7 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
if (config.workload_sim && sreq.have_other_results_list) {
init_ip_results(
sreq.global_prefs.work_buf_min(), effective_ncpus(reply.host), sreq.ip_results
sreq.global_prefs.work_buf_min(), effective_ncpus(sreq, reply), sreq.ip_results
);
}
@ -1450,7 +1463,7 @@ struct JOB_SET {
disk_usage = 0;
disk_limit = reply.wreq.disk_available;
max_jobs = config.max_wus_to_send;
int ncpus = effective_ncpus(reply.host), n;
int ncpus = effective_ncpus(sreq, reply), n;
if (config.daily_result_quota) {
if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {

View File

@ -62,3 +62,4 @@ extern void lock_sema();
extern void unlock_sema();
extern const char* infeasible_string(int);
extern bool app_not_selected(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
extern bool work_needed(SCHEDULER_REQUEST&, SCHEDULER_REPLY&, bool);

View File

@ -363,7 +363,6 @@ struct SCHEDULER_REPLY {
void insert_workunit_unique(WORKUNIT&);
void insert_result(RESULT&);
void insert_message(USER_MESSAGE&);
bool work_needed(bool locality_sched=false);
void set_delay(double);
void got_good_result(); // adjust max_results_day
void got_bad_result(); // adjust max_results_day