mirror of https://github.com/BOINC/boinc.git
- scheduler: add <have_cuda_apps> config flag.
If set the "effective NCPUS" (which is used to scale daily_result_quota and max_wus_in_progress) is max'd with the # of CUDA GPUs. svn path=/trunk/boinc/; revision=16246
This commit is contained in:
parent
1a409685e5
commit
5039207e2c
|
@ -8511,3 +8511,16 @@ David 21 Oct 2008
|
|||
- make "make distclean" work in all directories
|
||||
|
||||
samples/*/Makefile
|
||||
|
||||
David 21 Oct 2008
|
||||
- scheduler: add <have_cuda_apps> config flag.
|
||||
If set the "effective NCPUS" (which is used to scale
|
||||
daily_result_quota and max_wus_in_progress)
|
||||
is max'd with the # of CUDA GPUs.
|
||||
|
||||
sched/
|
||||
sched_array.cpp
|
||||
sched_config.cpp,h
|
||||
sched_locality.cpp
|
||||
sched_send.cpp,h
|
||||
server_types.h
|
||||
|
|
|
@ -54,7 +54,7 @@ void scan_work_array(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
rnd_off = rand() % ssp->max_wu_results;
|
||||
for (j=0; j<ssp->max_wu_results; j++) {
|
||||
i = (j+rnd_off) % ssp->max_wu_results;
|
||||
if (!reply.work_needed()) break;
|
||||
if (!work_needed(sreq, reply, false)) break;
|
||||
|
||||
WU_RESULT& wu_result = ssp->wu_results[i];
|
||||
|
||||
|
|
|
@ -184,6 +184,7 @@ int SCHED_CONFIG::parse(FILE* f) {
|
|||
if (xp.parse_bool(tag, "use_credit_multiplier", use_credit_multiplier)) continue;
|
||||
if (xp.parse_bool(tag, "multiple_clients_per_host", multiple_clients_per_host)) continue;
|
||||
if (xp.parse_bool(tag, "no_vista_sandbox", no_vista_sandbox)) continue;
|
||||
if (xp.parse_bool(tag, "have_cuda_apps", have_cuda_apps)) continue;
|
||||
|
||||
|
||||
if (xp.parse_bool(tag, "debug_version_select", debug_version_select)) continue;
|
||||
|
|
|
@ -125,6 +125,7 @@ public:
|
|||
bool use_credit_multiplier;
|
||||
bool multiple_clients_per_host;
|
||||
bool no_vista_sandbox;
|
||||
bool have_cuda_apps;
|
||||
|
||||
// log flags
|
||||
//
|
||||
|
|
|
@ -488,7 +488,7 @@ static int send_results_for_file(
|
|||
|
||||
nsent = 0;
|
||||
|
||||
if (!reply.work_needed(true)) {
|
||||
if (!work_needed(sreq, reply, true)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -519,7 +519,7 @@ static int send_results_for_file(
|
|||
for (i=0; i<100; i++) { // avoid infinite loop
|
||||
int query_retval;
|
||||
|
||||
if (!reply.work_needed(true)) break;
|
||||
if (!work_needed(sreq, reply, true)) break;
|
||||
|
||||
log_messages.printf(MSG_DEBUG,
|
||||
"in_send_results_for_file(%s, %d) prev_result.id=%d\n", filename, i, prev_result.id
|
||||
|
@ -740,7 +740,7 @@ static int send_new_file_work_deterministic_seeded(
|
|||
|
||||
if (retval==ERR_NO_APP_VERSION || retval==ERR_INSUFFICIENT_RESOURCE) return retval;
|
||||
|
||||
if (nsent>0 || !reply.work_needed(true)) break;
|
||||
if (nsent>0 || !work_needed(sreq, reply, true)) break;
|
||||
// construct a name which is lexically greater than the name of any result
|
||||
// which uses this file.
|
||||
sprintf(min_resultname, "%s__~", filename);
|
||||
|
@ -793,7 +793,7 @@ static int send_new_file_work_deterministic(
|
|||
|
||||
// continue deterministic search at lexically first possible
|
||||
// filename, continue to randomly choosen one
|
||||
if (!getfile_retval && reply.work_needed(true)) {
|
||||
if (!getfile_retval && work_needed(sreq, reply, true)) {
|
||||
send_new_file_work_deterministic_seeded(
|
||||
sreq, reply, nsent, "", start_filename
|
||||
);
|
||||
|
@ -834,7 +834,7 @@ static int send_new_file_work(
|
|||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply
|
||||
) {
|
||||
|
||||
while (reply.work_needed(true)) {
|
||||
while (work_needed(sreq, reply, true)) {
|
||||
int retval_sow, retval_snfwws;
|
||||
double frac=((double)rand())/(double)RAND_MAX;
|
||||
int now = time(0);
|
||||
|
@ -857,7 +857,7 @@ static int send_new_file_work(
|
|||
if (retval_sow==ERR_NO_APP_VERSION || retval_sow==ERR_INSUFFICIENT_RESOURCE) return retval_sow;
|
||||
|
||||
|
||||
while (reply.work_needed(true) && retry<5) {
|
||||
while (work_needed(sreq, reply, true) && retry<5) {
|
||||
log_messages.printf(MSG_DEBUG,
|
||||
"send_new_file_work(%d): try to send from working set\n", retry
|
||||
);
|
||||
|
@ -867,7 +867,7 @@ static int send_new_file_work(
|
|||
|
||||
}
|
||||
|
||||
if (reply.work_needed(true)) {
|
||||
if (work_needed(sreq, reply, true)) {
|
||||
log_messages.printf(MSG_DEBUG,
|
||||
"send_new_file_work(): try deterministic method\n"
|
||||
);
|
||||
|
@ -878,7 +878,7 @@ static int send_new_file_work(
|
|||
return 1;
|
||||
}
|
||||
}
|
||||
} // while reply.work_needed(true)
|
||||
} // while reply.work_needed(sreq, reply, true)
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -898,7 +898,7 @@ static int send_old_work(
|
|||
DB_RESULT result;
|
||||
int now=time(0);
|
||||
|
||||
if (!reply.work_needed(true)) {
|
||||
if (!work_needed(sreq, reply, true)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1067,7 +1067,7 @@ void send_work_locality(
|
|||
int k = (i+j)%nfiles;
|
||||
int retval_srff;
|
||||
|
||||
if (!reply.work_needed(true)) break;
|
||||
if (!work_needed(sreq, reply, true)) break;
|
||||
FILE_INFO& fi = sreq.file_infos[k];
|
||||
retval_srff=send_results_for_file(
|
||||
fi.name, nsent, sreq, reply, false
|
||||
|
@ -1081,7 +1081,7 @@ void send_work_locality(
|
|||
// If the work was not sent for other (dynamic) reason such as insufficient
|
||||
// cpu, then DON'T delete the file.
|
||||
//
|
||||
if (nsent == 0 && reply.work_needed(true) && config.file_deletion_strategy == 1) {
|
||||
if (nsent == 0 && work_needed(sreq, reply, true) && config.file_deletion_strategy == 1) {
|
||||
reply.file_deletes.push_back(fi);
|
||||
log_messages.printf(MSG_DEBUG,
|
||||
"[HOST#%d]: delete file %s (not needed)\n", reply.host.id, fi.name
|
||||
|
@ -1104,7 +1104,7 @@ void send_work_locality(
|
|||
|
||||
// send new files if needed
|
||||
//
|
||||
if (reply.work_needed(true)) {
|
||||
if (work_needed(sreq, reply, true)) {
|
||||
send_new_file_work(sreq, reply);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -77,10 +77,21 @@ const char* infeasible_string(int code) {
|
|||
const int MIN_SECONDS_TO_SEND = 0;
|
||||
const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY);
|
||||
|
||||
inline int effective_ncpus(HOST& host) {
|
||||
int ncpus = host.p_ncpus;
|
||||
// return a number that
|
||||
// - is the # of CPUs in EDF simulation
|
||||
// - scales the daily result quota
|
||||
// - scales max_wus_in_progress
|
||||
|
||||
inline int effective_ncpus(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
||||
int ncpus = reply.host.p_ncpus;
|
||||
if (ncpus > config.max_ncpus) ncpus = config.max_ncpus;
|
||||
if (ncpus < 1) ncpus = 1;
|
||||
if (config.have_cuda_apps) {
|
||||
COPROC* cp = sreq.coprocs.lookup("cuda");
|
||||
if (cp && cp->count > ncpus) {
|
||||
ncpus = cp->count;
|
||||
}
|
||||
}
|
||||
return ncpus;
|
||||
}
|
||||
|
||||
|
@ -653,7 +664,7 @@ int wu_is_infeasible_fast(
|
|||
}
|
||||
IP_RESULT candidate("", wu.delay_bound, est_cpu);
|
||||
strcpy(candidate.name, wu.name);
|
||||
if (check_candidate(candidate, effective_ncpus(reply.host), request.ip_results)) {
|
||||
if (check_candidate(candidate, effective_ncpus(request, reply), request.ip_results)) {
|
||||
// it passed the feasibility test,
|
||||
// but don't add it the the workload yet;
|
||||
// wait until we commit to sending it
|
||||
|
@ -833,47 +844,49 @@ void unlock_sema() {
|
|||
// and we haven't exceeded result per RPC limit,
|
||||
// and we haven't exceeded results per day limit
|
||||
//
|
||||
bool SCHEDULER_REPLY::work_needed(bool locality_sched) {
|
||||
bool work_needed(
|
||||
SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, bool locality_sched
|
||||
) {
|
||||
if (locality_sched) {
|
||||
// if we've failed to send a result because of a transient condition,
|
||||
// return false to preserve invariant
|
||||
//
|
||||
if (wreq.disk.insufficient || wreq.speed.insufficient || wreq.mem.insufficient || wreq.no_allowed_apps_available) {
|
||||
if (reply.wreq.disk.insufficient || reply.wreq.speed.insufficient || reply.wreq.mem.insufficient || reply.wreq.no_allowed_apps_available) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (wreq.seconds_to_fill <= 0) return false;
|
||||
if (wreq.disk_available <= 0) {
|
||||
if (reply.wreq.seconds_to_fill <= 0) return false;
|
||||
if (reply.wreq.disk_available <= 0) {
|
||||
return false;
|
||||
}
|
||||
if (wreq.nresults >= config.max_wus_to_send) return false;
|
||||
if (reply.wreq.nresults >= config.max_wus_to_send) return false;
|
||||
|
||||
int ncpus = effective_ncpus(host);
|
||||
int ncpus = effective_ncpus(sreq, reply);
|
||||
|
||||
// host.max_results_day is between 1 and config.daily_result_quota inclusive
|
||||
// wreq.daily_result_quota is between ncpus
|
||||
// and ncpus*host.max_results_day inclusive
|
||||
//
|
||||
if (config.daily_result_quota) {
|
||||
if (host.max_results_day == 0 || host.max_results_day>config.daily_result_quota) {
|
||||
host.max_results_day = config.daily_result_quota;
|
||||
if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {
|
||||
reply.host.max_results_day = config.daily_result_quota;
|
||||
}
|
||||
wreq.daily_result_quota = ncpus*host.max_results_day;
|
||||
if (host.nresults_today >= wreq.daily_result_quota) {
|
||||
wreq.daily_result_quota_exceeded = true;
|
||||
reply.wreq.daily_result_quota = ncpus*reply.host.max_results_day;
|
||||
if (reply.host.nresults_today >= reply.wreq.daily_result_quota) {
|
||||
reply.wreq.daily_result_quota_exceeded = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
if (config.max_wus_in_progress) {
|
||||
if (wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) {
|
||||
if (reply.wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) {
|
||||
if (config.debug_send) {
|
||||
log_messages.printf(MSG_DEBUG,
|
||||
"in-progress job limit exceeded; %d > %d*%d\n",
|
||||
wreq.nresults_on_host, config.max_wus_in_progress, ncpus
|
||||
reply.wreq.nresults_on_host, config.max_wus_in_progress, ncpus
|
||||
);
|
||||
}
|
||||
wreq.cache_size_exceeded = true;
|
||||
reply.wreq.cache_size_exceeded = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -1022,7 +1035,7 @@ int add_result_to_reply(
|
|||
result.bavp = bavp;
|
||||
reply.insert_result(result);
|
||||
reply.wreq.seconds_to_fill -= wu_seconds_filled;
|
||||
request.estimated_delay += wu_seconds_filled/effective_ncpus(reply.host);
|
||||
request.estimated_delay += wu_seconds_filled/effective_ncpus(request, reply);
|
||||
reply.wreq.nresults++;
|
||||
reply.wreq.nresults_on_host++;
|
||||
if (!resent_result) reply.host.nresults_today++;
|
||||
|
@ -1407,7 +1420,7 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
|
||||
if (config.workload_sim && sreq.have_other_results_list) {
|
||||
init_ip_results(
|
||||
sreq.global_prefs.work_buf_min(), effective_ncpus(reply.host), sreq.ip_results
|
||||
sreq.global_prefs.work_buf_min(), effective_ncpus(sreq, reply), sreq.ip_results
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1450,7 +1463,7 @@ struct JOB_SET {
|
|||
disk_usage = 0;
|
||||
disk_limit = reply.wreq.disk_available;
|
||||
max_jobs = config.max_wus_to_send;
|
||||
int ncpus = effective_ncpus(reply.host), n;
|
||||
int ncpus = effective_ncpus(sreq, reply), n;
|
||||
|
||||
if (config.daily_result_quota) {
|
||||
if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) {
|
||||
|
|
|
@ -62,3 +62,4 @@ extern void lock_sema();
|
|||
extern void unlock_sema();
|
||||
extern const char* infeasible_string(int);
|
||||
extern bool app_not_selected(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&);
|
||||
extern bool work_needed(SCHEDULER_REQUEST&, SCHEDULER_REPLY&, bool);
|
||||
|
|
|
@ -363,7 +363,6 @@ struct SCHEDULER_REPLY {
|
|||
void insert_workunit_unique(WORKUNIT&);
|
||||
void insert_result(RESULT&);
|
||||
void insert_message(USER_MESSAGE&);
|
||||
bool work_needed(bool locality_sched=false);
|
||||
void set_delay(double);
|
||||
void got_good_result(); // adjust max_results_day
|
||||
void got_bad_result(); // adjust max_results_day
|
||||
|
|
Loading…
Reference in New Issue