diff --git a/checkin_notes b/checkin_notes index 9fc472f201..762fd39f20 100644 --- a/checkin_notes +++ b/checkin_notes @@ -8511,3 +8511,16 @@ David 21 Oct 2008 - make "make distclean" work in all directories samples/*/Makefile + +David 21 Oct 2008 + - scheduler: add config flag. + If set the "effective NCPUS" (which is used to scale + daily_result_quota and max_wus_in_progress) + is max'd with the # of CUDA GPUs. + + sched/ + sched_array.cpp + sched_config.cpp,h + sched_locality.cpp + sched_send.cpp,h + server_types.h diff --git a/sched/sched_array.cpp b/sched/sched_array.cpp index 120049a628..f7a05797a0 100644 --- a/sched/sched_array.cpp +++ b/sched/sched_array.cpp @@ -54,7 +54,7 @@ void scan_work_array(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { rnd_off = rand() % ssp->max_wu_results; for (j=0; jmax_wu_results; j++) { i = (j+rnd_off) % ssp->max_wu_results; - if (!reply.work_needed()) break; + if (!work_needed(sreq, reply, false)) break; WU_RESULT& wu_result = ssp->wu_results[i]; diff --git a/sched/sched_config.cpp b/sched/sched_config.cpp index 344e67abf9..f0d4d3f1ee 100644 --- a/sched/sched_config.cpp +++ b/sched/sched_config.cpp @@ -184,6 +184,7 @@ int SCHED_CONFIG::parse(FILE* f) { if (xp.parse_bool(tag, "use_credit_multiplier", use_credit_multiplier)) continue; if (xp.parse_bool(tag, "multiple_clients_per_host", multiple_clients_per_host)) continue; if (xp.parse_bool(tag, "no_vista_sandbox", no_vista_sandbox)) continue; + if (xp.parse_bool(tag, "have_cuda_apps", have_cuda_apps)) continue; if (xp.parse_bool(tag, "debug_version_select", debug_version_select)) continue; diff --git a/sched/sched_config.h b/sched/sched_config.h index 384217f795..d43ba175d2 100644 --- a/sched/sched_config.h +++ b/sched/sched_config.h @@ -125,6 +125,7 @@ public: bool use_credit_multiplier; bool multiple_clients_per_host; bool no_vista_sandbox; + bool have_cuda_apps; // log flags // diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index f4943484cd..e050aeced7 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -488,7 +488,7 @@ static int send_results_for_file( nsent = 0; - if (!reply.work_needed(true)) { + if (!work_needed(sreq, reply, true)) { return 0; } @@ -519,7 +519,7 @@ static int send_results_for_file( for (i=0; i<100; i++) { // avoid infinite loop int query_retval; - if (!reply.work_needed(true)) break; + if (!work_needed(sreq, reply, true)) break; log_messages.printf(MSG_DEBUG, "in_send_results_for_file(%s, %d) prev_result.id=%d\n", filename, i, prev_result.id @@ -740,7 +740,7 @@ static int send_new_file_work_deterministic_seeded( if (retval==ERR_NO_APP_VERSION || retval==ERR_INSUFFICIENT_RESOURCE) return retval; - if (nsent>0 || !reply.work_needed(true)) break; + if (nsent>0 || !work_needed(sreq, reply, true)) break; // construct a name which is lexically greater than the name of any result // which uses this file. sprintf(min_resultname, "%s__~", filename); @@ -793,7 +793,7 @@ static int send_new_file_work_deterministic( // continue deterministic search at lexically first possible // filename, continue to randomly choosen one - if (!getfile_retval && reply.work_needed(true)) { + if (!getfile_retval && work_needed(sreq, reply, true)) { send_new_file_work_deterministic_seeded( sreq, reply, nsent, "", start_filename ); @@ -834,7 +834,7 @@ static int send_new_file_work( SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply ) { - while (reply.work_needed(true)) { + while (work_needed(sreq, reply, true)) { int retval_sow, retval_snfwws; double frac=((double)rand())/(double)RAND_MAX; int now = time(0); @@ -857,7 +857,7 @@ static int send_new_file_work( if (retval_sow==ERR_NO_APP_VERSION || retval_sow==ERR_INSUFFICIENT_RESOURCE) return retval_sow; - while (reply.work_needed(true) && retry<5) { + while (work_needed(sreq, reply, true) && retry<5) { log_messages.printf(MSG_DEBUG, "send_new_file_work(%d): try to send from working set\n", retry ); @@ -867,7 +867,7 @@ static int send_new_file_work( } - if (reply.work_needed(true)) { + if (work_needed(sreq, reply, true)) { log_messages.printf(MSG_DEBUG, "send_new_file_work(): try deterministic method\n" ); @@ -878,7 +878,7 @@ static int send_new_file_work( return 1; } } - } // while reply.work_needed(true) + } // while reply.work_needed(sreq, reply, true) return 0; } @@ -898,7 +898,7 @@ static int send_old_work( DB_RESULT result; int now=time(0); - if (!reply.work_needed(true)) { + if (!work_needed(sreq, reply, true)) { return 0; } @@ -1067,7 +1067,7 @@ void send_work_locality( int k = (i+j)%nfiles; int retval_srff; - if (!reply.work_needed(true)) break; + if (!work_needed(sreq, reply, true)) break; FILE_INFO& fi = sreq.file_infos[k]; retval_srff=send_results_for_file( fi.name, nsent, sreq, reply, false @@ -1081,7 +1081,7 @@ void send_work_locality( // If the work was not sent for other (dynamic) reason such as insufficient // cpu, then DON'T delete the file. // - if (nsent == 0 && reply.work_needed(true) && config.file_deletion_strategy == 1) { + if (nsent == 0 && work_needed(sreq, reply, true) && config.file_deletion_strategy == 1) { reply.file_deletes.push_back(fi); log_messages.printf(MSG_DEBUG, "[HOST#%d]: delete file %s (not needed)\n", reply.host.id, fi.name @@ -1104,7 +1104,7 @@ void send_work_locality( // send new files if needed // - if (reply.work_needed(true)) { + if (work_needed(sreq, reply, true)) { send_new_file_work(sreq, reply); } } diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 172896f8ba..9049beef83 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -77,10 +77,21 @@ const char* infeasible_string(int code) { const int MIN_SECONDS_TO_SEND = 0; const int MAX_SECONDS_TO_SEND = (28*SECONDS_IN_DAY); -inline int effective_ncpus(HOST& host) { - int ncpus = host.p_ncpus; +// return a number that +// - is the # of CPUs in EDF simulation +// - scales the daily result quota +// - scales max_wus_in_progress + +inline int effective_ncpus(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { + int ncpus = reply.host.p_ncpus; if (ncpus > config.max_ncpus) ncpus = config.max_ncpus; if (ncpus < 1) ncpus = 1; + if (config.have_cuda_apps) { + COPROC* cp = sreq.coprocs.lookup("cuda"); + if (cp && cp->count > ncpus) { + ncpus = cp->count; + } + } return ncpus; } @@ -653,7 +664,7 @@ int wu_is_infeasible_fast( } IP_RESULT candidate("", wu.delay_bound, est_cpu); strcpy(candidate.name, wu.name); - if (check_candidate(candidate, effective_ncpus(reply.host), request.ip_results)) { + if (check_candidate(candidate, effective_ncpus(request, reply), request.ip_results)) { // it passed the feasibility test, // but don't add it the the workload yet; // wait until we commit to sending it @@ -833,47 +844,49 @@ void unlock_sema() { // and we haven't exceeded result per RPC limit, // and we haven't exceeded results per day limit // -bool SCHEDULER_REPLY::work_needed(bool locality_sched) { +bool work_needed( + SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply, bool locality_sched +) { if (locality_sched) { // if we've failed to send a result because of a transient condition, // return false to preserve invariant // - if (wreq.disk.insufficient || wreq.speed.insufficient || wreq.mem.insufficient || wreq.no_allowed_apps_available) { + if (reply.wreq.disk.insufficient || reply.wreq.speed.insufficient || reply.wreq.mem.insufficient || reply.wreq.no_allowed_apps_available) { return false; } } - if (wreq.seconds_to_fill <= 0) return false; - if (wreq.disk_available <= 0) { + if (reply.wreq.seconds_to_fill <= 0) return false; + if (reply.wreq.disk_available <= 0) { return false; } - if (wreq.nresults >= config.max_wus_to_send) return false; + if (reply.wreq.nresults >= config.max_wus_to_send) return false; - int ncpus = effective_ncpus(host); + int ncpus = effective_ncpus(sreq, reply); // host.max_results_day is between 1 and config.daily_result_quota inclusive // wreq.daily_result_quota is between ncpus // and ncpus*host.max_results_day inclusive // if (config.daily_result_quota) { - if (host.max_results_day == 0 || host.max_results_day>config.daily_result_quota) { - host.max_results_day = config.daily_result_quota; + if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) { + reply.host.max_results_day = config.daily_result_quota; } - wreq.daily_result_quota = ncpus*host.max_results_day; - if (host.nresults_today >= wreq.daily_result_quota) { - wreq.daily_result_quota_exceeded = true; + reply.wreq.daily_result_quota = ncpus*reply.host.max_results_day; + if (reply.host.nresults_today >= reply.wreq.daily_result_quota) { + reply.wreq.daily_result_quota_exceeded = true; return false; } } if (config.max_wus_in_progress) { - if (wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) { + if (reply.wreq.nresults_on_host >= config.max_wus_in_progress*ncpus) { if (config.debug_send) { log_messages.printf(MSG_DEBUG, "in-progress job limit exceeded; %d > %d*%d\n", - wreq.nresults_on_host, config.max_wus_in_progress, ncpus + reply.wreq.nresults_on_host, config.max_wus_in_progress, ncpus ); } - wreq.cache_size_exceeded = true; + reply.wreq.cache_size_exceeded = true; return false; } } @@ -1022,7 +1035,7 @@ int add_result_to_reply( result.bavp = bavp; reply.insert_result(result); reply.wreq.seconds_to_fill -= wu_seconds_filled; - request.estimated_delay += wu_seconds_filled/effective_ncpus(reply.host); + request.estimated_delay += wu_seconds_filled/effective_ncpus(request, reply); reply.wreq.nresults++; reply.wreq.nresults_on_host++; if (!resent_result) reply.host.nresults_today++; @@ -1407,7 +1420,7 @@ void send_work(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) { if (config.workload_sim && sreq.have_other_results_list) { init_ip_results( - sreq.global_prefs.work_buf_min(), effective_ncpus(reply.host), sreq.ip_results + sreq.global_prefs.work_buf_min(), effective_ncpus(sreq, reply), sreq.ip_results ); } @@ -1450,7 +1463,7 @@ struct JOB_SET { disk_usage = 0; disk_limit = reply.wreq.disk_available; max_jobs = config.max_wus_to_send; - int ncpus = effective_ncpus(reply.host), n; + int ncpus = effective_ncpus(sreq, reply), n; if (config.daily_result_quota) { if (reply.host.max_results_day == 0 || reply.host.max_results_day>config.daily_result_quota) { diff --git a/sched/sched_send.h b/sched/sched_send.h index 81182bfdf0..a136710061 100644 --- a/sched/sched_send.h +++ b/sched/sched_send.h @@ -62,3 +62,4 @@ extern void lock_sema(); extern void unlock_sema(); extern const char* infeasible_string(int); extern bool app_not_selected(WORKUNIT&, SCHEDULER_REQUEST&, SCHEDULER_REPLY&); +extern bool work_needed(SCHEDULER_REQUEST&, SCHEDULER_REPLY&, bool); diff --git a/sched/server_types.h b/sched/server_types.h index 85d2bbb555..9ed4cf8122 100644 --- a/sched/server_types.h +++ b/sched/server_types.h @@ -363,7 +363,6 @@ struct SCHEDULER_REPLY { void insert_workunit_unique(WORKUNIT&); void insert_result(RESULT&); void insert_message(USER_MESSAGE&); - bool work_needed(bool locality_sched=false); void set_delay(double); void got_good_result(); // adjust max_results_day void got_bad_result(); // adjust max_results_day