// This file is part of BOINC. // http://boinc.berkeley.edu // Copyright (C) 2008 University of California // // BOINC is free software; you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License // as published by the Free Software Foundation, // either version 3 of the License, or (at your option) any later version. // // BOINC is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. // See the GNU Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public License // along with BOINC. If not, see . // Simulate the processing of the current workload // (include jobs that are downloading) // with weighted round-robin (WRR) scheduling. // // Outputs are changes to global state: // - deadline misses (per-project count, per-result flag) // Deadline misses are not counted for tasks // that are too large to run in RAM right now. // - number of runnable jobs per project // p.pwf.n_runnable_jobs // - for each resource type (in RSC_WORK_FETCH): // - shortfall // - nidle_now: # of idle instances // - sim_excluded_instances: bitmap of instances idle because of exclusions // // For coprocessors, we saturate the resource if possible; // i.e. with 2 GPUs, we'd let a 1-GPU app and a 2-GPU app run together. // Otherwise, there'd be the possibility of computing // a nonzero shortfall inappropriately. // #include "cpp.h" #ifdef _WIN32 #include "boinc_win.h" #else #include "config.h" #endif #include "client_msgs.h" #include "client_state.h" #include "coproc.h" #include "project.h" #include "result.h" using std::vector; // set "nused" bits of the source bitmap in the dest bitmap // static inline void set_bits( COPROC_INSTANCE_BITMAP src, double nused, COPROC_INSTANCE_BITMAP& dst ) { // if all bits are already set, we're done // if ((src&dst) == src) return; COPROC_INSTANCE_BITMAP bit = 1; for (int i=0; i active_jobs; inline void activate(RESULT* rp) { PROJECT* p = rp->project; active_jobs.push_back(rp); int rt = rp->avp->gpu_usage.rsc_type; // if this is a GPU app and GPU computing is suspended, // don't count its CPU usage. // That way we'll fetch more CPU work if needed. // if (!rt || !gpu_suspend_reason) { rsc_work_fetch[0].sim_nused += rp->avp->avg_ncpus; p->rsc_pwf[0].sim_nused += rp->avp->avg_ncpus; } if (rt) { rsc_work_fetch[rt].sim_nused += rp->avp->gpu_usage.usage; p->rsc_pwf[rt].sim_nused += rp->avp->gpu_usage.usage; if (rsc_work_fetch[rt].has_exclusions) { set_bits( rp->app->non_excluded_instances[rt], p->rsc_pwf[rt].nused_total, rsc_work_fetch[rt].sim_used_instances ); #if 0 msg_printf(p, MSG_INFO, "%d non_excl %d used %d", rt, rp->app->non_excluded_instances[rt], rsc_work_fetch[rt].sim_used_instances ); #endif } } if (have_max_concurrent) { max_concurrent_inc(rp); if (p->rsc_pwf[0].sim_nused > p->rsc_pwf[0].max_nused) { p->rsc_pwf[0].max_nused = p->rsc_pwf[0].sim_nused; } if (rt && p->rsc_pwf[rt].sim_nused > p->rsc_pwf[rt].max_nused) { p->rsc_pwf[rt].max_nused = p->rsc_pwf[rt].sim_nused; } } } void init_pending_lists(); void pick_jobs_to_run(double reltime); void simulate(); RR_SIM() {} ~RR_SIM() {} }; // estimate the long-term FLOPS that this job will get // (counting unavailability) // void set_rrsim_flops(RESULT* rp) { // For coproc jobs, use app version estimate // if (rp->uses_gpu()) { rp->rrsim_flops = rp->avp->flops * gstate.overall_gpu_frac(); } else if (rp->avp->needs_network) { rp->rrsim_flops = rp->avp->flops * gstate.overall_cpu_and_network_frac(); } else { rp->rrsim_flops = rp->avp->flops * gstate.overall_cpu_frac(); } if (rp->rrsim_flops == 0) { rp->rrsim_flops = 1e6; // just in case } } void print_deadline_misses() { unsigned int i; RESULT* rp; PROJECT* p; for (i=0; irr_sim_misses_deadline) { msg_printf(rp->project, MSG_INFO, "[rr_sim] Result %s projected to miss deadline.", rp->name ); } } for (i=0; irsc_pwf[j].deadlines_missed) { msg_printf(p, MSG_INFO, "[rr_sim] Project has %d projected %s deadline misses", p->rsc_pwf[j].deadlines_missed, rsc_name_long(j) ); } } } } // Decide what jobs to include in the simulation; // build the "pending" lists for each (project, processor type) pair. // NOTE: "results" is sorted by increasing arrival time. // void RR_SIM::init_pending_lists() { for (unsigned int i=0; irsc_pwf[j].pending.clear(); p->rsc_pwf[j].queue_est = 0; } } for (unsigned int i=0; irr_sim_misses_deadline = false; rp->already_selected = false; if (!rp->nearly_runnable()) continue; if (rp->some_download_stalled()) continue; if (rp->always_run()) continue; rp->rrsim_flops_left = rp->estimated_flops_remaining(); //if (rp->rrsim_flops_left <= 0) continue; // job may have fraction_done=1 but not be done; // if it's past its deadline, we need to mark it as such PROJECT* p = rp->project; p->pwf.n_runnable_jobs++; p->rsc_pwf[0].nused_total += rp->avp->avg_ncpus; set_rrsim_flops(rp); int rt = rp->avp->gpu_usage.rsc_type; if (rt) { p->rsc_pwf[rt].nused_total += rp->avp->gpu_usage.usage; p->rsc_pwf[rt].n_runnable_jobs++; p->rsc_pwf[rt].queue_est += rp->rrsim_flops_left/rp->rrsim_flops; } p->rsc_pwf[rt].pending.push_back(rp); rp->rrsim_done = false; } } // Pick jobs to run from pending lists, putting them in "active_jobs" list. // Approximate what the job scheduler would do: // pick a job from the project P with highest scheduling priority, // then adjust P's scheduling priority. // // This is called: // - at the start of the simulation // It will pick jobs to use all resources // - each time a job finishes in the simulation // It will generally pick one new job to use the resource just freed // void RR_SIM::pick_jobs_to_run(double reltime) { if (log_flags.rr_simulation) { msg_printf(NULL, MSG_INFO, "pick_jobs_to_run() start"); } active_jobs.clear(); if (have_max_concurrent) { max_concurrent_init(); } // save and restore rec_temp // for (unsigned int i=0; ipwf.rec_temp_save = p->pwf.rec_temp; p->pwf.at_max_concurrent_limit = false; } rsc_work_fetch[0].sim_nused = 0; // loop over resource types; do the GPUs first // for (int rt=coprocs.n_rsc-1; rt>=0; rt--) { vector project_heap; if (rt) rsc_work_fetch[rt].sim_nused = 0; // Make a heap of projects with runnable jobs for this resource, // ordered by scheduling priority. // Clear usage counts. // Initialize iterators to the pending list of each project. // for (unsigned int i=0; irsc_pwf[rt]; size_t s = rsc_pwf.pending.size(); #if 0 if (log_flags.rrsim_detail) { msg_printf(p, MSG_INFO, "[rr_sim] %u jobs for rsc %zu", s, rt); } #endif if (s == 0) continue; rsc_pwf.pending_iter = rsc_pwf.pending.begin(); rsc_pwf.sim_nused = 0; p->pwf.rec_temp = p->pwf.rec; p->compute_sched_priority(); project_heap.push_back(p); } make_heap(project_heap.begin(), project_heap.end()); // Loop over jobs. // Keep going until the resource is saturated or there are no more jobs. // while (1) { if (project_heap.empty()) break; // p is the highest-priority project with work for this resource // PROJECT* p = project_heap.front(); RSC_PROJECT_WORK_FETCH& rsc_pwf = p->rsc_pwf[rt]; RESULT* rp = *rsc_pwf.pending_iter; // garbage-collect jobs that already completed in our simulation // (this is just a handy place to do this) // if (rp->rrsim_done) { rsc_pwf.pending_iter = rsc_pwf.pending.erase( rsc_pwf.pending_iter ); } else { // add job to active_jobs list, and adjust project priority // activate(rp); adjust_rec_sched(rp); if (log_flags.rrsim_detail && !rp->already_selected) { char buf[256]; rp->rsc_string(buf, sizeof(buf)); msg_printf(rp->project, MSG_INFO, "[rr_sim_detail] %.2f: starting %s (%s) (%.2fG/%.2fG)", reltime, rp->name, buf, rp->rrsim_flops_left/1e9, rp->rrsim_flops/1e9 ); rp->already_selected = true; } // Check if project is at a max_concurrent limit // if (have_max_concurrent) { switch (max_concurrent_exceeded(rp)) { case CONCURRENT_LIMIT_PROJECT: rsc_pwf.last_mc_limit_reltime = reltime; p->pwf.at_max_concurrent_limit = true; if (log_flags.rr_simulation) { msg_printf(p, MSG_INFO, "[rr_sim] at project max concurrent: t %f", reltime ); } break; case CONCURRENT_LIMIT_APP: // no more jobs for this project/app // p->pwf.at_max_concurrent_limit = true; rsc_pwf.last_mc_limit_reltime = reltime; if (log_flags.rr_simulation) { msg_printf(p, MSG_INFO, "[rr_sim] at app max concurrent for %s; t %f", rp->app->name, reltime ); } break; } } // check whether resource is saturated // if (rt) { if (rsc_work_fetch[rt].sim_nused >= coprocs.coprocs[rt].count) { break; } // if a GPU isn't saturated but this project is using // its max given exclusions, remove it from project heap // if (rsc_pwf.sim_nused >= coprocs.coprocs[rt].count - p->rsc_pwf[rt].ncoprocs_excluded) { pop_heap(project_heap.begin(), project_heap.end()); project_heap.pop_back(); continue; } } else { if (rsc_work_fetch[rt].sim_nused >= gstate.n_usable_cpus) break; } ++rsc_pwf.pending_iter; } if (rsc_pwf.pending_iter == rsc_pwf.pending.end() || p->pwf.at_max_concurrent_limit ) { // if this project now has no more jobs for the resource, // remove it from the project heap // pop_heap(project_heap.begin(), project_heap.end()); project_heap.pop_back(); } else if (!rp->rrsim_done) { // Otherwise reshuffle the project heap // make_heap(project_heap.begin(), project_heap.end()); } } } for (unsigned int i=0; ipwf.rec_temp = p->pwf.rec_temp_save; } if (log_flags.rr_simulation) { msg_printf(NULL, MSG_INFO, "pick_jobs_to_run() end"); } } // compute the number of idle instances (count - nused) // Called at the start of RR simulation, // after the initial assignment of jobs // static void record_nidle_now() { rsc_work_fetch[0].nidle_now = gstate.n_usable_cpus - rsc_work_fetch[0].sim_nused; if (rsc_work_fetch[0].nidle_now < 0) rsc_work_fetch[0].nidle_now = 0; for (int i=1; iproject; if (atp) { atp->last_deadline_miss_time = gstate.now; } if (atp && atp->procinfo.working_set_size_smoothed > ar) { if (log_flags.rr_simulation) { msg_printf(pbest, MSG_INFO, "[rr_sim] %s misses deadline but too large to run", rpbest->name ); } } else { rpbest->rr_sim_misses_deadline = true; int rt = rpbest->avp->gpu_usage.rsc_type; if (rt) { pbest->rsc_pwf[rt].deadlines_missed++; rsc_work_fetch[rt].deadline_missed_instances += rpbest->avp->gpu_usage.usage; } else { pbest->rsc_pwf[0].deadlines_missed++; rsc_work_fetch[0].deadline_missed_instances += rpbest->avp->avg_ncpus; } if (log_flags.rr_simulation) { msg_printf(pbest, MSG_INFO, "[rr_sim] %s misses deadline by %.2f", rpbest->name, diff ); } } } // update "MC shortfall" for projects with max concurrent restrictions // static void mc_update_stats(double sim_now, double dt, double buf_end) { for (unsigned int i=0; iapp_configs.project_has_mc) continue; for (int rt=0; rtrsc_pwf[rt]; RSC_WORK_FETCH& rwf = rsc_work_fetch[rt]; double x = rsc_pwf.max_nused - rsc_pwf.sim_nused; x = std::min(x, rwf.ninstances - rwf.sim_nused); if (x > 1e-6 && sim_now < buf_end) { double dt2; if (sim_now + dt > buf_end) { dt2 = buf_end - sim_now; } else { dt2 = dt; } rsc_pwf.mc_shortfall += x*dt2; } } } } // do a round_robin simulation, // for either CPU scheduling (to find deadline misses) // or work fetch (do compute idleness and shortfall) // void RR_SIM::simulate() { PROJECT* pbest; RESULT* rp, *rpbest; unsigned int u; double ar = gstate.available_ram(); // initialize work-fetch data structures in either case // work_fetch.rr_init(); if (log_flags.rr_simulation) { msg_printf(0, MSG_INFO, "[rr_sim] start: work_buf min %.0f additional %.0f total %.0f on_frac %.3f active_frac %.3f", gstate.work_buf_min(), gstate.work_buf_additional(), gstate.work_buf_total(), gstate.time_stats.on_frac, gstate.time_stats.active_frac ); } project_priority_init(false); init_pending_lists(); if (have_max_concurrent) { for (unsigned int i=0; ipwf.at_max_concurrent_limit = false; } } // Simulation loop. Keep going until all jobs done // double buf_end = gstate.now + gstate.work_buf_total(); double sim_now = gstate.now; bool first = true; while (1) { pick_jobs_to_run(sim_now-gstate.now); if (first) { record_nidle_now(); first = false; } if (!active_jobs.size()) break; // compute finish times and see which job finishes first // rpbest = NULL; for (u=0; urrsim_finish_delay = rp->rrsim_flops_left/rp->rrsim_flops; if (!rpbest || rp->rrsim_finish_delay < rpbest->rrsim_finish_delay) { rpbest = rp; } } // see if we finish a time slice before first job ends // double delta_t = rpbest->rrsim_finish_delay; if (log_flags.rrsim_detail) { msg_printf(NULL, MSG_INFO, "[rrsim_detail] next job to finish: %s (will finish in %.2f sec)", rpbest->name, delta_t ); } if (delta_t > 3600) { rpbest = 0; // limit the granularity // if (delta_t > 36000) { delta_t /= 10; } else { delta_t = 3600; } if (log_flags.rrsim_detail) { msg_printf(NULL, MSG_INFO, "[rrsim_detail] taking time-slice step of %.2f sec", delta_t ); } } else { rpbest->rrsim_done = true; pbest = rpbest->project; if (log_flags.rr_simulation) { char buf[256]; rpbest->rsc_string(buf, sizeof(buf)); msg_printf(pbest, MSG_INFO, "[rr_sim] %.2f: %s finishes (%s) (%.2fG/%.2fG)", sim_now + delta_t - gstate.now, rpbest->name, buf, rpbest->estimated_flops_remaining()/1e9, rpbest->rrsim_flops/1e9 ); } // Does it miss its deadline? // double diff = (sim_now + rpbest->rrsim_finish_delay) - rpbest->computation_deadline(); if (diff > 0) { handle_missed_deadline(rpbest, diff, ar); // update busy time of relevant processor types // double frac = rpbest->uses_gpu()?gstate.overall_gpu_frac():gstate.overall_cpu_frac(); double dur = rpbest->estimated_runtime_remaining() / frac; rsc_work_fetch[0].update_busy_time(dur, rpbest->avp->avg_ncpus); int rt = rpbest->avp->gpu_usage.rsc_type; if (rt) { rsc_work_fetch[rt].update_busy_time(dur, rpbest->avp->gpu_usage.usage); } } } // adjust FLOPS left of other active jobs // for (unsigned int i=0; irrsim_flops_left -= rp->rrsim_flops*delta_t; // can be slightly less than 0 due to roundoff // if (rp->rrsim_flops_left < -1e6) { if (log_flags.rr_simulation) { msg_printf(rp->project, MSG_INTERNAL_ERROR, "%s: negative FLOPs left %f", rp->name, rp->rrsim_flops_left ); } } if (rp->rrsim_flops_left < 0) { rp->rrsim_flops_left = 0; } } // update shortfall and saturated time for each resource // for (int i=0; irsc_pwf[j].sim_nused * delta_t * f * rsc_work_fetch[j].relative_speed; } x *= COBBLESTONE_SCALE; update_average( sim_now+delta_t, sim_now, x, cc_config.rec_half_life, p->pwf.rec_temp, dtemp ); p->compute_sched_priority(); } sim_now += delta_t; } // identify GPU instances starved because of exclusions // for (int i=1; i 0 idle instance // Put results in global state (rsc_work_fetch) // This is called from the account manager logic, // to decide if we need to get new projects from the AM. // int n_idle_resources() { int nidle_rsc = coprocs.n_rsc; for (int i=0; inearly_runnable()) continue; if (rp->some_download_stalled()) continue; APP_VERSION* avp = rp->avp; if (rsc_work_fetch[0].nidle_now) { rsc_work_fetch[0].nidle_now -= avp->avg_ncpus; if (rsc_work_fetch[0].nidle_now <= 0) { nidle_rsc--; rsc_work_fetch[0].nidle_now = 0; } } int j = avp->gpu_usage.rsc_type; if (!j) { continue; } if (rsc_work_fetch[j].nidle_now) { rsc_work_fetch[j].nidle_now -= avp->gpu_usage.usage; if (rsc_work_fetch[j].nidle_now <= 0) { nidle_rsc--; rsc_work_fetch[j].nidle_now = 0; } } if (nidle_rsc == 0) { // no idle resources - no need to look further // break; } } return nidle_rsc; }