boinc/client/sim.cpp

1586 lines
46 KiB
C++

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
// BOINC client simulator.
//
// usage: directory options
//
// [--infile_prefix dir/]
// Prefix of input filenames; default is blank.
// Input files are:
// client_state.xml
// global_prefs.xml
// global_prefs_override.xml
// [--outfile_prefix X]
// Prefix of output filenames; default is blank.
// Output files are:
// index.html (lists other files)
// timeline.html
// log.txt
// results.dat (simulation results, machine-readable)
// results.txt (simulation results, human-readable)
// inputs.txt (sim parameters)
// summary.txt (summary of inputs; detailed outputs)
// rec.png
//
// Simulation params:
// [--existing_jobs_only]
// If set, simulate the specific set of jobs in the state file.
// Otherwise simulate an infinite stream of jobs
// modeled after those found in the state file.
// [--duration x]
// simulation duration (default 86400)
// [--delta x]
// delta = simulation time step (default 10)
//
// Policy options:
// [--server_uses_workload]
// simulate use of EDF sim by scheduler
// [--cpu_sched_rr_only]
// use only RR scheduling
// [--rec_half_life X]
// half-life of recent est credit
#include <cmath>
#include "error_numbers.h"
#include "str_util.h"
#include "util.h"
#include "log_flags.h"
#include "filesys.h"
#include "../sched/edf_sim.h"
#include "client_msgs.h"
#include "client_state.h"
#include "project.h"
#include "result.h"
#include "scheduler_op.h"
#include "sim.h"
#define SCHED_RETRY_DELAY_MIN 60 // 1 minute
#define SCHED_RETRY_DELAY_MAX (60*60*4) // 4 hours
const char* infile_prefix = "./";
const char* outfile_prefix = "./";
#define TIMELINE_FNAME "timeline.html"
#define LOG_FNAME "log.txt"
#define INPUTS_FNAME "inputs.txt"
#define RESULTS_DAT_FNAME "results.dat"
#define RESULTS_TXT_FNAME "results.txt"
#define SUMMARY_FNAME "summary.txt"
#define REC_FNAME "rec.dat"
bool user_active;
double duration = 86400, delta = 60;
FILE* logfile;
FILE* html_out;
FILE* rec_file;
FILE* index_file;
FILE* summary_file;
char log_filename[256];
string html_msg;
double active_time = 0;
double gpu_active_time = 0;
bool server_uses_workload = false;
bool cpu_sched_rr_only = false;
bool existing_jobs_only = false;
bool include_empty_projects;
RANDOM_PROCESS on_proc;
RANDOM_PROCESS active_proc;
RANDOM_PROCESS gpu_active_proc;
RANDOM_PROCESS connected_proc;
bool on;
bool active;
bool gpu_active;
bool connected;
extern double rec_adjust_period;
SIM_RESULTS sim_results;
int njobs;
void usage(char* prog) {
fprintf(stderr, "usage: %s\n"
"[--infile_prefix F]\n"
"[--outfile_prefix F]\n"
"[--existing_jobs_only]\n"
"[--duration X]\n"
"[--delta X]\n"
"[--server_uses_workload]\n"
"[--cpu_sched_rr_only]\n"
"[--rec_half_life X]\n",
prog
);
exit(1);
}
// peak flops of an app version
//
double app_peak_flops(APP_VERSION* avp, double cpu_scale) {
double x = avp->avg_ncpus*cpu_scale;
int rt = avp->gpu_usage.rsc_type;
if (rt) {
x += avp->gpu_usage.usage * rsc_work_fetch[rt].relative_speed;
}
x *= gstate.host_info.p_fpops;
return x;
}
double gpu_peak_flops() {
double x = 0;
for (int i=1; i<coprocs.n_rsc; i++) {
x += coprocs.coprocs[i].count * rsc_work_fetch[i].relative_speed * gstate.host_info.p_fpops;
}
return x;
}
double cpu_peak_flops() {
return gstate.ncpus * gstate.host_info.p_fpops;
}
void print_project_results(FILE* f) {
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
p->print_results(f, sim_results);
}
}
APP* choose_app(vector<APP*>& apps) {
double x = drand();
double sum = 0;
unsigned int i;
for (i=0; i<apps.size(); i++) {
sum += apps[i]->weight;
}
for (i=0; i<apps.size(); i++) {
APP* app = apps[i];
x -= app->weight/sum;
if (x <= 0) {
return app;
}
}
return apps.back();
}
bool app_version_needs_work(APP_VERSION* avp) {
if (avp->dont_use) return false;
int rt = avp->gpu_usage.rsc_type;
if (rt) {
return (rsc_work_fetch[rt].req_secs>0 || rsc_work_fetch[rt].req_instances>0);
}
return (rsc_work_fetch[0].req_secs>0 || rsc_work_fetch[0].req_instances>0);
}
bool has_app_version_needing_work(APP* app) {
for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
APP_VERSION* avp = gstate.app_versions[i];
if (avp->app != app) continue;
if (app_version_needs_work(avp)) return true;
}
return false;
}
// choose a version for this app for which we need work
//
APP_VERSION* choose_app_version(APP* app) {
APP_VERSION* best_avp = NULL;
for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
APP_VERSION* avp = gstate.app_versions[i];
if (avp->app != app) continue;
if (!app_version_needs_work(avp)) continue;
if (!best_avp) {
best_avp = avp;
} else if (avp->flops > best_avp->flops) {
best_avp = avp;
}
}
return best_avp;
}
// generate a job; pick a random app for this project,
// and pick a FLOP count from its distribution
//
void make_job(
PROJECT* p, WORKUNIT* wup, RESULT* rp, vector<APP*>app_list
) {
APP* app = choose_app(app_list);
APP_VERSION* avp = choose_app_version(app);
rp->clear();
rp->avp = avp;
rp->app = app;
if (!rp->avp) {
fprintf(stderr, "ERROR - NO APP VERSION\n");
exit(1);
}
rp->project = p;
rp->wup = wup;
sprintf(rp->name, "%s_%d", p->project_name, p->result_index++);
wup->project = p;
wup->rsc_fpops_est = app->fpops_est;
rp->sim_flops_left = rp->wup->rsc_fpops_est;
safe_strcpy(wup->name, rp->name);
safe_strcpy(wup->app_name, app->name);
wup->app = app;
double ops = app->fpops.sample();
if (ops < 0) ops = 0;
wup->rsc_fpops_est = ops;
rp->report_deadline = gstate.now + app->latency_bound;
}
// process ready-to-report results
//
void CLIENT_STATE::handle_completed_results(PROJECT* p) {
char buf[256];
vector<RESULT*>::iterator result_iter;
result_iter = results.begin();
while (result_iter != results.end()) {
RESULT* rp = *result_iter;
if (rp->project == p && rp->ready_to_report) {
if (gstate.now > rp->report_deadline) {
sprintf(buf, "result %s reported; "
"<font color=#cc0000>MISSED DEADLINE by %f</font><br>\n",
rp->name, gstate.now - rp->report_deadline
);
} else {
sprintf(buf, "result %s reported; "
"<font color=#00cc00>MADE DEADLINE</font><br>\n",
rp->name
);
}
PROJECT* spp = rp->project;
if (gstate.now > rp->report_deadline) {
sim_results.flops_wasted += rp->peak_flop_count;
sim_results.nresults_missed_deadline++;
spp->project_results.nresults_missed_deadline++;
spp->project_results.flops_wasted += rp->peak_flop_count;
} else {
sim_results.nresults_met_deadline++;
spp->project_results.nresults_met_deadline++;
}
html_msg += buf;
delete rp;
result_iter = results.erase(result_iter);
} else {
++result_iter;
}
}
}
// convert results in progress to IP_RESULTs,
// and get an initial schedule for them
//
void CLIENT_STATE::get_workload(vector<IP_RESULT>& ip_results) {
for (unsigned int i=0; i<results.size(); i++) {
RESULT* rp = results[i];
double x = rp->estimated_runtime_remaining();
if (x == 0) continue;
IP_RESULT ipr(rp->name, rp->report_deadline-now, x);
ip_results.push_back(ipr);
}
//init_ip_results(work_buf_min(), ncpus, ip_results);
init_ip_results(0, ncpus, ip_results);
}
void get_apps_needing_work(PROJECT* p, vector<APP*>& apps) {
apps.clear();
for (unsigned int i=0; i<gstate.apps.size(); i++) {
APP* app = gstate.apps[i];
if (app->project != p) continue;
if (app->ignore) continue;
if (!has_app_version_needing_work(app)) continue;
apps.push_back(app);
}
}
void decrement_request_rsc(
RSC_WORK_FETCH& rwf, double ninstances, double est_runtime
) {
rwf.req_secs -= est_runtime * ninstances;
rwf.req_instances -= ninstances;
rwf.estimated_delay += est_runtime*(ninstances/rwf.ninstances);
}
void decrement_request(RESULT* rp) {
APP_VERSION* avp = rp->avp;
double est_runtime = rp->wup->rsc_fpops_est/avp->flops;
est_runtime /= (gstate.time_stats.on_frac*gstate.time_stats.active_frac);
decrement_request_rsc(rsc_work_fetch[0], avp->avg_ncpus, est_runtime);
int rt = avp->gpu_usage.rsc_type;
if (rt) {
decrement_request_rsc(rsc_work_fetch[rt], avp->gpu_usage.usage, est_runtime);
}
}
double get_estimated_delay(RESULT* rp) {
int rt = rp->avp->gpu_usage.rsc_type;
return rsc_work_fetch[rt].estimated_delay;
}
// simulate trying to do an RPC;
// return true if we actually did one
//
bool CLIENT_STATE::simulate_rpc(PROJECT* p) {
char buf[256], buf2[256];
vector<IP_RESULT> ip_results;
vector<RESULT*> new_results;
bool avail;
if (p->last_rpc_time) {
double delta = now - p->last_rpc_time;
avail = p->available.sample(delta);
} else {
avail = p->available.sample(0);
}
p->last_rpc_time = now;
if (!avail) {
sprintf(buf, "RPC to %s skipped - project down<br>", p->project_name);
html_msg += buf;
msg_printf(p, MSG_INFO, "RPC skipped: project down");
gstate.scheduler_op->project_rpc_backoff(p, "project down");
p->master_url_fetch_pending = false;
return false;
}
// save request params for WORK_FETCH::handle_reply
//
double save_cpu_req_secs = rsc_work_fetch[0].req_secs;
for (int i=1; i<coprocs.n_rsc; i++) {
COPROC& cp = coprocs.coprocs[i];
if (!strcmp(cp.type, "NVIDIA")) {
coprocs.nvidia.req_secs = rsc_work_fetch[i].req_secs;
}
if (!strcmp(cp.type, "ATI")) {
coprocs.ati.req_secs = rsc_work_fetch[i].req_secs;
}
}
if (!server_uses_workload) {
for (int i=0; i<coprocs.n_rsc; i++) {
rsc_work_fetch[i].estimated_delay = rsc_work_fetch[i].busy_time_estimator.get_busy_time();
}
}
for (unsigned int i=0; i<app_versions.size(); i++) {
app_versions[i]->dont_use = false;
}
work_fetch.request_string(buf2);
sprintf(buf, "RPC to %s: %s<br>", p->project_name, buf2);
html_msg += buf;
msg_printf(p, MSG_INFO, "RPC: %s", buf2);
handle_completed_results(p);
if (server_uses_workload) {
get_workload(ip_results);
}
bool sent_something = false;
while (!existing_jobs_only) {
vector<APP*> apps;
get_apps_needing_work(p, apps);
if (apps.empty()) break;
RESULT* rp = new RESULT;
WORKUNIT* wup = new WORKUNIT;
make_job(p, wup, rp, apps);
double et = wup->rsc_fpops_est / rp->avp->flops;
if (server_uses_workload) {
IP_RESULT c(rp->name, rp->report_deadline-now, et);
if (check_candidate(c, ncpus, ip_results)) {
ip_results.push_back(c);
} else {
msg_printf(p, MSG_INFO, "job for %s misses deadline sim\n", rp->app->name);
APP_VERSION* avp = rp->avp;
delete rp;
delete wup;
avp->dont_use = true;
continue;
}
} else {
double est_delay = get_estimated_delay(rp);
if (est_delay + et > wup->app->latency_bound) {
msg_printf(p, MSG_INFO,
"job for %s misses deadline approx: del %f + et %f > %f\n",
rp->app->name,
est_delay, et, wup->app->latency_bound
);
APP_VERSION* avp = rp->avp;
delete rp;
delete wup;
avp->dont_use = true;
continue;
}
}
sent_something = true;
rp->set_state(RESULT_FILES_DOWNLOADED, "simulate_rpc");
results.push_back(rp);
new_results.push_back(rp);
#if 0
sprintf(buf, "got job %s: CPU time %.2f, deadline %s<br>",
rp->name, rp->final_cpu_time, time_to_string(rp->report_deadline)
);
html_msg += buf;
#endif
decrement_request(rp);
}
njobs += new_results.size();
msg_printf(0, MSG_INFO, "Got %lu tasks", new_results.size());
sprintf(buf, "got %lu tasks<br>", new_results.size());
html_msg += buf;
SCHEDULER_REPLY sr;
rsc_work_fetch[0].req_secs = save_cpu_req_secs;
work_fetch.handle_reply(p, &sr, new_results);
p->nrpc_failures = 0;
p->sched_rpc_pending = 0;
//p->min_rpc_time = now + 900;
p->min_rpc_time = now;
if (sent_something) {
request_schedule_cpus("simulate_rpc");
request_work_fetch("simulate_rpc");
}
sim_results.nrpcs++;
return true;
}
void PROJECT::backoff() {
nrpc_failures++;
double backoff = calculate_exponential_backoff(
nrpc_failures, SCHED_RETRY_DELAY_MIN, SCHED_RETRY_DELAY_MAX
);
min_rpc_time = gstate.now + backoff;
}
bool CLIENT_STATE::scheduler_rpc_poll() {
PROJECT *p;
bool action = false;
static double last_time=0;
static double last_work_fetch_time = 0;
double elapsed_time;
// check only every 5 sec
//
if (now - last_time < SCHEDULER_RPC_POLL_PERIOD) {
#if 0
msg_printf(NULL, MSG_INFO, "RPC poll: not time %f - %f < %f",
now, last_time, SCHEDULER_RPC_POLL_PERIOD
);
#endif
return false;
}
last_time = now;
//msg_printf(NULL, MSG_INFO, "RPC poll start");
while (1) {
#if 0
p = next_project_sched_rpc_pending();
if (p) {
work_fetch.piggyback_work_request(p);
action = simulate_rpc(p);
break;
}
#endif
p = find_project_with_overdue_results(false);
if (p) {
msg_printf(p, MSG_INFO, "doing RPC to report results");
p->sched_rpc_pending = RPC_REASON_RESULTS_DUE;
work_fetch.piggyback_work_request(p);
action = simulate_rpc(p);
break;
}
// should we check work fetch? Do this at most once/minute
if (must_check_work_fetch) {
last_work_fetch_time = 0;
}
elapsed_time = now - last_work_fetch_time;
if (elapsed_time < WORK_FETCH_PERIOD) {
return false;
}
must_check_work_fetch = false;
last_work_fetch_time = now;
p = work_fetch.choose_project();
if (p) {
msg_printf(p, MSG_INFO, "doing RPC to get work");
action = simulate_rpc(p);
break;
}
break;
}
#if 0
if (action) {
msg_printf(p, MSG_INFO, "RPC poll: did an RPC");
} else {
msg_printf(0, MSG_INFO, "RPC poll: didn't do an RPC");
}
#endif
return action;
}
bool ACTIVE_TASK_SET::poll() {
unsigned int i;
char buf[256];
bool action = false;
static double last_time = START_TIME;
double diff = gstate.now - last_time;
if (diff < 1.0) return false;
last_time = gstate.now;
if (diff > delta) {
diff = 0;
}
PROJECT* p;
for (i=0; i<gstate.projects.size(); i++) {
p = gstate.projects[i];
p->idle = true;
}
// we do two kinds of FLOPs accounting:
// 1) actual FLOPS (for job completion)
// 2) peak FLOPS (for total and per-project resource usage)
//
// CPU may be overcommitted, in which case we compute
// a "cpu_scale" factor that is < 1.
// GPUs are never overcommitted.
//
// actual FLOPS is based on app_version.flops, scaled by cpu_scale for CPU jobs
// peak FLOPS is based on device peak FLOPS,
// with CPU component scaled by cpu_scale for all jobs
// get CPU usage by GPU and CPU jobs
//
double cpu_usage_cpu=0;
double cpu_usage_gpu=0;
for (i=0; i<active_tasks.size(); i++) {
ACTIVE_TASK* atp = active_tasks[i];
if (atp->task_state() != PROCESS_EXECUTING) continue;
RESULT* rp = atp->result;
if (rp->uses_gpu()) {
if (gpu_active) {
cpu_usage_gpu += rp->avp->avg_ncpus;
}
} else {
cpu_usage_cpu += rp->avp->avg_ncpus;
}
}
double cpu_usage = cpu_usage_cpu + cpu_usage_gpu;
// if CPU is overcommitted, compute cpu_scale
//
double cpu_scale = 1;
if (cpu_usage > gstate.ncpus) {
cpu_scale = (gstate.ncpus - cpu_usage_gpu) / (cpu_usage - cpu_usage_gpu);
}
double used = 0;
for (i=0; i<active_tasks.size(); i++) {
ACTIVE_TASK* atp = active_tasks[i];
if (atp->task_state() != PROCESS_EXECUTING) continue;
RESULT* rp = atp->result;
if (!gpu_active && rp->uses_gpu()) {
continue;
}
atp->elapsed_time += diff;
double flops = rp->avp->flops;
if (!rp->uses_gpu()) {
flops *= cpu_scale;
}
rp->sim_flops_left -= diff*flops;
atp->fraction_done = 1 - rp->sim_flops_left / rp->wup->rsc_fpops_est;
atp->checkpoint_wall_time = gstate.now;
if (rp->sim_flops_left <= 0) {
atp->set_task_state(PROCESS_EXITED, "poll");
rp->exit_status = 0;
rp->ready_to_report = true;
gstate.request_schedule_cpus("job finished");
gstate.request_work_fetch("job finished");
sprintf(buf, "result %s finished<br>", rp->name);
html_msg += buf;
action = true;
}
double pf = diff * app_peak_flops(rp->avp, cpu_scale);
rp->project->project_results.flops_used += pf;
rp->peak_flop_count += pf;
sim_results.flops_used += pf;
used += pf;
rp->project->idle = false;
}
for (i=0; i<gstate.projects.size(); i++) {
p = gstate.projects[i];
if (p->idle) {
p->idle_time += diff;
p->idle_time_sumsq += diff*(p->idle_time*p->idle_time);
} else {
p->idle_time = 0;
}
}
active_time += diff;
if (gpu_active) {
gpu_active_time += diff;
}
return action;
}
// return the fraction of flops that was spent in violation of shares
// i.e., if a project got X and it should have got Y,
// add up |X-Y| over all projects, and divide by total flops
//
double CLIENT_STATE::share_violation() {
unsigned int i;
double tot = 0, trs=0;
for (i=0; i<projects.size(); i++) {
PROJECT* p = projects[i];
tot += p->project_results.flops_used;
trs += p->resource_share;
}
double sum = 0;
for (i=0; i<projects.size(); i++) {
PROJECT* p = projects[i];
double t = p->project_results.flops_used;
double rs = p->resource_share/trs;
double rt = tot*rs;
sum += fabs(t - rt);
}
return sum/tot;
}
// "monotony" is defined as follows:
// for each project P, maintain R(P), the time since P last ran,
// let S(P) be the RMS of R(P).
// Let X = mean(S(P))/(sched_interval*nprojects)
// (the *nprojects reflects the fact that in the limit of nprojects,
// each one waits for a time to run proportional to nprojects)
// X varies from zero (no monotony) to infinity.
// X is one in the case of round-robin on 1 CPU.
// Let monotony = 1-1/(x+1)
//
double CLIENT_STATE::monotony() {
double sum = 0;
double schedint = global_prefs.cpu_scheduling_period();
unsigned int i;
for (i=0; i<projects.size(); i++) {
PROJECT* p = projects[i];
double avg_ss = p->idle_time_sumsq/active_time;
double s = sqrt(avg_ss);
sum += s;
}
int n = (int)projects.size();
double x = sum/(n*schedint*n);
double m = 1-(1/(x+1));
//printf("sum: %f; x: %f m: %f\n", sum, x, m);
return m;
}
// the CPU totals are there; compute the other fields
//
void SIM_RESULTS::compute_figures_of_merit() {
double flops_total = cpu_peak_flops()*active_time
+ gpu_peak_flops()*gpu_active_time;
double flops_idle = flops_total - flops_used;
if (flops_idle<0) flops_idle=0;
wasted_frac = flops_wasted/flops_total;
idle_frac = flops_idle/flops_total;
share_violation = gstate.share_violation();
monotony = gstate.monotony();
}
void SIM_RESULTS::print(FILE* f, bool human_readable) {
double r = njobs?((double)nrpcs)/(njobs*2):0;
if (human_readable) {
fprintf(f,
"wasted fraction %f\n"
"Idle fraction %f\n"
"Share violation %f\n"
"Monotony %f\n"
"RPCs per job %f\n",
wasted_frac, idle_frac, share_violation, monotony, r
);
} else {
fprintf(f, "wf %f if %f sv %f m %f r %f\n",
wasted_frac, idle_frac, share_violation, monotony, r
);
}
}
void SIM_RESULTS::parse(FILE* f) {
fscanf(f, "wasted_frac %lf idle_frac %lf share_violation %lf monotony %lf",
&wasted_frac, &idle_frac, &share_violation, &monotony
);
}
void SIM_RESULTS::add(SIM_RESULTS& r) {
wasted_frac += r.wasted_frac;
idle_frac += r.idle_frac;
share_violation += r.share_violation;
monotony += r.monotony;
}
void SIM_RESULTS::divide(int n) {
wasted_frac /= n;
idle_frac /= n;
share_violation /= n;
monotony /= n;
}
void SIM_RESULTS::clear() {
memset(this, 0, sizeof(*this));
}
void PROJECT::print_results(FILE* f, SIM_RESULTS& sr) {
double t = project_results.flops_used;
double gt = sr.flops_used;
fprintf(f, "%s: share %.2f total flops %.2fG (%.2f%%)\n"
" used %.2fG wasted %.2fG\n"
" deadlines: met %d missed %d\n",
project_name, resource_share,
t/1e9, (t/gt)*100,
project_results.flops_used/1e9,
project_results.flops_wasted/1e9,
project_results.nresults_met_deadline,
project_results.nresults_missed_deadline
);
}
const char* colors[] = {
"#000088",
"#008800",
"#880000",
"#880088",
"#888800",
"#008888",
"#0000aa",
"#00aa00",
"#aa0000",
"#aa00aa",
"#aaaa00",
"#00aaaa",
"#8800aa",
"#aa0088",
"#88aa00",
"#aa8800",
"#00aa88",
"#0088aa",
};
#define NCOLORS 18
#define WIDTH1 100
#define WIDTH2 400
void show_project_colors() {
fprintf(html_out,
"<table>\n"
" <tr><th>Project</th><th>Resource share</th></tr>\n"
);
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
fprintf(html_out,
"<tr><td bgcolor=%s><font color=ffffff>%s</font></td><td>%.0f</td></tr>\n",
colors[p->index%NCOLORS], p->project_name, p->resource_share
);
}
fprintf(html_out, "</table>\n");
}
void job_count(PROJECT* p, int rsc_type, int& in_progress, int& done) {
in_progress = done = 0;
unsigned int i;
for (i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
if (rp->project != p) continue;
if (rp->resource_type() != rsc_type) continue;
if (rp->state() < RESULT_FILES_UPLOADED) {
in_progress++;
} else {
done++;
}
}
}
void show_resource(int rsc_type) {
unsigned int i;
char buf[256];
fprintf(html_out, "<td width=%d valign=top>", WIDTH2);
bool found = false;
for (i=0; i<gstate.active_tasks.active_tasks.size(); i++) {
ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[i];
RESULT* rp = atp->result;
if (atp->task_state() != PROCESS_EXECUTING) continue;
double ninst=0;
if (rsc_type) {
if (rp->avp->gpu_usage.rsc_type != rsc_type) continue;
ninst = rp->avp->gpu_usage.usage;
} else {
ninst = rp->avp->avg_ncpus;
}
PROJECT* p = rp->project;
if (!found) {
found = true;
fprintf(html_out,
"<table>\n"
"<tr><th>#devs</th><th>Job name (* = high priority)</th><th>GFLOPs left</th>%s</tr>\n",
rsc_type?"<th>GPU</th>":""
);
}
if (rsc_type) {
sprintf(buf, "<td>%d</td>", rp->coproc_indices[0]);
} else {
strcpy(buf, "");
}
fprintf(html_out, "<tr><td>%.2f</td><td bgcolor=%s><font color=#ffffff>%s%s</font></td><td>%.0f</td>%s</tr>\n",
ninst,
colors[p->index%NCOLORS],
rp->edf_scheduled?"*":"",
rp->name,
rp->sim_flops_left/1e9,
buf
);
}
if (found) {
fprintf(html_out, "</table>\n");
} else {
fprintf(html_out, "IDLE\n");
}
fprintf(html_out,
"<table><tr><td>Project</td><td>In progress</td><td>done</td><td>REC</td></tr>\n"
);
found = false;
for (i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
int in_progress, done;
job_count(p, rsc_type, in_progress, done);
if (in_progress || done) {
fprintf(html_out, "<td bgcolor=%s><font color=#ffffff>%s</font></td><td>%d</td><td>%d</td><td>%.3f</td></tr>\n",
colors[p->index%NCOLORS], p->project_name, in_progress, done,
p->pwf.rec
);
found = true;
}
}
//if (!found) fprintf(html_out, " ---\n");
fprintf(html_out, "</table></td>");
}
void html_start() {
char buf[256];
sprintf(buf, "%s%s", outfile_prefix, TIMELINE_FNAME);
html_out = fopen(buf, "w");
if (!html_out) {
fprintf(stderr, "can't open %s for writing\n", buf);
exit(1);
}
setbuf(html_out, 0);
fprintf(index_file, "<br><a href=%s>Timeline</a>\n", TIMELINE_FNAME);
fprintf(html_out,
"<head><style> body, td, th { font-family: Verdana; font-size: 12px;} th {white-space: nowrap;}</style></head>\n"
"<h2>BOINC client emulator results</h2>\n"
);
show_project_colors();
fprintf(html_out,
"<table border=0 cellpadding=4><tr><th width=%d>Time</th>\n", WIDTH1
);
fprintf(html_out,
"<th width=%d>CPU</th>", WIDTH2
);
for (int i=1; i<coprocs.n_rsc; i++) {
int pt = coproc_type_name_to_num(coprocs.coprocs[i].type);
const char* name;
if (pt) {
name = proc_type_name(pt);
} else {
name = coprocs.coprocs[i].type;
}
fprintf(html_out, "<th width=%d>%s</th>\n", WIDTH2, name);
}
fprintf(html_out, "</tr></table>\n");
}
void html_rec() {
if (html_msg.size()) {
fprintf(html_out,
"<table border=0 cellpadding=4><tr><td width=%d valign=top>%s</td>",
WIDTH1, sim_time_string(gstate.now)
);
fprintf(html_out,
"<td width=%d valign=top><font size=-2>%s</font></td></tr></table>\n",
coprocs.n_rsc*WIDTH2,
html_msg.c_str()
);
html_msg = "";
}
fprintf(html_out, "<table border=0 cellpadding=4><tr><td width=%d valign=top>%s</td>", WIDTH1, sim_time_string(gstate.now));
if (active) {
show_resource(0);
if (gpu_active) {
for (int i=1; i<coprocs.n_rsc; i++) {
show_resource(i);
}
} else {
for (int i=1; i<coprocs.n_rsc; i++) {
fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
}
}
} else {
fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
for (int i=1; i<coprocs.n_rsc; i++) {
fprintf(html_out, "<td width=%d valign=top bgcolor=#aaaaaa>OFF</td>", WIDTH2);
}
}
fprintf(html_out, "</tr></table>\n");
}
void html_end() {
fprintf(html_out, "<pre>\n");
sim_results.compute_figures_of_merit();
sim_results.print(html_out);
print_project_results(html_out);
fprintf(html_out, "</pre>\n");
fclose(html_out);
}
void set_initial_rec() {
unsigned int i;
double sum=0;
double x = cpu_peak_flops() + gpu_peak_flops();
for (i=0; i<gstate.projects.size(); i++) {
sum += gstate.projects[i]->resource_share;
}
for (i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
p->pwf.rec = 86400*x*(p->resource_share/sum)/1e9;
}
}
static bool compare_names(PROJECT* p1, PROJECT* p2) {
return (strcmp(p1->project_name, p2->project_name) < 0);
}
void write_recs() {
fprintf(rec_file, "%f ", gstate.now);
std::sort(
gstate.projects.begin(),
gstate.projects.end(),
compare_names
);
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
fprintf(rec_file, "%f ", p->pwf.rec);
}
fprintf(rec_file, "\n");
}
void make_graph(const char* title, const char* fname, int field) {
char gp_fname[256], cmd[256], png_fname[256];
sprintf(gp_fname, "%s%s.gp", outfile_prefix, fname);
FILE* f = fopen(gp_fname, "w");
fprintf(f,
"set terminal png small size 1024, 768\n"
"set title \"%s\"\n"
"set yrange[0:]\n"
"plot ",
title
);
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
fprintf(f, "\"%srec.dat\" using 1:%d title \"%s\" with lines%s",
outfile_prefix, 2+i+field, p->project_name,
(i==gstate.projects.size()-1)?"\n":", \\\n"
);
}
fclose(f);
sprintf(png_fname, "%s%s.png", outfile_prefix, fname);
sprintf(cmd, "gnuplot < %s > %s", gp_fname, png_fname);
fprintf(index_file, "<br><a href=%s.png>Graph of %s</a>\n", fname, title);
system(cmd);
}
static void write_inputs() {
char buf[256];
sprintf(buf, "%s/%s", outfile_prefix, INPUTS_FNAME);
FILE* f = fopen(buf, "w");
fprintf(f,
"Existing jobs only: %s\n"
"Round-robin only: %s\n"
"scheduler EDF sim: %s\n"
"Include empty projects: %s\n",
existing_jobs_only?"yes":"no",
cpu_sched_rr_only?"yes":"no",
server_uses_workload?"yes":"no",
include_empty_projects?"yes":"no"
);
fprintf(f,
"REC half-life: %f\n", cc_config.rec_half_life
);
fprintf(f,
"Simulation duration: %f\nTime step: %f\n",
duration, delta
);
fclose(f);
}
void simulate() {
bool action;
double start = START_TIME;
gstate.now = start;
html_start();
fprintf(summary_file,
"Hardware summary\n %d CPUs, %.1f GFLOPS\n",
gstate.host_info.p_ncpus, gstate.host_info.p_fpops/1e9
);
for (int i=1; i<coprocs.n_rsc; i++) {
fprintf(summary_file,
" %d %s GPUs, %.1f GFLOPS\n",
coprocs.coprocs[i].count,
coprocs.coprocs[i].type,
coprocs.coprocs[i].peak_flops/1e9
);
}
fprintf(summary_file,
"Preferences summary\n"
" work buf min %f max %f\n"
" Scheduling period %f\n"
"Scheduling policies\n"
" Round-robin only: %s\n"
" Scheduler EDF simulation: %s\n"
" REC half-life: %f\n",
gstate.work_buf_min(), gstate.work_buf_total(),
gstate.global_prefs.cpu_scheduling_period(),
cpu_sched_rr_only?"yes":"no",
server_uses_workload?"yes":"no",
cc_config.rec_half_life
);
fprintf(summary_file, "Jobs\n");
for (unsigned int i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
fprintf(summary_file,
" %s time left %s deadline %s\n",
rp->name,
timediff_format(rp->sim_flops_left/rp->avp->flops).c_str(),
timediff_format(rp->report_deadline - START_TIME).c_str()
);
}
fprintf(summary_file,
"Simulation parameters\n"
" time step %f, duration %f\n"
"-------------------\n",
delta, duration
);
write_inputs();
while (1) {
on = on_proc.sample(delta);
if (on) {
active = active_proc.sample(delta);
if (active) {
gpu_active = gpu_active_proc.sample(delta);
} else {
gpu_active = false;
}
connected = connected_proc.sample(delta);
} else {
active = gpu_active = connected = false;
}
// do accounting for the period that just ended,
// even if we're now in an "off" state.
//
// need both of the following, else crash
//
action |= gstate.active_tasks.poll();
action |= gstate.handle_finished_apps();
if (on) {
while (1) {
action = false;
action |= gstate.schedule_cpus();
if (connected) {
action |= gstate.scheduler_rpc_poll();
// this deletes completed results
}
action |= gstate.active_tasks.poll();
action |= gstate.handle_finished_apps();
gpu_suspend_reason = gpu_active?0:1;
//msg_printf(0, MSG_INFO, action?"did action":"did no action");
if (!action) break;
}
}
//msg_printf(0, MSG_INFO, "took time step");
for (unsigned int i=0; i<gstate.active_tasks.active_tasks.size(); i++) {
ACTIVE_TASK* atp = gstate.active_tasks.active_tasks[i];
if (atp->task_state() == PROCESS_EXECUTING) {
atp->elapsed_time += delta;
}
}
html_rec();
write_recs();
gstate.now += delta;
if (gstate.now > start + duration) break;
}
html_end();
}
void show_app(APP* app) {
fprintf(summary_file,
" app %s\n"
" job params: fpops_est %.0fG fpops mean %.0fG std_dev %.0fG\n"
" latency %.2f weight %.2f",
app->name, app->fpops_est/1e9,
app->fpops.mean/1e9, app->fpops.std_dev/1e9,
app->latency_bound,
app->weight
);
if (app->max_concurrent) {
fprintf(summary_file, " max_concurrent %d\n", app->max_concurrent);
} else {
fprintf(summary_file, "\n");
}
for (unsigned int i=0; i<gstate.app_versions.size(); i++) {
APP_VERSION* avp = gstate.app_versions[i];
if (avp->app != app) continue;
if (avp->gpu_usage.rsc_type) {
fprintf(summary_file,
" app version %d (%s)\n"
" %.2f CPUs, %.2f %s GPUs, %.0f GFLOPS\n",
avp->version_num, avp->plan_class,
avp->avg_ncpus,
avp->gpu_usage.usage,
rsc_name(avp->gpu_usage.rsc_type),
avp->flops/1e9
);
} else {
fprintf(summary_file,
" app version %d (%s)\n"
" %.2f CPUs, %.0f GFLOPS\n",
avp->version_num, avp->plan_class,
avp->avg_ncpus,
avp->flops/1e9
);
}
}
}
// get application params,
// and set "ignore" for apps that have no versions or no params.
//
// App params can be specified in 2 ways:
// - the presence of a WU and result for that app
// - app.latency_bound and app.fpops_est are populated
//
void get_app_params() {
APP* app;
unsigned int i, j;
for (i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
app = rp->app;
double latency_bound = rp->report_deadline - rp->received_time;
if (!app->latency_bound) {
app->latency_bound = latency_bound;
}
rp->received_time = START_TIME;
rp->report_deadline = START_TIME + latency_bound;
rp->sim_flops_left = rp->wup->rsc_fpops_est;
}
for (i=0; i<gstate.workunits.size(); i++) {
WORKUNIT* wup = gstate.workunits[i];
app = wup->app;
if (!app->fpops_est) {
app->fpops_est = wup->rsc_fpops_est;
}
}
for (i=0; i<gstate.apps.size(); i++) {
app = gstate.apps[i];
app->ignore = true;
}
for (i=0; i<gstate.app_versions.size(); i++) {
APP_VERSION* avp = gstate.app_versions[i];
if (avp->missing_coproc) continue;
avp->app->ignore = false;
}
fprintf(summary_file, "Applications and version\n");
for (j=0; j<gstate.projects.size(); j++) {
PROJECT* p = gstate.projects[j];
fprintf(summary_file, "%s\n", p->project_name);
for (i=0; i<gstate.apps.size(); i++) {
app = gstate.apps[i];
if (app->project != p) continue;
if (app->ignore) {
fprintf(summary_file,
" app %s: ignoring - no usable app versions\n",
app->name
);
continue;
}
if (app->non_cpu_intensive) {
fprintf(summary_file,
" app %s: ignoring - non CPU intensive\n",
app->name
);
app->ignore = true;
continue;
}
// if missing app params, fill in defaults
//
if (!app->fpops_est) {
app->fpops_est = 3600e11;
}
if (!app->latency_bound) {
app->latency_bound = 864000;
}
if (!app->fpops_est || !app->latency_bound) {
app->ignore = true;
fprintf(summary_file,
" app %s: ignoring - no job parameters (see below)\n",
app->name
);
} else if (app->ignore) {
fprintf(summary_file,
" app %s: ignoring - no app versions\n",
app->name
);
} else {
if (!app->fpops.mean) {
app->fpops.mean = app->fpops_est;
}
if (!app->weight) {
app->weight = 1;
}
show_app(app);
}
}
}
fprintf(summary_file,
"\n"
"Note: an app's job parameters are taken from a job for that app.\n"
" They can also be specified by adding tags to client_state.xml.\n"
" See http://boinc.berkeley.edu/trac/wiki/ClientSim.\n"
"\n"
);
}
// zero backoffs and REC
//
void clear_backoff() {
unsigned int i;
for (i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
for (int j=0; j<coprocs.n_rsc; j++) {
p->rsc_pwf[j].reset();
}
p->min_rpc_time = 0;
}
}
// remove apps with no app versions,
// then projects with no apps
//
void cull_projects() {
unsigned int i;
PROJECT* p;
for (i=0; i<gstate.projects.size(); i++) {
p = gstate.projects[i];
p->no_apps = true;
}
for (i=0; i<gstate.app_versions.size(); i++) {
APP_VERSION* avp = gstate.app_versions[i];
if (avp->app->ignore) continue;
}
for (i=0; i<gstate.apps.size(); i++) {
APP* app = gstate.apps[i];
if (!app->ignore) {
app->project->no_apps = false;
}
}
for (i=0; i<gstate.projects.size(); i++) {
p = gstate.projects[i];
if (p->no_apps) {
fprintf(summary_file,
"%s: Removing from simulation - no apps\n",
p->project_name
);
p->ignore = true;
} else if (p->non_cpu_intensive) {
fprintf(summary_file,
"%s: Removing from simulation - non CPU intensive\n",
p->project_name
);
p->ignore = true;
}
}
// remove results and active tasks of projects we're culling
//
vector<ACTIVE_TASK*>::iterator ati = gstate.active_tasks.active_tasks.begin();
while (ati != gstate.active_tasks.active_tasks.end()) {
ACTIVE_TASK* atp = *ati;
if (atp->wup->project->ignore) {
ati = gstate.active_tasks.active_tasks.erase(ati);
} else {
++ati;
}
}
vector<RESULT*>::iterator ri = gstate.results.begin();
while (ri != gstate.results.end()) {
RESULT* rp = *ri;
if (rp->project->ignore) {
ri = gstate.results.erase(ri);
} else {
++ri;
}
}
vector<PROJECT*>::iterator iter = gstate.projects.begin();
while (iter != gstate.projects.end()) {
p = *iter;
if (p->ignore) {
iter = gstate.projects.erase(iter);
} else {
++iter;
}
}
}
void do_client_simulation() {
char buf[256], buf2[256];
int retval;
FILE* f;
sprintf(buf, "%s%s", infile_prefix, CONFIG_FILE);
cc_config.defaults();
read_config_file(true, buf);
log_flags.init();
sprintf(buf, "%s%s", outfile_prefix, "log_flags.xml");
f = fopen(buf, "r");
if (f) {
MIOFILE mf;
mf.init_file(f);
XML_PARSER xp(&mf);
xp.get_tag(); // skip open tag
log_flags.parse(xp);
fclose(f);
}
gstate.add_platform("client simulator");
sprintf(buf, "%s%s", infile_prefix, STATE_FILE_NAME);
if (!boinc_file_exists(buf)) {
fprintf(stderr, "No client state file\n");
exit(1);
}
retval = gstate.parse_state_file_aux(buf);
if (retval) {
fprintf(stderr, "state file parse error %d\n", retval);
exit(1);
}
// if tasks have pending transfers, mark as completed
//
for (unsigned int i=0; i<gstate.results.size(); i++) {
RESULT* rp = gstate.results[i];
if (rp->state() < RESULT_FILES_DOWNLOADED) {
rp->set_state(RESULT_FILES_DOWNLOADED, "init");
} else if (rp->state() == RESULT_FILES_UPLOADING) {
rp->set_state(RESULT_FILES_UPLOADED, "init");
}
}
cc_config.show();
log_flags.show();
sprintf(buf, "%s%s", infile_prefix, GLOBAL_PREFS_FILE_NAME);
sprintf(buf2, "%s%s", infile_prefix, GLOBAL_PREFS_OVERRIDE_FILE);
gstate.read_global_prefs(buf, buf2);
fprintf(index_file,
"<h3>Output files</h3>\n"
"<a href=%s>Summary</a>\n"
"<br><a href=%s>Log file</a>\n",
SUMMARY_FNAME, LOG_FNAME
);
// fill in GPU device nums and OpenCL flags
//
for (int i=0; i<coprocs.n_rsc; i++) {
COPROC& cp = coprocs.coprocs[i];
for (int j=0; j<cp.count; j++) {
cp.device_nums[j] = j;
if (cp.have_opencl) {
cp.instance_has_opencl[j] = true;
}
}
}
set_no_rsc_config();
process_gpu_exclusions();
get_app_params();
if (!include_empty_projects) {
cull_projects();
}
fprintf(summary_file, "--------------------------\n");
int j=0;
for (unsigned int i=0; i<gstate.projects.size(); i++) {
gstate.projects[i]->index = j++;
}
clear_backoff();
gstate.log_show_projects();
gstate.set_ncpus();
work_fetch.init();
//set_initial_rec();
rec_adjust_period = delta;
gstate.request_work_fetch("init");
simulate();
sim_results.compute_figures_of_merit();
sprintf(buf, "%s%s", outfile_prefix, RESULTS_DAT_FNAME);
f = fopen(buf, "w");
sim_results.print(f);
fclose(f);
sprintf(buf, "%s%s", outfile_prefix, RESULTS_TXT_FNAME);
f = fopen(buf, "w");
sim_results.print(f, true);
fclose(f);
fprintf(summary_file,
"Simulation done.\n"
"-------------------------\n"
"Figures of merit:\n"
);
sim_results.print(summary_file, true);
double cpu_time;
boinc_calling_thread_cpu_time(cpu_time);
fprintf(summary_file,
"-------------------------\n"
"Simulator CPU time: %f secs\n"
"-------------------------\n"
"Peak FLOPS: CPU %.2fG GPU %.2fG\n",
cpu_time,
cpu_peak_flops()/1e9,
gpu_peak_flops()/1e9
);
print_project_results(summary_file);
fclose(rec_file);
make_graph("REC", "rec", 0);
}
char* next_arg(int argc, char** argv, int& i) {
if (i >= argc) {
fprintf(stderr, "Missing command-line argument\n");
usage(argv[0]);
}
return argv[i++];
}
int main(int argc, char** argv) {
int i;
char buf[256];
sim_results.clear();
for (i=1; i<argc;) {
char* opt = argv[i++];
if (!strcmp(opt, "--infile_prefix")) {
infile_prefix = argv[i++];
} else if (!strcmp(opt, "--outfile_prefix")) {
outfile_prefix = argv[i++];
} else if (!strcmp(opt, "--existing_jobs_only")) {
existing_jobs_only = true;
} else if (!strcmp(opt, "--duration")) {
duration = atof(next_arg(argc, argv, i));
} else if (!strcmp(opt, "--delta")) {
delta = atof(next_arg(argc, argv, i));
} else if (!strcmp(opt, "--server_uses_workload")) {
server_uses_workload = true;
} else if (!strcmp(opt, "--cpu_sched_rr_only")) {
cpu_sched_rr_only = true;
} else if (!strcmp(opt, "--include_empty_projects")) {
include_empty_projects = true;
} else if (!strcmp(opt, "--rec_half_life")) {
cc_config.rec_half_life = atof(argv[i++]);
} else {
usage(argv[0]);
}
}
if (duration <= 0) {
fprintf(stderr, "duration <= 0\n");
exit(1);
}
if (delta <= 0) {
fprintf(stderr, "delta <= 0\n");
exit(1);
}
sprintf(buf, "%s%s", outfile_prefix, "index.html");
index_file = fopen(buf, "w");
sprintf(log_filename, "%s%s", outfile_prefix, LOG_FNAME);
logfile = fopen(log_filename, "w");
if (!logfile) {
fprintf(stderr, "Can't open %s\n", buf);
exit(1);
}
setbuf(logfile, 0);
sprintf(buf, "%s%s", outfile_prefix, REC_FNAME);
rec_file = fopen(buf, "w");
sprintf(buf, "%s%s", outfile_prefix, SUMMARY_FNAME);
summary_file = fopen(buf, "w");
srand(1); // make it deterministic
do_client_simulation();
}