// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see .
// Modify claimed credit based on the historical granted credit if
// the project is configured to do this
//
// functions related to the computation and granting of credit
// Note: this is credit.cpp rather than sched_credit.cpp
// because you might grant credit e.g. from a trickle handler
#include
#include "boinc_db.h"
#include "error_numbers.h"
#include "sched_config.h"
#include "sched_msgs.h"
#include "sched_util.h"
#include "sched_shmem.h"
#include "sched_types.h"
#include "credit.h"
double fpops_to_credit(double fpops) {
return fpops*COBBLESTONE_SCALE;
}
double cpu_time_to_credit(double cpu_time, double cpu_flops_sec) {
return fpops_to_credit(cpu_time*cpu_flops_sec);
}
// Grant the host (and associated user and team)
// the given amount of credit for work that started at the given time.
// Update the user and team records,
// but not the host record (caller must update)
//
int grant_credit(DB_HOST& host, double start_time, double credit) {
DB_USER user;
DB_TEAM team;
int retval;
char buf[256];
double now = dtime();
// first, process the host
update_average(
now,
start_time, credit, CREDIT_HALF_LIFE,
host.expavg_credit, host.expavg_time
);
host.total_credit += credit;
// then the user
retval = user.lookup_id(host.userid);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"lookup of user %d failed: %s\n",
host.userid, boincerror(retval)
);
return retval;
}
update_average(
now,
start_time, credit, CREDIT_HALF_LIFE,
user.expavg_credit, user.expavg_time
);
sprintf(
buf, "total_credit=total_credit+%.15e, expavg_credit=%.15e, expavg_time=%.15e",
credit, user.expavg_credit, user.expavg_time
);
retval = user.update_field(buf);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"update of user %d failed: %s\n",
host.userid, boincerror(retval)
);
}
// and finally the team
if (user.teamid) {
retval = team.lookup_id(user.teamid);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"lookup of team %d failed: %s\n",
user.teamid, boincerror(retval)
);
return retval;
}
update_average(
now,
start_time, credit, CREDIT_HALF_LIFE,
team.expavg_credit, team.expavg_time
);
sprintf(buf,
"total_credit=total_credit+%.15e, expavg_credit=%.15e, expavg_time=%.15e",
credit, team.expavg_credit, team.expavg_time
);
retval = team.update_field(buf);
if (retval) {
log_messages.printf(MSG_CRITICAL,
"update of team %d failed: %s\n",
team.id, boincerror(retval)
);
}
}
return 0;
}
///////////////////// V2 CREDIT STUFF STARTS HERE ///////////////////
// levels of confidence in a credit value
//
#define PFC_MODE_NORMAL 0
// PFC was computed in the "normal" way, i.e.
// - claimed PFC
// - app version scaling (i.e. not anonymous platform)
// - host scaling
#define PFC_MODE_APPROX 1
// PFC was approximated, but still (in the absence of cheating)
// reflects the size of the particular job
#define PFC_MODE_WU_EST 2
// PFC was set to the WU estimate.
// If this doesn't reflect the WU size, neither does the PFC estimate
// This is a last resort, and can be way off.
// used in the computation of AV scale factors
//
struct RSC_INFO {
double pfc_sum;
double pfc_n;
int nvers_thresh; // # app versions w/ lots of samples
int nvers_total;
RSC_INFO() {
pfc_sum = 0;
pfc_n = 0;
nvers_thresh = 0;
nvers_total = 0;
}
void update(APP_VERSION& av) {
nvers_total++;
if (av.pfc.n > MIN_VERSION_SAMPLES) {
nvers_thresh++;
pfc_sum += av.pfc.get_avg() * av.pfc.n;
pfc_n += av.pfc.n;
}
}
double avg() {
return pfc_sum/pfc_n;
}
};
// "avg" is the average PFC for this app
// over CPU versions or GPU versions, whichever is lowest.
// Update the pfc_scale of this app's versions in the DB,
// and update app.min_avg_pfc
//
int scale_versions(APP& app, double avg, SCHED_SHMEM* ssp) {
char buf[256];
int retval;
for (int j=0; jnapp_versions; j++) {
APP_VERSION& av = ssp->app_versions[j];
if (av.appid != app.id) continue;
if (av.pfc.n < MIN_VERSION_SAMPLES) continue;
av.pfc_scale= avg/av.pfc.get_avg();
DB_APP_VERSION dav;
dav.id = av.id;
sprintf(buf, "pfc_scale=%.15e", av.pfc_scale);
retval = dav.update_field(buf);
if (retval) return retval;
if (config.debug_credit) {
PLATFORM* p = ssp->lookup_platform_id(av.platformid);
log_messages.printf(MSG_NORMAL,
" updating scale factor for %d (%s %s)\n",
av.id, p->name, av.plan_class
);
log_messages.printf(MSG_NORMAL,
" n: %g avg PFC: %g new scale: %g\n",
av.pfc.n, av.pfc.get_avg(), av.pfc_scale
);
}
}
app.min_avg_pfc = avg;
DB_APP da;
da.id = app.id;
sprintf(buf, "min_avg_pfc=%.15e", avg);
retval = da.update_field(buf);
if (retval) return retval;
return 0;
}
// Update app version scale factors,
// and find the min average PFC for each app.
// Called periodically from the master feeder.
//
int update_av_scales(SCHED_SHMEM* ssp) {
int i, j, retval;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL, "-- updating app version scales --\n");
}
for (i=0; inapps; i++) {
APP& app = ssp->apps[i];
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL, "app %s (%d)\n", app.name, app.id);
}
RSC_INFO cpu_info, gpu_info;
// find the average PFC of CPU and GPU versions
for (j=0; jnapp_versions; j++) {
APP_VERSION& avr = ssp->app_versions[j];
if (avr.appid != app.id) continue;
DB_APP_VERSION av;
retval = av.lookup_id(avr.id);
if (retval) return retval;
avr = av; // update shared mem array
if (strstr(av.plan_class, "cuda") || strstr(av.plan_class, "ati")) {
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"add to gpu totals: (%d %s) %g %g\n",
av.id, av.plan_class, av.pfc.n, av.pfc.get_avg()
);
}
gpu_info.update(av);
} else {
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"add to cpu totals: (%d %s) %g %g\n",
av.id, av.plan_class, av.pfc.n, av.pfc.get_avg()
);
}
cpu_info.update(av);
}
}
// If there are only CPU or only GPU versions,
// and at least 2 are above threshold, normalize to the average
//
// If there are both, and at least 1 of each is above threshold,
// normalize to the min of the averages
//
if (cpu_info.nvers_total) {
if (gpu_info.nvers_total) {
if (cpu_info.nvers_thresh && gpu_info.nvers_thresh) {
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"CPU avg: %g; GPU avg: %g\n",
cpu_info.avg(), gpu_info.avg()
);
}
scale_versions(app,
cpu_info.avg() 1) {
log_messages.printf(MSG_NORMAL,
"CPU avg: %g\n", cpu_info.avg()
);
scale_versions(app, cpu_info.avg(), ssp);
}
}
} else {
if (gpu_info.nvers_thresh > 1) {
log_messages.printf(MSG_NORMAL,
"GPU avg: %g\n", gpu_info.avg()
);
scale_versions(app, gpu_info.avg(), ssp);
}
}
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL, "-------------\n");
}
return 0;
}
// look up HOST_APP_VERSION record; called from validator and transitioner.
// Normally the record will exist; if not create it (transitional case)
//
int hav_lookup(DB_HOST_APP_VERSION& hav, int hostid, int avid) {
int retval;
char buf[256];
sprintf(buf, "where host_id=%d and app_version_id=%d", hostid, avid);
retval = hav.lookup(buf);
if (retval == ERR_DB_NOT_FOUND) {
hav.clear();
hav.host_id = hostid;
hav.app_version_id = avid;
retval = hav.insert();
}
return retval;
}
DB_APP_VERSION* av_lookup(int id, vector& app_versions) {
for (unsigned int i=0; i 1e4 || x < 1e-4) {
log_messages.printf(MSG_CRITICAL,
"Bad FLOP ratio (%f): check workunit.rsc_fpops_est for %s (app %s)\n",
x, wu.name, app.name
);
return false;
}
return true;
}
// Compute or estimate "claimed peak FLOP count".
// Possibly update host_app_version records and write to DB.
// Possibly update app_version records in memory and let caller write to DB,
// to merge DB writes
//
int get_pfc(
RESULT& r, WORKUNIT& wu, DB_APP& app, // in
vector&app_versions, // in/out
DB_HOST_APP_VERSION& hav, // in/out
double& pfc, int& mode // out
) {
DB_APP_VERSION* avp=0;
int retval;
mode = PFC_MODE_APPROX;
// is result from old scheduler that didn't set r.app_version_id correctly?
// if so, use WU estimate (this is a transient condition)
//
if (r.app_version_id == 0 || r.app_version_id == 1) {
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] missing app_version_id (%d): returning WU default %.2f\n",
r.id, r.app_version_id, wu_estimated_credit(wu, app)
);
}
mode = PFC_MODE_WU_EST;
pfc = wu_estimated_pfc(wu, app);
return 0;
}
// temporary kludge for SETI@home:
// if GPU initialization fails the app falls back to CPU.
//
if (strstr(r.stderr_out, "Device Emulation (CPU)")) {
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d][AV#%d] CUDA app fell back to CPU; returning WU default %.2f\n",
r.id, r.app_version_id, wu.rsc_fpops_est*COBBLESTONE_SCALE
);
}
mode = PFC_MODE_WU_EST;
pfc = wu_estimated_pfc(wu, app);
return 0;
}
int gavid = generalized_app_version_id(r.app_version_id, r.appid);
// transition case
//
if (!hav.host_id) {
mode = PFC_MODE_WU_EST;
pfc = wu_estimated_pfc(wu, app);
return 0;
}
// old clients report CPU time but not elapsed time.
// Use HOST_APP_VERSION.et to track statistics of CPU time.
//
if (r.elapsed_time < 1e-6) {
// in case buggy client reports elapsed time like 1e-304
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] old client (elapsed time not reported)\n",
r.id
);
}
hav.et.update_var(
r.cpu_time/wu.rsc_fpops_est,
HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
);
pfc = wu_estimated_pfc(wu, app);
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] old client: raw credit %.2f\n",
r.id, pfc*COBBLESTONE_SCALE
);
}
bool do_scale = true;
if (hav.et.n < MIN_HOST_SAMPLES || (hav.et.get_avg() <= 0)) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] old client: no host scaling - zero or too few samples %f\n",
r.id, hav.et.n
);
}
}
if (do_scale
&& app.host_scale_check
&& hav.consecutive_valid < CONS_VALID_HOST_SCALE
) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] old client: no host scaling - cons valid %d\n",
r.id, hav.consecutive_valid
);
}
}
if (do_scale) {
double s = r.cpu_time / (hav.et.get_avg()*wu.rsc_fpops_est);
pfc *= s;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] old client: scaling (based on CPU time) by %g, return %.2f\n",
r.id, s, pfc*COBBLESTONE_SCALE
);
}
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] old client: returning PFC %.2f\n",
r.id, pfc*COBBLESTONE_SCALE
);
}
return 0;
}
// r.flops_estimate shouldn't be zero,
// but (because of scheduler bug) it can be.
// At this point we don't have much to go on, so use 1e10.
//
if (!r.flops_estimate) {
r.flops_estimate = 1e10;
}
double raw_pfc = (r.elapsed_time * r.flops_estimate);
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] raw credit: %.2f (%.2f sec, %.2f est GFLOPS)\n",
r.id, raw_pfc*COBBLESTONE_SCALE, r.elapsed_time,
r.flops_estimate/1e9
);
}
// Sanity check
//
if (raw_pfc > wu.rsc_fpops_bound) {
char query[256], clause[256];
pfc = wu_estimated_pfc(wu, app);
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] sanity check failed: %.2f>%.2f, return %.2f\n",
r.id, raw_pfc*COBBLESTONE_SCALE,
wu.rsc_fpops_bound*COBBLESTONE_SCALE, pfc*COBBLESTONE_SCALE
);
}
sprintf(query, "consecutive_valid=0");
sprintf(clause, "host_id=%d and app_version_id=%d", r.hostid, gavid);
retval = hav.update_fields_noid(query, clause);
return retval;
}
if (r.app_version_id < 0) {
// anon platform
//
bool do_scale = true;
if (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()<=0) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] anon platform, not scaling, PFC avg zero or too few samples %.0f\n",
r.id, hav.pfc.n
);
}
}
if (do_scale
&& app.host_scale_check
&& hav.consecutive_valid < CONS_VALID_HOST_SCALE
) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] anon platform, not scaling, cons valid %d\n",
r.id, hav.consecutive_valid
);
}
}
if (do_scale) {
double scale = app.min_avg_pfc / hav.pfc.get_avg();
pfc = raw_pfc * scale;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] anon platform, scaling by %g (%.2f/%.2f)\n",
r.id, scale, app.min_avg_pfc, hav.pfc.get_avg()
);
}
} else {
pfc = wu_estimated_pfc(wu, app);
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] not scaling, using app avg %.2f\n",
r.id, pfc*COBBLESTONE_SCALE
);
}
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] anon platform, returning %.2f\n",
r.id, pfc*COBBLESTONE_SCALE
);
}
} else {
avp = av_lookup(r.app_version_id, app_versions);
if (!avp) {
log_messages.printf(MSG_CRITICAL,
"get_pfc() [RESULT#%d]: No AVP %d!!\n", r.id, r.app_version_id
);
return ERR_NOT_FOUND;
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] [AV#%d] normal case. %.0f sec, %.1f GFLOPS. raw credit: %.2f\n",
r.id, avp->id, r.elapsed_time, r.flops_estimate/1e9,
raw_pfc*COBBLESTONE_SCALE
);
}
bool do_scale = true;
double host_scale = 0;
if (app.host_scale_check
&& hav.consecutive_valid < CONS_VALID_HOST_SCALE
) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] not host scaling - cons valid %d\n",
r.id, hav.consecutive_valid
);
}
}
if (do_scale && (hav.pfc.n < MIN_HOST_SAMPLES || hav.pfc.get_avg()==0)) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] not host scaling - HAV PFC zero or too few samples %.0f\n",
r.id, hav.pfc.n
);
}
}
if (do_scale && avp->pfc.n < MIN_VERSION_SAMPLES) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] not host scaling - app_version PFC too few samples%.0f\n",
r.id, avp->pfc.n
);
}
}
if (do_scale && hav.pfc.get_avg() <= 0) {
do_scale = false;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] not host scaling - HAV PFC is zero\n",
r.id
);
}
}
if (do_scale) {
host_scale = avp->pfc.get_avg() / hav.pfc.get_avg();
if (host_scale > 10) host_scale = 10;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] host scale: %.2f (%f/%f)\n",
r.id, host_scale, avp->pfc.get_avg(), hav.pfc.get_avg()
);
}
}
pfc = raw_pfc;
if (avp->pfc_scale) {
pfc *= avp->pfc_scale;
if (host_scale) {
pfc *= host_scale;
mode = PFC_MODE_NORMAL;
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] applying app version scale %.3f\n",
r.id, avp->pfc_scale
);
}
} else {
if (host_scale) {
pfc *= host_scale;
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] no app version scale\n",
r.id
);
}
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] [AV#%d] PFC avgs with %g (%g/%g)\n",
r.id, avp->id,
raw_pfc/wu.rsc_fpops_est,
raw_pfc, wu.rsc_fpops_est
);
}
double x = raw_pfc / wu.rsc_fpops_est;
if (is_pfc_sane(x, wu, app)) {
avp->pfc_samples.push_back(x);
}
}
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [RESULT#%d] updating HAV PFC %.2f et %g turnaround %d\n",
r.id, raw_pfc / wu.rsc_fpops_est,
r.elapsed_time / wu.rsc_fpops_est,
(r.received_time - r.sent_time)
);
}
double x = raw_pfc / wu.rsc_fpops_est;
if (is_pfc_sane(x, wu, app)) {
hav.pfc.update(x, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT);
}
hav.et.update_var(
r.elapsed_time / wu.rsc_fpops_est,
HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
);
hav.turnaround.update_var(
(r.received_time - r.sent_time),
HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
);
// keep track of credit per app version
//
if (avp) {
avp->credit_samples.push_back(pfc*COBBLESTONE_SCALE);
avp->credit_times.push_back(r.sent_time);
}
return 0;
}
// compute the average of some numbers,
// where each value is weighted by the sum of the other values.
// (reduces the weight of large outliers)
//
double low_average(vector& v) {
int i;
int n = v.size();
if (n == 1) {
return v[0];
}
double sum=0;
for (i=0; i& v) {
double x = v[0];
for (unsigned int i=1; i& results,
DB_APP& app,
vector& app_versions,
vector& host_app_versions,
double max_granted_credit, double& credit
) {
unsigned int i;
int mode, retval;
double pfc;
vector normal;
vector approx;
for (i=0; i wu.rsc_fpops_bound) {
log_messages.printf(MSG_NORMAL,
"[credit] PFC too high: %f\n", pfc*COBBLESTONE_SCALE
);
pfc = wu_estimated_pfc(wu, app);
}
// max_granted_credit trumps rsc_fpops_bound;
// the latter may be set absurdly high
//
if (max_granted_credit && pfc*COBBLESTONE_SCALE > max_granted_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] Credit too high: %f\n", pfc*COBBLESTONE_SCALE
);
pfc = max_granted_credit/COBBLESTONE_SCALE;
}
if (mode == PFC_MODE_NORMAL) {
normal.push_back(pfc);
} else {
approx.push_back(pfc);
}
}
// averaging policy: if there is least one normal result,
// use the "low average" of normal results.
// Otherwise use the min of all results
//
double x;
if (normal.size()) {
x = low_average(normal);
} else if (approx.size()) {
x = vec_min(approx);
} else {
x = 0;
}
x *= COBBLESTONE_SCALE;
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [WU#%d] assign_credit_set: credit %g\n",
wu.id, x
);
}
credit = x;
return 0;
}
// A job has:
// - errored out (scheduler)
// - timed out (transitioner)
// - failed validation (validator).
// Put (host/app_version) on "host scale probation",
// so that we won't use host scaling for a while.
//
void got_error(DB_HOST_APP_VERSION& hav) {
if (config.debug_credit) {
log_messages.printf(MSG_NORMAL,
"[credit] [HAV#%d] got error, setting error rate to %f\n",
hav.app_version_id, ERROR_RATE_INIT
);
}
}
// carefully write any app_version records that have changed;
// done at the end of every validator scan.
//
int write_modified_app_versions(vector& app_versions) {
unsigned int i, j;
int retval = 0;
double now = dtime();
for (i=0; i