2009-03-19 16:35:35 +00:00
|
|
|
// This file is part of BOINC.
|
|
|
|
// http://boinc.berkeley.edu
|
|
|
|
// Copyright (C) 2008 University of California
|
|
|
|
//
|
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
|
|
|
//
|
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
|
|
|
|
#include "boinc_db.h"
|
|
|
|
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_main.h"
|
2009-03-19 16:35:35 +00:00
|
|
|
#include "sched_msgs.h"
|
|
|
|
#include "sched_config.h"
|
2009-07-29 18:55:50 +00:00
|
|
|
#include "sched_customize.h"
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_types.h"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
#include "sched_util.h"
|
2010-04-08 23:14:47 +00:00
|
|
|
#include "credit.h"
|
2009-03-19 16:35:35 +00:00
|
|
|
|
|
|
|
#include "sched_version.h"
|
|
|
|
|
|
|
|
inline void dont_need_message(
|
|
|
|
const char* p, APP_VERSION* avp, CLIENT_APP_VERSION* cavp
|
|
|
|
) {
|
|
|
|
if (!config.debug_version_select) return;
|
|
|
|
if (avp) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-06-03 20:26:02 +00:00
|
|
|
"[version] [AV#%d] Don't need %s jobs, skipping\n",
|
|
|
|
avp->id, p
|
2009-03-19 16:35:35 +00:00
|
|
|
);
|
|
|
|
} else if (cavp) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] Don't need %s jobs, skipping anonymous version %d for %s (%s)\n",
|
|
|
|
p, cavp->version_num, cavp->app_name, cavp->plan_class
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// for new-style requests, check that the app version uses a
|
|
|
|
// resource for which we need work
|
|
|
|
//
|
|
|
|
bool need_this_resource(
|
|
|
|
HOST_USAGE& host_usage, APP_VERSION* avp, CLIENT_APP_VERSION* cavp
|
|
|
|
) {
|
2012-03-26 19:26:12 +00:00
|
|
|
if (!g_wreq->rsc_spec_request) {
|
|
|
|
return true;
|
|
|
|
}
|
2012-06-22 07:35:54 +00:00
|
|
|
int pt = host_usage.proc_type;
|
|
|
|
if (!g_wreq->need_proc_type(pt)) {
|
|
|
|
dont_need_message(proc_type_name(pt), avp, cavp);
|
|
|
|
return false;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
static DB_HOST_APP_VERSION* lookup_host_app_version(int gavid) {
|
|
|
|
for (unsigned int i=0; i<g_wreq->host_app_versions.size(); i++) {
|
|
|
|
DB_HOST_APP_VERSION& hav = g_wreq->host_app_versions[i];
|
2010-05-13 16:45:27 +00:00
|
|
|
if (hav.app_version_id == gavid) return &hav;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool app_version_is_trusted(int gavid) {
|
|
|
|
DB_HOST_APP_VERSION* havp = lookup_host_app_version(gavid);
|
|
|
|
if (!havp) return false;
|
|
|
|
return havp->trusted;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool app_version_is_reliable(int gavid) {
|
|
|
|
DB_HOST_APP_VERSION* havp = lookup_host_app_version(gavid);
|
|
|
|
if (!havp) return false;
|
|
|
|
return havp->reliable;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline int host_usage_to_gavid(HOST_USAGE& hu, APP& app) {
|
|
|
|
return app.id*1000000 - hu.resource_type();
|
|
|
|
}
|
|
|
|
|
2010-06-15 22:21:57 +00:00
|
|
|
// scale daily quota by # processors and/or by config.gpu_multiplier
|
|
|
|
//
|
|
|
|
inline int scaled_max_jobs_per_day(DB_HOST_APP_VERSION& hav, HOST_USAGE& hu) {
|
|
|
|
int n = hav.max_jobs_per_day;
|
2012-06-25 23:09:45 +00:00
|
|
|
if (hu.proc_type == PROC_TYPE_CPU) {
|
|
|
|
if (g_reply->host.p_ncpus) {
|
|
|
|
n *= g_reply->host.p_ncpus;
|
2010-06-15 22:21:57 +00:00
|
|
|
}
|
2012-06-25 23:09:45 +00:00
|
|
|
} else {
|
2014-07-25 19:40:35 +00:00
|
|
|
COPROC* cp = g_request->coprocs.proc_type_to_coproc(hu.proc_type);
|
2012-06-25 23:09:45 +00:00
|
|
|
if (cp->count) {
|
|
|
|
n *= cp->count;
|
2010-06-15 22:21:57 +00:00
|
|
|
}
|
|
|
|
if (config.gpu_multiplier) {
|
|
|
|
n *= config.gpu_multiplier;
|
|
|
|
}
|
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.debug_quota) {
|
2010-06-29 03:20:19 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-07-06 23:31:26 +00:00
|
|
|
"[quota] [AV#%d] scaled max jobs per day: %d\n",
|
|
|
|
hav.app_version_id,
|
|
|
|
n
|
2010-06-25 18:54:37 +00:00
|
|
|
);
|
|
|
|
}
|
2010-06-15 22:21:57 +00:00
|
|
|
return n;
|
|
|
|
}
|
|
|
|
|
|
|
|
inline bool daily_quota_exceeded(int gavid, HOST_USAGE& hu) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = lookup_host_app_version(gavid);
|
|
|
|
if (!havp) return false;
|
2010-06-25 18:54:37 +00:00
|
|
|
int q = scaled_max_jobs_per_day(*havp, hu);
|
|
|
|
if (havp->n_jobs_today >= q) {
|
|
|
|
if (config.debug_quota) {
|
2010-06-29 03:20:19 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-07-06 23:31:26 +00:00
|
|
|
"[quota] [AV#%d] daily quota exceeded: %d >= %d\n",
|
|
|
|
gavid, havp->n_jobs_today, q
|
2010-06-25 18:54:37 +00:00
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
havp->daily_quota_exceeded = true;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2009-03-19 16:35:35 +00:00
|
|
|
// scan through client's anonymous apps and pick the best one
|
|
|
|
//
|
2010-10-01 19:54:09 +00:00
|
|
|
CLIENT_APP_VERSION* get_app_version_anonymous(
|
|
|
|
APP& app, bool need_64b, bool reliable_only
|
|
|
|
) {
|
2009-03-19 16:35:35 +00:00
|
|
|
unsigned int i;
|
|
|
|
CLIENT_APP_VERSION* best = NULL;
|
|
|
|
bool found = false;
|
|
|
|
char message[256];
|
|
|
|
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] get_app_version_anonymous: app %s%s\n",
|
2010-05-20 22:49:00 +00:00
|
|
|
app.name, reliable_only?" (reliable only)":""
|
2010-05-20 17:50:00 +00:00
|
|
|
);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
for (i=0; i<g_request->client_app_versions.size(); i++) {
|
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
|
2010-07-23 17:43:20 +00:00
|
|
|
if (!cav.app) continue;
|
2010-04-11 04:42:52 +00:00
|
|
|
if (cav.app->id != app.id) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
continue;
|
|
|
|
}
|
2010-10-01 19:54:09 +00:00
|
|
|
if (need_64b && !is_64b_platform(cav.platform)) {
|
|
|
|
continue;
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
int gavid = host_usage_to_gavid(cav.host_usage, app);
|
|
|
|
if (reliable_only && !app_version_is_reliable(gavid)) {
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] %d %s not reliable\n",
|
|
|
|
cav.version_num, cav.plan_class
|
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
continue;
|
|
|
|
}
|
2010-06-15 22:21:57 +00:00
|
|
|
if (daily_quota_exceeded(gavid, cav.host_usage)) {
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] %d %s daily quota exceeded\n",
|
|
|
|
cav.version_num, cav.plan_class
|
|
|
|
);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (cav.version_num < app.min_version) {
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] %d %s version < min version\n",
|
|
|
|
cav.version_num, cav.plan_class
|
|
|
|
);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
found = true;
|
|
|
|
if (!need_this_resource(cav.host_usage, NULL, &cav)) {
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] %d %s don't need resource\n",
|
|
|
|
cav.version_num, cav.plan_class
|
|
|
|
);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (best) {
|
2010-04-10 05:49:51 +00:00
|
|
|
if (cav.host_usage.projected_flops > best->host_usage.projected_flops) {
|
2009-03-19 16:35:35 +00:00
|
|
|
best = &cav;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
best = &cav;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!best) {
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
2009-03-19 16:35:35 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-05-20 17:50:00 +00:00
|
|
|
"[version] Didn't find anonymous platform app for %s\n",
|
2009-03-19 16:35:35 +00:00
|
|
|
app.name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!found) {
|
|
|
|
sprintf(message,
|
2010-07-13 02:49:35 +00:00
|
|
|
"%s %s.",
|
|
|
|
_("Your app_info.xml file doesn't have a usable version of"),
|
2009-03-19 16:35:35 +00:00
|
|
|
app.user_friendly_name
|
|
|
|
);
|
2009-08-21 20:38:39 +00:00
|
|
|
add_no_work_message(message);
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
|
|
|
return best;
|
|
|
|
}
|
|
|
|
|
2012-04-27 06:17:59 +00:00
|
|
|
#define ET_RATIO_LIMIT 250.
|
2011-09-15 06:53:01 +00:00
|
|
|
// if the FLOPS estimate based on elapsed time
|
2012-04-05 11:01:38 +00:00
|
|
|
// exceeds projected_flops by more than this factor, cap it.
|
2011-09-15 06:53:01 +00:00
|
|
|
// The host may have received a bunch of short jobs recently
|
|
|
|
|
2012-06-28 06:27:37 +00:00
|
|
|
#define GPU_CPU_RATIO 10.
|
|
|
|
// a conservative estimate of the ratio of a typical GPU to CPU
|
|
|
|
|
2010-07-15 21:38:24 +00:00
|
|
|
// input:
|
|
|
|
// cav.host_usage.projected_flops
|
|
|
|
// This is the <flops> specified in app_info.xml
|
2011-12-16 19:45:31 +00:00
|
|
|
// If not specified there, it's a conservative estimate
|
2012-06-28 06:27:37 +00:00
|
|
|
// (CPU speed * (ncpus + 10*ngpus))
|
2010-07-15 21:38:24 +00:00
|
|
|
// In either case, this value will be used by the client
|
|
|
|
// to estimate job runtime and runtime limit
|
|
|
|
// est runtime = wu.rsc_fpops_est/x
|
|
|
|
// runtime limit = wu.rsc_fpops_bound/x
|
|
|
|
// x may be way off from the actual speed.
|
2011-12-16 19:45:31 +00:00
|
|
|
// So to get accurate runtime est, we need to adjust wu.rsc_fpops_est
|
2010-07-15 21:38:24 +00:00
|
|
|
//
|
|
|
|
// output:
|
|
|
|
// cav.host_usage.projected_flops
|
|
|
|
// An estimate of the actual FLOPS the app will get,
|
|
|
|
// based on elapsed time history (if possible).
|
|
|
|
// This is used by the scheduler to estimate runtime.
|
|
|
|
// cav.rsc_fpops_scale
|
|
|
|
// wu.rsc_fpops_est and wu.rsc_fpops_bound will be scaled by this
|
|
|
|
//
|
2010-04-08 23:14:47 +00:00
|
|
|
// called at start of send_work().
|
|
|
|
//
|
|
|
|
void estimate_flops_anon_platform() {
|
|
|
|
unsigned int i;
|
|
|
|
for (i=0; i<g_request->client_app_versions.size(); i++) {
|
|
|
|
CLIENT_APP_VERSION& cav = g_request->client_app_versions[i];
|
2010-07-23 17:43:20 +00:00
|
|
|
if (!cav.app) continue;
|
2010-04-08 23:14:47 +00:00
|
|
|
|
|
|
|
cav.rsc_fpops_scale = 1;
|
|
|
|
|
2012-06-25 23:09:45 +00:00
|
|
|
if (cav.host_usage.avg_ncpus == 0
|
|
|
|
&& cav.host_usage.proc_type == PROC_TYPE_CPU
|
|
|
|
) {
|
2010-04-08 23:14:47 +00:00
|
|
|
cav.host_usage.avg_ncpus = 1;
|
|
|
|
}
|
|
|
|
|
2010-07-15 21:38:24 +00:00
|
|
|
// if projected_flops is missing, make a wild guess
|
2011-10-16 06:04:13 +00:00
|
|
|
// Note: 6.12+ clients supply a project FLOPS,
|
|
|
|
// even if the user didn't
|
2010-04-08 23:14:47 +00:00
|
|
|
//
|
2010-04-10 05:49:51 +00:00
|
|
|
if (cav.host_usage.projected_flops == 0) {
|
|
|
|
cav.host_usage.projected_flops = g_reply->host.p_fpops;
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
|
2011-10-16 06:04:13 +00:00
|
|
|
// If data is available, estimate FLOPS based on average elapsed time
|
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(
|
2010-04-08 23:14:47 +00:00
|
|
|
generalized_app_version_id(
|
|
|
|
cav.host_usage.resource_type(), cav.app->id
|
|
|
|
)
|
|
|
|
);
|
2010-07-15 21:38:24 +00:00
|
|
|
if (havp
|
|
|
|
&& (havp->et.n > MIN_HOST_SAMPLES)
|
|
|
|
&& (havp->et.get_avg() > 0)
|
|
|
|
) {
|
2010-04-08 23:14:47 +00:00
|
|
|
double new_flops = 1./havp->et.get_avg();
|
2011-09-05 17:29:53 +00:00
|
|
|
|
2011-09-15 06:53:01 +00:00
|
|
|
// cap this at ET_RATIO_LIMIT*projected,
|
2011-09-05 17:29:53 +00:00
|
|
|
// in case we've had a bunch of short jobs recently
|
|
|
|
//
|
2011-09-15 06:53:01 +00:00
|
|
|
if (new_flops > ET_RATIO_LIMIT*cav.host_usage.projected_flops) {
|
2011-09-05 17:29:53 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2011-09-15 06:53:01 +00:00
|
|
|
"[version] (%s) capping new_flops; %.1fG > %.0f*%.1fG\n",
|
|
|
|
cav.plan_class, new_flops/1e9,
|
|
|
|
ET_RATIO_LIMIT,
|
|
|
|
cav.host_usage.projected_flops/1e9
|
2011-09-05 17:29:53 +00:00
|
|
|
);
|
|
|
|
}
|
2011-09-15 06:53:01 +00:00
|
|
|
new_flops = ET_RATIO_LIMIT*cav.host_usage.projected_flops;
|
2011-09-05 17:29:53 +00:00
|
|
|
}
|
2010-04-10 05:49:51 +00:00
|
|
|
cav.rsc_fpops_scale = cav.host_usage.projected_flops/new_flops;
|
|
|
|
cav.host_usage.projected_flops = new_flops;
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
2010-04-10 05:49:51 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-05-20 17:50:00 +00:00
|
|
|
"[version] (%s) setting projected flops to %fG based on ET\n",
|
2010-04-16 18:07:08 +00:00
|
|
|
cav.plan_class, new_flops/1e9
|
2010-04-10 05:49:51 +00:00
|
|
|
);
|
2010-06-24 22:58:05 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] setting rsc_fpops_scale to %g\n",
|
|
|
|
cav.rsc_fpops_scale
|
|
|
|
);
|
2010-04-10 05:49:51 +00:00
|
|
|
}
|
|
|
|
} else {
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
2010-04-10 05:49:51 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-05-20 17:50:00 +00:00
|
|
|
"[version] (%s) using client-supplied flops %fG\n",
|
2010-06-24 22:58:05 +00:00
|
|
|
cav.plan_class, cav.host_usage.projected_flops/1e9
|
2010-04-10 05:49:51 +00:00
|
|
|
);
|
|
|
|
}
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-14 21:13:33 +00:00
|
|
|
// compute HOST_USAGE::projected_flops, which is used to estimate job runtime:
|
|
|
|
// est. runtime = wu.rsc_fpops_est / projected_flops
|
|
|
|
// so project_flops must reflect systematic errors in rsc_fpops_est
|
2013-04-30 23:30:27 +00:00
|
|
|
//
|
|
|
|
// 1) if we have statistics for (host, app version) and
|
|
|
|
// <estimate_flops_from_hav_pfc> is not set use elapsed time,
|
|
|
|
// otherwise use pfc_avg.
|
2011-12-16 19:45:31 +00:00
|
|
|
// 2) if we have statistics for app version elapsed time, use those.
|
2014-06-10 21:04:40 +00:00
|
|
|
// 3) else use a conservative estimate (p_fpops*(cpu usage + gpu usage))
|
2011-12-16 19:45:31 +00:00
|
|
|
// This prevents jobs from aborting with "time limit exceeded"
|
|
|
|
// even if the estimate supplied by the plan class function is way off
|
2010-04-08 23:14:47 +00:00
|
|
|
//
|
|
|
|
void estimate_flops(HOST_USAGE& hu, APP_VERSION& av) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id);
|
2010-04-08 23:14:47 +00:00
|
|
|
if (havp && havp->et.n > MIN_HOST_SAMPLES) {
|
2013-04-30 23:30:27 +00:00
|
|
|
double new_flops;
|
|
|
|
if (config.estimate_flops_from_hav_pfc) {
|
|
|
|
new_flops = hu.peak_flops / (havp->pfc.get_avg()+1e-18);
|
|
|
|
} else {
|
|
|
|
new_flops = 1./havp->et.get_avg();
|
|
|
|
}
|
2011-09-15 06:53:01 +00:00
|
|
|
// cap this at ET_RATIO_LIMIT*projected,
|
|
|
|
// in case we've had a bunch of short jobs recently
|
|
|
|
//
|
|
|
|
if (new_flops > ET_RATIO_LIMIT*hu.projected_flops) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] (%s) capping new_flops; %.1fG > %.0f*%.1fG\n",
|
|
|
|
av.plan_class, new_flops/1e9,
|
|
|
|
ET_RATIO_LIMIT,
|
|
|
|
hu.projected_flops/1e9
|
|
|
|
);
|
|
|
|
}
|
|
|
|
new_flops = ET_RATIO_LIMIT*hu.projected_flops;
|
|
|
|
}
|
2010-04-10 05:49:51 +00:00
|
|
|
hu.projected_flops = new_flops;
|
2011-09-15 06:53:01 +00:00
|
|
|
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
2013-04-30 23:30:27 +00:00
|
|
|
if (config.estimate_flops_from_hav_pfc) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] (%s) setting projected flops based on host_app_version pfc: %.2fG\n",
|
|
|
|
av.id, av.plan_class, hu.projected_flops/1e9
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] (%s) setting projected flops based on host elapsed time avg: %.2fG\n",
|
|
|
|
av.id, av.plan_class, hu.projected_flops/1e9
|
|
|
|
);
|
|
|
|
}
|
2010-04-10 05:49:51 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-04-30 23:30:27 +00:00
|
|
|
"[version] [AV#%d] (%s) comparison pfc: %.2fG et: %.2fG\n",
|
|
|
|
av.id, av.plan_class, hu.peak_flops/(havp->pfc.get_avg()+1e-18)/1e+9,
|
|
|
|
1e-9/havp->et.get_avg()
|
2010-04-10 05:49:51 +00:00
|
|
|
);
|
|
|
|
}
|
2010-04-08 23:14:47 +00:00
|
|
|
} else {
|
2011-06-03 19:53:52 +00:00
|
|
|
if (av.pfc.n > MIN_VERSION_SAMPLES) {
|
|
|
|
hu.projected_flops = hu.peak_flops/av.pfc.get_avg();
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
2010-04-10 05:49:51 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2011-06-03 19:53:52 +00:00
|
|
|
"[version] [AV#%d] (%s) adjusting projected flops based on PFC avg: %.2fG\n",
|
2010-04-16 18:07:08 +00:00
|
|
|
av.id, av.plan_class, hu.projected_flops/1e9
|
2010-04-10 05:49:51 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
} else {
|
2012-06-28 06:27:37 +00:00
|
|
|
hu.projected_flops = g_reply->host.p_fpops * (hu.avg_ncpus + GPU_CPU_RATIO*hu.gpu_usage);
|
2010-05-20 17:50:00 +00:00
|
|
|
if (config.debug_version_select) {
|
2010-04-10 05:49:51 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2011-12-16 19:45:31 +00:00
|
|
|
"[version] [AV#%d] (%s) using conservative projected flops: %.2fG\n",
|
2010-04-16 18:07:08 +00:00
|
|
|
av.id, av.plan_class, hu.projected_flops/1e9
|
2010-04-10 05:49:51 +00:00
|
|
|
);
|
|
|
|
}
|
2010-04-08 23:14:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-06-04 16:57:33 +00:00
|
|
|
// return a string describing an app version
|
|
|
|
//
|
|
|
|
static void app_version_desc(BEST_APP_VERSION& bav, char* buf) {
|
|
|
|
if (!bav.present) {
|
2013-06-04 05:42:53 +00:00
|
|
|
safe_strcpy(buf, "none");
|
2010-06-04 16:57:33 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (bav.cavp) {
|
2012-06-25 23:09:45 +00:00
|
|
|
sprintf(buf, "anonymous platform (%s)", proc_type_name(bav.host_usage.proc_type));
|
2010-06-04 16:57:33 +00:00
|
|
|
} else {
|
|
|
|
sprintf(buf, "[AV#%d]", bav.avp->id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-10-27 22:58:16 +00:00
|
|
|
// different OSs have different max user address space for 32 bit apps
|
|
|
|
//
|
|
|
|
static double max_32b_address_space() {
|
|
|
|
if (strstr(g_request->platform.name, "windows")) {
|
|
|
|
return 2*GIGA;
|
|
|
|
} else if (strstr(g_request->platform.name, "linux")) {
|
|
|
|
return 3*GIGA;
|
|
|
|
} else if (strstr(g_request->platform.name, "darwin")) {
|
|
|
|
return 4*GIGA;
|
|
|
|
} else if (strstr(g_request->platform.name, "solaris")) {
|
|
|
|
return 4*GIGA;
|
|
|
|
} else if (strstr(g_request->platform.name, "anonymous")) {
|
|
|
|
// problem case. assume windows
|
|
|
|
return 2*GIGA;
|
|
|
|
}
|
|
|
|
return 2*GIGA;
|
|
|
|
}
|
|
|
|
|
2011-06-06 03:40:42 +00:00
|
|
|
// The WU is already committed to an app version.
|
|
|
|
// - check if this host supports that platform
|
|
|
|
// - if plan class, check if this host can handle it
|
|
|
|
// - check if we need work for the resource
|
|
|
|
//
|
2012-09-06 19:41:49 +00:00
|
|
|
// If all these are satisfied, return a pointer to a BEST_APP_VERSION struct
|
|
|
|
// with HOST_USAGE filled in correctly.
|
2012-09-06 22:46:02 +00:00
|
|
|
// Else return NULL.
|
2012-09-06 19:41:49 +00:00
|
|
|
//
|
2011-06-06 03:40:42 +00:00
|
|
|
static BEST_APP_VERSION* check_homogeneous_app_version(
|
2012-05-23 00:25:19 +00:00
|
|
|
WORKUNIT& wu, bool /* reliable_only */
|
2011-11-03 05:26:19 +00:00
|
|
|
// TODO: enforce reliable_only
|
2011-06-06 03:40:42 +00:00
|
|
|
) {
|
2012-09-06 19:25:05 +00:00
|
|
|
BEST_APP_VERSION bav;
|
2011-06-06 03:40:42 +00:00
|
|
|
|
2011-12-17 22:11:26 +00:00
|
|
|
bool found;
|
2011-06-06 03:40:42 +00:00
|
|
|
APP_VERSION *avp = ssp->lookup_app_version(wu.app_version_id);
|
2011-12-12 22:57:58 +00:00
|
|
|
if (!avp) {
|
2011-12-17 22:11:26 +00:00
|
|
|
// If the app version is not in shmem,
|
|
|
|
// it's been superceded or deprecated.
|
|
|
|
// Use it anyway.
|
|
|
|
// Keep an array of such app versions in
|
|
|
|
// SCHEDULER_REPLY::old_app_versions
|
2011-12-12 22:57:58 +00:00
|
|
|
//
|
2011-12-17 22:11:26 +00:00
|
|
|
found = false;
|
|
|
|
for (unsigned int i=0; i<g_reply->old_app_versions.size(); i++) {
|
|
|
|
APP_VERSION& av = g_reply->old_app_versions[i];
|
|
|
|
if (av.id == wu.app_version_id) {
|
|
|
|
avp = &av;
|
|
|
|
found = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!found) {
|
|
|
|
DB_APP_VERSION av;
|
|
|
|
int retval = av.lookup_id(wu.app_version_id);
|
|
|
|
if (retval) return NULL;
|
|
|
|
g_reply->old_app_versions.push_back(av);
|
|
|
|
avp = &(g_reply->old_app_versions.back());
|
|
|
|
}
|
2011-12-12 22:57:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// see if this host supports the version's platform
|
|
|
|
//
|
2011-12-17 22:11:26 +00:00
|
|
|
found = false;
|
2011-06-06 03:40:42 +00:00
|
|
|
for (unsigned int i=0; i<g_request->platforms.list.size(); i++) {
|
|
|
|
PLATFORM* p = g_request->platforms.list[i];
|
|
|
|
if (p->id == avp->platformid) {
|
|
|
|
found = true;
|
2011-10-08 08:16:24 +00:00
|
|
|
bav.avp = avp;
|
2011-06-06 03:40:42 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (!found) return NULL;
|
2011-12-12 22:57:58 +00:00
|
|
|
|
|
|
|
// and see if it supports the plan class
|
|
|
|
//
|
2011-06-06 03:40:42 +00:00
|
|
|
if (strlen(avp->plan_class)) {
|
|
|
|
if (!app_plan(*g_request, avp->plan_class, bav.host_usage)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
} else {
|
2012-01-09 17:35:48 +00:00
|
|
|
bav.host_usage.sequential_app(capped_host_fpops());
|
2011-06-06 03:40:42 +00:00
|
|
|
}
|
2011-12-12 22:57:58 +00:00
|
|
|
|
|
|
|
// and see if the client is asking for this resource
|
|
|
|
//
|
2011-06-06 03:40:42 +00:00
|
|
|
if (!need_this_resource(bav.host_usage, avp, NULL)) {
|
|
|
|
return NULL;
|
|
|
|
}
|
2012-09-06 22:46:02 +00:00
|
|
|
|
|
|
|
// dynamically allocate the BEST_APP_VERSION.
|
|
|
|
// This is a memory leak, but that's OK
|
|
|
|
//
|
|
|
|
BEST_APP_VERSION* bavp = new BEST_APP_VERSION;
|
|
|
|
*bavp = bav;
|
|
|
|
return bavp;
|
2011-06-06 03:40:42 +00:00
|
|
|
}
|
|
|
|
|
2012-02-25 03:54:23 +00:00
|
|
|
// return the app version with greatest projected FLOPS
|
|
|
|
// for the given job and host, or NULL if none is available
|
2009-03-19 16:35:35 +00:00
|
|
|
//
|
2012-09-06 22:46:02 +00:00
|
|
|
// NOTE: the BEST_APP_VERSION structure returned by this
|
|
|
|
// must not be modified or reused;
|
|
|
|
// a pointer to it is stored in APP_VERSION.
|
2012-09-06 19:25:05 +00:00
|
|
|
//
|
2012-02-25 03:54:23 +00:00
|
|
|
// check_req: if set, return only app versions that use resources
|
|
|
|
// for which the work request is nonzero.
|
|
|
|
// This check is not done for:
|
2010-06-01 23:41:07 +00:00
|
|
|
// - assigned jobs
|
|
|
|
// - resent jobs
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// reliable_only: use only versions for which this host is "reliable"
|
2009-03-19 16:35:35 +00:00
|
|
|
//
|
2010-10-01 19:54:09 +00:00
|
|
|
// We "memoize" the results, maintaining an array g_wreq->best_app_versions
|
|
|
|
// that maps app ID to the best app version (or NULL).
|
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
BEST_APP_VERSION* get_app_version(
|
|
|
|
WORKUNIT& wu, bool check_req, bool reliable_only
|
|
|
|
) {
|
2009-03-19 16:35:35 +00:00
|
|
|
unsigned int i;
|
2010-06-25 18:54:37 +00:00
|
|
|
int j;
|
2010-05-12 21:07:39 +00:00
|
|
|
BEST_APP_VERSION* bavp;
|
2011-08-27 02:54:39 +00:00
|
|
|
char buf[256];
|
2010-10-27 22:58:16 +00:00
|
|
|
bool job_needs_64b = (wu.rsc_memory_bound > max_32b_address_space());
|
2010-10-01 19:54:09 +00:00
|
|
|
|
|
|
|
if (config.debug_version_select) {
|
2013-02-19 08:48:59 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] get_app_version(): getting app version for WU#%d (%s) appid:%d\n",
|
|
|
|
wu.id, wu.name, wu.appid
|
|
|
|
);
|
2010-10-01 19:54:09 +00:00
|
|
|
if (job_needs_64b) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] job needs 64-bit app version: mem bnd %f\n",
|
|
|
|
wu.rsc_memory_bound
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
APP* app = ssp->lookup_app(wu.appid);
|
|
|
|
if (!app) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"WU refers to nonexistent app: %d\n", wu.appid
|
|
|
|
);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2013-04-18 19:43:17 +00:00
|
|
|
// if the app uses homogeneous app version,
|
2012-11-26 03:12:08 +00:00
|
|
|
// don't send to anonymous platform client.
|
|
|
|
// Then check if the WU is already committed to an app version
|
2011-06-06 03:40:42 +00:00
|
|
|
//
|
2012-11-26 03:12:08 +00:00
|
|
|
if (app->homogeneous_app_version) {
|
|
|
|
if (g_wreq->anonymous_platform) {
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
if ( wu.app_version_id) {
|
|
|
|
return check_homogeneous_app_version(wu, reliable_only);
|
|
|
|
}
|
2011-06-06 03:40:42 +00:00
|
|
|
}
|
|
|
|
|
2009-03-19 16:35:35 +00:00
|
|
|
// see if app is already in memoized array
|
|
|
|
//
|
2010-05-12 21:07:39 +00:00
|
|
|
std::vector<BEST_APP_VERSION*>::iterator bavi;
|
2009-03-19 16:35:35 +00:00
|
|
|
bavi = g_wreq->best_app_versions.begin();
|
|
|
|
while (bavi != g_wreq->best_app_versions.end()) {
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp = *bavi;
|
2010-10-01 19:54:09 +00:00
|
|
|
if (bavp->appid == wu.appid && (job_needs_64b == bavp->for_64b_jobs)) {
|
2010-05-12 21:07:39 +00:00
|
|
|
if (!bavp->present) {
|
2010-05-20 17:50:00 +00:00
|
|
|
#if 0
|
2010-04-11 04:42:52 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] returning cached NULL\n"
|
|
|
|
);
|
|
|
|
}
|
2010-05-20 17:50:00 +00:00
|
|
|
#endif
|
2010-04-11 04:42:52 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
|
2010-06-01 23:41:07 +00:00
|
|
|
// if we're at the jobs-in-progress limit for this
|
|
|
|
// app and resource type, fall through and find another version
|
|
|
|
//
|
2011-05-17 19:11:44 +00:00
|
|
|
if (config.max_jobs_in_progress.exceeded(
|
2014-03-08 19:17:16 +00:00
|
|
|
app, bavp->host_usage.proc_type
|
|
|
|
)) {
|
2010-06-04 16:57:33 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
app_version_desc(*bavp, buf);
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] %s: max jobs in progress exceeded\n", buf
|
|
|
|
);
|
|
|
|
}
|
2010-06-01 23:41:07 +00:00
|
|
|
g_wreq->best_app_versions.erase(bavi);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2012-06-25 23:09:45 +00:00
|
|
|
// if we previously chose an app version but don't need more work
|
|
|
|
// for that processor type, fall through and find another version
|
2009-03-19 16:35:35 +00:00
|
|
|
//
|
2012-06-25 23:09:45 +00:00
|
|
|
if (check_req && g_wreq->rsc_spec_request) {
|
|
|
|
int pt = bavp->host_usage.proc_type;
|
|
|
|
if (!g_wreq->need_proc_type(pt)) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] have %s version but no more %s work needed\n",
|
|
|
|
proc_type_name(pt),
|
|
|
|
proc_type_name(pt)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
g_wreq->best_app_versions.erase(bavi);
|
|
|
|
break;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-04-11 04:42:52 +00:00
|
|
|
if (config.debug_version_select) {
|
2010-06-04 16:57:33 +00:00
|
|
|
app_version_desc(*bavp, buf);
|
2010-04-11 04:42:52 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-06-04 16:57:33 +00:00
|
|
|
"[version] returning cached version: %s\n", buf
|
2010-04-11 04:42:52 +00:00
|
|
|
);
|
|
|
|
}
|
2010-05-12 21:07:39 +00:00
|
|
|
return bavp;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
2015-01-14 20:18:29 +00:00
|
|
|
++bavi;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
|
|
|
|
2011-05-17 19:11:44 +00:00
|
|
|
// here if app was not in memoized array,
|
|
|
|
// or we couldn't use the app version there.
|
|
|
|
|
2010-04-16 18:07:08 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] looking for version of %s\n",
|
|
|
|
app->name
|
|
|
|
);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp = new BEST_APP_VERSION;
|
|
|
|
bavp->appid = wu.appid;
|
2010-10-01 19:54:09 +00:00
|
|
|
bavp->for_64b_jobs = job_needs_64b;
|
2009-03-19 16:35:35 +00:00
|
|
|
if (g_wreq->anonymous_platform) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
CLIENT_APP_VERSION* cavp = get_app_version_anonymous(
|
2010-10-01 19:54:09 +00:00
|
|
|
*app, job_needs_64b, reliable_only
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
);
|
2009-03-19 16:35:35 +00:00
|
|
|
if (!cavp) {
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp->present = false;
|
2009-03-19 16:35:35 +00:00
|
|
|
} else {
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp->present = true;
|
|
|
|
bavp->host_usage = cavp->host_usage;
|
|
|
|
bavp->cavp = cavp;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
int gavid = host_usage_to_gavid(cavp->host_usage, *app);
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp->reliable = app_version_is_reliable(gavid);
|
|
|
|
bavp->trusted = app_version_is_trusted(gavid);
|
2010-06-04 16:57:33 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
app_version_desc(*bavp, buf);
|
|
|
|
log_messages.printf(MSG_NORMAL, "[version] using %s\n", buf);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
2010-05-12 21:07:39 +00:00
|
|
|
g_wreq->best_app_versions.push_back(bavp);
|
|
|
|
if (!bavp->present) return NULL;
|
|
|
|
return bavp;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
|
|
|
|
2010-08-22 19:13:25 +00:00
|
|
|
// Go through the client's platforms,
|
|
|
|
// and scan the app versions for each platform.
|
|
|
|
// Pick the one with highest expected FLOPS
|
2009-03-19 16:35:35 +00:00
|
|
|
//
|
2010-08-22 19:13:25 +00:00
|
|
|
// if config.prefer_primary_platform is set:
|
|
|
|
// stop scanning platforms once we find a feasible version
|
|
|
|
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp->host_usage.projected_flops = 0;
|
|
|
|
bavp->avp = NULL;
|
2009-03-19 16:35:35 +00:00
|
|
|
for (i=0; i<g_request->platforms.list.size(); i++) {
|
2010-08-22 19:13:25 +00:00
|
|
|
bool found_feasible_version = false;
|
2009-03-19 16:35:35 +00:00
|
|
|
PLATFORM* p = g_request->platforms.list[i];
|
2010-10-01 19:54:09 +00:00
|
|
|
if (job_needs_64b && !is_64b_platform(p->name)) {
|
|
|
|
continue;
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
for (j=0; j<ssp->napp_versions; j++) {
|
|
|
|
HOST_USAGE host_usage;
|
|
|
|
APP_VERSION& av = ssp->app_versions[j];
|
|
|
|
if (av.appid != wu.appid) continue;
|
|
|
|
if (av.platformid != p->id) continue;
|
2014-06-05 16:51:01 +00:00
|
|
|
if (av.beta) {
|
|
|
|
if (!g_wreq->allow_beta_work) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
|
2009-03-19 16:35:35 +00:00
|
|
|
if (strlen(av.plan_class)) {
|
2009-08-21 20:38:39 +00:00
|
|
|
if (!app_plan(*g_request, av.plan_class, host_usage)) {
|
2011-10-16 06:04:13 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] app_plan() returned false\n",
|
|
|
|
av.id
|
|
|
|
);
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
continue;
|
|
|
|
}
|
2010-12-13 22:58:15 +00:00
|
|
|
if (!g_request->client_cap_plan_class) {
|
|
|
|
if (!host_usage.is_sequential_app()) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] client %d lacks plan class capability\n",
|
|
|
|
av.id, g_request->core_client_version
|
|
|
|
);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
2009-03-19 16:35:35 +00:00
|
|
|
} else {
|
|
|
|
host_usage.sequential_app(g_reply->host.p_fpops);
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip versions that go against resource prefs
|
|
|
|
//
|
2012-06-25 23:09:45 +00:00
|
|
|
int pt = host_usage.proc_type;
|
|
|
|
if (g_wreq->dont_use_proc_type[pt]) {
|
2009-03-19 16:35:35 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-06-25 23:09:45 +00:00
|
|
|
"[version] [AV#%d] Skipping %s version - user prefs say no %s\n",
|
|
|
|
av.id,
|
|
|
|
proc_type_name(pt),
|
|
|
|
proc_type_name(pt)
|
2009-03-19 16:35:35 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2010-08-22 19:13:25 +00:00
|
|
|
if (reliable_only && !app_version_is_reliable(av.id)) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] not reliable\n", av.id
|
|
|
|
);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (daily_quota_exceeded(av.id, host_usage)) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] daily quota exceeded\n", av.id
|
|
|
|
);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip versions for which we're at the jobs-in-progress limit
|
|
|
|
//
|
2014-03-12 22:31:12 +00:00
|
|
|
if (config.max_jobs_in_progress.exceeded(app, host_usage.proc_type)) {
|
2011-05-17 19:11:44 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] jobs in progress limit exceeded\n",
|
|
|
|
av.id
|
|
|
|
);
|
2011-08-30 21:34:27 +00:00
|
|
|
config.max_jobs_in_progress.print_log();
|
2011-05-17 19:11:44 +00:00
|
|
|
}
|
2010-08-22 19:13:25 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// skip versions for resources we don't need
|
|
|
|
//
|
2012-02-25 03:54:23 +00:00
|
|
|
if (check_req && !need_this_resource(host_usage, &av, NULL)) {
|
2010-08-22 19:13:25 +00:00
|
|
|
continue;
|
|
|
|
}
|
2012-06-06 09:11:41 +00:00
|
|
|
|
|
|
|
// skip versions which require a newer core client
|
|
|
|
//
|
|
|
|
if (g_request->core_client_version < av.min_core_version) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] client version %d < min core version %d\n",
|
|
|
|
av.id, g_request->core_client_version, av.min_core_version
|
|
|
|
);
|
|
|
|
}
|
|
|
|
// Do not tell the user he needs to update the client
|
|
|
|
// just because the client is too old for a particular app version
|
|
|
|
// g_wreq->outdated_client = true;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (av.max_core_version && g_request->core_client_version > av.max_core_version) {
|
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] [AV#%d] client version %d > max core version %d\n",
|
|
|
|
av.id, g_request->core_client_version, av.max_core_version
|
|
|
|
);
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
2010-08-22 19:13:25 +00:00
|
|
|
|
2010-09-09 20:01:28 +00:00
|
|
|
// at this point we know the version is feasible,
|
|
|
|
// so if config.prefer_primary_platform is set
|
|
|
|
// we won't look any further.
|
|
|
|
//
|
|
|
|
found_feasible_version = true;
|
|
|
|
|
2010-12-09 00:32:50 +00:00
|
|
|
// pick the fastest version.
|
|
|
|
// Throw in a random factor in case the estimates are off.
|
2009-03-19 16:35:35 +00:00
|
|
|
//
|
2012-09-06 22:46:02 +00:00
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(av.id);
|
2012-02-22 19:51:09 +00:00
|
|
|
double r = 1;
|
2012-09-06 22:46:02 +00:00
|
|
|
long n=1;
|
2012-08-02 23:18:51 +00:00
|
|
|
if (havp) {
|
2013-05-19 18:08:36 +00:00
|
|
|
// slowly move from raw calc to measured performance as number
|
|
|
|
// of results increases
|
2012-08-02 23:18:51 +00:00
|
|
|
n=std::max((long)havp->pfc.n,(long)n);
|
2013-05-19 18:08:36 +00:00
|
|
|
double old_projected_flops=host_usage.projected_flops;
|
|
|
|
estimate_flops(host_usage, av);
|
|
|
|
host_usage.projected_flops=(host_usage.projected_flops*(n-1)+old_projected_flops)/n;
|
2013-06-04 21:00:09 +00:00
|
|
|
|
|
|
|
// special case for versions that don't work on a given host.
|
|
|
|
// This is defined as:
|
|
|
|
// 1. pfc.n is 0
|
|
|
|
// 2. The max_jobs_per_day is 1
|
|
|
|
// 3. Consecutive valid is 0.
|
|
|
|
// In that case, heavily penalize this app_version most of the
|
|
|
|
// time.
|
|
|
|
if ((havp->pfc.n==0) && (havp->max_jobs_per_day==1) && (havp->consecutive_valid==0)) {
|
|
|
|
if (drand()>0.01) {
|
|
|
|
host_usage.projected_flops*=0.01;
|
|
|
|
if (config.debug_version_select && bavp && bavp->avp) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] App version AV#%d is failing on HOST#%d\n",
|
|
|
|
havp->app_version_id,havp->host_id
|
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2013-05-19 18:08:36 +00:00
|
|
|
}
|
2012-02-22 19:51:09 +00:00
|
|
|
if (config.version_select_random_factor) {
|
2012-08-02 23:18:51 +00:00
|
|
|
r += config.version_select_random_factor*rand_normal()/n;
|
2012-02-22 19:51:09 +00:00
|
|
|
}
|
2013-05-19 18:08:36 +00:00
|
|
|
if (config.debug_version_select && bavp && bavp->avp) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] Comparing AV#%d (%.2f GFLOP) against AV#%d (%.2f GFLOP)\n",
|
|
|
|
av.id,host_usage.projected_flops/1e+9,bavp->avp->id,bavp->host_usage.projected_flops/1e+9
|
|
|
|
);
|
|
|
|
}
|
2010-12-09 00:32:50 +00:00
|
|
|
if (r*host_usage.projected_flops > bavp->host_usage.projected_flops) {
|
2012-08-02 23:18:51 +00:00
|
|
|
if (config.debug_version_select && (host_usage.projected_flops <= bavp->host_usage.projected_flops)) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-08-08 23:09:43 +00:00
|
|
|
"[version] [AV#%d] Random factor wins. r=%f n=%ld\n",
|
2012-08-02 23:18:51 +00:00
|
|
|
av.id, r, n
|
|
|
|
);
|
|
|
|
}
|
|
|
|
host_usage.projected_flops*=r;
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp->host_usage = host_usage;
|
|
|
|
bavp->avp = &av;
|
|
|
|
bavp->reliable = app_version_is_reliable(av.id);
|
|
|
|
bavp->trusted = app_version_is_trusted(av.id);
|
2013-05-19 18:08:36 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] Best app version is now AV%d (%.2f GFLOP)\n",
|
|
|
|
bavp->avp->id, bavp->host_usage.projected_flops/1e+9
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
2010-08-22 19:13:25 +00:00
|
|
|
} // loop over app versions
|
|
|
|
|
|
|
|
if (config.prefer_primary_platform && found_feasible_version) {
|
|
|
|
break;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
2010-08-22 19:13:25 +00:00
|
|
|
} // loop over client platforms
|
|
|
|
|
2010-05-12 21:07:39 +00:00
|
|
|
if (bavp->avp) {
|
2011-10-12 23:59:38 +00:00
|
|
|
estimate_flops(bavp->host_usage, *bavp->avp);
|
2009-03-19 16:35:35 +00:00
|
|
|
if (config.debug_version_select) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-05-20 17:50:00 +00:00
|
|
|
"[version] Best version of app %s is [AV#%d] (%.2f GFLOPS)\n",
|
2010-05-12 21:07:39 +00:00
|
|
|
app->name, bavp->avp->id, bavp->host_usage.projected_flops/1e9
|
2009-03-19 16:35:35 +00:00
|
|
|
);
|
|
|
|
}
|
2010-05-12 21:07:39 +00:00
|
|
|
bavp->present = true;
|
|
|
|
g_wreq->best_app_versions.push_back(bavp);
|
2009-03-19 16:35:35 +00:00
|
|
|
} else {
|
|
|
|
// Here if there's no app version we can use.
|
|
|
|
//
|
|
|
|
if (config.debug_version_select) {
|
2010-04-11 04:42:52 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[version] returning NULL; platforms:\n"
|
|
|
|
);
|
2009-03-19 16:35:35 +00:00
|
|
|
for (i=0; i<g_request->platforms.list.size(); i++) {
|
|
|
|
PLATFORM* p = g_request->platforms.list[i];
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-16 18:07:08 +00:00
|
|
|
"[version] %s\n",
|
|
|
|
p->name
|
2009-03-19 16:35:35 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
2010-05-12 21:07:39 +00:00
|
|
|
g_wreq->best_app_versions.push_back(bavp);
|
2009-03-19 16:35:35 +00:00
|
|
|
return NULL;
|
|
|
|
}
|
2010-05-12 21:07:39 +00:00
|
|
|
return bavp;
|
2009-03-19 16:35:35 +00:00
|
|
|
}
|
|
|
|
|