2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2008-08-06 18:36:30 +00:00
|
|
|
// Copyright (C) 2008 University of California
|
2003-06-11 23:12:57 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2003-06-11 23:12:57 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2002-12-03 18:57:40 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2002-12-03 18:57:40 +00:00
|
|
|
|
2003-08-15 20:35:44 +00:00
|
|
|
// transitioner - handle transitions in the state of a WU
|
|
|
|
// - a result has become DONE (via timeout or client reply)
|
|
|
|
// - the WU error mask is set (e.g. by validater)
|
|
|
|
// - assimilation is finished
|
2002-10-24 08:25:42 +00:00
|
|
|
//
|
2003-08-15 20:35:44 +00:00
|
|
|
// cmdline:
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// [ --one_pass ] do one pass, then exit
|
|
|
|
// [ --d x ] debug level x
|
|
|
|
// [ --mod n i ] process only WUs with (id mod n) == i
|
|
|
|
// [ --sleep_interval x ] sleep x seconds if nothing to do
|
2002-10-24 08:25:42 +00:00
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2002-11-05 18:36:09 +00:00
|
|
|
#include <vector>
|
2002-11-07 19:31:34 +00:00
|
|
|
#include <unistd.h>
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <cstring>
|
2004-07-13 13:54:09 +00:00
|
|
|
#include <climits>
|
2008-02-27 23:26:38 +00:00
|
|
|
#include <cstdlib>
|
|
|
|
#include <string>
|
2008-06-01 03:43:47 +00:00
|
|
|
#include <signal.h>
|
2002-11-05 18:36:09 +00:00
|
|
|
#include <sys/time.h>
|
2012-05-09 16:11:50 +00:00
|
|
|
#include <sys/param.h>
|
2002-11-05 18:36:09 +00:00
|
|
|
|
2002-11-07 19:31:34 +00:00
|
|
|
#include "backend_lib.h"
|
2013-08-23 00:01:45 +00:00
|
|
|
#include "boinc_db.h"
|
2008-02-26 17:24:29 +00:00
|
|
|
#include "common_defs.h"
|
2008-10-27 21:23:07 +00:00
|
|
|
#include "error_numbers.h"
|
2013-08-23 00:01:45 +00:00
|
|
|
#include "filesys.h"
|
2009-05-07 13:54:51 +00:00
|
|
|
#include "str_util.h"
|
2009-09-17 17:56:59 +00:00
|
|
|
#include "svn_version.h"
|
2013-08-23 00:01:45 +00:00
|
|
|
#include "util.h"
|
2008-02-26 17:24:29 +00:00
|
|
|
|
2003-08-15 00:45:25 +00:00
|
|
|
#include "sched_config.h"
|
2010-03-29 22:28:20 +00:00
|
|
|
#include "credit.h"
|
2003-03-08 00:09:40 +00:00
|
|
|
#include "sched_util.h"
|
2004-04-08 08:15:23 +00:00
|
|
|
#include "sched_msgs.h"
|
2008-09-22 17:52:41 +00:00
|
|
|
#ifdef GCL_SIMULATOR
|
|
|
|
#include "gcl_simulator.h"
|
|
|
|
#endif
|
2002-11-05 18:36:09 +00:00
|
|
|
|
2003-08-15 20:35:44 +00:00
|
|
|
#define LOCKFILE "transitioner.out"
|
|
|
|
#define PIDFILE "transitioner.pid"
|
2002-12-12 23:01:26 +00:00
|
|
|
|
2004-07-22 17:43:11 +00:00
|
|
|
#define SELECT_LIMIT 1000
|
2008-11-26 19:09:27 +00:00
|
|
|
|
|
|
|
#define DEFAULT_SLEEP_INTERVAL 5
|
2004-07-21 21:50:25 +00:00
|
|
|
|
2002-11-05 18:36:09 +00:00
|
|
|
int startup_time;
|
2002-11-07 19:31:34 +00:00
|
|
|
R_RSA_PRIVATE_KEY key;
|
2004-06-27 23:26:52 +00:00
|
|
|
int mod_n, mod_i;
|
|
|
|
bool do_mod = false;
|
2005-07-17 19:52:44 +00:00
|
|
|
bool one_pass = false;
|
2008-11-26 20:37:11 +00:00
|
|
|
int sleep_interval = DEFAULT_SLEEP_INTERVAL;
|
2008-06-01 03:43:47 +00:00
|
|
|
|
2008-06-26 21:20:07 +00:00
|
|
|
void signal_handler(int) {
|
2008-06-01 03:43:47 +00:00
|
|
|
log_messages.printf(MSG_NORMAL, "Signaled by simulator\n");
|
|
|
|
}
|
2002-11-05 18:36:09 +00:00
|
|
|
|
2004-07-16 21:23:43 +00:00
|
|
|
int result_suffix(char* name) {
|
|
|
|
char* p = strrchr(name, '_');
|
|
|
|
if (p) return atoi(p+1);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// A result timed out; penalize the corresponding host_app_version
|
2004-12-16 22:10:51 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
static int result_timed_out(
|
|
|
|
TRANSITIONER_ITEM res_item, TRANSITIONER_ITEM& wu_item
|
|
|
|
) {
|
|
|
|
DB_HOST_APP_VERSION hav;
|
|
|
|
char query[512], clause[512];
|
|
|
|
|
|
|
|
int gavid = generalized_app_version_id(
|
|
|
|
res_item.res_app_version_id, wu_item.appid
|
|
|
|
);
|
|
|
|
int retval = hav_lookup(hav, res_item.res_hostid, gavid);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"result_timed_out(): hav_lookup failed: %s\n", boincerror(retval)
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
);
|
2014-07-11 03:51:46 +00:00
|
|
|
return 0;
|
2005-02-26 00:24:37 +00:00
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
hav.turnaround.update_var(
|
|
|
|
(double)wu_item.delay_bound,
|
|
|
|
HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT
|
2005-02-26 00:24:37 +00:00
|
|
|
);
|
2010-06-25 18:54:37 +00:00
|
|
|
int n = hav.max_jobs_per_day;
|
|
|
|
if (n == 0) {
|
|
|
|
n = config.daily_result_quota;
|
- back end: change "daily result quota" mechanism.
Old: config.xml specifies an initial daily quota (say, 100).
Each host_app_version starts out with this quota.
On the return of a SUCCESS result,
the quota is doubled, up to the initial value.
On the return of an error result, or a timeout,
the quota is decremented down to 1.
Problem:
Doesn't accommodate hosts that can do more than 100 jobs/day.
New: similar, but
- on validation of a job, daily quota is incremented.
- on invalidation of a job, daily quota is decremented.
- on return of an error result, or a timeout,
daily quota is min'd with initial quota, then decremented.
Notes:
- This allows a host to have an unboundedly large quota
as long as it continues to return more valid
than invalid results.
- Even with this change, hosts that return SUCCESS but
invalid results will continue to get the initial daily quota.
It would be desirable to reduce their quota to 1.
svn path=/trunk/boinc/; revision=21675
2010-06-02 00:11:01 +00:00
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
if (n > config.daily_result_quota) {
|
|
|
|
n = config.daily_result_quota;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
n -= 1;
|
|
|
|
if (n < 1) {
|
|
|
|
n = 1;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.debug_quota) {
|
2010-06-29 03:20:19 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-06-25 18:54:37 +00:00
|
|
|
"[quota] max_jobs_per_day for %d; %d->%d\n",
|
|
|
|
gavid, hav.max_jobs_per_day, n
|
|
|
|
);
|
|
|
|
}
|
|
|
|
hav.max_jobs_per_day = n;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
|
2010-04-21 19:33:20 +00:00
|
|
|
hav.consecutive_valid = 0;
|
2010-03-29 22:28:20 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
sprintf(query,
|
2010-06-25 18:54:37 +00:00
|
|
|
"turnaround_n=%.15e, turnaround_avg=%.15e, turnaround_var=%.15e, turnaround_q=%.15e, max_jobs_per_day=%d, consecutive_valid=%d",
|
|
|
|
hav.turnaround.n,
|
|
|
|
hav.turnaround.avg,
|
|
|
|
hav.turnaround.var,
|
|
|
|
hav.turnaround.q,
|
2010-04-21 19:33:20 +00:00
|
|
|
hav.max_jobs_per_day,
|
|
|
|
hav.consecutive_valid
|
2010-03-29 22:28:20 +00:00
|
|
|
);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
sprintf(clause,
|
|
|
|
"host_id=%d and app_version_id=%d",
|
|
|
|
hav.host_id, hav.app_version_id
|
|
|
|
);
|
|
|
|
retval = hav.update_fields_noid(query, clause);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"CRITICAL result_timed_out(): hav updated failed: %s\n",
|
|
|
|
boincerror(retval)
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return 0;
|
2004-12-16 22:10:51 +00:00
|
|
|
}
|
|
|
|
|
2004-07-21 21:50:25 +00:00
|
|
|
int handle_wu(
|
2004-07-10 00:11:20 +00:00
|
|
|
DB_TRANSITIONER_ITEM_SET& transitioner,
|
2004-07-03 23:35:23 +00:00
|
|
|
std::vector<TRANSITIONER_ITEM>& items
|
|
|
|
) {
|
2004-07-02 23:27:15 +00:00
|
|
|
int ntotal, nerrors, retval, ninprogress, nsuccess;
|
2004-12-02 23:00:30 +00:00
|
|
|
int nunsent, ncouldnt_send, nover, ndidnt_need, nno_reply;
|
2004-08-13 15:52:55 +00:00
|
|
|
int canonical_result_index, j;
|
2004-07-02 17:53:31 +00:00
|
|
|
char suffix[256];
|
2003-08-15 20:35:44 +00:00
|
|
|
time_t now = time(0), x;
|
2004-12-14 00:57:03 +00:00
|
|
|
bool all_over_and_validated, have_new_result_to_validate, do_delete;
|
2004-08-13 15:52:55 +00:00
|
|
|
unsigned int i;
|
2002-10-24 08:25:42 +00:00
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
TRANSITIONER_ITEM& wu_item = items[0];
|
|
|
|
TRANSITIONER_ITEM wu_item_original = wu_item;
|
|
|
|
|
2003-08-15 20:35:44 +00:00
|
|
|
// count up the number of results in various states,
|
|
|
|
// and check for timed-out results
|
|
|
|
//
|
2004-07-02 23:27:15 +00:00
|
|
|
ntotal = 0;
|
2013-05-20 17:25:58 +00:00
|
|
|
nunsent = 0; // including INACTIVE
|
2003-08-15 20:35:44 +00:00
|
|
|
ninprogress = 0;
|
2003-10-10 20:10:18 +00:00
|
|
|
nover = 0;
|
2003-01-23 08:07:48 +00:00
|
|
|
nerrors = 0;
|
2003-08-15 20:35:44 +00:00
|
|
|
nsuccess = 0;
|
2010-12-10 00:33:45 +00:00
|
|
|
// not counting invalid results!!!!
|
2003-08-15 20:35:44 +00:00
|
|
|
ncouldnt_send = 0;
|
2004-12-02 23:00:30 +00:00
|
|
|
nno_reply = 0;
|
|
|
|
ndidnt_need = 0;
|
2004-12-14 00:57:03 +00:00
|
|
|
have_new_result_to_validate = false;
|
2004-07-16 21:23:43 +00:00
|
|
|
int rs, max_result_suffix = -1;
|
|
|
|
|
2005-02-08 00:39:05 +00:00
|
|
|
// Scan the WU's results, and find the canonical result if there is one
|
|
|
|
//
|
|
|
|
canonical_result_index = -1;
|
|
|
|
if (wu_item.canonical_resultid) {
|
|
|
|
for (i=0; i<items.size(); i++) {
|
|
|
|
TRANSITIONER_ITEM& res_item = items[i];
|
|
|
|
if (!res_item.res_id) continue;
|
|
|
|
if (res_item.res_id == wu_item.canonical_resultid) {
|
|
|
|
canonical_result_index = i;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (wu_item.canonical_resultid && (canonical_result_index == -1)) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] can't find canonical result\n",
|
2005-02-08 00:39:05 +00:00
|
|
|
wu_item.id, wu_item.name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// if there is a canonical result, see if its file are deleted
|
|
|
|
//
|
|
|
|
bool canonical_result_files_deleted = false;
|
|
|
|
if (canonical_result_index >= 0) {
|
|
|
|
TRANSITIONER_ITEM& cr = items[canonical_result_index];
|
|
|
|
if (cr.res_file_delete_state == FILE_DELETE_DONE) {
|
|
|
|
canonical_result_files_deleted = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Scan this WU's results, and
|
|
|
|
// 1) count those in various server states;
|
2011-02-15 22:07:14 +00:00
|
|
|
// 2) identify timed-out results and update their server state and outcome
|
2005-02-08 00:39:05 +00:00
|
|
|
// 3) find the max result suffix (in case need to generate new ones)
|
|
|
|
// 4) see if we have a new result to validate
|
|
|
|
// (outcome SUCCESS and validate_state INIT)
|
|
|
|
//
|
2004-08-13 15:52:55 +00:00
|
|
|
for (i=0; i<items.size(); i++) {
|
|
|
|
TRANSITIONER_ITEM& res_item = items[i];
|
|
|
|
|
|
|
|
if (!res_item.res_id) continue;
|
2004-07-16 21:23:43 +00:00
|
|
|
ntotal++;
|
2005-02-08 00:39:05 +00:00
|
|
|
|
2004-08-13 15:52:55 +00:00
|
|
|
rs = result_suffix(res_item.res_name);
|
2004-07-16 21:23:43 +00:00
|
|
|
if (rs > max_result_suffix) max_result_suffix = rs;
|
2005-02-08 00:39:05 +00:00
|
|
|
|
2004-08-13 15:52:55 +00:00
|
|
|
switch (res_item.res_server_state) {
|
2013-05-20 17:25:58 +00:00
|
|
|
case RESULT_SERVER_STATE_INACTIVE:
|
2004-07-16 21:23:43 +00:00
|
|
|
case RESULT_SERVER_STATE_UNSENT:
|
|
|
|
nunsent++;
|
|
|
|
break;
|
|
|
|
case RESULT_SERVER_STATE_IN_PROGRESS:
|
2004-08-13 15:52:55 +00:00
|
|
|
if (res_item.res_report_deadline < now) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] result timed out (%d < %d) server_state:IN_PROGRESS=>OVER; outcome:NO_REPLY\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id,
|
|
|
|
res_item.res_name,
|
2004-08-13 15:52:55 +00:00
|
|
|
res_item.res_report_deadline, (int)now
|
2004-07-16 21:23:43 +00:00
|
|
|
);
|
2004-08-13 15:52:55 +00:00
|
|
|
res_item.res_server_state = RESULT_SERVER_STATE_OVER;
|
|
|
|
res_item.res_outcome = RESULT_OUTCOME_NO_REPLY;
|
|
|
|
retval = transitioner.update_result(res_item);
|
2004-07-16 21:23:43 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] update_result(): %s\n",
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id,
|
2010-11-08 17:51:57 +00:00
|
|
|
res_item.res_name, boincerror(retval)
|
2004-08-13 15:52:55 +00:00
|
|
|
);
|
2003-10-06 23:42:43 +00:00
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
retval = result_timed_out(res_item, wu_item);
|
2010-03-29 22:28:20 +00:00
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"result_timed_out() error: %s\n", boincerror(retval)
|
2010-03-29 22:28:20 +00:00
|
|
|
);
|
|
|
|
exit(1);
|
|
|
|
}
|
2004-07-02 23:27:15 +00:00
|
|
|
nover++;
|
2005-09-11 10:33:47 +00:00
|
|
|
nno_reply++;
|
2004-07-16 21:23:43 +00:00
|
|
|
} else {
|
|
|
|
ninprogress++;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case RESULT_SERVER_STATE_OVER:
|
|
|
|
nover++;
|
2004-08-13 15:52:55 +00:00
|
|
|
switch (res_item.res_outcome) {
|
2004-07-16 21:23:43 +00:00
|
|
|
case RESULT_OUTCOME_COULDNT_SEND:
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] result couldn't be sent\n",
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id, res_item.res_name
|
2004-07-16 21:23:43 +00:00
|
|
|
);
|
|
|
|
ncouldnt_send++;
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_SUCCESS:
|
2004-08-13 15:52:55 +00:00
|
|
|
if (res_item.res_validate_state == VALIDATE_STATE_INIT) {
|
2005-02-08 00:39:05 +00:00
|
|
|
if (canonical_result_files_deleted) {
|
|
|
|
res_item.res_validate_state = VALIDATE_STATE_TOO_LATE;
|
|
|
|
retval = transitioner.update_result(res_item);
|
2010-11-08 17:51:57 +00:00
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] update_result(): %s\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id,
|
|
|
|
res_item.res_name, boincerror(retval)
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] validate_state:INIT=>TOO_LATE\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id,
|
|
|
|
res_item.res_name
|
|
|
|
);
|
|
|
|
}
|
2005-02-08 00:39:05 +00:00
|
|
|
} else {
|
|
|
|
have_new_result_to_validate = true;
|
|
|
|
}
|
2003-08-15 20:35:44 +00:00
|
|
|
}
|
2010-12-10 00:33:45 +00:00
|
|
|
// don't count invalid results as successful
|
|
|
|
//
|
|
|
|
if (res_item.res_validate_state != VALIDATE_STATE_INVALID) {
|
|
|
|
nsuccess++;
|
|
|
|
}
|
2004-07-16 21:23:43 +00:00
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_CLIENT_ERROR:
|
2014-05-23 06:54:56 +00:00
|
|
|
// is user aborted job, don't count it as an error
|
|
|
|
//
|
|
|
|
if (res_item.res_exit_status == EXIT_ABORTED_VIA_GUI) {
|
|
|
|
nno_reply++;
|
|
|
|
} else {
|
|
|
|
nerrors++;
|
|
|
|
}
|
|
|
|
break;
|
2004-09-10 00:41:48 +00:00
|
|
|
case RESULT_OUTCOME_VALIDATE_ERROR:
|
2004-07-16 21:23:43 +00:00
|
|
|
nerrors++;
|
2003-01-23 08:07:48 +00:00
|
|
|
break;
|
2006-07-06 17:30:03 +00:00
|
|
|
case RESULT_OUTCOME_CLIENT_DETACHED:
|
|
|
|
case RESULT_OUTCOME_NO_REPLY:
|
2004-12-02 23:00:30 +00:00
|
|
|
nno_reply++;
|
|
|
|
break;
|
2006-07-06 17:30:03 +00:00
|
|
|
case RESULT_OUTCOME_DIDNT_NEED:
|
2004-12-02 23:00:30 +00:00
|
|
|
ndidnt_need++;
|
|
|
|
break;
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2004-07-16 21:23:43 +00:00
|
|
|
break;
|
2002-11-07 19:31:34 +00:00
|
|
|
}
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2002-11-07 19:31:34 +00:00
|
|
|
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] %d results: unsent %d, in_progress %d, over %d (success %d, error %d, couldnt_send %d, no_reply %d, didnt_need %d)\n",
|
2005-02-08 00:39:05 +00:00
|
|
|
wu_item.id, wu_item.name, ntotal, nunsent, ninprogress, nover,
|
|
|
|
nsuccess, nerrors, ncouldnt_send, nno_reply, ndidnt_need
|
2003-10-10 20:10:18 +00:00
|
|
|
);
|
|
|
|
|
2005-02-08 00:39:05 +00:00
|
|
|
// if there's a new result to validate, trigger validation
|
2003-01-23 08:07:48 +00:00
|
|
|
//
|
2005-02-08 00:39:05 +00:00
|
|
|
if (have_new_result_to_validate && (nsuccess >= wu_item.min_quorum)) {
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.need_validate = true;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] need_validate:=>true\n", wu_item.id, wu_item.name
|
2003-10-10 05:36:09 +00:00
|
|
|
);
|
2003-08-15 20:35:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// check for WU error conditions
|
|
|
|
// NOTE: check on max # of success results is done in validater
|
|
|
|
//
|
|
|
|
if (ncouldnt_send > 0) {
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.error_mask |= WU_ERROR_COULDNT_SEND_RESULT;
|
2003-08-15 20:35:44 +00:00
|
|
|
}
|
|
|
|
|
2012-03-14 07:08:05 +00:00
|
|
|
// if WU has results with errors and there are no results that are
|
|
|
|
// - successful
|
|
|
|
// - in progress
|
|
|
|
// - timed out (but could still be returned)
|
2011-06-06 03:40:42 +00:00
|
|
|
// reset homogeneous redundancy class to give other platforms a try;
|
|
|
|
// also reset app version ID if using HAV
|
2007-05-29 23:41:31 +00:00
|
|
|
//
|
2012-03-14 07:08:05 +00:00
|
|
|
if (nerrors && !(nsuccess || ninprogress || nno_reply)) {
|
2007-05-29 23:41:31 +00:00
|
|
|
wu_item.hr_class = 0;
|
2011-06-06 03:40:42 +00:00
|
|
|
wu_item.app_version_id = 0;
|
2007-05-29 23:41:31 +00:00
|
|
|
}
|
|
|
|
|
2004-08-13 15:52:55 +00:00
|
|
|
if (nerrors > wu_item.max_error_results) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] WU has too many errors (%d errors for %d results)\n",
|
2009-09-18 15:59:40 +00:00
|
|
|
wu_item.id, wu_item.name, nerrors, ntotal
|
2003-06-14 20:15:53 +00:00
|
|
|
);
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.error_mask |= WU_ERROR_TOO_MANY_ERROR_RESULTS;
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2009-09-16 04:35:42 +00:00
|
|
|
|
|
|
|
// see how many new results we need to make
|
|
|
|
//
|
|
|
|
int n_new_results_needed = wu_item.target_nresults - nunsent - ninprogress - nsuccess;
|
|
|
|
if (n_new_results_needed < 0) n_new_results_needed = 0;
|
2009-09-18 15:59:40 +00:00
|
|
|
int n_new_results_allowed = wu_item.max_total_results - ntotal;
|
2009-09-16 04:35:42 +00:00
|
|
|
|
|
|
|
// if we're already at the limit and need more, error out the WU
|
|
|
|
//
|
|
|
|
bool too_many = false;
|
|
|
|
if (n_new_results_allowed < 0) {
|
|
|
|
too_many = true;
|
|
|
|
} else if (n_new_results_allowed == 0) {
|
|
|
|
if (n_new_results_needed > 0) {
|
|
|
|
too_many = true;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (n_new_results_needed > n_new_results_allowed) {
|
|
|
|
n_new_results_needed = n_new_results_allowed;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (too_many) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] WU has too many total results (%d)\n",
|
2009-09-18 15:59:40 +00:00
|
|
|
wu_item.id, wu_item.name, ntotal
|
2003-06-14 20:15:53 +00:00
|
|
|
);
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.error_mask |= WU_ERROR_TOO_MANY_TOTAL_RESULTS;
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2002-11-03 23:05:12 +00:00
|
|
|
|
2003-08-15 20:35:44 +00:00
|
|
|
// if this WU had an error, don't send any unsent results,
|
|
|
|
// and trigger assimilation if needed
|
2003-01-23 08:07:48 +00:00
|
|
|
//
|
2004-08-13 15:52:55 +00:00
|
|
|
if (wu_item.error_mask) {
|
|
|
|
for (i=0; i<items.size(); i++) {
|
|
|
|
TRANSITIONER_ITEM& res_item = items[i];
|
2009-09-18 15:59:40 +00:00
|
|
|
if (!res_item.res_id) continue;
|
|
|
|
bool update_result = false;
|
|
|
|
switch(res_item.res_server_state) {
|
2013-05-20 17:25:58 +00:00
|
|
|
case RESULT_SERVER_STATE_INACTIVE:
|
2009-09-18 15:59:40 +00:00
|
|
|
case RESULT_SERVER_STATE_UNSENT:
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] server_state:UNSENT=>OVER; outcome:=>DIDNT_NEED\n",
|
2009-09-18 15:59:40 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id, res_item.res_name
|
|
|
|
);
|
|
|
|
res_item.res_server_state = RESULT_SERVER_STATE_OVER;
|
|
|
|
res_item.res_outcome = RESULT_OUTCOME_DIDNT_NEED;
|
|
|
|
update_result = true;
|
|
|
|
break;
|
|
|
|
case RESULT_SERVER_STATE_OVER:
|
|
|
|
switch (res_item.res_outcome) {
|
|
|
|
case RESULT_OUTCOME_SUCCESS:
|
|
|
|
switch(res_item.res_validate_state) {
|
|
|
|
case VALIDATE_STATE_INIT:
|
|
|
|
case VALIDATE_STATE_INCONCLUSIVE:
|
|
|
|
res_item.res_validate_state = VALIDATE_STATE_NO_CHECK;
|
|
|
|
update_result = true;
|
|
|
|
break;
|
2004-12-15 23:50:00 +00:00
|
|
|
}
|
2004-07-02 23:27:15 +00:00
|
|
|
}
|
2009-09-18 15:59:40 +00:00
|
|
|
}
|
|
|
|
if (update_result) {
|
|
|
|
retval = transitioner.update_result(res_item);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] result.update(): %s\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id,
|
|
|
|
res_item.res_name, boincerror(retval)
|
2009-09-18 15:59:40 +00:00
|
|
|
);
|
2003-10-06 23:42:43 +00:00
|
|
|
}
|
2002-11-03 23:05:12 +00:00
|
|
|
}
|
|
|
|
}
|
2004-08-13 15:52:55 +00:00
|
|
|
if (wu_item.assimilate_state == ASSIMILATE_INIT) {
|
|
|
|
wu_item.assimilate_state = ASSIMILATE_READY;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] error_mask:%d assimilate_state:INIT=>READY\n",
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.id, wu_item.name, wu_item.error_mask
|
|
|
|
);
|
2002-11-03 23:05:12 +00:00
|
|
|
}
|
2008-03-13 23:35:13 +00:00
|
|
|
} else if (wu_item.canonical_resultid == 0) {
|
2005-02-08 00:39:05 +00:00
|
|
|
// Here if no WU-level error.
|
|
|
|
// Generate new results if needed.
|
2004-07-21 21:50:25 +00:00
|
|
|
//
|
2009-02-26 00:23:23 +00:00
|
|
|
std::string values;
|
2004-07-21 21:50:25 +00:00
|
|
|
char value_buf[MAX_QUERY_LEN];
|
2012-01-30 22:39:13 +00:00
|
|
|
if (wu_item.transitioner_flags != TRANSITION_NO_NEW_RESULTS
|
|
|
|
&& n_new_results_needed > 0
|
|
|
|
) {
|
2008-03-07 21:13:01 +00:00
|
|
|
log_messages.printf(
|
|
|
|
MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] Generating %d more results (%d target - %d unsent - %d in progress - %d success)\n",
|
2009-09-16 04:35:42 +00:00
|
|
|
wu_item.id, wu_item.name, n_new_results_needed,
|
|
|
|
wu_item.target_nresults, nunsent, ninprogress, nsuccess
|
2003-06-14 20:15:53 +00:00
|
|
|
);
|
2009-09-16 04:35:42 +00:00
|
|
|
for (j=0; j<n_new_results_needed; j++) {
|
2004-08-13 15:52:55 +00:00
|
|
|
sprintf(suffix, "%d", max_result_suffix+j+1);
|
2009-05-07 13:54:51 +00:00
|
|
|
const char *rtfpath = config.project_path("%s", wu_item.result_template_file);
|
2007-04-05 17:02:01 +00:00
|
|
|
int priority_increase = 0;
|
2008-02-21 00:47:50 +00:00
|
|
|
if (nover && config.reliable_priority_on_over) {
|
|
|
|
priority_increase += config.reliable_priority_on_over;
|
2007-04-05 17:02:01 +00:00
|
|
|
} else if (nover && !nerrors && config.reliable_priority_on_over_except_error) {
|
2008-02-21 00:47:50 +00:00
|
|
|
priority_increase += config.reliable_priority_on_over_except_error;
|
2007-04-05 17:02:01 +00:00
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
retval = create_result_ti(
|
2009-05-07 13:54:51 +00:00
|
|
|
wu_item, (char *)rtfpath, suffix, key, config, value_buf, priority_increase
|
2004-07-02 19:45:33 +00:00
|
|
|
);
|
2003-08-15 23:44:28 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] create_result_ti(): %s\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, boincerror(retval)
|
2003-08-15 23:44:28 +00:00
|
|
|
);
|
2004-07-21 21:50:25 +00:00
|
|
|
return retval;
|
|
|
|
}
|
2004-08-13 15:52:55 +00:00
|
|
|
if (j==0) {
|
2004-07-21 21:50:25 +00:00
|
|
|
values = value_buf;
|
|
|
|
} else {
|
|
|
|
values += ",";
|
|
|
|
values += value_buf;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
DB_RESULT r;
|
2005-01-10 00:00:42 +00:00
|
|
|
retval = r.insert_batch(values);
|
2004-07-21 21:50:25 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] insert_batch(): %s\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, boincerror(retval)
|
2004-07-21 21:50:25 +00:00
|
|
|
);
|
|
|
|
return retval;
|
2002-11-07 19:31:34 +00:00
|
|
|
}
|
|
|
|
}
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2002-11-07 19:31:34 +00:00
|
|
|
|
2004-04-09 06:01:25 +00:00
|
|
|
// scan results:
|
|
|
|
// - see if all over and validated
|
2003-08-15 20:35:44 +00:00
|
|
|
//
|
2004-04-09 06:01:25 +00:00
|
|
|
all_over_and_validated = true;
|
2010-01-22 23:55:50 +00:00
|
|
|
bool all_over_and_ready_to_assimilate = true;
|
2010-01-23 00:36:12 +00:00
|
|
|
// used for the defer assimilation
|
|
|
|
double most_recently_returned = 0;
|
2004-08-13 15:52:55 +00:00
|
|
|
for (i=0; i<items.size(); i++) {
|
|
|
|
TRANSITIONER_ITEM& res_item = items[i];
|
2009-09-18 15:59:40 +00:00
|
|
|
if (!res_item.res_id) continue;
|
|
|
|
if (res_item.res_server_state == RESULT_SERVER_STATE_OVER) {
|
2010-01-22 23:55:50 +00:00
|
|
|
if (res_item.res_received_time > most_recently_returned) {
|
2009-09-18 15:59:40 +00:00
|
|
|
most_recently_returned = res_item.res_received_time;
|
|
|
|
}
|
|
|
|
if (res_item.res_outcome == RESULT_OUTCOME_SUCCESS) {
|
|
|
|
if (res_item.res_validate_state == VALIDATE_STATE_INIT) {
|
|
|
|
all_over_and_validated = false;
|
|
|
|
all_over_and_ready_to_assimilate = false;
|
2009-09-17 17:56:59 +00:00
|
|
|
}
|
2010-01-22 23:55:50 +00:00
|
|
|
} else if (res_item.res_outcome == RESULT_OUTCOME_NO_REPLY) {
|
2011-02-15 22:07:14 +00:00
|
|
|
if (now < res_item.res_report_deadline) {
|
2009-09-18 15:59:40 +00:00
|
|
|
all_over_and_validated = false;
|
2004-04-09 06:01:25 +00:00
|
|
|
}
|
2004-07-02 23:04:16 +00:00
|
|
|
}
|
2009-09-18 15:59:40 +00:00
|
|
|
} else {
|
|
|
|
all_over_and_validated = false;
|
|
|
|
all_over_and_ready_to_assimilate = false;
|
2003-08-15 20:35:44 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-06-29 05:17:01 +00:00
|
|
|
// If we're deferring assimilation until all results are over and validated,
|
2011-02-15 22:07:14 +00:00
|
|
|
// when that happens make sure that WU state is advanced to assimilate ready
|
2008-03-07 21:13:01 +00:00
|
|
|
// the items.size is a kludge
|
|
|
|
//
|
2010-01-22 23:55:50 +00:00
|
|
|
if (all_over_and_ready_to_assimilate
|
|
|
|
&& wu_item.assimilate_state == ASSIMILATE_INIT
|
|
|
|
&& items.size() > 0
|
|
|
|
&& wu_item.canonical_resultid > 0
|
2008-03-07 21:13:01 +00:00
|
|
|
) {
|
2009-09-17 17:56:59 +00:00
|
|
|
wu_item.assimilate_state = ASSIMILATE_READY;
|
2008-03-07 21:13:01 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] Deferred assimilation now set to ASSIMILATE_STATE_READY\n",
|
2008-03-07 21:13:01 +00:00
|
|
|
wu_item.id, wu_item.name
|
|
|
|
);
|
|
|
|
}
|
2011-02-15 22:07:14 +00:00
|
|
|
|
2003-08-15 20:35:44 +00:00
|
|
|
// if WU is assimilated, trigger file deletion
|
|
|
|
//
|
2011-02-15 22:07:14 +00:00
|
|
|
double deferred_file_delete_time = 0;
|
|
|
|
if (wu_item.assimilate_state == ASSIMILATE_DONE) {
|
|
|
|
if (now >= (most_recently_returned + config.delete_delay)) {
|
|
|
|
// can delete input files if all results OVER
|
2003-09-20 17:38:13 +00:00
|
|
|
//
|
2011-02-15 22:07:14 +00:00
|
|
|
if (all_over_and_validated && wu_item.file_delete_state == FILE_DELETE_INIT) {
|
|
|
|
wu_item.file_delete_state = FILE_DELETE_READY;
|
|
|
|
log_messages.printf(MSG_DEBUG,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] ASSIMILATE_DONE: file_delete_state:=>READY\n",
|
2011-02-15 22:07:14 +00:00
|
|
|
wu_item.id, wu_item.name
|
|
|
|
);
|
2004-04-09 06:01:25 +00:00
|
|
|
}
|
2003-09-11 09:35:19 +00:00
|
|
|
|
2011-02-15 22:07:14 +00:00
|
|
|
// output of error results can be deleted immediately;
|
|
|
|
// output of success results can be deleted if validated
|
|
|
|
//
|
|
|
|
for (i=0; i<items.size(); i++) {
|
|
|
|
TRANSITIONER_ITEM& res_item = items[i];
|
2004-07-02 23:27:15 +00:00
|
|
|
|
2011-02-15 22:07:14 +00:00
|
|
|
// can delete canonical result outputs only if all successful
|
|
|
|
// results have been validated
|
|
|
|
//
|
|
|
|
if (((int)i == canonical_result_index) && !all_over_and_validated) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!res_item.res_id) continue;
|
|
|
|
do_delete = false;
|
|
|
|
switch(res_item.res_outcome) {
|
|
|
|
case RESULT_OUTCOME_CLIENT_ERROR:
|
|
|
|
do_delete = true;
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_SUCCESS:
|
|
|
|
do_delete = (res_item.res_validate_state != VALIDATE_STATE_INIT);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (do_delete && res_item.res_file_delete_state == FILE_DELETE_INIT) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] file_delete_state:=>READY\n",
|
2011-02-15 22:07:14 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id, res_item.res_name
|
2010-01-22 23:55:50 +00:00
|
|
|
);
|
2011-02-15 22:07:14 +00:00
|
|
|
res_item.res_file_delete_state = FILE_DELETE_READY;
|
|
|
|
|
|
|
|
retval = transitioner.update_result(res_item);
|
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] [RESULT#%u %s] result.update(): %s\n",
|
2011-02-15 22:07:14 +00:00
|
|
|
wu_item.id, wu_item.name, res_item.res_id,
|
|
|
|
res_item.res_name, boincerror(retval)
|
|
|
|
);
|
|
|
|
}
|
2003-10-06 23:42:43 +00:00
|
|
|
}
|
2003-08-15 20:35:44 +00:00
|
|
|
}
|
2011-02-15 22:07:14 +00:00
|
|
|
} else {
|
|
|
|
deferred_file_delete_time = most_recently_returned + config.delete_delay;
|
|
|
|
log_messages.printf(MSG_DEBUG,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] deferring file deletion for %.0f seconds\n",
|
2011-02-15 22:07:14 +00:00
|
|
|
wu_item.id,
|
|
|
|
wu_item.name,
|
|
|
|
deferred_file_delete_time - now
|
|
|
|
);
|
2002-11-03 23:05:12 +00:00
|
|
|
}
|
2002-11-07 19:31:34 +00:00
|
|
|
}
|
2003-01-23 08:07:48 +00:00
|
|
|
|
2011-02-15 22:07:14 +00:00
|
|
|
// Compute next transition time.
|
|
|
|
// This is the min of
|
|
|
|
// - timeouts of in-progress results
|
|
|
|
// - deferred file deletion time
|
|
|
|
// - safety net
|
|
|
|
//
|
|
|
|
// It is then adjusted to deal with transitioner congestion
|
2005-02-08 00:39:05 +00:00
|
|
|
//
|
2010-01-25 23:35:16 +00:00
|
|
|
if (wu_item.canonical_resultid || wu_item.error_mask) {
|
2006-05-23 06:08:55 +00:00
|
|
|
wu_item.transition_time = INT_MAX;
|
|
|
|
} else {
|
2011-02-15 22:07:14 +00:00
|
|
|
// Safety net: if there is no canonical result and no WU-level error,
|
2010-01-25 23:35:16 +00:00
|
|
|
// make sure that the transitioner will process this WU again.
|
2011-02-15 22:07:14 +00:00
|
|
|
// In principle this is not needed,
|
|
|
|
// but it makes the BOINC back-end more robust.
|
2006-05-23 06:08:55 +00:00
|
|
|
//
|
|
|
|
const int ten_days = 10*86400;
|
2006-06-07 18:47:37 +00:00
|
|
|
int long_delay = (int)(1.5*wu_item.delay_bound);
|
2006-05-23 06:08:55 +00:00
|
|
|
wu_item.transition_time = (long_delay > ten_days) ? long_delay : ten_days;
|
|
|
|
wu_item.transition_time += time(0);
|
2007-01-30 18:19:30 +00:00
|
|
|
}
|
2011-02-15 22:07:14 +00:00
|
|
|
|
|
|
|
// handle timeout of in-progress results
|
|
|
|
//
|
2004-08-13 15:52:55 +00:00
|
|
|
for (i=0; i<items.size(); i++) {
|
|
|
|
TRANSITIONER_ITEM& res_item = items[i];
|
2010-01-22 23:55:50 +00:00
|
|
|
if (!res_item.res_id) continue;
|
|
|
|
if (res_item.res_server_state == RESULT_SERVER_STATE_IN_PROGRESS) {
|
|
|
|
x = res_item.res_report_deadline;
|
|
|
|
if (x < wu_item.transition_time) {
|
|
|
|
wu_item.transition_time = x;
|
|
|
|
}
|
2003-08-15 20:35:44 +00:00
|
|
|
}
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2009-09-17 17:56:59 +00:00
|
|
|
|
2011-02-15 22:07:14 +00:00
|
|
|
// handle deferred file deletion
|
2008-03-03 19:33:32 +00:00
|
|
|
//
|
2011-02-15 22:07:14 +00:00
|
|
|
if (deferred_file_delete_time
|
|
|
|
&& deferred_file_delete_time < wu_item.transition_time
|
2010-01-23 00:36:12 +00:00
|
|
|
) {
|
2012-08-29 03:09:10 +00:00
|
|
|
wu_item.transition_time = (int)deferred_file_delete_time;
|
2007-04-05 17:02:01 +00:00
|
|
|
}
|
2011-02-15 22:07:14 +00:00
|
|
|
|
|
|
|
// Handle transitioner overload.
|
2004-08-13 15:52:55 +00:00
|
|
|
// If transition time is in the past,
|
|
|
|
// the system is bogged down and behind schedule.
|
2008-03-03 19:33:32 +00:00
|
|
|
// Delay processing of the WU by an amount DOUBLE the amount we are behind,
|
|
|
|
// but not less than 60 secs or more than one day.
|
|
|
|
//
|
2005-07-14 20:22:53 +00:00
|
|
|
if (wu_item.transition_time < now) {
|
|
|
|
int extra_delay = 2*(now - wu_item.transition_time);
|
|
|
|
if (extra_delay < 60) extra_delay = 60;
|
|
|
|
if (extra_delay > 86400) extra_delay = 86400;
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] transition time in past: adding extra delay %d sec\n",
|
2005-07-14 20:22:53 +00:00
|
|
|
wu_item.id, wu_item.name, extra_delay
|
|
|
|
);
|
|
|
|
wu_item.transition_time = now + extra_delay;
|
2004-08-13 15:52:55 +00:00
|
|
|
}
|
|
|
|
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_DEBUG,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] setting transition_time to %d\n",
|
2004-08-13 15:52:55 +00:00
|
|
|
wu_item.id, wu_item.name, wu_item.transition_time
|
|
|
|
);
|
|
|
|
|
2005-03-30 06:36:11 +00:00
|
|
|
retval = transitioner.update_workunit(wu_item, wu_item_original);
|
2003-01-23 08:07:48 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] workunit.update(): %s\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, boincerror(retval)
|
2003-06-14 20:15:53 +00:00
|
|
|
);
|
2004-07-21 21:50:25 +00:00
|
|
|
return retval;
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2004-07-21 21:50:25 +00:00
|
|
|
return 0;
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
|
|
|
|
2003-08-15 20:35:44 +00:00
|
|
|
bool do_pass() {
|
2004-07-03 23:41:59 +00:00
|
|
|
int retval;
|
2004-07-02 17:53:31 +00:00
|
|
|
DB_TRANSITIONER_ITEM_SET transitioner;
|
2004-07-02 19:45:33 +00:00
|
|
|
std::vector<TRANSITIONER_ITEM> items;
|
2003-01-23 08:07:48 +00:00
|
|
|
bool did_something = false;
|
|
|
|
|
2005-07-17 19:52:44 +00:00
|
|
|
if (!one_pass) check_stop_daemons();
|
2004-06-29 06:21:05 +00:00
|
|
|
|
2004-07-02 17:53:31 +00:00
|
|
|
// loop over entries that are due to be checked
|
2003-01-23 08:07:48 +00:00
|
|
|
//
|
2008-10-27 21:23:07 +00:00
|
|
|
while (1) {
|
|
|
|
retval = transitioner.enumerate(
|
|
|
|
(int)time(0), SELECT_LIMIT, mod_n, mod_i, items
|
|
|
|
);
|
|
|
|
if (retval) {
|
|
|
|
if (retval != ERR_DB_NOT_FOUND) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"WU enum error: %s; exiting\n", boincerror(retval)
|
2008-10-27 21:23:07 +00:00
|
|
|
);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2003-01-23 08:07:48 +00:00
|
|
|
did_something = true;
|
2004-08-13 15:52:55 +00:00
|
|
|
TRANSITIONER_ITEM& wu_item = items[0];
|
2005-08-11 23:53:24 +00:00
|
|
|
retval = handle_wu(transitioner, items);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2013-06-19 17:15:08 +00:00
|
|
|
"[WU#%u %s] handle_wu: %s; quitting\n",
|
2010-11-08 17:51:57 +00:00
|
|
|
wu_item.id, wu_item.name, boincerror(retval)
|
2005-08-11 23:53:24 +00:00
|
|
|
);
|
|
|
|
exit(1);
|
2004-07-22 17:43:11 +00:00
|
|
|
}
|
2005-08-11 23:53:24 +00:00
|
|
|
|
|
|
|
if (!one_pass) check_stop_daemons();
|
2003-01-23 08:07:48 +00:00
|
|
|
}
|
2002-11-07 19:31:34 +00:00
|
|
|
return did_something;
|
|
|
|
}
|
|
|
|
|
2005-07-17 19:52:44 +00:00
|
|
|
void main_loop() {
|
2002-11-07 19:31:34 +00:00
|
|
|
int retval;
|
|
|
|
|
2004-01-15 23:53:13 +00:00
|
|
|
retval = boinc_db.open(config.db_name, config.db_host, config.db_user, config.db_passwd);
|
2002-11-07 19:31:34 +00:00
|
|
|
if (retval) {
|
2010-11-08 17:51:57 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"boinc_db.open: %s\n", boincerror(retval)
|
|
|
|
);
|
2002-11-07 19:31:34 +00:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2005-07-17 19:52:44 +00:00
|
|
|
while (1) {
|
2008-03-03 19:33:32 +00:00
|
|
|
log_messages.printf(MSG_DEBUG, "doing a pass\n");
|
2005-07-17 19:52:44 +00:00
|
|
|
if (!do_pass()) {
|
|
|
|
if (one_pass) break;
|
2008-09-22 17:52:41 +00:00
|
|
|
#ifdef GCL_SIMULATOR
|
|
|
|
continue_simulation("transitioner");
|
|
|
|
signal(SIGUSR2, simulator_signal_handler);
|
|
|
|
pause();
|
|
|
|
#else
|
2008-11-26 19:09:27 +00:00
|
|
|
log_messages.printf(MSG_DEBUG, "sleeping %d\n", sleep_interval);
|
2012-05-23 18:11:59 +00:00
|
|
|
daemon_sleep(sleep_interval);
|
2008-09-22 17:52:41 +00:00
|
|
|
#endif
|
2003-06-11 23:12:51 +00:00
|
|
|
}
|
2002-10-24 08:25:42 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2009-09-17 17:56:59 +00:00
|
|
|
void usage(char *name) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Handles transitions in the state of a WU\n"
|
|
|
|
" - a result has become DONE (via timeout or client reply)\n"
|
|
|
|
" - the WU error mask is set (e.g. by validater)\n"
|
|
|
|
" - assimilation is finished\n\n"
|
|
|
|
"Usage: %s [OPTION]...\n\n"
|
|
|
|
"Options: \n"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
" [ --one_pass ] do one pass, then exit\n"
|
|
|
|
" [ --d x ] debug level x\n"
|
|
|
|
" [ --mod n i ] process only WUs with (id mod n) == i\n"
|
|
|
|
" [ --sleep_interval x ] sleep x seconds if nothing to do\n"
|
|
|
|
" [ -h | --help ] Show this help text.\n"
|
|
|
|
" [ -v | --version ] Shows version information.\n",
|
2009-09-17 17:56:59 +00:00
|
|
|
name
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2002-10-24 08:25:42 +00:00
|
|
|
int main(int argc, char** argv) {
|
2002-11-07 19:31:34 +00:00
|
|
|
int i, retval;
|
2012-05-09 16:11:50 +00:00
|
|
|
char path[MAXPATHLEN];
|
2002-11-07 19:31:34 +00:00
|
|
|
|
2002-11-05 18:36:09 +00:00
|
|
|
startup_time = time(0);
|
|
|
|
for (i=1; i<argc; i++) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (is_arg(argv[i], "one_pass")) {
|
2002-12-24 03:03:45 +00:00
|
|
|
one_pass = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "d")) {
|
2010-04-05 21:59:33 +00:00
|
|
|
if (!argv[++i]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires an argument\n\n", argv[--i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
2010-04-05 21:59:33 +00:00
|
|
|
int dl = atoi(argv[i]);
|
|
|
|
log_messages.set_debug_level(dl);
|
|
|
|
if (dl == 4) g_print_queries = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "mod")) {
|
2010-04-05 21:59:33 +00:00
|
|
|
if (!argv[i+1] || !argv[i+2]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires two arguments\n\n", argv[i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
2004-06-27 23:26:52 +00:00
|
|
|
mod_n = atoi(argv[++i]);
|
|
|
|
mod_i = atoi(argv[++i]);
|
|
|
|
do_mod = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "sleep_interval")) {
|
2010-04-05 21:59:33 +00:00
|
|
|
if (!argv[++i]) {
|
2009-09-17 17:56:59 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "%s requires an argument\n\n", argv[--i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
sleep_interval = atoi(argv[i]);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "h") || is_arg(argv[i], "help")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
usage(argv[0]);
|
|
|
|
exit(0);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "v") || is_arg(argv[i], "version")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
printf("%s\n", SVN_VERSION);
|
|
|
|
exit(0);
|
|
|
|
} else {
|
|
|
|
log_messages.printf(MSG_CRITICAL, "unknown command line argument: %s\n\n", argv[i]);
|
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
2002-11-05 18:36:09 +00:00
|
|
|
}
|
|
|
|
}
|
2005-07-17 19:52:44 +00:00
|
|
|
if (!one_pass) check_stop_daemons();
|
2003-02-10 19:51:32 +00:00
|
|
|
|
2009-05-07 13:54:51 +00:00
|
|
|
retval = config.parse_file();
|
2003-02-10 19:51:32 +00:00
|
|
|
if (retval) {
|
2009-05-07 13:54:51 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "Can't parse config.xml: %s\n", boincerror(retval));
|
2003-02-10 19:51:32 +00:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
sprintf(path, "%s/upload_private", config.key_dir);
|
|
|
|
retval = read_key_file(path, key);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "can't read key\n");
|
2003-02-10 19:51:32 +00:00
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL, "Starting\n");
|
2003-06-11 23:12:54 +00:00
|
|
|
|
2003-12-31 23:09:21 +00:00
|
|
|
install_stop_signal_handler();
|
2003-02-27 19:29:48 +00:00
|
|
|
|
2005-07-17 19:52:44 +00:00
|
|
|
main_loop();
|
2002-10-24 08:25:42 +00:00
|
|
|
}
|
2004-12-08 00:40:19 +00:00
|
|
|
|
2005-01-02 18:29:53 +00:00
|
|
|
const char *BOINC_RCSID_be98c91511 = "$Id$";
|