2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2005-01-20 23:22:22 +00:00
|
|
|
// http://boinc.berkeley.edu
|
2019-01-12 21:43:48 +00:00
|
|
|
// Copyright (C) 2019 University of California
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2005-01-20 23:22:22 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2011-02-27 00:10:14 +00:00
|
|
|
|
|
|
|
|
2006-10-11 16:32:59 +00:00
|
|
|
// trickle_handler - framework for trickle-up message handler
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// --variety variety
|
|
|
|
// [--d debug_level]
|
2018-08-02 19:32:51 +00:00
|
|
|
// [--one_pass] // make one pass through table, then exit
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
2018-08-02 19:32:51 +00:00
|
|
|
// This program must be linked with an app-specific functions:
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
2018-08-02 19:32:51 +00:00
|
|
|
// int handle_trickle_init(int argc, char** argv);
|
|
|
|
// initialize
|
2006-07-17 16:38:53 +00:00
|
|
|
// int handle_trickle(MSG_FROM_HOST&)
|
2018-08-02 19:32:51 +00:00
|
|
|
// handle a trickle message
|
2004-03-17 01:26:44 +00:00
|
|
|
//
|
|
|
|
// return nonzero on error
|
|
|
|
|
2005-11-21 18:34:44 +00:00
|
|
|
#include "config.h"
|
2004-03-17 01:26:44 +00:00
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#include "boinc_db.h"
|
|
|
|
#include "util.h"
|
2008-10-27 21:23:07 +00:00
|
|
|
#include "error_numbers.h"
|
2013-06-07 00:31:46 +00:00
|
|
|
#include "str_replace.h"
|
2008-02-05 23:30:28 +00:00
|
|
|
#include "str_util.h"
|
2009-09-17 17:56:59 +00:00
|
|
|
#include "svn_version.h"
|
2008-10-27 21:23:07 +00:00
|
|
|
|
2004-03-17 01:26:44 +00:00
|
|
|
#include "sched_config.h"
|
|
|
|
#include "sched_util.h"
|
2004-04-08 08:15:23 +00:00
|
|
|
#include "sched_msgs.h"
|
2011-02-27 00:10:14 +00:00
|
|
|
#include "trickle_handler.h"
|
2004-03-17 01:26:44 +00:00
|
|
|
|
2004-07-06 04:10:51 +00:00
|
|
|
char variety[256];
|
2004-03-17 01:26:44 +00:00
|
|
|
|
2018-08-05 04:17:43 +00:00
|
|
|
// values of mhf.handled.
|
|
|
|
// Can change the following in handle_trickle_init()
|
2018-08-02 19:32:51 +00:00
|
|
|
//
|
|
|
|
int handled_enum = 0;
|
2018-08-05 04:17:43 +00:00
|
|
|
// enumerate messages with this
|
2018-08-02 19:32:51 +00:00
|
|
|
int handled_set = 1;
|
2018-08-05 04:17:43 +00:00
|
|
|
// if successful, set to this
|
|
|
|
int handled_error = 1;
|
|
|
|
// if handling error, set to this
|
2018-08-02 19:32:51 +00:00
|
|
|
|
|
|
|
// make one pass through trickle_ups with handled == handled_enum
|
2004-03-17 01:26:44 +00:00
|
|
|
// return true if there were any
|
|
|
|
//
|
2008-06-26 21:20:07 +00:00
|
|
|
bool do_trickle_scan() {
|
2004-06-22 22:56:50 +00:00
|
|
|
DB_MSG_FROM_HOST mfh;
|
2004-03-17 01:26:44 +00:00
|
|
|
char buf[256];
|
|
|
|
bool found=false;
|
2004-05-12 21:21:09 +00:00
|
|
|
int retval;
|
2004-03-17 01:26:44 +00:00
|
|
|
|
2018-08-02 19:32:51 +00:00
|
|
|
sprintf(buf, "where variety='%s' and handled=%d", variety, handled_enum);
|
2008-10-27 21:23:07 +00:00
|
|
|
while (1) {
|
|
|
|
retval = mfh.enumerate(buf);
|
|
|
|
if (retval) {
|
|
|
|
if (retval != ERR_DB_NOT_FOUND) {
|
|
|
|
fprintf(stderr, "lost DB conn\n");
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2004-06-22 22:56:50 +00:00
|
|
|
retval = handle_trickle(mfh);
|
2018-08-05 04:17:43 +00:00
|
|
|
if (retval) {
|
2014-03-29 16:25:01 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"handle_trickle(): %s", boincerror(retval)
|
|
|
|
);
|
2004-05-12 21:21:09 +00:00
|
|
|
}
|
2018-08-05 04:17:43 +00:00
|
|
|
mfh.handled = retval?handled_error:handled_set;
|
2014-03-29 16:25:01 +00:00
|
|
|
mfh.update();
|
2004-03-17 01:26:44 +00:00
|
|
|
found = true;
|
|
|
|
}
|
|
|
|
return found;
|
|
|
|
}
|
|
|
|
|
|
|
|
int main_loop(bool one_pass) {
|
2015-11-04 07:20:30 +00:00
|
|
|
// coverity[loop_top] - infinite loop is intended
|
2004-03-17 01:26:44 +00:00
|
|
|
while (1) {
|
2004-05-03 19:30:01 +00:00
|
|
|
check_stop_daemons();
|
2011-09-16 19:16:12 +00:00
|
|
|
bool did_something = do_trickle_scan();
|
2004-03-17 01:26:44 +00:00
|
|
|
if (one_pass) break;
|
|
|
|
if (!did_something) {
|
2012-05-23 18:11:59 +00:00
|
|
|
daemon_sleep(5);
|
2004-03-17 01:26:44 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-09-17 17:56:59 +00:00
|
|
|
void usage(char *name) {
|
|
|
|
fprintf(stderr,
|
|
|
|
"Framework for trickle-up message handler\n"
|
2011-09-13 21:01:42 +00:00
|
|
|
"This program must be linked with app-specific functions:\n\n"
|
|
|
|
"int handle_trickle_init(int argc, char** argv)\n"
|
|
|
|
" - initialize\n\n"
|
2009-09-17 17:56:59 +00:00
|
|
|
"int handle_trickle(MSG_FROM_HOST&)\n"
|
|
|
|
" - handle a trickle message\n\n"
|
|
|
|
"return nonzero on error\n\n"
|
|
|
|
"Usage: %s [OPTION]...\n\n"
|
|
|
|
"Options:\n"
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
" --variety X Set Variety to X\n"
|
|
|
|
" [ -d X ] Set debug level to X\n"
|
|
|
|
" [ --one_pass ] Make one pass through table, then exit\n"
|
|
|
|
" [ -h | --help ] Show this help text\n"
|
|
|
|
" [ -v | --version ] Shows version information\n",
|
2009-09-17 17:56:59 +00:00
|
|
|
name
|
|
|
|
);
|
|
|
|
}
|
2004-03-17 01:26:44 +00:00
|
|
|
|
|
|
|
int main(int argc, char** argv) {
|
|
|
|
int i, retval;
|
2007-04-18 20:49:58 +00:00
|
|
|
bool one_pass = false;
|
2004-03-17 01:26:44 +00:00
|
|
|
|
2004-05-03 19:30:01 +00:00
|
|
|
check_stop_daemons();
|
2004-03-17 01:26:44 +00:00
|
|
|
|
2011-09-13 21:01:42 +00:00
|
|
|
int j=1;
|
2004-03-17 01:26:44 +00:00
|
|
|
for (i=1; i<argc; i++) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
if (is_arg(argv[i], "one_pass")) {
|
2004-03-17 01:26:44 +00:00
|
|
|
one_pass = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (is_arg(argv[i], "variety")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
if (!argv[++i]) {
|
2010-11-08 17:51:57 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"%s requires an argument\n\n", argv[--i]
|
|
|
|
);
|
2009-09-17 17:56:59 +00:00
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
2013-06-04 05:42:53 +00:00
|
|
|
safe_strcpy(variety, argv[i]);
|
2004-03-17 01:26:44 +00:00
|
|
|
} else if (!strcmp(argv[i], "-d")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
if (!argv[++i]) {
|
2010-11-08 17:51:57 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"%s requires an argument\n\n", argv[--i]
|
|
|
|
|
|
|
|
);
|
2009-09-17 17:56:59 +00:00
|
|
|
usage(argv[0]);
|
|
|
|
exit(1);
|
|
|
|
}
|
2010-04-05 21:59:33 +00:00
|
|
|
int dl = atoi(argv[i]);
|
|
|
|
log_messages.set_debug_level(dl);
|
|
|
|
if (dl == 4) g_print_queries = true;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--version")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
printf("%s\n", SVN_VERSION);
|
|
|
|
exit(0);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
} else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
|
2009-09-17 17:56:59 +00:00
|
|
|
usage(argv[0]);
|
|
|
|
exit(0);
|
2004-03-17 01:26:44 +00:00
|
|
|
} else {
|
2011-09-13 21:01:42 +00:00
|
|
|
// unknown arg - pass to handler
|
|
|
|
argv[j++] = argv[i];
|
2004-03-17 01:26:44 +00:00
|
|
|
}
|
|
|
|
}
|
2009-05-07 13:54:51 +00:00
|
|
|
retval = config.parse_file();
|
2004-03-17 01:26:44 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-05-07 13:54:51 +00:00
|
|
|
"Can't parse config.xml: %s\n", boincerror(retval)
|
2004-03-17 01:26:44 +00:00
|
|
|
);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
2011-09-13 21:01:42 +00:00
|
|
|
retval = boinc_db.open(
|
|
|
|
config.db_name, config.db_host, config.db_user, config.db_passwd
|
2008-02-05 23:30:28 +00:00
|
|
|
);
|
2011-09-13 21:01:42 +00:00
|
|
|
if (retval) {
|
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2020-05-06 20:01:21 +00:00
|
|
|
"boinc_db.open failed: %s\n", boinc_db.error_string()
|
2011-09-13 21:01:42 +00:00
|
|
|
);
|
|
|
|
exit(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
argv[j] = 0;
|
2013-08-23 20:01:53 +00:00
|
|
|
retval = handle_trickle_init(j, argv);
|
2011-09-13 21:01:42 +00:00
|
|
|
if (retval) exit(1);
|
|
|
|
|
|
|
|
log_messages.printf(MSG_NORMAL, "Starting trickle handler\n");
|
2004-03-17 01:26:44 +00:00
|
|
|
|
|
|
|
install_stop_signal_handler();
|
|
|
|
|
|
|
|
main_loop(one_pass);
|
|
|
|
}
|