2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2008-02-21 00:47:50 +00:00
|
|
|
// http://boinc.berkeley.edu
|
|
|
|
// Copyright (C) 2008 University of California
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2008-02-21 00:47:50 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2008-02-21 00:47:50 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
#include "boinc_db.h"
|
|
|
|
#include "str_util.h"
|
2009-06-16 20:54:44 +00:00
|
|
|
#include "str_replace.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "parse.h"
|
|
|
|
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "credit.h"
|
|
|
|
#include "sched_types.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "sched_msgs.h"
|
|
|
|
#include "sched_util.h"
|
2010-04-21 19:33:20 +00:00
|
|
|
#include "sched_main.h"
|
2008-05-02 17:48:29 +00:00
|
|
|
#include "sched_config.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
#include "sched_result.h"
|
|
|
|
|
- back end: change "daily result quota" mechanism.
Old: config.xml specifies an initial daily quota (say, 100).
Each host_app_version starts out with this quota.
On the return of a SUCCESS result,
the quota is doubled, up to the initial value.
On the return of an error result, or a timeout,
the quota is decremented down to 1.
Problem:
Doesn't accommodate hosts that can do more than 100 jobs/day.
New: similar, but
- on validation of a job, daily quota is incremented.
- on invalidation of a job, daily quota is decremented.
- on return of an error result, or a timeout,
daily quota is min'd with initial quota, then decremented.
Notes:
- This allows a host to have an unboundedly large quota
as long as it continues to return more valid
than invalid results.
- Even with this change, hosts that return SUCCESS but
invalid results will continue to get the initial daily quota.
It would be desirable to reduce their quota to 1.
svn path=/trunk/boinc/; revision=21675
2010-06-02 00:11:01 +00:00
|
|
|
// got a SUCCESS result. Doesn't mean it's valid!
|
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
static inline void got_good_result(SCHED_RESULT_ITEM& sri) {
|
|
|
|
int gavid = generalized_app_version_id(sri.app_version_id, sri.appid);
|
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
|
|
|
|
if (!havp) {
|
|
|
|
if (config.debug_handle_results) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[handle] No app version for %d\n", gavid
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
- back end: change "daily result quota" mechanism.
Old: config.xml specifies an initial daily quota (say, 100).
Each host_app_version starts out with this quota.
On the return of a SUCCESS result,
the quota is doubled, up to the initial value.
On the return of an error result, or a timeout,
the quota is decremented down to 1.
Problem:
Doesn't accommodate hosts that can do more than 100 jobs/day.
New: similar, but
- on validation of a job, daily quota is incremented.
- on invalidation of a job, daily quota is decremented.
- on return of an error result, or a timeout,
daily quota is min'd with initial quota, then decremented.
Notes:
- This allows a host to have an unboundedly large quota
as long as it continues to return more valid
than invalid results.
- Even with this change, hosts that return SUCCESS but
invalid results will continue to get the initial daily quota.
It would be desirable to reduce their quota to 1.
svn path=/trunk/boinc/; revision=21675
2010-06-02 00:11:01 +00:00
|
|
|
if (havp->max_jobs_per_day < config.daily_result_quota) {
|
2010-06-25 18:54:37 +00:00
|
|
|
int n = havp->max_jobs_per_day*2;
|
|
|
|
if (n > config.daily_result_quota) {
|
|
|
|
n = config.daily_result_quota;
|
- back end: change "daily result quota" mechanism.
Old: config.xml specifies an initial daily quota (say, 100).
Each host_app_version starts out with this quota.
On the return of a SUCCESS result,
the quota is doubled, up to the initial value.
On the return of an error result, or a timeout,
the quota is decremented down to 1.
Problem:
Doesn't accommodate hosts that can do more than 100 jobs/day.
New: similar, but
- on validation of a job, daily quota is incremented.
- on invalidation of a job, daily quota is decremented.
- on return of an error result, or a timeout,
daily quota is min'd with initial quota, then decremented.
Notes:
- This allows a host to have an unboundedly large quota
as long as it continues to return more valid
than invalid results.
- Even with this change, hosts that return SUCCESS but
invalid results will continue to get the initial daily quota.
It would be desirable to reduce their quota to 1.
svn path=/trunk/boinc/; revision=21675
2010-06-02 00:11:01 +00:00
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] increasing max_jobs_per_day for %d: %d->%d\n",
|
|
|
|
gavid, havp->max_jobs_per_day, n
|
|
|
|
);
|
|
|
|
}
|
|
|
|
havp->max_jobs_per_day = n;
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-04-21 19:33:20 +00:00
|
|
|
static inline void got_bad_result(SCHED_RESULT_ITEM& sri) {
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
int gavid = generalized_app_version_id(sri.app_version_id, sri.appid);
|
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
|
|
|
|
if (!havp) {
|
|
|
|
if (config.debug_handle_results) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[handle] No app version for %d\n", gavid
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return;
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
|
2010-06-25 18:54:37 +00:00
|
|
|
int n = havp->max_jobs_per_day;
|
|
|
|
if (n > config.daily_result_quota) {
|
|
|
|
n = config.daily_result_quota;
|
- back end: change "daily result quota" mechanism.
Old: config.xml specifies an initial daily quota (say, 100).
Each host_app_version starts out with this quota.
On the return of a SUCCESS result,
the quota is doubled, up to the initial value.
On the return of an error result, or a timeout,
the quota is decremented down to 1.
Problem:
Doesn't accommodate hosts that can do more than 100 jobs/day.
New: similar, but
- on validation of a job, daily quota is incremented.
- on invalidation of a job, daily quota is decremented.
- on return of an error result, or a timeout,
daily quota is min'd with initial quota, then decremented.
Notes:
- This allows a host to have an unboundedly large quota
as long as it continues to return more valid
than invalid results.
- Even with this change, hosts that return SUCCESS but
invalid results will continue to get the initial daily quota.
It would be desirable to reduce their quota to 1.
svn path=/trunk/boinc/; revision=21675
2010-06-02 00:11:01 +00:00
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
n -= 1;
|
|
|
|
if (n < 1) {
|
|
|
|
n = 1;
|
|
|
|
}
|
|
|
|
if (config.debug_quota) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[quota] decreasing max_jobs_per_day for %d: %d->%d\n",
|
|
|
|
gavid, havp->max_jobs_per_day, n
|
|
|
|
);
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
}
|
2010-06-25 18:54:37 +00:00
|
|
|
havp->max_jobs_per_day = n;
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
|
2010-04-21 19:33:20 +00:00
|
|
|
havp->consecutive_valid = 0;
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
// handle completed results
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
int handle_results() {
|
2008-02-21 00:47:50 +00:00
|
|
|
DB_SCHED_RESULT_ITEM_SET result_handler;
|
|
|
|
SCHED_RESULT_ITEM* srip;
|
|
|
|
unsigned int i;
|
|
|
|
int retval;
|
|
|
|
RESULT* rp;
|
|
|
|
bool changed_host=false;
|
|
|
|
|
2008-12-19 18:14:02 +00:00
|
|
|
if (g_request->results.size() == 0) return 0;
|
2008-02-21 00:47:50 +00:00
|
|
|
|
2009-05-14 19:01:40 +00:00
|
|
|
// allow projects to limit the # of results handled
|
|
|
|
// (in case of server memory limits)
|
|
|
|
//
|
2010-06-25 18:54:37 +00:00
|
|
|
if (config.report_max
|
|
|
|
&& (int)g_request->results.size() > config.report_max
|
|
|
|
) {
|
2009-05-14 19:01:40 +00:00
|
|
|
g_request->results.resize(config.report_max);
|
|
|
|
}
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
// copy reported results to a separate vector, "result_handler",
|
|
|
|
// initially with only the "name" field present
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
for (i=0; i<g_request->results.size(); i++) {
|
|
|
|
result_handler.add_result(g_request->results[i].name);
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// read results from database into "result_handler".
|
2009-09-03 20:26:31 +00:00
|
|
|
//
|
2008-02-21 00:47:50 +00:00
|
|
|
// Quantities that must be read from the DB are those
|
|
|
|
// where srip (see below) appears as an rval.
|
|
|
|
// These are: id, name, server_state, received_time, hostid, validate_state.
|
2009-09-03 20:26:31 +00:00
|
|
|
//
|
2008-02-21 00:47:50 +00:00
|
|
|
// Quantities that must be written to the DB are those for
|
|
|
|
// which srip appears as an lval. These are:
|
|
|
|
// hostid, teamid, received_time, client_state, cpu_time, exit_status,
|
|
|
|
// app_version_num, claimed_credit, server_state, stderr_out,
|
2009-09-03 20:26:31 +00:00
|
|
|
// xml_doc_out, outcome, validate_state, elapsed_time
|
2008-02-21 00:47:50 +00:00
|
|
|
//
|
|
|
|
retval = result_handler.enumerate();
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2008-02-21 00:47:50 +00:00
|
|
|
"[HOST#%d] Batch query failed\n",
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->host.id
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// loop over results reported by client
|
|
|
|
//
|
|
|
|
// A note about acks: we send an ack for result received if either
|
|
|
|
// 1) there's some problem with it (wrong state, host, not in DB) or
|
|
|
|
// 2) we update it successfully.
|
|
|
|
// In other words, the only time we don't ack a result is when
|
|
|
|
// it looks OK but the update failed.
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
for (i=0; i<g_request->results.size(); i++) {
|
|
|
|
rp = &g_request->results[i];
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
retval = result_handler.lookup_result(rp->name, &srip);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-09-30 21:40:44 +00:00
|
|
|
"[HOST#%d] [RESULT#? %s] reported result not in DB\n",
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->host.id, rp->name
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[handle] [HOST#%d] [RESULT#%d] [WU#%d] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->server_state,
|
2008-05-02 17:48:29 +00:00
|
|
|
srip->outcome, srip->client_state, srip->validate_state,
|
|
|
|
srip->file_delete_state
|
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
// Do various sanity checks.
|
|
|
|
// If one of them fails, set srip->id = 0,
|
|
|
|
// which suppresses the DB update later on
|
|
|
|
//
|
|
|
|
|
|
|
|
// If result has server_state OVER
|
|
|
|
// if outcome NO_REPLY accept it (it's just late).
|
|
|
|
// else ignore it
|
|
|
|
//
|
|
|
|
if (srip->server_state == RESULT_SERVER_STATE_OVER) {
|
2010-06-01 19:56:54 +00:00
|
|
|
const char *msg = NULL;
|
2008-02-21 00:47:50 +00:00
|
|
|
switch (srip->outcome) {
|
|
|
|
case RESULT_OUTCOME_INIT:
|
|
|
|
// should never happen!
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "this result was never sent";
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_SUCCESS:
|
|
|
|
// don't replace a successful result!
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "result already reported as success";
|
2010-08-11 22:02:41 +00:00
|
|
|
|
|
|
|
// Client is reporting a result twice.
|
|
|
|
// That could mean it didn't get the first reply.
|
|
|
|
// That reply may have contained new jobs.
|
|
|
|
// So make sure we resend lost jobs
|
|
|
|
//
|
|
|
|
g_wreq->resend_lost_results = true;
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_COULDNT_SEND:
|
|
|
|
// should never happen!
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "this result couldn't be sent";
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_CLIENT_ERROR:
|
|
|
|
// should never happen!
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "result already reported as error";
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_CLIENT_DETACHED:
|
|
|
|
case RESULT_OUTCOME_NO_REPLY:
|
|
|
|
// result is late in arriving, but keep it anyhow
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_DIDNT_NEED:
|
|
|
|
// should never happen
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "this result wasn't sent (not needed)";
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_VALIDATE_ERROR:
|
|
|
|
// we already passed through the validator, so
|
|
|
|
// don't keep the new result
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "result already reported, validate error";
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
default:
|
2010-06-01 19:56:54 +00:00
|
|
|
msg = "server logic bug; please alert BOINC developers";
|
2008-02-21 00:47:50 +00:00
|
|
|
break;
|
|
|
|
}
|
2010-06-01 19:56:54 +00:00
|
|
|
if (msg) {
|
|
|
|
if (config.debug_handle_results) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[handle][HOST#%d][RESULT#%d][WU#%d] result already over [outcome=%d validate_state=%d]: %s\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid,
|
|
|
|
srip->outcome, srip->validate_state, msg
|
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srip->server_state == RESULT_SERVER_STATE_UNSENT) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] got unexpected result: server state is %d\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->server_state
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srip->received_time) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] already got result, at %s \n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid,
|
2008-05-02 17:48:29 +00:00
|
|
|
time_to_string(srip->received_time)
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-12-19 18:14:02 +00:00
|
|
|
if (srip->hostid != g_reply->host.id) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] got result from wrong host; expected [HOST#%d]\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->hostid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
DB_HOST result_host;
|
|
|
|
retval = result_host.lookup_id(srip->hostid);
|
|
|
|
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[RESULT#%d] [WU#%d] Can't lookup [HOST#%d]\n",
|
|
|
|
srip->id, srip->workunitid, srip->hostid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
2008-12-19 18:14:02 +00:00
|
|
|
} else if (result_host.userid != g_reply->host.userid) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[USER#%d] [HOST#%d] [RESULT#%d] [WU#%d] Not even the same user; expected [USER#%d]\n",
|
|
|
|
g_reply->host.userid, g_reply->host.id, srip->id, srip->workunitid, result_host.userid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
} else {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] Allowing result because same USER#%d\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, g_reply->host.userid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
changed_host = true;
|
|
|
|
}
|
|
|
|
} // hostids do not match
|
|
|
|
|
|
|
|
// Modify the in-memory copy obtained from the DB earlier.
|
|
|
|
// If we found a problem above,
|
|
|
|
// we have continued and skipped this modify
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
srip->hostid = g_reply->host.id;
|
|
|
|
srip->teamid = g_reply->user.teamid;
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->received_time = time(0);
|
|
|
|
srip->client_state = rp->client_state;
|
|
|
|
srip->cpu_time = rp->cpu_time;
|
2009-09-03 20:26:31 +00:00
|
|
|
srip->elapsed_time = rp->elapsed_time;
|
2008-02-21 00:47:50 +00:00
|
|
|
|
2010-04-21 19:33:20 +00:00
|
|
|
// check for impossible elapsed time
|
2008-02-21 00:47:50 +00:00
|
|
|
//
|
2009-09-03 20:26:31 +00:00
|
|
|
double turnaround_time = srip->received_time - srip->sent_time;
|
|
|
|
if (turnaround_time < 0) {
|
2010-04-21 19:33:20 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] inconsistent sent/received times\n",
|
|
|
|
srip->hostid, srip->id, srip->workunitid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
} else {
|
2009-09-03 20:26:31 +00:00
|
|
|
if (srip->elapsed_time > turnaround_time) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2010-04-21 19:33:20 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] impossible elapsed time: reported %f > turnaround %f\n",
|
2009-09-03 20:26:31 +00:00
|
|
|
srip->hostid, srip->id, srip->workunitid,
|
2010-04-21 19:33:20 +00:00
|
|
|
srip->elapsed_time, turnaround_time
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
2010-04-21 19:33:20 +00:00
|
|
|
srip->elapsed_time = turnaround_time;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Some buggy clients sporadically report very low elapsed time
|
|
|
|
// but actual CPU time.
|
|
|
|
// Try to fix the elapsed time, since it's critical to credit
|
|
|
|
//
|
|
|
|
if (srip->elapsed_time < srip->cpu_time) {
|
|
|
|
int avid = srip->app_version_id;
|
|
|
|
if (avid > 0) {
|
|
|
|
APP_VERSION* avp = ssp->lookup_app_version(avid);
|
|
|
|
if (avp && !avp->is_multithread()) {
|
|
|
|
srip->elapsed_time = srip->cpu_time;
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-04-21 19:33:20 +00:00
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->exit_status = rp->exit_status;
|
|
|
|
srip->app_version_num = rp->app_version_num;
|
2009-09-03 20:26:31 +00:00
|
|
|
|
|
|
|
// TODO: this is outdated, and doesn't belong here
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
if (rp->fpops_cumulative || rp->intops_cumulative) {
|
|
|
|
srip->claimed_credit = fpops_to_credit(rp->fpops_cumulative, rp->intops_cumulative);
|
2010-07-31 04:08:14 +00:00
|
|
|
if (config.debug_credit) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[credit] [RESULT#%d] claimed credit %.2f based on fpops_cumulative\n",
|
2010-09-30 21:40:44 +00:00
|
|
|
srip->id, srip->claimed_credit
|
2010-07-31 04:08:14 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
} else if (rp->fpops_per_cpu_sec || rp->intops_per_cpu_sec) {
|
|
|
|
srip->claimed_credit = fpops_to_credit(
|
|
|
|
rp->fpops_per_cpu_sec*srip->cpu_time,
|
|
|
|
rp->intops_per_cpu_sec*srip->cpu_time
|
|
|
|
);
|
2010-07-31 04:08:14 +00:00
|
|
|
if (config.debug_credit) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[credit] [RESULT#%d] claimed credit %.2f based on fpops_per_cpu_sec\n",
|
2010-09-30 21:40:44 +00:00
|
|
|
srip->id, srip->claimed_credit
|
2010-07-31 04:08:14 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
} else {
|
2010-06-12 22:08:15 +00:00
|
|
|
srip->claimed_credit = 0;
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
2008-08-06 23:30:22 +00:00
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->server_state = RESULT_SERVER_STATE_OVER;
|
|
|
|
|
|
|
|
strlcpy(srip->stderr_out, rp->stderr_out, sizeof(srip->stderr_out));
|
|
|
|
strlcpy(srip->xml_doc_out, rp->xml_doc_out, sizeof(srip->xml_doc_out));
|
|
|
|
|
|
|
|
// look for exit status and app version in stderr_out
|
|
|
|
// (historical - can be deleted at some point)
|
|
|
|
//
|
|
|
|
parse_int(srip->stderr_out, "<exit_status>", srip->exit_status);
|
|
|
|
parse_int(srip->stderr_out, "<app_version>", srip->app_version_num);
|
|
|
|
|
|
|
|
if ((srip->client_state == RESULT_FILES_UPLOADED) && (srip->exit_status == 0)) {
|
|
|
|
srip->outcome = RESULT_OUTCOME_SUCCESS;
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[handle] [RESULT#%d] [WU#%d]: setting outcome SUCCESS\n",
|
|
|
|
srip->id, srip->workunitid
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
got_good_result(*srip);
|
2009-06-30 18:00:58 +00:00
|
|
|
|
|
|
|
if (config.dont_store_success_stderr) {
|
|
|
|
strcpy(srip->stderr_out, "");
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
} else {
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[handle] [RESULT#%d] [WU#%d]: client_state %d exit_status %d; setting outcome ERROR\n",
|
|
|
|
srip->id, srip->workunitid, srip->client_state, srip->exit_status
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->outcome = RESULT_OUTCOME_CLIENT_ERROR;
|
|
|
|
srip->validate_state = VALIDATE_STATE_INVALID;
|
2009-01-20 00:54:16 +00:00
|
|
|
|
2010-04-21 19:33:20 +00:00
|
|
|
// adjust quota and reset error rate
|
2009-01-20 00:54:16 +00:00
|
|
|
//
|
2010-04-21 19:33:20 +00:00
|
|
|
got_bad_result(*srip);
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
} // loop over all incoming results
|
|
|
|
|
|
|
|
// Update the result records
|
|
|
|
// (skip items that we previously marked to skip)
|
|
|
|
//
|
|
|
|
for (i=0; i<result_handler.results.size(); i++) {
|
|
|
|
SCHED_RESULT_ITEM& sri = result_handler.results[i];
|
|
|
|
if (sri.id == 0) continue;
|
|
|
|
retval = result_handler.update_result(sri);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] can't update result: %s\n",
|
|
|
|
g_reply->host.id, sri.id, sri.workunitid, boinc_db.error_string()
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
} else {
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(sri.name));
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// set transition_time for the results' WUs
|
|
|
|
//
|
|
|
|
retval = result_handler.update_workunits();
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[HOST#%d] can't update WUs: %s\n",
|
|
|
|
g_reply->host.id, boincerror(retval)
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|