2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2008-02-21 00:47:50 +00:00
|
|
|
// http://boinc.berkeley.edu
|
|
|
|
// Copyright (C) 2008 University of California
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2008-02-21 00:47:50 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2008-02-21 00:47:50 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
#include "boinc_db.h"
|
|
|
|
#include "str_util.h"
|
2009-06-16 20:54:44 +00:00
|
|
|
#include "str_replace.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "parse.h"
|
|
|
|
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "credit.h"
|
|
|
|
#include "sched_types.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "sched_msgs.h"
|
|
|
|
#include "sched_util.h"
|
2008-05-02 17:48:29 +00:00
|
|
|
#include "sched_config.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
#include "sched_result.h"
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
static inline void got_good_result(SCHED_RESULT_ITEM& sri) {
|
|
|
|
int gavid = generalized_app_version_id(sri.app_version_id, sri.appid);
|
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
|
|
|
|
if (!havp) {
|
|
|
|
if (config.debug_handle_results) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[handle] No app version for %d\n", gavid
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
havp->max_jobs_per_day *= 2;
|
|
|
|
if (havp->max_jobs_per_day > config.daily_result_quota) {
|
|
|
|
havp->max_jobs_per_day = config.daily_result_quota;
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
static inline void got_bad_result(SCHED_RESULT_ITEM& sri, double delay_bound) {
|
|
|
|
int gavid = generalized_app_version_id(sri.app_version_id, sri.appid);
|
|
|
|
DB_HOST_APP_VERSION* havp = gavid_to_havp(gavid);
|
|
|
|
if (!havp) {
|
|
|
|
if (config.debug_handle_results) {
|
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[handle] No app version for %d\n", gavid
|
|
|
|
);
|
|
|
|
}
|
|
|
|
return;
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
|
|
|
|
// if job was aborted (possibly by client scheduler) don't penalize
|
|
|
|
//
|
|
|
|
if (sri.client_state != RESULT_ABORTED) {
|
|
|
|
havp->max_jobs_per_day -= 1;
|
|
|
|
if (havp->max_jobs_per_day < 1) {
|
|
|
|
havp->max_jobs_per_day = 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// but put on host scale probation regardless
|
|
|
|
//
|
|
|
|
host_scale_probation(*havp, delay_bound);
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
// handle completed results
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
int handle_results() {
|
2008-02-21 00:47:50 +00:00
|
|
|
DB_SCHED_RESULT_ITEM_SET result_handler;
|
|
|
|
SCHED_RESULT_ITEM* srip;
|
|
|
|
unsigned int i;
|
|
|
|
int retval;
|
|
|
|
RESULT* rp;
|
|
|
|
bool changed_host=false;
|
|
|
|
|
2008-12-19 18:14:02 +00:00
|
|
|
if (g_request->results.size() == 0) return 0;
|
2008-02-21 00:47:50 +00:00
|
|
|
|
2009-05-14 19:01:40 +00:00
|
|
|
// allow projects to limit the # of results handled
|
|
|
|
// (in case of server memory limits)
|
|
|
|
//
|
|
|
|
if (config.report_max && g_request->results.size() > config.report_max) {
|
|
|
|
g_request->results.resize(config.report_max);
|
|
|
|
}
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
// copy reported results to a separate vector, "result_handler",
|
|
|
|
// initially with only the "name" field present
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
for (i=0; i<g_request->results.size(); i++) {
|
|
|
|
result_handler.add_result(g_request->results[i].name);
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// read results from database into "result_handler".
|
2009-09-03 20:26:31 +00:00
|
|
|
//
|
2008-02-21 00:47:50 +00:00
|
|
|
// Quantities that must be read from the DB are those
|
|
|
|
// where srip (see below) appears as an rval.
|
|
|
|
// These are: id, name, server_state, received_time, hostid, validate_state.
|
2009-09-03 20:26:31 +00:00
|
|
|
//
|
2008-02-21 00:47:50 +00:00
|
|
|
// Quantities that must be written to the DB are those for
|
|
|
|
// which srip appears as an lval. These are:
|
|
|
|
// hostid, teamid, received_time, client_state, cpu_time, exit_status,
|
|
|
|
// app_version_num, claimed_credit, server_state, stderr_out,
|
2009-09-03 20:26:31 +00:00
|
|
|
// xml_doc_out, outcome, validate_state, elapsed_time
|
2008-02-21 00:47:50 +00:00
|
|
|
//
|
|
|
|
retval = result_handler.enumerate();
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2008-02-21 00:47:50 +00:00
|
|
|
"[HOST#%d] Batch query failed\n",
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->host.id
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// loop over results reported by client
|
|
|
|
//
|
|
|
|
// A note about acks: we send an ack for result received if either
|
|
|
|
// 1) there's some problem with it (wrong state, host, not in DB) or
|
|
|
|
// 2) we update it successfully.
|
|
|
|
// In other words, the only time we don't ack a result is when
|
|
|
|
// it looks OK but the update failed.
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
for (i=0; i<g_request->results.size(); i++) {
|
|
|
|
rp = &g_request->results[i];
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
retval = result_handler.lookup_result(rp->name, &srip);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2008-02-21 00:47:50 +00:00
|
|
|
"[HOST#%d] [RESULT#? %s] can't find result\n",
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->host.id, rp->name
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[handle] [HOST#%d] [RESULT#%d] [WU#%d] got result (DB: server_state=%d outcome=%d client_state=%d validate_state=%d delete_state=%d)\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->server_state,
|
2008-05-02 17:48:29 +00:00
|
|
|
srip->outcome, srip->client_state, srip->validate_state,
|
|
|
|
srip->file_delete_state
|
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
// Do various sanity checks.
|
|
|
|
// If one of them fails, set srip->id = 0,
|
|
|
|
// which suppresses the DB update later on
|
|
|
|
//
|
|
|
|
|
|
|
|
// If result has server_state OVER
|
|
|
|
// if outcome NO_REPLY accept it (it's just late).
|
|
|
|
// else ignore it
|
|
|
|
//
|
|
|
|
if (srip->server_state == RESULT_SERVER_STATE_OVER) {
|
|
|
|
const char *dont_replace_result = NULL;
|
|
|
|
switch (srip->outcome) {
|
|
|
|
case RESULT_OUTCOME_INIT:
|
|
|
|
// should never happen!
|
|
|
|
dont_replace_result = "this result was never sent";
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_SUCCESS:
|
|
|
|
// don't replace a successful result!
|
|
|
|
dont_replace_result = "result already reported as success";
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_COULDNT_SEND:
|
|
|
|
// should never happen!
|
|
|
|
dont_replace_result = "this result couldn't be sent";
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_CLIENT_ERROR:
|
|
|
|
// should never happen!
|
|
|
|
dont_replace_result = "result already reported as error";
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_CLIENT_DETACHED:
|
|
|
|
case RESULT_OUTCOME_NO_REPLY:
|
|
|
|
// result is late in arriving, but keep it anyhow
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_DIDNT_NEED:
|
|
|
|
// should never happen
|
|
|
|
dont_replace_result = "this result wasn't sent (not needed)";
|
|
|
|
break;
|
|
|
|
case RESULT_OUTCOME_VALIDATE_ERROR:
|
|
|
|
// we already passed through the validator, so
|
|
|
|
// don't keep the new result
|
|
|
|
dont_replace_result = "result already reported, validate error";
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
dont_replace_result = "server logic bug; please alert BOINC developers";
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (dont_replace_result) {
|
|
|
|
char buf[256];
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] result already over [outcome=%d validate_state=%d]: %s\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->outcome,
|
2008-02-21 21:00:58 +00:00
|
|
|
srip->validate_state, dont_replace_result
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
sprintf(buf, "Completed result %s refused: %s", srip->name, dont_replace_result);
|
2009-08-21 19:14:15 +00:00
|
|
|
g_reply->insert_message(buf, "high");
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srip->server_state == RESULT_SERVER_STATE_UNSENT) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] got unexpected result: server state is %d\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->server_state
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (srip->received_time) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] already got result, at %s \n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid,
|
2008-05-02 17:48:29 +00:00
|
|
|
time_to_string(srip->received_time)
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2008-12-19 18:14:02 +00:00
|
|
|
if (srip->hostid != g_reply->host.id) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] got result from wrong host; expected [HOST#%d]\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, srip->hostid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
DB_HOST result_host;
|
|
|
|
retval = result_host.lookup_id(srip->hostid);
|
|
|
|
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[RESULT#%d] [WU#%d] Can't lookup [HOST#%d]\n",
|
|
|
|
srip->id, srip->workunitid, srip->hostid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
2008-12-19 18:14:02 +00:00
|
|
|
} else if (result_host.userid != g_reply->host.userid) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[USER#%d] [HOST#%d] [RESULT#%d] [WU#%d] Not even the same user; expected [USER#%d]\n",
|
|
|
|
g_reply->host.userid, g_reply->host.id, srip->id, srip->workunitid, result_host.userid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
srip->id = 0;
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(rp->name));
|
2008-02-21 00:47:50 +00:00
|
|
|
continue;
|
|
|
|
} else {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] Allowing result because same USER#%d\n",
|
|
|
|
g_reply->host.id, srip->id, srip->workunitid, g_reply->host.userid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
changed_host = true;
|
|
|
|
}
|
|
|
|
} // hostids do not match
|
|
|
|
|
|
|
|
// Modify the in-memory copy obtained from the DB earlier.
|
|
|
|
// If we found a problem above,
|
|
|
|
// we have continued and skipped this modify
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
srip->hostid = g_reply->host.id;
|
|
|
|
srip->teamid = g_reply->user.teamid;
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->received_time = time(0);
|
|
|
|
srip->client_state = rp->client_state;
|
|
|
|
srip->cpu_time = rp->cpu_time;
|
2009-09-03 20:26:31 +00:00
|
|
|
srip->elapsed_time = rp->elapsed_time;
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
// check for impossible CPU time
|
|
|
|
//
|
2009-09-03 20:26:31 +00:00
|
|
|
double turnaround_time = srip->received_time - srip->sent_time;
|
|
|
|
if (turnaround_time < 0) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] inconsistent sent/received times\n", srip->hostid, srip->id, srip->workunitid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
} else {
|
2009-09-03 20:26:31 +00:00
|
|
|
if (srip->elapsed_time > turnaround_time) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-09-03 20:26:31 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] excessive elapsed time: reported %f > elapsed %f%s\n",
|
|
|
|
srip->hostid, srip->id, srip->workunitid,
|
|
|
|
srip->elapsed_time, turnaround_time,
|
|
|
|
changed_host?" [OK: HOST changed]":""
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
srip->exit_status = rp->exit_status;
|
|
|
|
srip->app_version_num = rp->app_version_num;
|
2009-09-03 20:26:31 +00:00
|
|
|
|
|
|
|
// TODO: this is outdated, and doesn't belong here
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
if (rp->fpops_cumulative || rp->intops_cumulative) {
|
|
|
|
srip->claimed_credit = fpops_to_credit(rp->fpops_cumulative, rp->intops_cumulative);
|
|
|
|
} else if (rp->fpops_per_cpu_sec || rp->intops_per_cpu_sec) {
|
|
|
|
srip->claimed_credit = fpops_to_credit(
|
|
|
|
rp->fpops_per_cpu_sec*srip->cpu_time,
|
|
|
|
rp->intops_per_cpu_sec*srip->cpu_time
|
|
|
|
);
|
|
|
|
} else {
|
2008-12-19 18:14:02 +00:00
|
|
|
srip->claimed_credit = srip->cpu_time * g_reply->host.claimed_credit_per_cpu_sec;
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
2008-08-06 23:30:22 +00:00
|
|
|
|
|
|
|
if (config.use_credit_multiplier) {
|
|
|
|
// Regardless of the method of claiming credit,
|
|
|
|
// multiply by the application's credit multiplier
|
|
|
|
// at the time of result creation.
|
|
|
|
//
|
|
|
|
srip->claimed_credit *= credit_multiplier(srip->appid,srip->sent_time);
|
|
|
|
}
|
|
|
|
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[handle] cpu time %f credit/sec %f, claimed credit %f\n", srip->cpu_time, g_reply->host.claimed_credit_per_cpu_sec, srip->claimed_credit
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->server_state = RESULT_SERVER_STATE_OVER;
|
|
|
|
|
|
|
|
strlcpy(srip->stderr_out, rp->stderr_out, sizeof(srip->stderr_out));
|
|
|
|
strlcpy(srip->xml_doc_out, rp->xml_doc_out, sizeof(srip->xml_doc_out));
|
|
|
|
|
|
|
|
// look for exit status and app version in stderr_out
|
|
|
|
// (historical - can be deleted at some point)
|
|
|
|
//
|
|
|
|
parse_int(srip->stderr_out, "<exit_status>", srip->exit_status);
|
|
|
|
parse_int(srip->stderr_out, "<app_version>", srip->app_version_num);
|
|
|
|
|
|
|
|
if ((srip->client_state == RESULT_FILES_UPLOADED) && (srip->exit_status == 0)) {
|
|
|
|
srip->outcome = RESULT_OUTCOME_SUCCESS;
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[handle] [RESULT#%d] [WU#%d]: setting outcome SUCCESS\n",
|
|
|
|
srip->id, srip->workunitid
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
|
|
|
}
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
got_good_result(*srip);
|
2009-06-30 18:00:58 +00:00
|
|
|
|
|
|
|
if (config.dont_store_success_stderr) {
|
|
|
|
strcpy(srip->stderr_out, "");
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
} else {
|
2008-05-02 17:48:29 +00:00
|
|
|
if (config.debug_handle_results) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[handle] [RESULT#%d] [WU#%d]: client_state %d exit_status %d; setting outcome ERROR\n",
|
|
|
|
srip->id, srip->workunitid, srip->client_state, srip->exit_status
|
2008-05-02 17:48:29 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
srip->outcome = RESULT_OUTCOME_CLIENT_ERROR;
|
|
|
|
srip->validate_state = VALIDATE_STATE_INVALID;
|
2009-01-20 00:54:16 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
// penalize result quota
|
2009-01-20 00:54:16 +00:00
|
|
|
//
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
DB_WORKUNIT wu;
|
|
|
|
int delay_bound;
|
|
|
|
wu.id = srip->workunitid;
|
|
|
|
retval = wu.get_field_int("delay_bound", delay_bound);
|
|
|
|
if (!retval) {
|
|
|
|
got_bad_result(*srip, (double) delay_bound);
|
2009-01-20 00:54:16 +00:00
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
} // loop over all incoming results
|
|
|
|
|
|
|
|
// Update the result records
|
|
|
|
// (skip items that we previously marked to skip)
|
|
|
|
//
|
|
|
|
for (i=0; i<result_handler.results.size(); i++) {
|
|
|
|
SCHED_RESULT_ITEM& sri = result_handler.results[i];
|
|
|
|
if (sri.id == 0) continue;
|
|
|
|
retval = result_handler.update_result(sri);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2009-06-10 17:42:18 +00:00
|
|
|
"[HOST#%d] [RESULT#%d] [WU#%d] can't update result: %s\n",
|
|
|
|
g_reply->host.id, sri.id, sri.workunitid, boinc_db.error_string()
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
} else {
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->result_acks.push_back(std::string(sri.name));
|
2008-02-21 00:47:50 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// set transition_time for the results' WUs
|
|
|
|
//
|
|
|
|
retval = result_handler.update_workunits();
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2008-02-21 00:47:50 +00:00
|
|
|
"[HOST#%d] can't update WUs: %d\n",
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->host.id, retval
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|