2008-08-06 18:36:30 +00:00
|
|
|
// This file is part of BOINC.
|
2008-02-21 00:47:50 +00:00
|
|
|
// http://boinc.berkeley.edu
|
|
|
|
// Copyright (C) 2008 University of California
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is free software; you can redistribute it and/or modify it
|
|
|
|
// under the terms of the GNU Lesser General Public License
|
|
|
|
// as published by the Free Software Foundation,
|
|
|
|
// either version 3 of the License, or (at your option) any later version.
|
2008-02-21 00:47:50 +00:00
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// BOINC is distributed in the hope that it will be useful,
|
2008-02-21 00:47:50 +00:00
|
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
// See the GNU Lesser General Public License for more details.
|
|
|
|
//
|
2008-08-06 18:36:30 +00:00
|
|
|
// You should have received a copy of the GNU Lesser General Public License
|
|
|
|
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
|
2009-01-13 23:06:02 +00:00
|
|
|
//
|
2011-11-16 19:47:40 +00:00
|
|
|
|
|
|
|
// scheduler functions to send assigned jobs.
|
|
|
|
|
2009-01-13 23:06:02 +00:00
|
|
|
#include "config.h"
|
|
|
|
|
|
|
|
#ifdef _USING_FCGI_
|
|
|
|
#include "boinc_fcgi.h"
|
|
|
|
#else
|
|
|
|
#include <cstdio>
|
|
|
|
#endif
|
2008-02-21 00:47:50 +00:00
|
|
|
|
2008-03-07 21:13:01 +00:00
|
|
|
#include <unistd.h>
|
|
|
|
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "boinc_db.h"
|
|
|
|
#include "crypt.h"
|
|
|
|
#include "backend_lib.h"
|
|
|
|
#include "error_numbers.h"
|
|
|
|
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_main.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
#include "sched_msgs.h"
|
|
|
|
#include "sched_send.h"
|
2009-03-19 16:35:35 +00:00
|
|
|
#include "sched_version.h"
|
2009-08-10 04:49:02 +00:00
|
|
|
#include "sched_types.h"
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
#include "sched_assign.h"
|
|
|
|
|
2012-01-30 22:39:13 +00:00
|
|
|
// send a job for the given assignment
|
|
|
|
//
|
2008-12-19 18:14:02 +00:00
|
|
|
static int send_assigned_job(ASSIGNMENT& asg) {
|
2008-02-21 00:47:50 +00:00
|
|
|
int retval;
|
|
|
|
DB_WORKUNIT wu;
|
2012-01-30 22:39:13 +00:00
|
|
|
char suffix[256], path[256];
|
2009-05-07 13:54:51 +00:00
|
|
|
const char *rtfpath;
|
2008-02-21 00:47:50 +00:00
|
|
|
static bool first=true;
|
|
|
|
static int seqno=0;
|
|
|
|
static R_RSA_PRIVATE_KEY key;
|
2008-03-27 18:25:29 +00:00
|
|
|
BEST_APP_VERSION* bavp;
|
2008-02-21 00:47:50 +00:00
|
|
|
|
|
|
|
if (first) {
|
|
|
|
first = false;
|
|
|
|
sprintf(path, "%s/upload_private", config.key_dir);
|
|
|
|
retval = read_key_file(path, key);
|
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL, "can't read key\n");
|
2008-02-21 00:47:50 +00:00
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
retval = wu.lookup_id(asg.workunitid);
|
|
|
|
if (retval) {
|
2008-02-27 22:26:37 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"assigned WU %d not found\n", asg.workunitid
|
|
|
|
);
|
2008-02-21 00:47:50 +00:00
|
|
|
return retval;
|
|
|
|
}
|
2008-03-27 18:25:29 +00:00
|
|
|
|
- server: change the following from per-host to per-(host, app version):
- daily quota mechanism
- reliable mechanism (accelerated retries)
- "trusted" mechanism (adaptive replication)
- scheduler: enforce host scale probation only for apps with
host_scale_check set.
- validator: do scale probation on invalid results
(need this in addition to error and timeout cases)
- feeder: update app version scales every 10 min, not 10 sec
- back-end apps: support --foo as well as -foo for options
Notes:
- If you have, say, cuda, cuda23 and cuda_fermi plan classes,
a host will have separate quotas for each one.
That means it could error out on 100 jobs for cuda_fermi,
and when its quota goes to zero,
error out on 100 jobs for cuda23, etc.
This is intentional; there may be cases where one version
works but not the others.
- host.error_rate and host.max_results_day are deprecated
TODO:
- the values in the app table for limits on jobs in progress etc.
should override rather than config.xml.
Implementation notes:
scheduler:
process_request():
read all host_app_versions for host at start;
Compute "reliable" and "trusted" for each one.
write modified records at end
get_app_version():
add "reliable_only" arg; if set, use only reliable versions
skip over-quota versions
Multi-pass scheduling: if have at least one reliable version,
do a pass for jobs that need reliable,
and use only reliable versions.
Then clear best_app_versions cache.
Score-based scheduling: for need-reliable jobs,
it will pick the fastest version,
then give a score bonus if that version happens to be reliable.
When get back a successful result from client:
increase daily quota
When get back an error result from client:
impose scale probation
decrease daily quota if not aborted
Validator:
when handling a WU, create a vector of HOST_APP_VERSION
parallel to vector of RESULT.
Pass it to assign_credit_set().
Make copies of originals so we can update only modified ones
update HOST_APP_VERSION error rates
Transitioner:
decrease quota on timeout
svn path=/trunk/boinc/; revision=21181
2010-04-15 03:13:56 +00:00
|
|
|
bavp = get_app_version(wu, false, false);
|
2008-03-27 18:25:29 +00:00
|
|
|
if (!bavp) {
|
2008-02-27 22:26:37 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
|
|
|
"App version for assigned WU not found\n"
|
|
|
|
);
|
2008-02-21 00:47:50 +00:00
|
|
|
return ERR_NOT_FOUND;
|
|
|
|
}
|
|
|
|
|
2009-05-07 13:54:51 +00:00
|
|
|
rtfpath = config.project_path("%s", wu.result_template_file);
|
- server code: at some point I made a global var "SCHED_CONFIG config",
mostly so that the parse function could assume
that everything was initially zero.
However, various back-end functions pass around SCHED_CONFIG&
as an argument (also named "config").
This creates a shadow, which is always bad.
Worse is the possibility that some projects have back-end programs
that have a SCHED_CONFIG variable that's automatic,
and therefore isn't zero initially,
and therefore isn't parsing correctly.
To fix this, I changed the 2 vectors in SCHED_CONFIG into pointers,
and have the parse routine zero the structure.
I was tempted to remove the SCHED_CONFIG& args to back-end functions,
but this would have broken some projects' code.
I did, however, change the name from config to config_loc
to avoid shadowing.
Also fixed various other compiler warnings.
svn path=/trunk/boinc/; revision=15541
2008-07-02 17:24:53 +00:00
|
|
|
sprintf(suffix, "%d_%d_%d", getpid(), (int)time(0), seqno++);
|
2011-09-07 17:37:50 +00:00
|
|
|
retval = create_result(
|
|
|
|
wu, const_cast<char*>(rtfpath), suffix, key, config, 0, 0
|
|
|
|
);
|
2008-02-21 00:47:50 +00:00
|
|
|
if (retval) {
|
2008-02-21 21:00:58 +00:00
|
|
|
log_messages.printf(MSG_CRITICAL,
|
2010-11-08 17:51:57 +00:00
|
|
|
"[WU#%d %s] create_result(): %s\n", wu.id, wu.name, boincerror(retval)
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
int result_id = boinc_db.insert_id();
|
2011-06-06 03:40:42 +00:00
|
|
|
SCHED_DB_RESULT result;
|
2008-02-21 00:47:50 +00:00
|
|
|
retval = result.lookup_id(result_id);
|
2009-03-03 16:38:54 +00:00
|
|
|
add_result_to_reply(result, wu, bavp, false);
|
2008-02-21 00:47:50 +00:00
|
|
|
|
2008-08-19 03:00:17 +00:00
|
|
|
if (config.debug_assignment) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
|
|
|
"[assign] [WU#%d] [RESULT#%d] [HOST#%d] send assignment %d\n",
|
2008-12-19 18:14:02 +00:00
|
|
|
wu.id, result_id, g_reply->host.id, asg.id
|
2008-08-19 03:00:17 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-01-30 22:39:13 +00:00
|
|
|
// Send this host any "multi" assigned jobs.
|
2008-02-21 00:47:50 +00:00
|
|
|
// Return true iff we sent anything
|
|
|
|
//
|
2012-01-30 22:39:13 +00:00
|
|
|
bool send_assigned_jobs_multi() {
|
2008-02-21 00:47:50 +00:00
|
|
|
DB_RESULT result;
|
|
|
|
int retval;
|
|
|
|
char buf[256];
|
|
|
|
bool sent_something = false;
|
|
|
|
|
|
|
|
for (int i=0; i<ssp->nassignments; i++) {
|
|
|
|
ASSIGNMENT& asg = ssp->assignments[i];
|
|
|
|
|
2008-03-18 21:22:44 +00:00
|
|
|
if (config.debug_assignment) {
|
2009-01-15 20:23:20 +00:00
|
|
|
log_messages.printf(MSG_NORMAL,
|
2012-01-30 22:39:13 +00:00
|
|
|
"[assign] processing multi assignment type %d\n",
|
|
|
|
asg.target_type
|
2008-03-18 21:22:44 +00:00
|
|
|
);
|
|
|
|
}
|
2008-02-21 00:47:50 +00:00
|
|
|
// see if this assignment applies to this host
|
|
|
|
//
|
|
|
|
switch (asg.target_type) {
|
|
|
|
case ASSIGN_NONE:
|
2008-02-25 18:05:04 +00:00
|
|
|
sprintf(buf, "where hostid=%d and workunitid=%d",
|
2008-12-19 18:14:02 +00:00
|
|
|
g_reply->host.id, asg.workunitid
|
2008-02-21 00:47:50 +00:00
|
|
|
);
|
|
|
|
retval = result.lookup(buf);
|
2008-02-27 22:26:37 +00:00
|
|
|
if (retval == ERR_DB_NOT_FOUND) {
|
2008-12-19 18:14:02 +00:00
|
|
|
retval = send_assigned_job(asg);
|
2008-02-21 00:47:50 +00:00
|
|
|
if (!retval) sent_something = true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ASSIGN_USER:
|
2008-12-19 18:14:02 +00:00
|
|
|
if (g_reply->user.id != asg.target_id) continue;
|
2012-01-30 22:39:13 +00:00
|
|
|
sprintf(buf, "where workunitid=%d and hostid=%d",
|
|
|
|
asg.workunitid, g_reply->host.id
|
|
|
|
);
|
2008-02-21 00:47:50 +00:00
|
|
|
retval = result.lookup(buf);
|
2008-02-27 22:26:37 +00:00
|
|
|
if (retval == ERR_DB_NOT_FOUND) {
|
2008-12-19 18:14:02 +00:00
|
|
|
retval = send_assigned_job(asg);
|
2008-02-21 00:47:50 +00:00
|
|
|
if (!retval) sent_something = true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ASSIGN_TEAM:
|
2008-12-19 18:14:02 +00:00
|
|
|
if (g_reply->team.id != asg.target_id) continue;
|
2012-01-30 22:39:13 +00:00
|
|
|
sprintf(buf, "where workunitid=%d and hostid=%d", asg.workunitid, g_reply->host.id);
|
2008-02-21 00:47:50 +00:00
|
|
|
retval = result.lookup(buf);
|
2008-02-27 22:26:37 +00:00
|
|
|
if (retval == ERR_DB_NOT_FOUND) {
|
2008-12-19 18:14:02 +00:00
|
|
|
retval = send_assigned_job(asg);
|
2008-02-21 00:47:50 +00:00
|
|
|
if (!retval) sent_something = true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return sent_something;
|
|
|
|
}
|
2012-01-30 22:39:13 +00:00
|
|
|
|
|
|
|
// send non-multi assigned jobs
|
|
|
|
//
|
|
|
|
bool send_assigned_jobs() {
|
|
|
|
DB_ASSIGNMENT asg;
|
|
|
|
DB_RESULT result;
|
|
|
|
DB_WORKUNIT wu;
|
|
|
|
bool sent_something = false;
|
|
|
|
int retval;
|
|
|
|
|
|
|
|
// for now, only look for user assignments
|
|
|
|
//
|
|
|
|
char buf[256];
|
|
|
|
sprintf(buf, "target_type=%d and target_id=%d and multi=0",
|
|
|
|
ASSIGN_USER, g_reply->user.id
|
|
|
|
);
|
|
|
|
while (asg.enumerate(buf)) {
|
|
|
|
if (!work_needed(false)) continue;
|
|
|
|
|
|
|
|
// if the WU doesn't exist, delete the assignment record.
|
|
|
|
//
|
|
|
|
retval = wu.lookup_id(asg.workunitid);
|
|
|
|
if (retval) {
|
|
|
|
asg.delete_from_db();
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// don't send if WU is validation pending or completed,
|
|
|
|
// or has transition pending
|
|
|
|
//
|
|
|
|
if (wu.need_validate) continue;
|
|
|
|
if (wu.canonical_resultid) continue;
|
|
|
|
if (wu.transition_time < time(0)) continue;
|
|
|
|
|
|
|
|
// don't send if we already sent one to this host
|
|
|
|
//
|
|
|
|
sprintf(buf, "where workunitid=%d and hostid=%d",
|
|
|
|
asg.workunitid,
|
|
|
|
g_request->host.id
|
|
|
|
);
|
|
|
|
retval = result.lookup(buf);
|
|
|
|
if (retval != ERR_DB_NOT_FOUND) continue;
|
|
|
|
|
|
|
|
// don't send if there's already one in progress to this user
|
|
|
|
//
|
|
|
|
sprintf(buf,
|
|
|
|
"where workunitid=%d and userid=%d and server_state=%d",
|
|
|
|
asg.workunitid,
|
|
|
|
g_reply->user.id,
|
|
|
|
RESULT_SERVER_STATE_IN_PROGRESS
|
|
|
|
);
|
|
|
|
retval = result.lookup(buf);
|
|
|
|
if (retval != ERR_DB_NOT_FOUND) continue;
|
|
|
|
|
|
|
|
// OK, send the job
|
|
|
|
//
|
|
|
|
retval = send_assigned_job(asg);
|
|
|
|
if (retval) continue;
|
|
|
|
|
|
|
|
sent_something = true;
|
|
|
|
|
|
|
|
// update the WU's transition time to time out this job
|
|
|
|
//
|
|
|
|
retval = wu.lookup_id(asg.workunitid);
|
|
|
|
if (retval) continue;
|
|
|
|
int new_tt = time(0) + wu.delay_bound;
|
|
|
|
if (new_tt < wu.transition_time) {
|
|
|
|
char buf2[256];
|
|
|
|
sprintf(buf2, "transition_time=%d", new_tt);
|
|
|
|
wu.update_field(buf2);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return sent_something;
|
|
|
|
}
|