mirror of https://github.com/BOINC/boinc.git
- scheduler: notify user if we're not sending work
because we don't have any (matchmaker only). - back end programs: for programs that do enumerations, check for error returns and exit (otherwise we'll get stuck forever if DB fails) NOTE: In the course of researching this I came across a bug in the transitioner: if there's a WU with more than 1000 results, the enumeration will always return ERR_DB_NOT_FOUND, and the transitioner won't ever do anything again. Fixing this is a little tricky, so I'm not going to do it right now. svn path=/trunk/boinc/; revision=16324
This commit is contained in:
parent
eae866f743
commit
be4cd9bb79
|
@ -8719,4 +8719,27 @@ David 27 Oct 2008
|
|||
time_stats.cpp
|
||||
work_fetch.cpp
|
||||
clientgui/
|
||||
BOINCBaseView.cpp
|
||||
BOINCBaseView.cpp
|
||||
|
||||
David 27 Oct 2008
|
||||
- scheduler: notify user if we're not sending work
|
||||
because we don't have any (matchmaker only).
|
||||
- back end programs: for programs that do enumerations,
|
||||
check for error returns and exit
|
||||
(otherwise we'll get stuck forever if DB fails)
|
||||
|
||||
NOTE: In the course of researching this I came across a bug
|
||||
in the transitioner: if there's a WU with more than 1000 results,
|
||||
the enumeration will always return ERR_DB_NOT_FOUND,
|
||||
and the transitioner won't ever do anything again.
|
||||
Fixing this is a little tricky, so I'm not going to do it right now.
|
||||
|
||||
sched/
|
||||
file_deleter.cpp
|
||||
request_file_list.cpp
|
||||
sched_send.cpp
|
||||
server_types.h
|
||||
transitioner.cpp
|
||||
trickle_handler.cpp
|
||||
update_stats.cpp
|
||||
|
||||
|
|
|
@ -275,9 +275,7 @@ bool do_pass(bool retry_error) {
|
|||
retval = wu.enumerate(buf);
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_DEBUG,
|
||||
"DB connection lost, exiting\n"
|
||||
);
|
||||
log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
|
||||
exit(0);
|
||||
}
|
||||
break;
|
||||
|
@ -306,7 +304,16 @@ bool do_pass(bool retry_error) {
|
|||
clause, RESULTS_PER_ENUM
|
||||
);
|
||||
|
||||
while (!result.enumerate(buf)) {
|
||||
while (1) {
|
||||
retval = result.enumerate(buf);
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
|
||||
exit(0);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
did_something = true;
|
||||
retval = 0;
|
||||
if (!preserve_result_files) {
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
|
||||
#include "boinc_db.h"
|
||||
#include "str_util.h"
|
||||
#include "error_numbers.h"
|
||||
|
||||
#include "sched_config.h"
|
||||
#include "sched_util.h"
|
||||
|
@ -55,7 +56,16 @@ int request_file_list(int host_id) {
|
|||
|
||||
int request_files_from_all() {
|
||||
DB_HOST host;
|
||||
while(!host.enumerate()) {
|
||||
|
||||
while(1) {
|
||||
int retval = host.enumerate();
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
fprintf(stderr, "lost DB connection\n");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
request_file_list(host.get_id());
|
||||
}
|
||||
return 0;
|
||||
|
|
|
@ -985,7 +985,7 @@ int add_result_to_reply(
|
|||
}
|
||||
}
|
||||
retval = result.mark_as_sent(old_server_state);
|
||||
if (retval==ERR_DB_NOT_FOUND) {
|
||||
if (retval == ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
|
||||
result.id, reply.host.id
|
||||
|
@ -1163,9 +1163,11 @@ static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
if (!reply.wreq.host_info.preferred_apps[i].work_available) {
|
||||
APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid);
|
||||
// don't write message if the app is deprecated
|
||||
if ( app != NULL ) {
|
||||
if (app != NULL) {
|
||||
char explanation[256];
|
||||
sprintf(explanation,"No work is available for %s",find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid));
|
||||
sprintf(explanation, "No work is available for %s",
|
||||
find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid)
|
||||
);
|
||||
USER_MESSAGE um(explanation, "high");
|
||||
reply.insert_message(um);
|
||||
}
|
||||
|
@ -1234,6 +1236,13 @@ static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
);
|
||||
reply.insert_message(um);
|
||||
}
|
||||
if (reply.wreq.no_jobs_available) {
|
||||
USER_MESSAGE um(
|
||||
"(Project has no jobs available)",
|
||||
"high"
|
||||
);
|
||||
reply.insert_message(um);
|
||||
}
|
||||
if (reply.wreq.daily_result_quota_exceeded) {
|
||||
struct tm *rpc_time_tm;
|
||||
int delay_time;
|
||||
|
@ -1778,7 +1787,7 @@ void JOB_SET::send(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
}
|
||||
|
||||
void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
||||
int i, slots_locked=0;
|
||||
int i, slots_locked=0, slots_nonempty=0;
|
||||
JOB_SET jobs (sreq, reply);
|
||||
int min_slots = config.mm_min_slots;
|
||||
if (!min_slots) min_slots = ssp->max_wu_results/2;
|
||||
|
@ -1799,8 +1808,10 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
case WR_STATE_EMPTY:
|
||||
continue;
|
||||
case WR_STATE_PRESENT:
|
||||
slots_nonempty++;
|
||||
break;
|
||||
default:
|
||||
slots_nonempty++;
|
||||
if (wu_result.state == g_pid) break;
|
||||
slots_locked++;
|
||||
continue;
|
||||
|
@ -1839,6 +1850,13 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
if (jobs.request_satisfied() && slots_scanned>=min_slots) break;
|
||||
}
|
||||
|
||||
if (!slots_nonempty) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"Job cache is empty - check feeder\n"
|
||||
);
|
||||
reply.wreq.no_jobs_available = true;
|
||||
}
|
||||
|
||||
// TODO: trim jobs from tail of list until we pass the EDF check
|
||||
//
|
||||
jobs.send(sreq, reply);
|
||||
|
|
|
@ -162,6 +162,7 @@ struct WORK_REQ {
|
|||
bool daily_result_quota_exceeded;
|
||||
int daily_result_quota; // for this machine: number of cpus * daily_quota/cpu
|
||||
bool cache_size_exceeded;
|
||||
bool no_jobs_available; // project has no work right now
|
||||
int nresults_on_host;
|
||||
// How many results from this project are in progress on the host.
|
||||
// Initially this is the number of "other_results"
|
||||
|
|
|
@ -40,6 +40,7 @@ using namespace std;
|
|||
#include "util.h"
|
||||
#include "backend_lib.h"
|
||||
#include "common_defs.h"
|
||||
#include "error_numbers.h"
|
||||
|
||||
#include "sched_config.h"
|
||||
#include "sched_util.h"
|
||||
|
@ -619,11 +620,21 @@ bool do_pass() {
|
|||
|
||||
// loop over entries that are due to be checked
|
||||
//
|
||||
while (!transitioner.enumerate((int)time(0), SELECT_LIMIT, mod_n, mod_i, items)) {
|
||||
while (1) {
|
||||
retval = transitioner.enumerate(
|
||||
(int)time(0), SELECT_LIMIT, mod_n, mod_i, items
|
||||
);
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
"WU enum error%d; exiting\n", retval
|
||||
);
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
did_something = true;
|
||||
|
||||
TRANSITIONER_ITEM& wu_item = items[0];
|
||||
|
||||
retval = handle_wu(transitioner, items);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
|
|
|
@ -28,13 +28,14 @@
|
|||
//
|
||||
// return nonzero on error
|
||||
|
||||
using namespace std;
|
||||
#include "config.h"
|
||||
#include <unistd.h>
|
||||
|
||||
#include "boinc_db.h"
|
||||
#include "util.h"
|
||||
#include "error_numbers.h"
|
||||
#include "str_util.h"
|
||||
|
||||
#include "sched_config.h"
|
||||
#include "sched_util.h"
|
||||
#include "sched_msgs.h"
|
||||
|
@ -82,7 +83,15 @@ bool do_trickle_scan() {
|
|||
int retval;
|
||||
|
||||
sprintf(buf, "where variety='%s' and handled=0", variety);
|
||||
while (!mfh.enumerate(buf)) {
|
||||
while (1) {
|
||||
retval = mfh.enumerate(buf);
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
fprintf(stderr, "lost DB conn\n");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
retval = handle_trickle(mfh);
|
||||
if (!retval) {
|
||||
mfh.handled = true;
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
#include "boinc_db.h"
|
||||
#include "util.h"
|
||||
#include "str_util.h"
|
||||
#include "error_numbers.h"
|
||||
|
||||
#include "sched_config.h"
|
||||
#include "sched_util.h"
|
||||
|
@ -53,7 +54,16 @@ int update_users() {
|
|||
int retval;
|
||||
char buf[256];
|
||||
|
||||
while (!user.enumerate("where expavg_credit>0.1")) {
|
||||
while (1) {
|
||||
retval = user.enumerate("where expavg_credit>0.1");
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_CRITICAL, "lost DB conn\n");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (user.expavg_time > update_time_cutoff) continue;
|
||||
update_average(0, 0, CREDIT_HALF_LIFE, user.expavg_credit, user.expavg_time);
|
||||
sprintf( buf, "expavg_credit=%f, expavg_time=%f",
|
||||
|
@ -74,7 +84,16 @@ int update_hosts() {
|
|||
int retval;
|
||||
char buf[256];
|
||||
|
||||
while (!host.enumerate("where expavg_credit>0.1")) {
|
||||
while (1) {
|
||||
retval = host.enumerate("where expavg_credit>0.1");
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_CRITICAL, "lost DB conn\n");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (host.expavg_time > update_time_cutoff) continue;
|
||||
update_average(0, 0, CREDIT_HALF_LIFE, host.expavg_credit, host.expavg_time);
|
||||
sprintf(
|
||||
|
@ -123,7 +142,16 @@ int update_teams() {
|
|||
int retval;
|
||||
char buf[256];
|
||||
|
||||
while (!team.enumerate("where expavg_credit>0.1")) {
|
||||
while (1) {
|
||||
retval = team.enumerate("where expavg_credit>0.1");
|
||||
if (retval) {
|
||||
if (retval != ERR_DB_NOT_FOUND) {
|
||||
log_messages.printf(MSG_CRITICAL, "lost DB conn\n");
|
||||
exit(1);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
retval = get_team_totals(team);
|
||||
if (retval) {
|
||||
log_messages.printf(MSG_CRITICAL,
|
||||
|
|
Loading…
Reference in New Issue