- scheduler: notify user if we're not sending work

because we don't have any (matchmaker only).
- back end programs: for programs that do enumerations,
    check for error returns and exit
    (otherwise we'll get stuck forever if DB fails)

NOTE: In the course of researching this I came across a bug
in the transitioner: if there's a WU with more than 1000 results,
the enumeration will always return ERR_DB_NOT_FOUND,
and the transitioner won't ever do anything again.
Fixing this is a little tricky, so I'm not going to do it right now.


svn path=/trunk/boinc/; revision=16324
This commit is contained in:
David Anderson 2008-10-27 21:23:07 +00:00
parent eae866f743
commit be4cd9bb79
8 changed files with 125 additions and 18 deletions

View File

@ -8719,4 +8719,27 @@ David 27 Oct 2008
time_stats.cpp
work_fetch.cpp
clientgui/
BOINCBaseView.cpp
BOINCBaseView.cpp
David 27 Oct 2008
- scheduler: notify user if we're not sending work
because we don't have any (matchmaker only).
- back end programs: for programs that do enumerations,
check for error returns and exit
(otherwise we'll get stuck forever if DB fails)
NOTE: In the course of researching this I came across a bug
in the transitioner: if there's a WU with more than 1000 results,
the enumeration will always return ERR_DB_NOT_FOUND,
and the transitioner won't ever do anything again.
Fixing this is a little tricky, so I'm not going to do it right now.
sched/
file_deleter.cpp
request_file_list.cpp
sched_send.cpp
server_types.h
transitioner.cpp
trickle_handler.cpp
update_stats.cpp

View File

@ -275,9 +275,7 @@ bool do_pass(bool retry_error) {
retval = wu.enumerate(buf);
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_DEBUG,
"DB connection lost, exiting\n"
);
log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
exit(0);
}
break;
@ -306,7 +304,16 @@ bool do_pass(bool retry_error) {
clause, RESULTS_PER_ENUM
);
while (!result.enumerate(buf)) {
while (1) {
retval = result.enumerate(buf);
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_DEBUG, "DB connection lost, exiting\n");
exit(0);
}
break;
}
did_something = true;
retval = 0;
if (!preserve_result_files) {

View File

@ -32,6 +32,7 @@
#include "boinc_db.h"
#include "str_util.h"
#include "error_numbers.h"
#include "sched_config.h"
#include "sched_util.h"
@ -55,7 +56,16 @@ int request_file_list(int host_id) {
int request_files_from_all() {
DB_HOST host;
while(!host.enumerate()) {
while(1) {
int retval = host.enumerate();
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
fprintf(stderr, "lost DB connection\n");
exit(1);
}
break;
}
request_file_list(host.get_id());
}
return 0;

View File

@ -985,7 +985,7 @@ int add_result_to_reply(
}
}
retval = result.mark_as_sent(old_server_state);
if (retval==ERR_DB_NOT_FOUND) {
if (retval == ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_CRITICAL,
"[RESULT#%d] [HOST#%d]: CAN'T SEND, already sent to another host\n",
result.id, reply.host.id
@ -1163,9 +1163,11 @@ static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
if (!reply.wreq.host_info.preferred_apps[i].work_available) {
APP* app = ssp->lookup_app(reply.wreq.host_info.preferred_apps[i].appid);
// don't write message if the app is deprecated
if ( app != NULL ) {
if (app != NULL) {
char explanation[256];
sprintf(explanation,"No work is available for %s",find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid));
sprintf(explanation, "No work is available for %s",
find_user_friendly_name(reply.wreq.host_info.preferred_apps[i].appid)
);
USER_MESSAGE um(explanation, "high");
reply.insert_message(um);
}
@ -1234,6 +1236,13 @@ static void explain_to_user(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
);
reply.insert_message(um);
}
if (reply.wreq.no_jobs_available) {
USER_MESSAGE um(
"(Project has no jobs available)",
"high"
);
reply.insert_message(um);
}
if (reply.wreq.daily_result_quota_exceeded) {
struct tm *rpc_time_tm;
int delay_time;
@ -1778,7 +1787,7 @@ void JOB_SET::send(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
}
void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
int i, slots_locked=0;
int i, slots_locked=0, slots_nonempty=0;
JOB_SET jobs (sreq, reply);
int min_slots = config.mm_min_slots;
if (!min_slots) min_slots = ssp->max_wu_results/2;
@ -1799,8 +1808,10 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
case WR_STATE_EMPTY:
continue;
case WR_STATE_PRESENT:
slots_nonempty++;
break;
default:
slots_nonempty++;
if (wu_result.state == g_pid) break;
slots_locked++;
continue;
@ -1839,6 +1850,13 @@ void send_work_matchmaker(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
if (jobs.request_satisfied() && slots_scanned>=min_slots) break;
}
if (!slots_nonempty) {
log_messages.printf(MSG_CRITICAL,
"Job cache is empty - check feeder\n"
);
reply.wreq.no_jobs_available = true;
}
// TODO: trim jobs from tail of list until we pass the EDF check
//
jobs.send(sreq, reply);

View File

@ -162,6 +162,7 @@ struct WORK_REQ {
bool daily_result_quota_exceeded;
int daily_result_quota; // for this machine: number of cpus * daily_quota/cpu
bool cache_size_exceeded;
bool no_jobs_available; // project has no work right now
int nresults_on_host;
// How many results from this project are in progress on the host.
// Initially this is the number of "other_results"

View File

@ -40,6 +40,7 @@ using namespace std;
#include "util.h"
#include "backend_lib.h"
#include "common_defs.h"
#include "error_numbers.h"
#include "sched_config.h"
#include "sched_util.h"
@ -619,11 +620,21 @@ bool do_pass() {
// loop over entries that are due to be checked
//
while (!transitioner.enumerate((int)time(0), SELECT_LIMIT, mod_n, mod_i, items)) {
while (1) {
retval = transitioner.enumerate(
(int)time(0), SELECT_LIMIT, mod_n, mod_i, items
);
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_CRITICAL,
"WU enum error%d; exiting\n", retval
);
exit(1);
}
break;
}
did_something = true;
TRANSITIONER_ITEM& wu_item = items[0];
retval = handle_wu(transitioner, items);
if (retval) {
log_messages.printf(MSG_CRITICAL,

View File

@ -28,13 +28,14 @@
//
// return nonzero on error
using namespace std;
#include "config.h"
#include <unistd.h>
#include "boinc_db.h"
#include "util.h"
#include "error_numbers.h"
#include "str_util.h"
#include "sched_config.h"
#include "sched_util.h"
#include "sched_msgs.h"
@ -82,7 +83,15 @@ bool do_trickle_scan() {
int retval;
sprintf(buf, "where variety='%s' and handled=0", variety);
while (!mfh.enumerate(buf)) {
while (1) {
retval = mfh.enumerate(buf);
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
fprintf(stderr, "lost DB conn\n");
exit(1);
}
break;
}
retval = handle_trickle(mfh);
if (!retval) {
mfh.handled = true;

View File

@ -35,6 +35,7 @@
#include "boinc_db.h"
#include "util.h"
#include "str_util.h"
#include "error_numbers.h"
#include "sched_config.h"
#include "sched_util.h"
@ -53,7 +54,16 @@ int update_users() {
int retval;
char buf[256];
while (!user.enumerate("where expavg_credit>0.1")) {
while (1) {
retval = user.enumerate("where expavg_credit>0.1");
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_CRITICAL, "lost DB conn\n");
exit(1);
}
break;
}
if (user.expavg_time > update_time_cutoff) continue;
update_average(0, 0, CREDIT_HALF_LIFE, user.expavg_credit, user.expavg_time);
sprintf( buf, "expavg_credit=%f, expavg_time=%f",
@ -74,7 +84,16 @@ int update_hosts() {
int retval;
char buf[256];
while (!host.enumerate("where expavg_credit>0.1")) {
while (1) {
retval = host.enumerate("where expavg_credit>0.1");
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_CRITICAL, "lost DB conn\n");
exit(1);
}
break;
}
if (host.expavg_time > update_time_cutoff) continue;
update_average(0, 0, CREDIT_HALF_LIFE, host.expavg_credit, host.expavg_time);
sprintf(
@ -123,7 +142,16 @@ int update_teams() {
int retval;
char buf[256];
while (!team.enumerate("where expavg_credit>0.1")) {
while (1) {
retval = team.enumerate("where expavg_credit>0.1");
if (retval) {
if (retval != ERR_DB_NOT_FOUND) {
log_messages.printf(MSG_CRITICAL, "lost DB conn\n");
exit(1);
}
break;
}
retval = get_team_totals(team);
if (retval) {
log_messages.printf(MSG_CRITICAL,