- client and API: improve the way an app checks for the death of the client

Old: heartbeat mechanism
    Problem: if the client is blocked for > 30 secs
        (e.g. because it takes a long time to write the state file,
        of because it's stopped in a debugger)
        then apps exit.
        This is bad is the app doesn't checkpoint and has been
        running for a long time.
    New: the client passes its PID to the app.
        The app periodically (10 sec) checks that the process still exists.
    Notes:
    - For backward compatibility (e.g. new API w/ old client,
        or vice versa) the client still sends heartbeats,
        and the API checks heartbeats if the client doesn't pass a PID.
    - The new mechanism works only if the client's PID isn't assigned
        to a new process within 10 secs of the client exiting.
        Windows 2000 reuses PIDs immediately, so check for Win2K
        and don't use this mechanism if so.

TODO: For Unix multithread apps,
    critical sections aren't currently being enforced.
    Need to fix this by masking signals.


svn path=/trunk/boinc/; revision=26147
This commit is contained in:
David Anderson 2012-10-11 16:41:31 +00:00
parent acd1ad07ae
commit 7d40c46923
8 changed files with 106 additions and 22 deletions

View File

@ -39,6 +39,8 @@
// new process call boinc_init_options() with flags to
// send status messages and handle checkpoint stuff,
// and returns from boinc_init_parallel()
// NOTE: THIS DOESN'T RESPECT CRITICAL SECTIONS.
// NEED TO MASK SIGNALS IN CHILD DURING CRITICAL SECTIONS
// Win:
// like sequential case, except suspend/resume must enumerate
// all threads (except timer) and suspend/resume them all
@ -48,6 +50,13 @@
//
// 3) For compatibility with C, we use int instead of bool various places
//
// 4) We must periodically check that the client is still alive and exit if not.
// Originally this was done using heartbeat msgs from client.
// This is unreliable, e.g. if the client is blocked for a long time.
// As of Oct 11 2012 we use a different mechanism:
// the client passes its PID and we periodically check whether it exists.
// But we need to support the heartbeat mechanism also for compatibility.
//
// Terminology:
// The processing of a result can be divided
// into multiple "episodes" (executions of the app),
@ -127,7 +136,7 @@ static volatile double initial_wu_cpu_time;
static volatile bool have_new_trickle_up = false;
static volatile bool have_trickle_down = true;
// on first call, scan slot dir for msgs
static volatile int heartbeat_giveup_time;
static volatile int heartbeat_giveup_count;
// interrupt count value at which to give up on core client
#ifdef _WIN32
static volatile int nrunning_ticks = 0;
@ -156,12 +165,13 @@ int app_min_checkpoint_period = 0;
#define TIMER_PERIOD 0.1
// period of worker-thread timer interrupts.
// Determines rate of handlling messages from client.
// Determines rate of handling messages from client.
#define TIMERS_PER_SEC 10
// This determines the resolution of fraction done and CPU time reporting
// to the core client, and of checkpoint enabling.
// It doesn't influence graphics, so 1 sec is enough.
#define HEARTBEAT_GIVEUP_COUNT ((int)(30/TIMER_PERIOD))
#define HEARTBEAT_GIVEUP_SECS 30
#define HEARTBEAT_GIVEUP_COUNT ((int)(HEARTBEAT_GIVEUP_SECS/TIMER_PERIOD))
// quit if no heartbeat from core in this #interrupts
#define LOCKFILE_TIMEOUT_PERIOD 35
// quit if we cannot aquire slot lock file in this #secs after startup
@ -354,7 +364,7 @@ static void handle_heartbeat_msg() {
if (app_client_shm->shm->heartbeat.get_msg(buf)) {
boinc_status.network_suspended = false;
if (match_tag(buf, "<heartbeat/>")) {
heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
heartbeat_giveup_count = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
}
if (parse_double(buf, "<wss>", dtemp)) {
boinc_status.working_set_size = dtemp;
@ -368,6 +378,30 @@ static void handle_heartbeat_msg() {
}
}
static bool client_dead() {
if (aid.client_pid) {
// check every 10 sec
//
if (interrupt_count%(TIMERS_PER_SEC*10)) return false;
#ifdef _WIN32
// Windows doesn't have waitpid() :-(
//
DWORD pids[4096], nb;
BOOL r = EnumProcesses(pids, sizeof(pids), nb);
if (!r) return false;
int n = nb/sizeof(DWORD);
for (int i=0; i<n; i++) {
if (pids[i] == aid.client_pid) return false;
}
return true;
#else
return (waitpid(aid.client_pid, 0, WNOHANG) < 0);
#endif
} else {
return (interrupt_count > heartbeat_giveup_count);
}
}
#ifndef _WIN32
// For multithread apps on Unix, the main process executes the following.
//
@ -393,7 +427,7 @@ static void parallel_master(int child_pid) {
}
}
if (heartbeat_giveup_time < interrupt_count) {
if (client_dead()) {
kill(child_pid, SIGKILL);
exit(0);
}
@ -551,7 +585,7 @@ int boinc_init_options_general(BOINC_OPTIONS& opt) {
if (standalone) {
options.check_heartbeat = false;
}
heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
heartbeat_giveup_count = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
return 0;
}
@ -1171,10 +1205,10 @@ static void timer_handler() {
// (unless we're in a critical section)
//
if (in_critical_section==0 && options.check_heartbeat) {
if (heartbeat_giveup_time < interrupt_count) {
if (client_dead()) {
boinc_msg_prefix(buf, sizeof(buf));
fputs(buf, stderr);
fputs(" No heartbeat from core client for 30 sec - exiting\n", stderr);
fputs(buf, stderr); // don't use fprintf() here
fputs(" No heartbeat from client for 30 sec - exiting\n", stderr);
if (options.direct_process_action) {
exit_from_timer_thread(0);
} else {

View File

@ -6096,3 +6096,36 @@ David 10 Oct 2012
vda/
sched_vda.cpp
vda_lib2.cpp
David 11 Oct 2012
- client and API: improve the way an app checks for the death of the client
Old: heartbeat mechanism
Problem: if the client is blocked for > 30 secs
(e.g. because it takes a long time to write the state file,
of because it's stopped in a debugger)
then apps exit.
This is bad is the app doesn't checkpoint and has been
running for a long time.
New: the client passes its PID to the app.
The app periodically (10 sec) checks that the process still exists.
Notes:
- For backward compatibility (e.g. new API w/ old client,
or vice versa) the client still sends heartbeats,
and the API checks heartbeats if the client doesn't pass a PID.
- The new mechanism works only if the client's PID isn't assigned
to a new process within 10 secs of the client exiting.
Windows 2000 reuses PIDs immediately, so check for Win2K
and don't use this mechanism if so.
TODO: For Unix multithread apps,
critical sections aren't currently being enforced.
Need to fix this by masking signals.
api/
boinc_api.cpp
client/
hostinfo_win.cpp
app_start.cpp
lib/
app_ipc.cpp,h
proc_control.cpp

View File

@ -218,6 +218,17 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) {
relative_to_absolute("", aid.boinc_dir);
strcpy(aid.authenticator, wup->project->authenticator);
aid.slot = slot;
#ifdef _WIN32
if (strstr(gstate.hostinfo.os_name, "Windows 2000")) {
// Win2K immediately reuses PIDs, so can't use this mechanism
//
aid.client_pid = 0;
} else {
aid.client_pid = GetCurrentProcessId();
}
#else
aid.client_pid = getpid();
#endif
strcpy(aid.wu_name, wup->name);
strcpy(aid.result_name, result->name);
aid.user_total_credit = wup->project->user_total_credit;

View File

@ -478,8 +478,7 @@ int get_os_information(
strcat(os_name, "Windows 2000");
}
if ( osvi.dwMajorVersion <= 4 )
{
if ( osvi.dwMajorVersion <= 4 ) {
strcat(os_name, "Windows NT");
}

View File

@ -516,16 +516,16 @@ array('scr0.9.dmg',
);
$browser = array(
array('http://setihometoolbar.ourtoolbar.com/',
'SETI@home Toolbar',
'',
'Toolbar for Firefox and IE (Windows);
includes search, links, radio, RSS',
'http://setihometoolbar.ourtoolbar.com/',
'',
'',
1162833635
),
//array('http://setihometoolbar.ourtoolbar.com/',
// 'SETI@home Toolbar',
// '',
// 'Toolbar for Firefox and IE (Windows);
// includes search, links, radio, RSS',
// 'http://setihometoolbar.ourtoolbar.com/',
// '',
// '',
// 1162833635
//),
array('http://widgets.yahoo.com/gallery/view.php?widget=41595',
'BOINC Statistics',
'',

View File

@ -100,6 +100,7 @@ void APP_INIT_DATA::copy(const APP_INIT_DATA& a) {
teamid = a.teamid;
hostid = a.hostid;
slot = a.slot;
client_pid = a.client_pid;
user_total_credit = a.user_total_credit;
user_expavg_credit = a.user_expavg_credit;
host_total_credit = a.host_total_credit;
@ -193,6 +194,7 @@ int write_init_data_file(FILE* f, APP_INIT_DATA& ai) {
#endif
fprintf(f,
"<slot>%d</slot>\n"
"<client_pid>%d</client_pid>\n"
"<wu_cpu_time>%f</wu_cpu_time>\n"
"<starting_elapsed_time>%f</starting_elapsed_time>\n"
"<using_sandbox>%d</using_sandbox>\n"
@ -215,6 +217,7 @@ int write_init_data_file(FILE* f, APP_INIT_DATA& ai) {
"<computation_deadline>%f</computation_deadline>\n"
"<vbox_window>%d</vbox_window>\n",
ai.slot,
ai.client_pid,
ai.wu_cpu_time,
ai.starting_elapsed_time,
ai.using_sandbox?1:0,
@ -266,6 +269,7 @@ void APP_INIT_DATA::clear() {
strcpy(result_name, "");
strcpy(authenticator, "");
slot = 0;
client_pid = 0;
user_total_credit = 0;
user_expavg_credit = 0;
host_total_credit = 0;
@ -371,6 +375,7 @@ int parse_init_data_file(FILE* f, APP_INIT_DATA& ai) {
if (xp.parse_int("shm_key", ai.shmem_seg_name)) continue;
#endif
if (xp.parse_int("slot", ai.slot)) continue;
if (xp.parse_int("client_pid", ai.client_pid)) continue;
if (xp.parse_double("user_total_credit", ai.user_total_credit)) continue;
if (xp.parse_double("user_expavg_credit", ai.user_expavg_credit)) continue;
if (xp.parse_double("host_total_credit", ai.host_total_credit)) continue;

View File

@ -169,6 +169,7 @@ struct APP_INIT_DATA {
char result_name[256];
char authenticator[256];
int slot;
int client_pid;
double user_total_credit;
double user_expavg_credit;
double host_total_credit;

View File

@ -166,7 +166,8 @@ void kill_descendants() {
kill_all(descendants);
}
#else
// Same, but if child_pid is nonzero, give it a chance to exit gracefully on Unix
// Same, but if child_pid is nonzero,
// give it a chance to exit gracefully on Unix
//
void kill_descendants(int child_pid) {
vector<int> descendants;