mirror of https://github.com/BOINC/boinc.git
- client and API: improve the way an app checks for the death of the client
Old: heartbeat mechanism Problem: if the client is blocked for > 30 secs (e.g. because it takes a long time to write the state file, of because it's stopped in a debugger) then apps exit. This is bad is the app doesn't checkpoint and has been running for a long time. New: the client passes its PID to the app. The app periodically (10 sec) checks that the process still exists. Notes: - For backward compatibility (e.g. new API w/ old client, or vice versa) the client still sends heartbeats, and the API checks heartbeats if the client doesn't pass a PID. - The new mechanism works only if the client's PID isn't assigned to a new process within 10 secs of the client exiting. Windows 2000 reuses PIDs immediately, so check for Win2K and don't use this mechanism if so. TODO: For Unix multithread apps, critical sections aren't currently being enforced. Need to fix this by masking signals. svn path=/trunk/boinc/; revision=26147
This commit is contained in:
parent
acd1ad07ae
commit
7d40c46923
|
@ -39,6 +39,8 @@
|
|||
// new process call boinc_init_options() with flags to
|
||||
// send status messages and handle checkpoint stuff,
|
||||
// and returns from boinc_init_parallel()
|
||||
// NOTE: THIS DOESN'T RESPECT CRITICAL SECTIONS.
|
||||
// NEED TO MASK SIGNALS IN CHILD DURING CRITICAL SECTIONS
|
||||
// Win:
|
||||
// like sequential case, except suspend/resume must enumerate
|
||||
// all threads (except timer) and suspend/resume them all
|
||||
|
@ -48,6 +50,13 @@
|
|||
//
|
||||
// 3) For compatibility with C, we use int instead of bool various places
|
||||
//
|
||||
// 4) We must periodically check that the client is still alive and exit if not.
|
||||
// Originally this was done using heartbeat msgs from client.
|
||||
// This is unreliable, e.g. if the client is blocked for a long time.
|
||||
// As of Oct 11 2012 we use a different mechanism:
|
||||
// the client passes its PID and we periodically check whether it exists.
|
||||
// But we need to support the heartbeat mechanism also for compatibility.
|
||||
//
|
||||
// Terminology:
|
||||
// The processing of a result can be divided
|
||||
// into multiple "episodes" (executions of the app),
|
||||
|
@ -127,7 +136,7 @@ static volatile double initial_wu_cpu_time;
|
|||
static volatile bool have_new_trickle_up = false;
|
||||
static volatile bool have_trickle_down = true;
|
||||
// on first call, scan slot dir for msgs
|
||||
static volatile int heartbeat_giveup_time;
|
||||
static volatile int heartbeat_giveup_count;
|
||||
// interrupt count value at which to give up on core client
|
||||
#ifdef _WIN32
|
||||
static volatile int nrunning_ticks = 0;
|
||||
|
@ -156,12 +165,13 @@ int app_min_checkpoint_period = 0;
|
|||
|
||||
#define TIMER_PERIOD 0.1
|
||||
// period of worker-thread timer interrupts.
|
||||
// Determines rate of handlling messages from client.
|
||||
// Determines rate of handling messages from client.
|
||||
#define TIMERS_PER_SEC 10
|
||||
// This determines the resolution of fraction done and CPU time reporting
|
||||
// to the core client, and of checkpoint enabling.
|
||||
// It doesn't influence graphics, so 1 sec is enough.
|
||||
#define HEARTBEAT_GIVEUP_COUNT ((int)(30/TIMER_PERIOD))
|
||||
#define HEARTBEAT_GIVEUP_SECS 30
|
||||
#define HEARTBEAT_GIVEUP_COUNT ((int)(HEARTBEAT_GIVEUP_SECS/TIMER_PERIOD))
|
||||
// quit if no heartbeat from core in this #interrupts
|
||||
#define LOCKFILE_TIMEOUT_PERIOD 35
|
||||
// quit if we cannot aquire slot lock file in this #secs after startup
|
||||
|
@ -354,7 +364,7 @@ static void handle_heartbeat_msg() {
|
|||
if (app_client_shm->shm->heartbeat.get_msg(buf)) {
|
||||
boinc_status.network_suspended = false;
|
||||
if (match_tag(buf, "<heartbeat/>")) {
|
||||
heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
|
||||
heartbeat_giveup_count = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
|
||||
}
|
||||
if (parse_double(buf, "<wss>", dtemp)) {
|
||||
boinc_status.working_set_size = dtemp;
|
||||
|
@ -368,6 +378,30 @@ static void handle_heartbeat_msg() {
|
|||
}
|
||||
}
|
||||
|
||||
static bool client_dead() {
|
||||
if (aid.client_pid) {
|
||||
// check every 10 sec
|
||||
//
|
||||
if (interrupt_count%(TIMERS_PER_SEC*10)) return false;
|
||||
#ifdef _WIN32
|
||||
// Windows doesn't have waitpid() :-(
|
||||
//
|
||||
DWORD pids[4096], nb;
|
||||
BOOL r = EnumProcesses(pids, sizeof(pids), nb);
|
||||
if (!r) return false;
|
||||
int n = nb/sizeof(DWORD);
|
||||
for (int i=0; i<n; i++) {
|
||||
if (pids[i] == aid.client_pid) return false;
|
||||
}
|
||||
return true;
|
||||
#else
|
||||
return (waitpid(aid.client_pid, 0, WNOHANG) < 0);
|
||||
#endif
|
||||
} else {
|
||||
return (interrupt_count > heartbeat_giveup_count);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef _WIN32
|
||||
// For multithread apps on Unix, the main process executes the following.
|
||||
//
|
||||
|
@ -393,7 +427,7 @@ static void parallel_master(int child_pid) {
|
|||
}
|
||||
}
|
||||
|
||||
if (heartbeat_giveup_time < interrupt_count) {
|
||||
if (client_dead()) {
|
||||
kill(child_pid, SIGKILL);
|
||||
exit(0);
|
||||
}
|
||||
|
@ -551,7 +585,7 @@ int boinc_init_options_general(BOINC_OPTIONS& opt) {
|
|||
if (standalone) {
|
||||
options.check_heartbeat = false;
|
||||
}
|
||||
heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
|
||||
heartbeat_giveup_count = interrupt_count + HEARTBEAT_GIVEUP_COUNT;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -1171,10 +1205,10 @@ static void timer_handler() {
|
|||
// (unless we're in a critical section)
|
||||
//
|
||||
if (in_critical_section==0 && options.check_heartbeat) {
|
||||
if (heartbeat_giveup_time < interrupt_count) {
|
||||
if (client_dead()) {
|
||||
boinc_msg_prefix(buf, sizeof(buf));
|
||||
fputs(buf, stderr);
|
||||
fputs(" No heartbeat from core client for 30 sec - exiting\n", stderr);
|
||||
fputs(buf, stderr); // don't use fprintf() here
|
||||
fputs(" No heartbeat from client for 30 sec - exiting\n", stderr);
|
||||
if (options.direct_process_action) {
|
||||
exit_from_timer_thread(0);
|
||||
} else {
|
||||
|
|
|
@ -6096,3 +6096,36 @@ David 10 Oct 2012
|
|||
vda/
|
||||
sched_vda.cpp
|
||||
vda_lib2.cpp
|
||||
|
||||
David 11 Oct 2012
|
||||
- client and API: improve the way an app checks for the death of the client
|
||||
Old: heartbeat mechanism
|
||||
Problem: if the client is blocked for > 30 secs
|
||||
(e.g. because it takes a long time to write the state file,
|
||||
of because it's stopped in a debugger)
|
||||
then apps exit.
|
||||
This is bad is the app doesn't checkpoint and has been
|
||||
running for a long time.
|
||||
New: the client passes its PID to the app.
|
||||
The app periodically (10 sec) checks that the process still exists.
|
||||
Notes:
|
||||
- For backward compatibility (e.g. new API w/ old client,
|
||||
or vice versa) the client still sends heartbeats,
|
||||
and the API checks heartbeats if the client doesn't pass a PID.
|
||||
- The new mechanism works only if the client's PID isn't assigned
|
||||
to a new process within 10 secs of the client exiting.
|
||||
Windows 2000 reuses PIDs immediately, so check for Win2K
|
||||
and don't use this mechanism if so.
|
||||
|
||||
TODO: For Unix multithread apps,
|
||||
critical sections aren't currently being enforced.
|
||||
Need to fix this by masking signals.
|
||||
|
||||
api/
|
||||
boinc_api.cpp
|
||||
client/
|
||||
hostinfo_win.cpp
|
||||
app_start.cpp
|
||||
lib/
|
||||
app_ipc.cpp,h
|
||||
proc_control.cpp
|
||||
|
|
|
@ -218,6 +218,17 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) {
|
|||
relative_to_absolute("", aid.boinc_dir);
|
||||
strcpy(aid.authenticator, wup->project->authenticator);
|
||||
aid.slot = slot;
|
||||
#ifdef _WIN32
|
||||
if (strstr(gstate.hostinfo.os_name, "Windows 2000")) {
|
||||
// Win2K immediately reuses PIDs, so can't use this mechanism
|
||||
//
|
||||
aid.client_pid = 0;
|
||||
} else {
|
||||
aid.client_pid = GetCurrentProcessId();
|
||||
}
|
||||
#else
|
||||
aid.client_pid = getpid();
|
||||
#endif
|
||||
strcpy(aid.wu_name, wup->name);
|
||||
strcpy(aid.result_name, result->name);
|
||||
aid.user_total_credit = wup->project->user_total_credit;
|
||||
|
|
|
@ -478,8 +478,7 @@ int get_os_information(
|
|||
strcat(os_name, "Windows 2000");
|
||||
}
|
||||
|
||||
if ( osvi.dwMajorVersion <= 4 )
|
||||
{
|
||||
if ( osvi.dwMajorVersion <= 4 ) {
|
||||
strcat(os_name, "Windows NT");
|
||||
}
|
||||
|
||||
|
|
|
@ -516,16 +516,16 @@ array('scr0.9.dmg',
|
|||
);
|
||||
|
||||
$browser = array(
|
||||
array('http://setihometoolbar.ourtoolbar.com/',
|
||||
'SETI@home Toolbar',
|
||||
'',
|
||||
'Toolbar for Firefox and IE (Windows);
|
||||
includes search, links, radio, RSS',
|
||||
'http://setihometoolbar.ourtoolbar.com/',
|
||||
'',
|
||||
'',
|
||||
1162833635
|
||||
),
|
||||
//array('http://setihometoolbar.ourtoolbar.com/',
|
||||
// 'SETI@home Toolbar',
|
||||
// '',
|
||||
// 'Toolbar for Firefox and IE (Windows);
|
||||
// includes search, links, radio, RSS',
|
||||
// 'http://setihometoolbar.ourtoolbar.com/',
|
||||
// '',
|
||||
// '',
|
||||
// 1162833635
|
||||
//),
|
||||
array('http://widgets.yahoo.com/gallery/view.php?widget=41595',
|
||||
'BOINC Statistics',
|
||||
'',
|
||||
|
|
|
@ -100,6 +100,7 @@ void APP_INIT_DATA::copy(const APP_INIT_DATA& a) {
|
|||
teamid = a.teamid;
|
||||
hostid = a.hostid;
|
||||
slot = a.slot;
|
||||
client_pid = a.client_pid;
|
||||
user_total_credit = a.user_total_credit;
|
||||
user_expavg_credit = a.user_expavg_credit;
|
||||
host_total_credit = a.host_total_credit;
|
||||
|
@ -193,6 +194,7 @@ int write_init_data_file(FILE* f, APP_INIT_DATA& ai) {
|
|||
#endif
|
||||
fprintf(f,
|
||||
"<slot>%d</slot>\n"
|
||||
"<client_pid>%d</client_pid>\n"
|
||||
"<wu_cpu_time>%f</wu_cpu_time>\n"
|
||||
"<starting_elapsed_time>%f</starting_elapsed_time>\n"
|
||||
"<using_sandbox>%d</using_sandbox>\n"
|
||||
|
@ -215,6 +217,7 @@ int write_init_data_file(FILE* f, APP_INIT_DATA& ai) {
|
|||
"<computation_deadline>%f</computation_deadline>\n"
|
||||
"<vbox_window>%d</vbox_window>\n",
|
||||
ai.slot,
|
||||
ai.client_pid,
|
||||
ai.wu_cpu_time,
|
||||
ai.starting_elapsed_time,
|
||||
ai.using_sandbox?1:0,
|
||||
|
@ -266,6 +269,7 @@ void APP_INIT_DATA::clear() {
|
|||
strcpy(result_name, "");
|
||||
strcpy(authenticator, "");
|
||||
slot = 0;
|
||||
client_pid = 0;
|
||||
user_total_credit = 0;
|
||||
user_expavg_credit = 0;
|
||||
host_total_credit = 0;
|
||||
|
@ -371,6 +375,7 @@ int parse_init_data_file(FILE* f, APP_INIT_DATA& ai) {
|
|||
if (xp.parse_int("shm_key", ai.shmem_seg_name)) continue;
|
||||
#endif
|
||||
if (xp.parse_int("slot", ai.slot)) continue;
|
||||
if (xp.parse_int("client_pid", ai.client_pid)) continue;
|
||||
if (xp.parse_double("user_total_credit", ai.user_total_credit)) continue;
|
||||
if (xp.parse_double("user_expavg_credit", ai.user_expavg_credit)) continue;
|
||||
if (xp.parse_double("host_total_credit", ai.host_total_credit)) continue;
|
||||
|
|
|
@ -169,6 +169,7 @@ struct APP_INIT_DATA {
|
|||
char result_name[256];
|
||||
char authenticator[256];
|
||||
int slot;
|
||||
int client_pid;
|
||||
double user_total_credit;
|
||||
double user_expavg_credit;
|
||||
double host_total_credit;
|
||||
|
|
|
@ -166,7 +166,8 @@ void kill_descendants() {
|
|||
kill_all(descendants);
|
||||
}
|
||||
#else
|
||||
// Same, but if child_pid is nonzero, give it a chance to exit gracefully on Unix
|
||||
// Same, but if child_pid is nonzero,
|
||||
// give it a chance to exit gracefully on Unix
|
||||
//
|
||||
void kill_descendants(int child_pid) {
|
||||
vector<int> descendants;
|
||||
|
|
Loading…
Reference in New Issue