From 7d40c46923bc16c9b3716332b0d1ef34f1da2a80 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Thu, 11 Oct 2012 16:41:31 +0000 Subject: [PATCH] - client and API: improve the way an app checks for the death of the client Old: heartbeat mechanism Problem: if the client is blocked for > 30 secs (e.g. because it takes a long time to write the state file, of because it's stopped in a debugger) then apps exit. This is bad is the app doesn't checkpoint and has been running for a long time. New: the client passes its PID to the app. The app periodically (10 sec) checks that the process still exists. Notes: - For backward compatibility (e.g. new API w/ old client, or vice versa) the client still sends heartbeats, and the API checks heartbeats if the client doesn't pass a PID. - The new mechanism works only if the client's PID isn't assigned to a new process within 10 secs of the client exiting. Windows 2000 reuses PIDs immediately, so check for Win2K and don't use this mechanism if so. TODO: For Unix multithread apps, critical sections aren't currently being enforced. Need to fix this by masking signals. svn path=/trunk/boinc/; revision=26147 --- api/boinc_api.cpp | 52 ++++++++++++++++++++++++++++++++++------- checkin_notes | 33 ++++++++++++++++++++++++++ client/app_start.cpp | 11 +++++++++ client/hostinfo_win.cpp | 3 +-- doc/addon_data.php | 20 ++++++++-------- lib/app_ipc.cpp | 5 ++++ lib/app_ipc.h | 1 + lib/proc_control.cpp | 3 ++- 8 files changed, 106 insertions(+), 22 deletions(-) diff --git a/api/boinc_api.cpp b/api/boinc_api.cpp index 6462f6a133..5c4092bb4c 100644 --- a/api/boinc_api.cpp +++ b/api/boinc_api.cpp @@ -39,6 +39,8 @@ // new process call boinc_init_options() with flags to // send status messages and handle checkpoint stuff, // and returns from boinc_init_parallel() +// NOTE: THIS DOESN'T RESPECT CRITICAL SECTIONS. +// NEED TO MASK SIGNALS IN CHILD DURING CRITICAL SECTIONS // Win: // like sequential case, except suspend/resume must enumerate // all threads (except timer) and suspend/resume them all @@ -48,6 +50,13 @@ // // 3) For compatibility with C, we use int instead of bool various places // +// 4) We must periodically check that the client is still alive and exit if not. +// Originally this was done using heartbeat msgs from client. +// This is unreliable, e.g. if the client is blocked for a long time. +// As of Oct 11 2012 we use a different mechanism: +// the client passes its PID and we periodically check whether it exists. +// But we need to support the heartbeat mechanism also for compatibility. +// // Terminology: // The processing of a result can be divided // into multiple "episodes" (executions of the app), @@ -127,7 +136,7 @@ static volatile double initial_wu_cpu_time; static volatile bool have_new_trickle_up = false; static volatile bool have_trickle_down = true; // on first call, scan slot dir for msgs -static volatile int heartbeat_giveup_time; +static volatile int heartbeat_giveup_count; // interrupt count value at which to give up on core client #ifdef _WIN32 static volatile int nrunning_ticks = 0; @@ -156,12 +165,13 @@ int app_min_checkpoint_period = 0; #define TIMER_PERIOD 0.1 // period of worker-thread timer interrupts. - // Determines rate of handlling messages from client. + // Determines rate of handling messages from client. #define TIMERS_PER_SEC 10 // This determines the resolution of fraction done and CPU time reporting // to the core client, and of checkpoint enabling. // It doesn't influence graphics, so 1 sec is enough. -#define HEARTBEAT_GIVEUP_COUNT ((int)(30/TIMER_PERIOD)) +#define HEARTBEAT_GIVEUP_SECS 30 +#define HEARTBEAT_GIVEUP_COUNT ((int)(HEARTBEAT_GIVEUP_SECS/TIMER_PERIOD)) // quit if no heartbeat from core in this #interrupts #define LOCKFILE_TIMEOUT_PERIOD 35 // quit if we cannot aquire slot lock file in this #secs after startup @@ -354,7 +364,7 @@ static void handle_heartbeat_msg() { if (app_client_shm->shm->heartbeat.get_msg(buf)) { boinc_status.network_suspended = false; if (match_tag(buf, "")) { - heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT; + heartbeat_giveup_count = interrupt_count + HEARTBEAT_GIVEUP_COUNT; } if (parse_double(buf, "", dtemp)) { boinc_status.working_set_size = dtemp; @@ -368,6 +378,30 @@ static void handle_heartbeat_msg() { } } +static bool client_dead() { + if (aid.client_pid) { + // check every 10 sec + // + if (interrupt_count%(TIMERS_PER_SEC*10)) return false; +#ifdef _WIN32 + // Windows doesn't have waitpid() :-( + // + DWORD pids[4096], nb; + BOOL r = EnumProcesses(pids, sizeof(pids), nb); + if (!r) return false; + int n = nb/sizeof(DWORD); + for (int i=0; i heartbeat_giveup_count); + } +} + #ifndef _WIN32 // For multithread apps on Unix, the main process executes the following. // @@ -393,7 +427,7 @@ static void parallel_master(int child_pid) { } } - if (heartbeat_giveup_time < interrupt_count) { + if (client_dead()) { kill(child_pid, SIGKILL); exit(0); } @@ -551,7 +585,7 @@ int boinc_init_options_general(BOINC_OPTIONS& opt) { if (standalone) { options.check_heartbeat = false; } - heartbeat_giveup_time = interrupt_count + HEARTBEAT_GIVEUP_COUNT; + heartbeat_giveup_count = interrupt_count + HEARTBEAT_GIVEUP_COUNT; return 0; } @@ -1171,10 +1205,10 @@ static void timer_handler() { // (unless we're in a critical section) // if (in_critical_section==0 && options.check_heartbeat) { - if (heartbeat_giveup_time < interrupt_count) { + if (client_dead()) { boinc_msg_prefix(buf, sizeof(buf)); - fputs(buf, stderr); - fputs(" No heartbeat from core client for 30 sec - exiting\n", stderr); + fputs(buf, stderr); // don't use fprintf() here + fputs(" No heartbeat from client for 30 sec - exiting\n", stderr); if (options.direct_process_action) { exit_from_timer_thread(0); } else { diff --git a/checkin_notes b/checkin_notes index 3e353be4ec..bd19f20d11 100644 --- a/checkin_notes +++ b/checkin_notes @@ -6096,3 +6096,36 @@ David 10 Oct 2012 vda/ sched_vda.cpp vda_lib2.cpp + +David 11 Oct 2012 + - client and API: improve the way an app checks for the death of the client + Old: heartbeat mechanism + Problem: if the client is blocked for > 30 secs + (e.g. because it takes a long time to write the state file, + of because it's stopped in a debugger) + then apps exit. + This is bad is the app doesn't checkpoint and has been + running for a long time. + New: the client passes its PID to the app. + The app periodically (10 sec) checks that the process still exists. + Notes: + - For backward compatibility (e.g. new API w/ old client, + or vice versa) the client still sends heartbeats, + and the API checks heartbeats if the client doesn't pass a PID. + - The new mechanism works only if the client's PID isn't assigned + to a new process within 10 secs of the client exiting. + Windows 2000 reuses PIDs immediately, so check for Win2K + and don't use this mechanism if so. + + TODO: For Unix multithread apps, + critical sections aren't currently being enforced. + Need to fix this by masking signals. + + api/ + boinc_api.cpp + client/ + hostinfo_win.cpp + app_start.cpp + lib/ + app_ipc.cpp,h + proc_control.cpp diff --git a/client/app_start.cpp b/client/app_start.cpp index 245dcbcaea..c876c5df0c 100644 --- a/client/app_start.cpp +++ b/client/app_start.cpp @@ -218,6 +218,17 @@ void ACTIVE_TASK::init_app_init_data(APP_INIT_DATA& aid) { relative_to_absolute("", aid.boinc_dir); strcpy(aid.authenticator, wup->project->authenticator); aid.slot = slot; +#ifdef _WIN32 + if (strstr(gstate.hostinfo.os_name, "Windows 2000")) { + // Win2K immediately reuses PIDs, so can't use this mechanism + // + aid.client_pid = 0; + } else { + aid.client_pid = GetCurrentProcessId(); + } +#else + aid.client_pid = getpid(); +#endif strcpy(aid.wu_name, wup->name); strcpy(aid.result_name, result->name); aid.user_total_credit = wup->project->user_total_credit; diff --git a/client/hostinfo_win.cpp b/client/hostinfo_win.cpp index 57b0fea115..1bd4452cba 100644 --- a/client/hostinfo_win.cpp +++ b/client/hostinfo_win.cpp @@ -478,8 +478,7 @@ int get_os_information( strcat(os_name, "Windows 2000"); } - if ( osvi.dwMajorVersion <= 4 ) - { + if ( osvi.dwMajorVersion <= 4 ) { strcat(os_name, "Windows NT"); } diff --git a/doc/addon_data.php b/doc/addon_data.php index 2523b72e47..99036819a9 100644 --- a/doc/addon_data.php +++ b/doc/addon_data.php @@ -516,16 +516,16 @@ array('scr0.9.dmg', ); $browser = array( -array('http://setihometoolbar.ourtoolbar.com/', - 'SETI@home Toolbar', - '', - 'Toolbar for Firefox and IE (Windows); - includes search, links, radio, RSS', - 'http://setihometoolbar.ourtoolbar.com/', - '', - '', - 1162833635 -), +//array('http://setihometoolbar.ourtoolbar.com/', +// 'SETI@home Toolbar', +// '', +// 'Toolbar for Firefox and IE (Windows); +// includes search, links, radio, RSS', +// 'http://setihometoolbar.ourtoolbar.com/', +// '', +// '', +// 1162833635 +//), array('http://widgets.yahoo.com/gallery/view.php?widget=41595', 'BOINC Statistics', '', diff --git a/lib/app_ipc.cpp b/lib/app_ipc.cpp index c7f0a7280d..8f55b7c9ce 100644 --- a/lib/app_ipc.cpp +++ b/lib/app_ipc.cpp @@ -100,6 +100,7 @@ void APP_INIT_DATA::copy(const APP_INIT_DATA& a) { teamid = a.teamid; hostid = a.hostid; slot = a.slot; + client_pid = a.client_pid; user_total_credit = a.user_total_credit; user_expavg_credit = a.user_expavg_credit; host_total_credit = a.host_total_credit; @@ -193,6 +194,7 @@ int write_init_data_file(FILE* f, APP_INIT_DATA& ai) { #endif fprintf(f, "%d\n" + "%d\n" "%f\n" "%f\n" "%d\n" @@ -215,6 +217,7 @@ int write_init_data_file(FILE* f, APP_INIT_DATA& ai) { "%f\n" "%d\n", ai.slot, + ai.client_pid, ai.wu_cpu_time, ai.starting_elapsed_time, ai.using_sandbox?1:0, @@ -266,6 +269,7 @@ void APP_INIT_DATA::clear() { strcpy(result_name, ""); strcpy(authenticator, ""); slot = 0; + client_pid = 0; user_total_credit = 0; user_expavg_credit = 0; host_total_credit = 0; @@ -371,6 +375,7 @@ int parse_init_data_file(FILE* f, APP_INIT_DATA& ai) { if (xp.parse_int("shm_key", ai.shmem_seg_name)) continue; #endif if (xp.parse_int("slot", ai.slot)) continue; + if (xp.parse_int("client_pid", ai.client_pid)) continue; if (xp.parse_double("user_total_credit", ai.user_total_credit)) continue; if (xp.parse_double("user_expavg_credit", ai.user_expavg_credit)) continue; if (xp.parse_double("host_total_credit", ai.host_total_credit)) continue; diff --git a/lib/app_ipc.h b/lib/app_ipc.h index 80684c628c..ccf1038dab 100644 --- a/lib/app_ipc.h +++ b/lib/app_ipc.h @@ -169,6 +169,7 @@ struct APP_INIT_DATA { char result_name[256]; char authenticator[256]; int slot; + int client_pid; double user_total_credit; double user_expavg_credit; double host_total_credit; diff --git a/lib/proc_control.cpp b/lib/proc_control.cpp index ac7777189e..83ca3d2c43 100644 --- a/lib/proc_control.cpp +++ b/lib/proc_control.cpp @@ -166,7 +166,8 @@ void kill_descendants() { kill_all(descendants); } #else -// Same, but if child_pid is nonzero, give it a chance to exit gracefully on Unix +// Same, but if child_pid is nonzero, +// give it a chance to exit gracefully on Unix // void kill_descendants(int child_pid) { vector descendants;