From 7d96f35ba383f34f789d0e2a087ed2b09a20c649 Mon Sep 17 00:00:00 2001 From: Bruce Allen Date: Sat, 8 Jan 2005 01:17:51 +0000 Subject: [PATCH] Reinhard has done a substantial amount of work in the past couple of weeks to fix a number of bugs and problems in the X-windows and/or Mac graphics event loop, and with lockfile handling problems seen on a number of Unix file systems. A brief summary of the changes is: lockfile: replace calls to exit() by boinc_finish() + make boinc_finish() remove the lockfile graphics-eventloop: some re-structuring and simplification to make handling of glut-exits/abort-signals more robust. Eric, if you could test this under Solaris we'd be very grateful. svn path=/trunk/boinc/; revision=5025 --- api/boinc_api.C | 33 +++- api/graphics_impl.C | 9 +- api/x_opengl.C | 382 +++++++++++++++++++++++++++++--------------- checkin_notes | 22 +++ lib/diagnostics.C | 4 +- lib/filesys.C | 38 ++++- 6 files changed, 343 insertions(+), 145 deletions(-) diff --git a/api/boinc_api.C b/api/boinc_api.C index fb148c0d90..2c8e19366c 100644 --- a/api/boinc_api.C +++ b/api/boinc_api.C @@ -138,7 +138,7 @@ int boinc_init_options_general(BOINC_OPTIONS& opt) { } if (retval) { fprintf(stderr, "Can't acquire lockfile - exiting\n"); - exit(0); + boinc_finish(0); // not un-recoverable ==> status=0 } } @@ -162,7 +162,7 @@ int boinc_init_options_general(BOINC_OPTIONS& opt) { fprintf(stderr, "Core client has wrong major version: wanted %d, got %d\n", BOINC_MAJOR_VERSION, aid.core_version/100 ); - exit(ERR_MAJOR_VERSION); + boinc_finish(ERR_MAJOR_VERSION); // un-recoverable==> exit with nonzero status } retval = setup_shared_mem(); if (retval) { @@ -203,7 +203,16 @@ static void send_trickle_up_msg() { } } + +// NOTE: a non-zero status tells a running client that we're exiting with +// and "unrecoverable error", which will be reported back to server. +// Therefore, on "recoverable errors", use exit-status == 0 ! int boinc_finish(int status) { + + // remove the lockfile + if ( boinc_delete_file (LOCKFILE) != 0) + perror ("boinc_finish(): failed to remove lockfile"); + if (options.send_status_msgs) { boinc_calling_thread_cpu_time(last_checkpoint_cpu_time); last_checkpoint_cpu_time += aid.wu_cpu_time; @@ -225,7 +234,21 @@ int boinc_finish(int status) { aid.wu_cpu_time = last_checkpoint_cpu_time; boinc_write_init_data_file(); } + + // on Linux < 2.6, probably due to non-POSIX LinuxThreads, _Exit() fails to + // shut down the graphics-thread properly, while exit() does the job and does _NOT_ + // seem to get tangled in exit-atexit loops... +#ifdef __linux__ exit(status); +#else + // on Mac (and supposedly other systems with POSIX-complian thread-implementations?), + // calling exit() can lead to infinite exit-atexit loops, while _Exit() seems to behave nicely + // This is not pretty but unless someone finds a cleaner solution, we handle the two cases + // separately based on these observations + _Exit(status); +#endif + + fprintf(stderr, "..exit() or _Exit() returned... this is totally weird!!\n"); return 0; } @@ -400,7 +423,7 @@ static void handle_process_control_msg() { break; } if (match_tag(buf, "")) { - exit(0); + boinc_finish(0); // NOTE: exit-status = 0 ! } } boinc_sleep(1.0); @@ -422,7 +445,7 @@ static void handle_process_control_msg() { if (match_tag(buf, "")) { boinc_status.quit_request = true; if (options.direct_process_action) { - exit(0); + boinc_finish(0); // NOTE: exit-status == 0! } } } @@ -467,7 +490,7 @@ static void worker_timer(int a) { now - (heartbeat_giveup_time - HEARTBEAT_GIVEUP_PERIOD) ); if (options.direct_process_action) { - exit(0); + boinc_finish(0); // NOTE: exit-status == 0! (recoverable error) } else { boinc_status.no_heartbeat = true; } diff --git a/api/graphics_impl.C b/api/graphics_impl.C index a2238e88c9..f02f42ebf7 100755 --- a/api/graphics_impl.C +++ b/api/graphics_impl.C @@ -61,6 +61,11 @@ HANDLE hQuitEvent; BOINC_MAIN_STATE* g_bmsp; static WORKER_FUNC_PTR worker_main; +#ifndef _WIN32 +pthread_t worker_thread; +pthread_t graphics_thread; +#endif + #ifdef _WIN32 DWORD WINAPI foobar(LPVOID) { #else @@ -117,7 +122,6 @@ int start_worker_thread(WORKER_FUNC_PTR _worker_main) { #else - pthread_t worker_thread; pthread_attr_t worker_thread_attr; sched_param param; int retval, currentpolicy, minpriority; @@ -145,6 +149,9 @@ int start_worker_thread(WORKER_FUNC_PTR _worker_main) { retval = pthread_attr_setschedparam(&worker_thread_attr, ¶m); if (retval) return ERR_THREAD; + // initialze thread-id of calling thread (which is the graphics-thread!) + graphics_thread = pthread_self(); + retval = pthread_create(&worker_thread, &worker_thread_attr, foobar, 0); if (retval) return ERR_THREAD; pthread_attr_destroy( &worker_thread_attr ); diff --git a/api/x_opengl.C b/api/x_opengl.C index bae6f1d32b..4aca40087e 100644 --- a/api/x_opengl.C +++ b/api/x_opengl.C @@ -8,6 +8,7 @@ #include "app_ipc.h" #include "util.h" +#include "filesys.h" #include "boinc_gl.h" #include "graphics_api.h" @@ -23,9 +24,21 @@ static int acked_graphics_mode; static int xpos = 100, ypos = 100; static int clicked_button; static int win=0; +static bool glut_is_initialized = false; +static jmp_buf jbuf; // longjump/setjump for exit/signal handler +static struct sigaction original_signal_handler; // store previous ABRT signal-handler static void set_mode(int mode); +static void restart(void); +static void boinc_glut_init(void); + static APP_INIT_DATA aid; +extern pthread_t graphics_thread; // thread info +extern pthread_t worker_thread; + + +void app_debug_msg (char *fmt, ...); + // This callback is invoked when a user presses a key. // void keyboardD(unsigned char key, int x, int y) { @@ -78,56 +91,81 @@ static void maybe_render() { } } -static void close_func() { - if (boinc_is_standalone()) { - exit(0); - } else { - set_mode(MODE_HIDE_GRAPHICS); - } -} +static void make_new_window(int mode) +{ + if ( (mode != MODE_WINDOW) && (mode != MODE_FULLSCREEN) ) // nothing to be done here + return; -static void make_new_window(int mode){ - if (mode == MODE_WINDOW || mode == MODE_FULLSCREEN){ - glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH); - glutInitWindowPosition(xpos, ypos); - glutInitWindowSize(600, 400); - g_bmsp->boinc_get_init_data_hook(aid); - if (!strlen(aid.app_name)) { - strcpy(aid.app_name, "BOINC Application"); - } - win = glutCreateWindow(aid.app_name); + if ( ! glut_is_initialized ) //initialize glut if necessary + boinc_glut_init(); - glutReshapeFunc(app_graphics_resize); - glutKeyboardFunc(keyboardD); - glutKeyboardUpFunc(keyboardU); - glutMouseFunc(mouse_click); - glutMotionFunc(mouse_click_move); - glutDisplayFunc(maybe_render); - - app_graphics_init(); - glEnable(GL_DEPTH_TEST); - - if (mode == MODE_FULLSCREEN) { - glutFullScreen(); - } - } -} + app_debug_msg("make_new_window(): now calling glutCreateWindow(%s)...\n", aid.app_name); + win = glutCreateWindow(aid.app_name); + app_debug_msg("glutCreateWindow() succeeded. win = %d\n", win); + // installing callbacks for the current window + glutReshapeFunc(app_graphics_resize); + glutKeyboardFunc(keyboardD); + glutKeyboardUpFunc(keyboardU); + glutMouseFunc(mouse_click); + glutMotionFunc(mouse_click_move); + glutDisplayFunc(maybe_render); + glEnable(GL_DEPTH_TEST); + + app_graphics_init(); + + if (mode == MODE_FULLSCREEN) + glutFullScreen(); + + return; +} // make_new_window() + +// initialized glut and screensaver-graphics +// this should only called once, even if restarted by user-exit +static void +boinc_glut_init(void) +{ + char* args[2] = {"screensaver", NULL}; + int one=1; + app_debug_msg("Calling glutInit()... \n"); + glutInit (&one, args); + app_debug_msg("...survived glutInit(). \n"); + + glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGB | GLUT_DEPTH); + glutInitWindowPosition(xpos, ypos); + glutInitWindowSize(600, 400); + g_bmsp->boinc_get_init_data_hook(aid); + if (!strlen(aid.app_name)) + strcpy(aid.app_name, "BOINC Application"); + + glut_is_initialized = true; + + return; + +} // boinc_glut_init() + +// Destroy current window if any void KillWindow() { if (win) { - glutDestroyWindow(win); - win = 0; + int oldwin = win; + win = 0; // set this to 0 FIRST to avoid recursion if the following call fails. + glutDestroyWindow(oldwin); } } void set_mode(int mode) { if (mode == current_graphics_mode) return; + app_debug_msg("set_mode(%d): current_mode = %d. Calling KillWindow(): win = %d\n", mode, current_graphics_mode, win); KillWindow(); + app_debug_msg("...KillWindow() survived.\n"); current_graphics_mode = mode; if (mode != MODE_HIDE_GRAPHICS) { - make_new_window(mode); + app_debug_msg("set_mode(): Calling make_new_window(%d)\n", mode); + make_new_window(mode); + app_debug_msg("...make_new_window() survived.\n"); } -} + return; +} // set_mode() static void wait_for_initial_message() { (*g_bmsp->app_client_shmp)->shm->graphics_reply.send_msg( @@ -140,7 +178,8 @@ static void wait_for_initial_message() { } sleep(1); } -} + return; +} // wait_for_initial_message() static void timer_handler(int) { char buf[MSG_CHANNEL_SIZE]; @@ -178,39 +217,66 @@ static void timer_handler(int) { } maybe_render(); glutTimerFunc(TIMER_INTERVAL_MSEC, timer_handler, 0); -} -static jmp_buf jbuf; -static pthread_t graphics_thread; -static struct sigaction original_signal_handler; -static bool tried_to_install_abort_handler=false; + return; +} // timer_handler() -void restart() { - if (pthread_equal(pthread_self(), graphics_thread)) { - atexit(restart); - longjmp(jbuf, 1); - } -} +// exit handler: really only meant to handle exits() called by GLUT... +void restart() +{ + // don't do anything except when exit is called from the graphics-thread + if ( !pthread_equal(pthread_self(), graphics_thread) ) + app_debug_msg ("exit() was called from worker-thread\n"); + else + { + app_debug_msg ("exit() called from graphics-thread.\n"); -// we will only arrive here if the signal handler was sucessfully -// installed. If we are in the graphics thread, we just return back + // we're standalone, a glut-window was open: user must have pressed 'close', so we exit + if (boinc_is_standalone() && glut_is_initialized && win) + { + app_debug_msg("Open glut-window found... means we're exiting now.\n"); + if ( boinc_delete_file(LOCKFILE) != 0) + perror ("Failed to remove lockfile..\n"); + return; + } + + // re-install the exit-handler to catch glut's notorious exits()... + atexit(restart); + // jump back to entry-point in xwin_graphics_event_loop(); + app_debug_msg( "Jumping back to event_loop.\n"); + longjmp(jbuf, 1); + } // if in graphics-thread + + return; +} // restart() + +// deal with ABORT's, typically raised by GLUT +// If we are in the graphics thread, we just return back // into xwin_graphics_event_loop. If we are in the main thread, then // restore the old signal handler and raise SIGABORT. -void restart_sig(int signal_number /* not used, always == SIGABRT */) { - if (pthread_equal(pthread_self(), graphics_thread)) { - // alternative approach is for the signal hander to call exit(). - fprintf(stderr, "Caught SIGABRT in graphics thread\n"); - longjmp(jbuf, 1); - } - else { - // In non-graphics thread: use original signal handler - fprintf(stderr, "Caught SIGABRT in non-graphics thread\n"); - if (sigaction(SIGABRT, &original_signal_handler, NULL)) { - perror("Unable to restore SIGABRT signal handler in non-graphics thread\n"); - // what to do? call abort(3)?? call exit(nonzero)??? - // boinc_finish()???? Here we just return. +void restart_sig(int signal_number) +{ + if ( pthread_equal(pthread_self(), graphics_thread) ) + { + // alternative approach is for the signal hander to call exit(). + fprintf(stderr, "Caught SIGABRT in graphics thread\n"); + app_debug_msg ("Caught SIGABRT in graphics thread. Jumping back to xwin_graphics_event_loop()...\n"); + // jump back to entry-point in xwin_graphics_event_loop() + longjmp(jbuf, 1); + } // if ABRT raised in graphics-thread + else + { + // In non-graphics thread: use original signal handler + fprintf(stderr, "Caught SIGABRT in non-graphics thread\n"); + app_debug_msg ("Caught SIGABRT in non-graphics thread. Trying to call previous ABRT-handler...\n"); + if (sigaction(SIGABRT, &original_signal_handler, NULL)) + { + perror("Unable to restore SIGABRT signal handler in non-graphics thread\n"); + // what to do? call abort(3)?? call exit(nonzero)??? + // Here we just return. } - else { + else + { // we could conceivably try to call the old signal handler // directly. But this means checking how its flags/masks // are set, making sure it's not NULL (for no action) and so @@ -220,85 +286,139 @@ void restart_sig(int signal_number /* not used, always == SIGABRT */) { // executing? raise(SIGABRT); } - return; - } -} + } // if ABRT raised in non-graphics thread + return; +} // restart_sig() -void xwin_graphics_event_loop() { - char* args[] = {"foobar", 0}; - int one=1; - static bool glut_inited = false; - int restarted; - int retry_interval_sec=32; - struct sigaction sa; - sa.sa_handler = restart_sig; - sa.sa_flags = SA_RESTART; +// graphics thread main-function +// NOTE: this should only be called *ONCE* by an application +void xwin_graphics_event_loop() +{ + int restarted; - graphics_thread = pthread_self(); + app_debug_msg ("Direct call to xwin_graphics_event_loop()\n"); - atexit(restart); + struct sigaction sa; + sa.sa_handler = restart_sig; + sa.sa_flags = SA_RESTART; -try_again: - restarted = setjmp(jbuf); + // install signal handler to catch abort() from GLUT. Note that + // signal handlers are global to ALL threads, so the signal + // handler that we record here in &original_signal_handler is the + // previous one that applied to BOTH threads + if (sigaction(SIGABRT, &sa, &original_signal_handler)) { + perror("unable to install signal handler to catch SIGABORT"); + } - // install signal handler to catch abort() from GLUT. Note that - // signal handlers are global to ALL threads, so the signal - // handler that we record here in &original_signal_handler is the - // previous one that applied to BOTH threads - if (!tried_to_install_abort_handler) { - tried_to_install_abort_handler=true; - if (sigaction(SIGABRT, &sa, &original_signal_handler)) { - perror("unable to install signal handler to catch SIGABORT"); + // handle glut's notorious exits().. + atexit(restart); + + // THIS IS THE re-entry point at exit or ABORT in the graphics-thread + restarted = setjmp(jbuf); + + if (restarted) + app_debug_msg("xwin_graphics_event_loop() restarted at re-entry point\n"); + + if ( boinc_is_standalone() ) + { + if (restarted && !win) // now window was yet opened: just go to sleep to avoid more trouble... + { + app_debug_msg("Graphics exited before glut-window opened... continuing without graphics\n"); + while(1) sleep(1); + return; // should never arrive here + } // if restarted & no open windows + + if ( !restarted ) // first time through here + { + // open the graphics-window + set_mode(MODE_WINDOW); + glutTimerFunc(TIMER_INTERVAL_MSEC, timer_handler, 0); } - } - - if (restarted) { - //fprintf(stderr, "graphics thread restarted\n"); fflush(stderr); - if (glut_inited) { - // here the user must have closed the window, - // which causes GLUT to call exit(). - // + } // if standalone-mode + else // under BOINC: start with graphics hidden and wait for client to tell us otherwise + { + + // Mac's GLUT is sufficiently "broken" that I simply don't see a way of restarting + // a GLUT-window after user exited once (e.g. using M-Q, or "Quit einstein" in menu). + // glut will segfault or similar at the attempt, so we rather don't try this in the first place. + // ==> If user exited glut-window once on mac, we put this thread to sleep + // (until somebody has a better idea / understanding of Mac's glut #ifdef __APPLE_CC__ - glutInit(&one, args); + if (restarted) + { + app_debug_msg("Sorry. I don't know a way of restarting glut on the Mac. Putting graphics to sleep now.\n"); + while(1) sleep(1); + return; + } #endif + // on Linux on the other hand, glut seems to be happily restartable after exit was caught, + // so we simply continue as if it was the first time...: i.e. wait for client-message, then do something. + app_debug_msg ("Switching graphics-mode to MODE_HIDE_GRAPHICS\n"); + set_mode(MODE_HIDE_GRAPHICS); + while ( current_graphics_mode == MODE_HIDE_GRAPHICS ) + { + app_debug_msg ("Graphics-thread now waiting for client-message...\n"); + wait_for_initial_message(); + app_debug_msg ("got a graphics-message from client... \n"); + timer_handler(0); + } + } // ! boinc_is_standalone() + + // ok we should be ready & initialized by now to call glutMainLoop() + app_debug_msg ("now calling glutMainLoop()...\n"); + glutMainLoop(); + app_debug_msg("...glutMainLoop() returned!! This should never happen...\n"); -#if 0 - // NOTE: currently when run standalone, killing the - // graphics window does NOT kill the application. If it's - // desired to kill the application in standalone mode, - // insert something here like this - if (boinc_is_standalone()) - exit(0); -#endif + return; + +} // xwin_graphics_event_loop() - set_mode(MODE_HIDE_GRAPHICS); - } else { - // here glutInit() must have failed and called exit() or - // raised SIGABRT. - // - retry_interval_sec *= 2; - sleep(retry_interval_sec); - goto try_again; - } - } else { - glutInit(&one, args); - glut_inited = true; - if (boinc_is_standalone()) { - set_mode(MODE_WINDOW); - } else { - // don't proceed until we've opened a window; - // otherwise glutMainLoop() will abort - // - while (current_graphics_mode == MODE_HIDE_GRAPHICS) { - wait_for_initial_message(); - timer_handler(0); - } - } + +/* RP's little APP-debug output function: */ +void +app_debug_msg (char *fmt, ...) +{ +#ifndef _DEBUG + return; +#else + va_list args; + char buffer[5000+1]; + static char *boinc_slotdir = NULL; + va_start (args, fmt); + + if (boinc_slotdir == NULL) + { + char *tmp, *ptr; + if ( (tmp = getcwd(NULL, 0)) == NULL) + { + perror ("failed to get working directory using getcwd()"); + boinc_slotdir = (char*)calloc(1, 20); + strcpy( boinc_slotdir, "[unknown]"); + } + else + { + if ( (ptr = strrchr(tmp, '/')) == NULL) + ptr = tmp; + else + ptr ++; + + boinc_slotdir = (char*)calloc(1, strlen(ptr)+1); + strcpy(boinc_slotdir, ptr); + free (tmp); + } /* got cwd */ } - glutTimerFunc(TIMER_INTERVAL_MSEC, timer_handler, 0); - glutMainLoop(); -} + + vsnprintf (buffer, 5000, fmt, args); + fprintf (stdout, "APP '%s' DEBUG: ", boinc_slotdir); + fprintf (stdout, buffer); + fflush (stdout); + + va_end (args); +#endif // defined _DEBUG +} // app_debug_msg() + + const char *BOINC_RCSID_c457a14644 = "$Id$"; diff --git a/checkin_notes b/checkin_notes index 3fbb06a00f..69ac3813c3 100755 --- a/checkin_notes +++ b/checkin_notes @@ -22243,3 +22243,25 @@ David 7 Jan 2005 ops/ db_action.php db_form.ph + +Bruce 7 Jan 2005 + + - Reinhard has done a substantial amount of work in the past + couple of weeks to fix a number of bugs and problems in the + X-windows and/or Mac graphics event loop, and with lockfile + handling problems seen on a number of Unix file systems. A + brief summary of the changes is: + + lockfile: replace calls to exit() by boinc_finish() + make + boinc_finish() remove the lockfile + + graphics-eventloop: some re-structuring and simplification to + make handling of glut-exits/abort-signals more robust. Eric, + if you could test this under Solaris we'd be very grateful. + + api/ + boinc_api.C + graphics_impl.C + x_opengl.C + diagnostics.C + filesys.C diff --git a/lib/diagnostics.C b/lib/diagnostics.C index 16a6c99b98..c91aa8e42f 100644 --- a/lib/diagnostics.C +++ b/lib/diagnostics.C @@ -443,7 +443,7 @@ void boinc_set_signal_handler(int sig, void(*handler)(int)) { sigaction(sig, NULL, &temp); if (temp.sa_handler != SIG_IGN) { temp.sa_handler = handler; - sigemptyset(&temp.sa_mask); + // sigemptyset(&temp.sa_mask); sigaction(sig, &temp, NULL); } #else @@ -463,7 +463,7 @@ void boinc_set_signal_handler_force(int sig, void(*handler)(int)) { struct sigaction temp; sigaction(sig, NULL, &temp); temp.sa_handler = handler; - sigemptyset(&temp.sa_mask); + // sigemptyset(&temp.sa_mask); sigaction(sig, &temp, NULL); #else void (*temp)(int); diff --git a/lib/filesys.C b/lib/filesys.C index 0d2ae32b90..5a2d251d31 100755 --- a/lib/filesys.C +++ b/lib/filesys.C @@ -472,18 +472,44 @@ int lock_file(char* filename) { 0, 0, OPEN_ALWAYS, FILE_ATTRIBUTE_NORMAL, 0 ); if (hfile == INVALID_HANDLE_VALUE) retval = 1; - else retval = 0; + else + // NOTE: contrary to unix, we close the file on windows, so that we can remove it later in boinc_finish(). + // there seems to be no real file-locking done here, so this should be ok... + if (CloseHandle(hfile)) + retval = 0; + else + retval = 1; // some systems have both! #elif defined(HAVE_LOCKF) && !defined(__APPLE__) - int lock = open(filename, O_WRONLY|O_CREAT, 0644); - retval = lockf(lock, F_TLOCK, 1); + int fd = open(filename, O_WRONLY|O_CREAT, 0644); + retval = lockf(fd, F_TLOCK, 0); +#ifndef NDEBUG + if ( retval != 0 ) + { + struct flock lck; + fprintf(stdout, "DEBUG: Failed to get lockf on %s. Error = %d, %s\n", filename, errno, strerror(errno)); + lck.l_whence = 0; + lck.l_start = 0L; + lck.l_len = 0L; + lck.l_type = F_WRLCK; + (void) fcntl(fd, F_GETLK, &lck); + if (lck.l_type != F_UNLCK) + fprintf(stdout, "DEBUG: Lock-info: pid = %d type = %c start = %8ld len = %8ld\n", lck.l_pid, + (lck.l_type == F_WRLCK) ? 'W' : 'R', lck.l_start, lck.l_len); + else + fprintf(stdout, "DEBUG: fcntl() says it's unlocked though....strange.\n"); + + fflush(stdout); + } +#endif // defined NDEBUG + #elif HAVE_FLOCK - int lock = open(filename, O_WRONLY|O_CREAT, 0644); - retval = flock(lock, LOCK_EX|LOCK_NB); + int fd = open(filename, O_WRONLY|O_CREAT, 0644); + retval = flock(fd, LOCK_EX|LOCK_NB); // must leave fd open #else - no file lock mechanism + no file lock mechanism; #endif return retval;