From 7bb1b1a0d49f5fc9c9e290c892f29d0b11dd509b Mon Sep 17 00:00:00 2001 From: Charlie Fenton Date: Mon, 25 Jun 2007 11:46:15 +0000 Subject: [PATCH] CC: Add logic to scheduler, etc. for systems which limit the number of shared memory segments per process and / or system-wide. svn path=/trunk/boinc/; revision=13015 --- checkin_notes | 51 ++++++++++++++++++++++++++++++++++++++ client/app.C | 11 ++++++-- client/app.h | 1 + client/app_control.C | 2 +- client/app_start.C | 15 ++++++++--- client/client_state.C | 8 +++++- client/client_state.h | 1 + client/cpu_sched.C | 18 +++++++++++--- client/log_flags.C | 2 ++ client/log_flags.h | 1 + clientgui/ViewWork.cpp | 2 ++ clientgui/ViewWorkGrid.cpp | 2 ++ lib/gui_rpc_client.h | 1 + lib/gui_rpc_client_ops.C | 2 ++ lib/shmem.C | 22 ++++++++++++++++ lib/shmem.h | 6 +++++ 16 files changed, 133 insertions(+), 12 deletions(-) diff --git a/checkin_notes b/checkin_notes index 7f1caa83ae..4af1913ee3 100755 --- a/checkin_notes +++ b/checkin_notes @@ -6636,3 +6636,54 @@ Charlie 24 June 2007 mac_build/ boinc.xcodeproj/ project.pbxproj + +Charlie 25 June 2007 + CC: Add logic to scheduler, etc. for systems which limit the number + of shared memory segments per process and / or system-wide. + Mac OS X has default limits of 8 shmem segments per process, 32 + max system wide. + - Don't treat failure to allocate & attach to shared memory as a + computation error. + - Destroy shared memory segments associated with all tasks which + exit, not just those with a finished file. This frees up a + segment for use by another task. + - If we run out of shared memory, start a 10 second timer during + which don't try to run tasks unless they are already attached + to shared memory (suspended tasks left in memory qualify.) + - If a task is waiting for shared memory to become available, + display this in the Manager's tasks pane. + - Added a new log flag stress_shmem_debug for testing scheduling + logic when we run out of shared memory. + + QUESTIONS for further development: + - Note that it is quite easy to run out of shared memory segments if + the leave-suspended-tasks-in-memory option is set. (The shared + memory channel must remain active for heartbeats and to tell the + task when to resume.) + - Should we override this option if we run out of shared memory + segments? + - What about tasks left in RAM because they don't checkpoint? + - If BOINC crashes, its shared memory is not removed. In some + cases, when BOINC is relaunched it will use the same key and + so can re-attach to the orphaned segments, but this won't always + happen. This can cause problems with the system-wide limits. + We can't mark the segment for removal when we create it, because + doing so (by calling shmctl with IPC_RMID) would remove its key + so the project application couldn't attach to it. + But we could do this safely after the first successful response + from the project application. Should we make this change? + + client/ + app.C,h + app_control.C + app_start.C + client_state.C,h + cpu_sched.C + log_flags.C,h + clientgui/ + ViewWork.cpp + ViewWorkGrid.cpp + lib/ + shmem.C,h + gui_rpc_client.h + gui_rpc_client_ops.C diff --git a/client/app.C b/client/app.C index 4fd9f6c127..2bdecfc160 100644 --- a/client/app.C +++ b/client/app.C @@ -111,6 +111,7 @@ ACTIVE_TASK::ACTIVE_TASK() { have_trickle_down = false; send_upload_file_status = false; too_large = false; + needs_shmem = false; want_network = 0; memset(&procinfo, 0, sizeof(procinfo)); #ifdef _WIN32 @@ -167,10 +168,10 @@ void ACTIVE_TASK::close_process_handles() { // call this when a process has exited and we're not going to restart it // void ACTIVE_TASK::cleanup_task() { +#ifdef _WIN32 if (gstate.exit_after_finish) { exit(0); } -#ifdef _WIN32 // detach from shared mem. // This will destroy shmem seg since we're the last attachment // @@ -196,6 +197,10 @@ void ACTIVE_TASK::cleanup_task() { } app_client_shm.shm = NULL; } + + if (gstate.exit_after_finish) { + exit(0); + } #endif } @@ -459,6 +464,7 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) { " %f\n" " %f\n" " %f\n" + "%s" "%s", task_state(), app_version->version_num, @@ -470,7 +476,8 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) { procinfo.working_set_size, procinfo.working_set_size_smoothed, procinfo.page_fault_rate, - too_large?" \n":"" + too_large?" \n":"", + needs_shmem?" \n":"" ); if (strlen(app_version->graphics_exec_path)) { fout.printf( diff --git a/client/app.h b/client/app.h index 1b2c6e476b..b8e3af043c 100644 --- a/client/app.h +++ b/client/app.h @@ -102,6 +102,7 @@ public: bool have_trickle_down; bool send_upload_file_status; bool too_large; // working set too large to run now + bool needs_shmem; // waiting for a free shared memory segment int want_network; // This task wants to do network comm (for F@h) // this is passed via share-memory message (app_status channel) diff --git a/client/app_control.C b/client/app_control.C index 561338c575..be97f2464e 100644 --- a/client/app_control.C +++ b/client/app_control.C @@ -315,8 +315,8 @@ void ACTIVE_TASK::handle_exited_app(int stat) { #endif } + cleanup_task(); // Always release shared memory if (!will_restart) { - cleanup_task(); copy_output_files(); read_stderr_file(); clean_out_dir(slot_dir); diff --git a/client/app_start.C b/client/app_start.C index 6b94878cab..ef87e382f8 100644 --- a/client/app_start.C +++ b/client/app_start.C @@ -516,8 +516,7 @@ int ACTIVE_TASK::start(bool first_time) { shmem_seg_name, sizeof(SHARED_MEM), (void**)&app_client_shm.shm ); if (retval) { - sprintf(buf, "Can't create shared memory: %s", boincerror(retval)); - goto error; + return retval; } } app_client_shm.reset_msgs(); @@ -580,9 +579,14 @@ int ACTIVE_TASK::start(bool first_time) { (void**)&app_client_shm.shm ); if (retval) { - sprintf(buf, "Can't create shared memory: %s", boincerror(retval)); - goto error; + needs_shmem = true; + // Assume no additional shared memory is available for next 10 seconds + // (run only tasks which are already attached to shared memory). + gstate.retry_shmem_time = gstate.now + 10.0; + destroy_shmem(shmem_seg_name); // Don't leave an orphan shmem segment + return retval; } + needs_shmem = false; } app_client_shm.reset_msgs(); @@ -710,6 +714,9 @@ int ACTIVE_TASK::resume_or_start(bool first_time) { retval = start(false); str = "Restarting"; } + if ((retval == ERR_SHMGET) || (retval == ERR_SHMAT)) { + return retval; + } if (retval) { set_task_state(PROCESS_COULDNT_START, "resume_or_start1"); return retval; diff --git a/client/client_state.C b/client/client_state.C index 4495066ac4..1cac0c49c1 100644 --- a/client/client_state.C +++ b/client/client_state.C @@ -46,6 +46,7 @@ #include "network.h" #include "http_curl.h" #include "client_msgs.h" +#include "shmem.h" #include "client_state.h" using std::max; @@ -104,6 +105,7 @@ CLIENT_STATE::CLIENT_STATE() { debt_interval_start = 0; total_wall_cpu_time_this_debt_interval = 0; + retry_shmem_time = 0; must_schedule_cpus = true; must_enforce_cpu_schedule = true; no_gui_rpc = false; @@ -359,7 +361,11 @@ int CLIENT_STATE::init() { auto_update.init(); http_ops->cleanup_temp_files(); - + + if (log_flags.stress_shmem_debug) { + stress_shmem(); + } + initialized = true; return 0; } diff --git a/client/client_state.h b/client/client_state.h index 2ce4e313d5..2c5671214e 100644 --- a/client/client_state.h +++ b/client/client_state.h @@ -268,6 +268,7 @@ private: void make_running_task_heap(vector&); void print_deadline_misses(); public: + double retry_shmem_time; inline double work_buf_min() { return global_prefs.work_buf_min_days * 86400; } diff --git a/client/cpu_sched.C b/client/cpu_sched.C index 693fd184ff..e1ec09914e 100644 --- a/client/cpu_sched.C +++ b/client/cpu_sched.C @@ -506,6 +506,14 @@ void CLIENT_STATE::schedule_cpus() { continue; } else { atp->too_large = false; + + if (gstate.retry_shmem_time < gstate.now) { + if (atp->app_client_shm.shm == NULL) { + atp->needs_shmem = true; + continue; + } + atp->needs_shmem = false; + } } ram_left -= atp->procinfo.working_set_size_smoothed; } @@ -902,10 +910,12 @@ bool CLIENT_STATE::enforce_schedule() { atp->scheduler_state == CPU_SCHED_UNINITIALIZED ); if (retval) { - report_result_error( - *(atp->result), "Couldn't start or resume: %d", retval - ); - request_schedule_cpus("start failed"); + if ((retval != ERR_SHMGET) && (retval != ERR_SHMAT)) { + report_result_error( + *(atp->result), "Couldn't start or resume: %d", retval + ); + request_schedule_cpus("start failed"); + } continue; } atp->run_interval_start_wall_time = now; diff --git a/client/log_flags.C b/client/log_flags.C index b37ee6c14b..89bd19555f 100644 --- a/client/log_flags.C +++ b/client/log_flags.C @@ -79,6 +79,7 @@ void LOG_FLAGS::defaults() { mem_usage_debug = false; network_status_debug = false; checkpoint_debug = false; + stress_shmem_debug = false; } // Parse log flag preferences @@ -121,6 +122,7 @@ int LOG_FLAGS::parse(XML_PARSER& xp) { if (xp.parse_bool(tag, "mem_usage_debug", mem_usage_debug)) continue; if (xp.parse_bool(tag, "network_status_debug", network_status_debug)) continue; if (xp.parse_bool(tag, "checkpoint_debug", checkpoint_debug)) continue; + if (xp.parse_bool(tag, "stress_shmem_debug", stress_shmem_debug)) continue; msg_printf(NULL, MSG_USER_ERROR, "Unrecognized tag in %s: <%s>\n", CONFIG_FILE, tag ); diff --git a/client/log_flags.h b/client/log_flags.h index b08df63dd3..631090655d 100644 --- a/client/log_flags.h +++ b/client/log_flags.h @@ -73,6 +73,7 @@ struct LOG_FLAGS { bool mem_usage_debug; // memory usage bool network_status_debug; bool checkpoint_debug; + bool stress_shmem_debug; LOG_FLAGS(); void defaults(); diff --git a/clientgui/ViewWork.cpp b/clientgui/ViewWork.cpp index d926f5f66c..c773daac83 100644 --- a/clientgui/ViewWork.cpp +++ b/clientgui/ViewWork.cpp @@ -793,6 +793,8 @@ wxInt32 CViewWork::FormatStatus(wxInt32 item, wxString& strBuffer) const { } else if (result->active_task) { if (result->too_large) { strBuffer = _("Waiting for memory"); + } else if (result->needs_shmem) { + strBuffer = _("Waiting for shared memory"); } else if (result->scheduler_state == CPU_SCHED_SCHEDULED) { strBuffer = _("Running"); } else if (result->scheduler_state == CPU_SCHED_PREEMPTED) { diff --git a/clientgui/ViewWorkGrid.cpp b/clientgui/ViewWorkGrid.cpp index 59bf8dd667..7ca813e776 100644 --- a/clientgui/ViewWorkGrid.cpp +++ b/clientgui/ViewWorkGrid.cpp @@ -712,6 +712,8 @@ wxInt32 CViewWorkGrid::FormatStatus(wxInt32 item, wxString& strBuffer) const { } else if (result->active_task) { if (result->too_large) { strBuffer = _("Waiting for memory"); + } else if (result->needs_shmem) { + strBuffer = _("Waiting for shared memory"); } else if (result->scheduler_state == CPU_SCHED_SCHEDULED) { strBuffer = _("Running"); } else if (result->scheduler_state == CPU_SCHED_PREEMPTED) { diff --git a/lib/gui_rpc_client.h b/lib/gui_rpc_client.h index 1ee6f5fb29..46f9ff5ffd 100644 --- a/lib/gui_rpc_client.h +++ b/lib/gui_rpc_client.h @@ -212,6 +212,7 @@ public: bool supports_graphics; int graphics_mode_acked; bool too_large; + bool needs_shmem; bool edf_scheduled; std::string graphics_exec_path; std::string slot_path; diff --git a/lib/gui_rpc_client_ops.C b/lib/gui_rpc_client_ops.C index 7459468877..736ab556b2 100644 --- a/lib/gui_rpc_client_ops.C +++ b/lib/gui_rpc_client_ops.C @@ -391,6 +391,7 @@ int RESULT::parse(MIOFILE& in) { if (parse_double(buf, "", fraction_done)) continue; if (parse_double(buf, "", estimated_cpu_time_remaining)) continue; if (parse_bool(buf, "too_large", too_large)) continue; + if (parse_bool(buf, "needs_shmem", needs_shmem)) continue; if (parse_bool(buf, "edf_scheduled", edf_scheduled)) continue; if (parse_str(buf, "graphics_exec_path", graphics_exec_path)) continue; if (parse_str(buf, "slot_path", slot_path)) continue; @@ -428,6 +429,7 @@ void RESULT::clear() { supports_graphics = false; graphics_mode_acked = 0; too_large = false; + needs_shmem = false; edf_scheduled = false; app = NULL; diff --git a/lib/shmem.C b/lib/shmem.C index 2bdb97bf92..0851a20cdd 100755 --- a/lib/shmem.C +++ b/lib/shmem.C @@ -260,6 +260,28 @@ int print_shmem_info(key_t key) { return 0; } +// For debugging shared memory logic +// For testing on Apple, Linux, UNIX systems with limited number +// of shared memory segments per process and / or system-wide +// Mac OS X has a default limit of 8 segments per process, 32 system-wide +void stress_shmem() { + int retval; + void * shmaddr[8]; + key_t key[] = {'BNC1', 'BNC2', 'BNC3', 'BNC4', 'BNC5', 'BNC6', 'BNC7', 'BNC8' }; + int i, id; + + // Tie up 5 of the 8 shared memory segments each process may have + for (i=0; i<5; i++) { + retval = create_shmem(key[i], 1024, 0, &shmaddr[i]); + if (!retval) { + id = shmget(key[i], 0, 0); + // Mark it for automatic destruction when BOINC exits + if (id >= 0) + retval = shmctl(id, IPC_RMID, 0); + } + } +} + #endif const char *BOINC_RCSID_f835f078de = "$Id$"; diff --git a/lib/shmem.h b/lib/shmem.h index 45fa1a9076..886b4ad2cc 100755 --- a/lib/shmem.h +++ b/lib/shmem.h @@ -64,5 +64,11 @@ extern int detach_shmem(void*); extern int shmem_info(key_t key); +#ifndef __EMX__ +// For testing on Apple, Linux, UNIX systems with limited number +// of shared memory segments per process and / or system-wide +void stress_shmem(void); +#endif + #endif #endif