diff --git a/checkin_notes b/checkin_notes
index 7f1caa83ae..4af1913ee3 100755
--- a/checkin_notes
+++ b/checkin_notes
@@ -6636,3 +6636,54 @@ Charlie 24 June 2007
mac_build/
boinc.xcodeproj/
project.pbxproj
+
+Charlie 25 June 2007
+ CC: Add logic to scheduler, etc. for systems which limit the number
+ of shared memory segments per process and / or system-wide.
+ Mac OS X has default limits of 8 shmem segments per process, 32
+ max system wide.
+ - Don't treat failure to allocate & attach to shared memory as a
+ computation error.
+ - Destroy shared memory segments associated with all tasks which
+ exit, not just those with a finished file. This frees up a
+ segment for use by another task.
+ - If we run out of shared memory, start a 10 second timer during
+ which don't try to run tasks unless they are already attached
+ to shared memory (suspended tasks left in memory qualify.)
+ - If a task is waiting for shared memory to become available,
+ display this in the Manager's tasks pane.
+ - Added a new log flag stress_shmem_debug for testing scheduling
+ logic when we run out of shared memory.
+
+ QUESTIONS for further development:
+ - Note that it is quite easy to run out of shared memory segments if
+ the leave-suspended-tasks-in-memory option is set. (The shared
+ memory channel must remain active for heartbeats and to tell the
+ task when to resume.)
+ - Should we override this option if we run out of shared memory
+ segments?
+ - What about tasks left in RAM because they don't checkpoint?
+ - If BOINC crashes, its shared memory is not removed. In some
+ cases, when BOINC is relaunched it will use the same key and
+ so can re-attach to the orphaned segments, but this won't always
+ happen. This can cause problems with the system-wide limits.
+ We can't mark the segment for removal when we create it, because
+ doing so (by calling shmctl with IPC_RMID) would remove its key
+ so the project application couldn't attach to it.
+ But we could do this safely after the first successful response
+ from the project application. Should we make this change?
+
+ client/
+ app.C,h
+ app_control.C
+ app_start.C
+ client_state.C,h
+ cpu_sched.C
+ log_flags.C,h
+ clientgui/
+ ViewWork.cpp
+ ViewWorkGrid.cpp
+ lib/
+ shmem.C,h
+ gui_rpc_client.h
+ gui_rpc_client_ops.C
diff --git a/client/app.C b/client/app.C
index 4fd9f6c127..2bdecfc160 100644
--- a/client/app.C
+++ b/client/app.C
@@ -111,6 +111,7 @@ ACTIVE_TASK::ACTIVE_TASK() {
have_trickle_down = false;
send_upload_file_status = false;
too_large = false;
+ needs_shmem = false;
want_network = 0;
memset(&procinfo, 0, sizeof(procinfo));
#ifdef _WIN32
@@ -167,10 +168,10 @@ void ACTIVE_TASK::close_process_handles() {
// call this when a process has exited and we're not going to restart it
//
void ACTIVE_TASK::cleanup_task() {
+#ifdef _WIN32
if (gstate.exit_after_finish) {
exit(0);
}
-#ifdef _WIN32
// detach from shared mem.
// This will destroy shmem seg since we're the last attachment
//
@@ -196,6 +197,10 @@ void ACTIVE_TASK::cleanup_task() {
}
app_client_shm.shm = NULL;
}
+
+ if (gstate.exit_after_finish) {
+ exit(0);
+ }
#endif
}
@@ -459,6 +464,7 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) {
" %f\n"
" %f\n"
" %f\n"
+ "%s"
"%s",
task_state(),
app_version->version_num,
@@ -470,7 +476,8 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) {
procinfo.working_set_size,
procinfo.working_set_size_smoothed,
procinfo.page_fault_rate,
- too_large?" \n":""
+ too_large?" \n":"",
+ needs_shmem?" \n":""
);
if (strlen(app_version->graphics_exec_path)) {
fout.printf(
diff --git a/client/app.h b/client/app.h
index 1b2c6e476b..b8e3af043c 100644
--- a/client/app.h
+++ b/client/app.h
@@ -102,6 +102,7 @@ public:
bool have_trickle_down;
bool send_upload_file_status;
bool too_large; // working set too large to run now
+ bool needs_shmem; // waiting for a free shared memory segment
int want_network;
// This task wants to do network comm (for F@h)
// this is passed via share-memory message (app_status channel)
diff --git a/client/app_control.C b/client/app_control.C
index 561338c575..be97f2464e 100644
--- a/client/app_control.C
+++ b/client/app_control.C
@@ -315,8 +315,8 @@ void ACTIVE_TASK::handle_exited_app(int stat) {
#endif
}
+ cleanup_task(); // Always release shared memory
if (!will_restart) {
- cleanup_task();
copy_output_files();
read_stderr_file();
clean_out_dir(slot_dir);
diff --git a/client/app_start.C b/client/app_start.C
index 6b94878cab..ef87e382f8 100644
--- a/client/app_start.C
+++ b/client/app_start.C
@@ -516,8 +516,7 @@ int ACTIVE_TASK::start(bool first_time) {
shmem_seg_name, sizeof(SHARED_MEM), (void**)&app_client_shm.shm
);
if (retval) {
- sprintf(buf, "Can't create shared memory: %s", boincerror(retval));
- goto error;
+ return retval;
}
}
app_client_shm.reset_msgs();
@@ -580,9 +579,14 @@ int ACTIVE_TASK::start(bool first_time) {
(void**)&app_client_shm.shm
);
if (retval) {
- sprintf(buf, "Can't create shared memory: %s", boincerror(retval));
- goto error;
+ needs_shmem = true;
+ // Assume no additional shared memory is available for next 10 seconds
+ // (run only tasks which are already attached to shared memory).
+ gstate.retry_shmem_time = gstate.now + 10.0;
+ destroy_shmem(shmem_seg_name); // Don't leave an orphan shmem segment
+ return retval;
}
+ needs_shmem = false;
}
app_client_shm.reset_msgs();
@@ -710,6 +714,9 @@ int ACTIVE_TASK::resume_or_start(bool first_time) {
retval = start(false);
str = "Restarting";
}
+ if ((retval == ERR_SHMGET) || (retval == ERR_SHMAT)) {
+ return retval;
+ }
if (retval) {
set_task_state(PROCESS_COULDNT_START, "resume_or_start1");
return retval;
diff --git a/client/client_state.C b/client/client_state.C
index 4495066ac4..1cac0c49c1 100644
--- a/client/client_state.C
+++ b/client/client_state.C
@@ -46,6 +46,7 @@
#include "network.h"
#include "http_curl.h"
#include "client_msgs.h"
+#include "shmem.h"
#include "client_state.h"
using std::max;
@@ -104,6 +105,7 @@ CLIENT_STATE::CLIENT_STATE() {
debt_interval_start = 0;
total_wall_cpu_time_this_debt_interval = 0;
+ retry_shmem_time = 0;
must_schedule_cpus = true;
must_enforce_cpu_schedule = true;
no_gui_rpc = false;
@@ -359,7 +361,11 @@ int CLIENT_STATE::init() {
auto_update.init();
http_ops->cleanup_temp_files();
-
+
+ if (log_flags.stress_shmem_debug) {
+ stress_shmem();
+ }
+
initialized = true;
return 0;
}
diff --git a/client/client_state.h b/client/client_state.h
index 2ce4e313d5..2c5671214e 100644
--- a/client/client_state.h
+++ b/client/client_state.h
@@ -268,6 +268,7 @@ private:
void make_running_task_heap(vector&);
void print_deadline_misses();
public:
+ double retry_shmem_time;
inline double work_buf_min() {
return global_prefs.work_buf_min_days * 86400;
}
diff --git a/client/cpu_sched.C b/client/cpu_sched.C
index 693fd184ff..e1ec09914e 100644
--- a/client/cpu_sched.C
+++ b/client/cpu_sched.C
@@ -506,6 +506,14 @@ void CLIENT_STATE::schedule_cpus() {
continue;
} else {
atp->too_large = false;
+
+ if (gstate.retry_shmem_time < gstate.now) {
+ if (atp->app_client_shm.shm == NULL) {
+ atp->needs_shmem = true;
+ continue;
+ }
+ atp->needs_shmem = false;
+ }
}
ram_left -= atp->procinfo.working_set_size_smoothed;
}
@@ -902,10 +910,12 @@ bool CLIENT_STATE::enforce_schedule() {
atp->scheduler_state == CPU_SCHED_UNINITIALIZED
);
if (retval) {
- report_result_error(
- *(atp->result), "Couldn't start or resume: %d", retval
- );
- request_schedule_cpus("start failed");
+ if ((retval != ERR_SHMGET) && (retval != ERR_SHMAT)) {
+ report_result_error(
+ *(atp->result), "Couldn't start or resume: %d", retval
+ );
+ request_schedule_cpus("start failed");
+ }
continue;
}
atp->run_interval_start_wall_time = now;
diff --git a/client/log_flags.C b/client/log_flags.C
index b37ee6c14b..89bd19555f 100644
--- a/client/log_flags.C
+++ b/client/log_flags.C
@@ -79,6 +79,7 @@ void LOG_FLAGS::defaults() {
mem_usage_debug = false;
network_status_debug = false;
checkpoint_debug = false;
+ stress_shmem_debug = false;
}
// Parse log flag preferences
@@ -121,6 +122,7 @@ int LOG_FLAGS::parse(XML_PARSER& xp) {
if (xp.parse_bool(tag, "mem_usage_debug", mem_usage_debug)) continue;
if (xp.parse_bool(tag, "network_status_debug", network_status_debug)) continue;
if (xp.parse_bool(tag, "checkpoint_debug", checkpoint_debug)) continue;
+ if (xp.parse_bool(tag, "stress_shmem_debug", stress_shmem_debug)) continue;
msg_printf(NULL, MSG_USER_ERROR, "Unrecognized tag in %s: <%s>\n",
CONFIG_FILE, tag
);
diff --git a/client/log_flags.h b/client/log_flags.h
index b08df63dd3..631090655d 100644
--- a/client/log_flags.h
+++ b/client/log_flags.h
@@ -73,6 +73,7 @@ struct LOG_FLAGS {
bool mem_usage_debug; // memory usage
bool network_status_debug;
bool checkpoint_debug;
+ bool stress_shmem_debug;
LOG_FLAGS();
void defaults();
diff --git a/clientgui/ViewWork.cpp b/clientgui/ViewWork.cpp
index d926f5f66c..c773daac83 100644
--- a/clientgui/ViewWork.cpp
+++ b/clientgui/ViewWork.cpp
@@ -793,6 +793,8 @@ wxInt32 CViewWork::FormatStatus(wxInt32 item, wxString& strBuffer) const {
} else if (result->active_task) {
if (result->too_large) {
strBuffer = _("Waiting for memory");
+ } else if (result->needs_shmem) {
+ strBuffer = _("Waiting for shared memory");
} else if (result->scheduler_state == CPU_SCHED_SCHEDULED) {
strBuffer = _("Running");
} else if (result->scheduler_state == CPU_SCHED_PREEMPTED) {
diff --git a/clientgui/ViewWorkGrid.cpp b/clientgui/ViewWorkGrid.cpp
index 59bf8dd667..7ca813e776 100644
--- a/clientgui/ViewWorkGrid.cpp
+++ b/clientgui/ViewWorkGrid.cpp
@@ -712,6 +712,8 @@ wxInt32 CViewWorkGrid::FormatStatus(wxInt32 item, wxString& strBuffer) const {
} else if (result->active_task) {
if (result->too_large) {
strBuffer = _("Waiting for memory");
+ } else if (result->needs_shmem) {
+ strBuffer = _("Waiting for shared memory");
} else if (result->scheduler_state == CPU_SCHED_SCHEDULED) {
strBuffer = _("Running");
} else if (result->scheduler_state == CPU_SCHED_PREEMPTED) {
diff --git a/lib/gui_rpc_client.h b/lib/gui_rpc_client.h
index 1ee6f5fb29..46f9ff5ffd 100644
--- a/lib/gui_rpc_client.h
+++ b/lib/gui_rpc_client.h
@@ -212,6 +212,7 @@ public:
bool supports_graphics;
int graphics_mode_acked;
bool too_large;
+ bool needs_shmem;
bool edf_scheduled;
std::string graphics_exec_path;
std::string slot_path;
diff --git a/lib/gui_rpc_client_ops.C b/lib/gui_rpc_client_ops.C
index 7459468877..736ab556b2 100644
--- a/lib/gui_rpc_client_ops.C
+++ b/lib/gui_rpc_client_ops.C
@@ -391,6 +391,7 @@ int RESULT::parse(MIOFILE& in) {
if (parse_double(buf, "", fraction_done)) continue;
if (parse_double(buf, "", estimated_cpu_time_remaining)) continue;
if (parse_bool(buf, "too_large", too_large)) continue;
+ if (parse_bool(buf, "needs_shmem", needs_shmem)) continue;
if (parse_bool(buf, "edf_scheduled", edf_scheduled)) continue;
if (parse_str(buf, "graphics_exec_path", graphics_exec_path)) continue;
if (parse_str(buf, "slot_path", slot_path)) continue;
@@ -428,6 +429,7 @@ void RESULT::clear() {
supports_graphics = false;
graphics_mode_acked = 0;
too_large = false;
+ needs_shmem = false;
edf_scheduled = false;
app = NULL;
diff --git a/lib/shmem.C b/lib/shmem.C
index 2bdb97bf92..0851a20cdd 100755
--- a/lib/shmem.C
+++ b/lib/shmem.C
@@ -260,6 +260,28 @@ int print_shmem_info(key_t key) {
return 0;
}
+// For debugging shared memory logic
+// For testing on Apple, Linux, UNIX systems with limited number
+// of shared memory segments per process and / or system-wide
+// Mac OS X has a default limit of 8 segments per process, 32 system-wide
+void stress_shmem() {
+ int retval;
+ void * shmaddr[8];
+ key_t key[] = {'BNC1', 'BNC2', 'BNC3', 'BNC4', 'BNC5', 'BNC6', 'BNC7', 'BNC8' };
+ int i, id;
+
+ // Tie up 5 of the 8 shared memory segments each process may have
+ for (i=0; i<5; i++) {
+ retval = create_shmem(key[i], 1024, 0, &shmaddr[i]);
+ if (!retval) {
+ id = shmget(key[i], 0, 0);
+ // Mark it for automatic destruction when BOINC exits
+ if (id >= 0)
+ retval = shmctl(id, IPC_RMID, 0);
+ }
+ }
+}
+
#endif
const char *BOINC_RCSID_f835f078de = "$Id$";
diff --git a/lib/shmem.h b/lib/shmem.h
index 45fa1a9076..886b4ad2cc 100755
--- a/lib/shmem.h
+++ b/lib/shmem.h
@@ -64,5 +64,11 @@ extern int detach_shmem(void*);
extern int shmem_info(key_t key);
+#ifndef __EMX__
+// For testing on Apple, Linux, UNIX systems with limited number
+// of shared memory segments per process and / or system-wide
+void stress_shmem(void);
+#endif
+
#endif
#endif