CC: Add logic to scheduler, etc. for systems which limit the number of shared memory segments per process and / or system-wide.

svn path=/trunk/boinc/; revision=13015
This commit is contained in:
Charlie Fenton 2007-06-25 11:46:15 +00:00
parent 8a0d5fa6c6
commit 7bb1b1a0d4
16 changed files with 133 additions and 12 deletions

View File

@ -6636,3 +6636,54 @@ Charlie 24 June 2007
mac_build/
boinc.xcodeproj/
project.pbxproj
Charlie 25 June 2007
CC: Add logic to scheduler, etc. for systems which limit the number
of shared memory segments per process and / or system-wide.
Mac OS X has default limits of 8 shmem segments per process, 32
max system wide.
- Don't treat failure to allocate & attach to shared memory as a
computation error.
- Destroy shared memory segments associated with all tasks which
exit, not just those with a finished file. This frees up a
segment for use by another task.
- If we run out of shared memory, start a 10 second timer during
which don't try to run tasks unless they are already attached
to shared memory (suspended tasks left in memory qualify.)
- If a task is waiting for shared memory to become available,
display this in the Manager's tasks pane.
- Added a new log flag stress_shmem_debug for testing scheduling
logic when we run out of shared memory.
QUESTIONS for further development:
- Note that it is quite easy to run out of shared memory segments if
the leave-suspended-tasks-in-memory option is set. (The shared
memory channel must remain active for heartbeats and to tell the
task when to resume.)
- Should we override this option if we run out of shared memory
segments?
- What about tasks left in RAM because they don't checkpoint?
- If BOINC crashes, its shared memory is not removed. In some
cases, when BOINC is relaunched it will use the same key and
so can re-attach to the orphaned segments, but this won't always
happen. This can cause problems with the system-wide limits.
We can't mark the segment for removal when we create it, because
doing so (by calling shmctl with IPC_RMID) would remove its key
so the project application couldn't attach to it.
But we could do this safely after the first successful response
from the project application. Should we make this change?
client/
app.C,h
app_control.C
app_start.C
client_state.C,h
cpu_sched.C
log_flags.C,h
clientgui/
ViewWork.cpp
ViewWorkGrid.cpp
lib/
shmem.C,h
gui_rpc_client.h
gui_rpc_client_ops.C

View File

@ -111,6 +111,7 @@ ACTIVE_TASK::ACTIVE_TASK() {
have_trickle_down = false;
send_upload_file_status = false;
too_large = false;
needs_shmem = false;
want_network = 0;
memset(&procinfo, 0, sizeof(procinfo));
#ifdef _WIN32
@ -167,10 +168,10 @@ void ACTIVE_TASK::close_process_handles() {
// call this when a process has exited and we're not going to restart it
//
void ACTIVE_TASK::cleanup_task() {
#ifdef _WIN32
if (gstate.exit_after_finish) {
exit(0);
}
#ifdef _WIN32
// detach from shared mem.
// This will destroy shmem seg since we're the last attachment
//
@ -196,6 +197,10 @@ void ACTIVE_TASK::cleanup_task() {
}
app_client_shm.shm = NULL;
}
if (gstate.exit_after_finish) {
exit(0);
}
#endif
}
@ -459,6 +464,7 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) {
" <working_set_size>%f</working_set_size>\n"
" <working_set_size_smoothed>%f</working_set_size_smoothed>\n"
" <page_fault_rate>%f</page_fault_rate>\n"
"%s"
"%s",
task_state(),
app_version->version_num,
@ -470,7 +476,8 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) {
procinfo.working_set_size,
procinfo.working_set_size_smoothed,
procinfo.page_fault_rate,
too_large?" <too_large/>\n":""
too_large?" <too_large/>\n":"",
needs_shmem?" <needs_shmem/>\n":""
);
if (strlen(app_version->graphics_exec_path)) {
fout.printf(

View File

@ -102,6 +102,7 @@ public:
bool have_trickle_down;
bool send_upload_file_status;
bool too_large; // working set too large to run now
bool needs_shmem; // waiting for a free shared memory segment
int want_network;
// This task wants to do network comm (for F@h)
// this is passed via share-memory message (app_status channel)

View File

@ -315,8 +315,8 @@ void ACTIVE_TASK::handle_exited_app(int stat) {
#endif
}
cleanup_task(); // Always release shared memory
if (!will_restart) {
cleanup_task();
copy_output_files();
read_stderr_file();
clean_out_dir(slot_dir);

View File

@ -516,8 +516,7 @@ int ACTIVE_TASK::start(bool first_time) {
shmem_seg_name, sizeof(SHARED_MEM), (void**)&app_client_shm.shm
);
if (retval) {
sprintf(buf, "Can't create shared memory: %s", boincerror(retval));
goto error;
return retval;
}
}
app_client_shm.reset_msgs();
@ -580,9 +579,14 @@ int ACTIVE_TASK::start(bool first_time) {
(void**)&app_client_shm.shm
);
if (retval) {
sprintf(buf, "Can't create shared memory: %s", boincerror(retval));
goto error;
needs_shmem = true;
// Assume no additional shared memory is available for next 10 seconds
// (run only tasks which are already attached to shared memory).
gstate.retry_shmem_time = gstate.now + 10.0;
destroy_shmem(shmem_seg_name); // Don't leave an orphan shmem segment
return retval;
}
needs_shmem = false;
}
app_client_shm.reset_msgs();
@ -710,6 +714,9 @@ int ACTIVE_TASK::resume_or_start(bool first_time) {
retval = start(false);
str = "Restarting";
}
if ((retval == ERR_SHMGET) || (retval == ERR_SHMAT)) {
return retval;
}
if (retval) {
set_task_state(PROCESS_COULDNT_START, "resume_or_start1");
return retval;

View File

@ -46,6 +46,7 @@
#include "network.h"
#include "http_curl.h"
#include "client_msgs.h"
#include "shmem.h"
#include "client_state.h"
using std::max;
@ -104,6 +105,7 @@ CLIENT_STATE::CLIENT_STATE() {
debt_interval_start = 0;
total_wall_cpu_time_this_debt_interval = 0;
retry_shmem_time = 0;
must_schedule_cpus = true;
must_enforce_cpu_schedule = true;
no_gui_rpc = false;
@ -359,7 +361,11 @@ int CLIENT_STATE::init() {
auto_update.init();
http_ops->cleanup_temp_files();
if (log_flags.stress_shmem_debug) {
stress_shmem();
}
initialized = true;
return 0;
}

View File

@ -268,6 +268,7 @@ private:
void make_running_task_heap(vector<ACTIVE_TASK*>&);
void print_deadline_misses();
public:
double retry_shmem_time;
inline double work_buf_min() {
return global_prefs.work_buf_min_days * 86400;
}

View File

@ -506,6 +506,14 @@ void CLIENT_STATE::schedule_cpus() {
continue;
} else {
atp->too_large = false;
if (gstate.retry_shmem_time < gstate.now) {
if (atp->app_client_shm.shm == NULL) {
atp->needs_shmem = true;
continue;
}
atp->needs_shmem = false;
}
}
ram_left -= atp->procinfo.working_set_size_smoothed;
}
@ -902,10 +910,12 @@ bool CLIENT_STATE::enforce_schedule() {
atp->scheduler_state == CPU_SCHED_UNINITIALIZED
);
if (retval) {
report_result_error(
*(atp->result), "Couldn't start or resume: %d", retval
);
request_schedule_cpus("start failed");
if ((retval != ERR_SHMGET) && (retval != ERR_SHMAT)) {
report_result_error(
*(atp->result), "Couldn't start or resume: %d", retval
);
request_schedule_cpus("start failed");
}
continue;
}
atp->run_interval_start_wall_time = now;

View File

@ -79,6 +79,7 @@ void LOG_FLAGS::defaults() {
mem_usage_debug = false;
network_status_debug = false;
checkpoint_debug = false;
stress_shmem_debug = false;
}
// Parse log flag preferences
@ -121,6 +122,7 @@ int LOG_FLAGS::parse(XML_PARSER& xp) {
if (xp.parse_bool(tag, "mem_usage_debug", mem_usage_debug)) continue;
if (xp.parse_bool(tag, "network_status_debug", network_status_debug)) continue;
if (xp.parse_bool(tag, "checkpoint_debug", checkpoint_debug)) continue;
if (xp.parse_bool(tag, "stress_shmem_debug", stress_shmem_debug)) continue;
msg_printf(NULL, MSG_USER_ERROR, "Unrecognized tag in %s: <%s>\n",
CONFIG_FILE, tag
);

View File

@ -73,6 +73,7 @@ struct LOG_FLAGS {
bool mem_usage_debug; // memory usage
bool network_status_debug;
bool checkpoint_debug;
bool stress_shmem_debug;
LOG_FLAGS();
void defaults();

View File

@ -793,6 +793,8 @@ wxInt32 CViewWork::FormatStatus(wxInt32 item, wxString& strBuffer) const {
} else if (result->active_task) {
if (result->too_large) {
strBuffer = _("Waiting for memory");
} else if (result->needs_shmem) {
strBuffer = _("Waiting for shared memory");
} else if (result->scheduler_state == CPU_SCHED_SCHEDULED) {
strBuffer = _("Running");
} else if (result->scheduler_state == CPU_SCHED_PREEMPTED) {

View File

@ -712,6 +712,8 @@ wxInt32 CViewWorkGrid::FormatStatus(wxInt32 item, wxString& strBuffer) const {
} else if (result->active_task) {
if (result->too_large) {
strBuffer = _("Waiting for memory");
} else if (result->needs_shmem) {
strBuffer = _("Waiting for shared memory");
} else if (result->scheduler_state == CPU_SCHED_SCHEDULED) {
strBuffer = _("Running");
} else if (result->scheduler_state == CPU_SCHED_PREEMPTED) {

View File

@ -212,6 +212,7 @@ public:
bool supports_graphics;
int graphics_mode_acked;
bool too_large;
bool needs_shmem;
bool edf_scheduled;
std::string graphics_exec_path;
std::string slot_path;

View File

@ -391,6 +391,7 @@ int RESULT::parse(MIOFILE& in) {
if (parse_double(buf, "<fraction_done>", fraction_done)) continue;
if (parse_double(buf, "<estimated_cpu_time_remaining>", estimated_cpu_time_remaining)) continue;
if (parse_bool(buf, "too_large", too_large)) continue;
if (parse_bool(buf, "needs_shmem", needs_shmem)) continue;
if (parse_bool(buf, "edf_scheduled", edf_scheduled)) continue;
if (parse_str(buf, "graphics_exec_path", graphics_exec_path)) continue;
if (parse_str(buf, "slot_path", slot_path)) continue;
@ -428,6 +429,7 @@ void RESULT::clear() {
supports_graphics = false;
graphics_mode_acked = 0;
too_large = false;
needs_shmem = false;
edf_scheduled = false;
app = NULL;

View File

@ -260,6 +260,28 @@ int print_shmem_info(key_t key) {
return 0;
}
// For debugging shared memory logic
// For testing on Apple, Linux, UNIX systems with limited number
// of shared memory segments per process and / or system-wide
// Mac OS X has a default limit of 8 segments per process, 32 system-wide
void stress_shmem() {
int retval;
void * shmaddr[8];
key_t key[] = {'BNC1', 'BNC2', 'BNC3', 'BNC4', 'BNC5', 'BNC6', 'BNC7', 'BNC8' };
int i, id;
// Tie up 5 of the 8 shared memory segments each process may have
for (i=0; i<5; i++) {
retval = create_shmem(key[i], 1024, 0, &shmaddr[i]);
if (!retval) {
id = shmget(key[i], 0, 0);
// Mark it for automatic destruction when BOINC exits
if (id >= 0)
retval = shmctl(id, IPC_RMID, 0);
}
}
}
#endif
const char *BOINC_RCSID_f835f078de = "$Id$";

View File

@ -64,5 +64,11 @@ extern int detach_shmem(void*);
extern int shmem_info(key_t key);
#ifndef __EMX__
// For testing on Apple, Linux, UNIX systems with limited number
// of shared memory segments per process and / or system-wide
void stress_shmem(void);
#endif
#endif
#endif