mirror of https://github.com/BOINC/boinc.git
CC: Add logic to scheduler, etc. for systems which limit the number of shared memory segments per process and / or system-wide.
svn path=/trunk/boinc/; revision=13015
This commit is contained in:
parent
8a0d5fa6c6
commit
7bb1b1a0d4
|
@ -6636,3 +6636,54 @@ Charlie 24 June 2007
|
|||
mac_build/
|
||||
boinc.xcodeproj/
|
||||
project.pbxproj
|
||||
|
||||
Charlie 25 June 2007
|
||||
CC: Add logic to scheduler, etc. for systems which limit the number
|
||||
of shared memory segments per process and / or system-wide.
|
||||
Mac OS X has default limits of 8 shmem segments per process, 32
|
||||
max system wide.
|
||||
- Don't treat failure to allocate & attach to shared memory as a
|
||||
computation error.
|
||||
- Destroy shared memory segments associated with all tasks which
|
||||
exit, not just those with a finished file. This frees up a
|
||||
segment for use by another task.
|
||||
- If we run out of shared memory, start a 10 second timer during
|
||||
which don't try to run tasks unless they are already attached
|
||||
to shared memory (suspended tasks left in memory qualify.)
|
||||
- If a task is waiting for shared memory to become available,
|
||||
display this in the Manager's tasks pane.
|
||||
- Added a new log flag stress_shmem_debug for testing scheduling
|
||||
logic when we run out of shared memory.
|
||||
|
||||
QUESTIONS for further development:
|
||||
- Note that it is quite easy to run out of shared memory segments if
|
||||
the leave-suspended-tasks-in-memory option is set. (The shared
|
||||
memory channel must remain active for heartbeats and to tell the
|
||||
task when to resume.)
|
||||
- Should we override this option if we run out of shared memory
|
||||
segments?
|
||||
- What about tasks left in RAM because they don't checkpoint?
|
||||
- If BOINC crashes, its shared memory is not removed. In some
|
||||
cases, when BOINC is relaunched it will use the same key and
|
||||
so can re-attach to the orphaned segments, but this won't always
|
||||
happen. This can cause problems with the system-wide limits.
|
||||
We can't mark the segment for removal when we create it, because
|
||||
doing so (by calling shmctl with IPC_RMID) would remove its key
|
||||
so the project application couldn't attach to it.
|
||||
But we could do this safely after the first successful response
|
||||
from the project application. Should we make this change?
|
||||
|
||||
client/
|
||||
app.C,h
|
||||
app_control.C
|
||||
app_start.C
|
||||
client_state.C,h
|
||||
cpu_sched.C
|
||||
log_flags.C,h
|
||||
clientgui/
|
||||
ViewWork.cpp
|
||||
ViewWorkGrid.cpp
|
||||
lib/
|
||||
shmem.C,h
|
||||
gui_rpc_client.h
|
||||
gui_rpc_client_ops.C
|
||||
|
|
11
client/app.C
11
client/app.C
|
@ -111,6 +111,7 @@ ACTIVE_TASK::ACTIVE_TASK() {
|
|||
have_trickle_down = false;
|
||||
send_upload_file_status = false;
|
||||
too_large = false;
|
||||
needs_shmem = false;
|
||||
want_network = 0;
|
||||
memset(&procinfo, 0, sizeof(procinfo));
|
||||
#ifdef _WIN32
|
||||
|
@ -167,10 +168,10 @@ void ACTIVE_TASK::close_process_handles() {
|
|||
// call this when a process has exited and we're not going to restart it
|
||||
//
|
||||
void ACTIVE_TASK::cleanup_task() {
|
||||
#ifdef _WIN32
|
||||
if (gstate.exit_after_finish) {
|
||||
exit(0);
|
||||
}
|
||||
#ifdef _WIN32
|
||||
// detach from shared mem.
|
||||
// This will destroy shmem seg since we're the last attachment
|
||||
//
|
||||
|
@ -196,6 +197,10 @@ void ACTIVE_TASK::cleanup_task() {
|
|||
}
|
||||
app_client_shm.shm = NULL;
|
||||
}
|
||||
|
||||
if (gstate.exit_after_finish) {
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -459,6 +464,7 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) {
|
|||
" <working_set_size>%f</working_set_size>\n"
|
||||
" <working_set_size_smoothed>%f</working_set_size_smoothed>\n"
|
||||
" <page_fault_rate>%f</page_fault_rate>\n"
|
||||
"%s"
|
||||
"%s",
|
||||
task_state(),
|
||||
app_version->version_num,
|
||||
|
@ -470,7 +476,8 @@ int ACTIVE_TASK::write_gui(MIOFILE& fout) {
|
|||
procinfo.working_set_size,
|
||||
procinfo.working_set_size_smoothed,
|
||||
procinfo.page_fault_rate,
|
||||
too_large?" <too_large/>\n":""
|
||||
too_large?" <too_large/>\n":"",
|
||||
needs_shmem?" <needs_shmem/>\n":""
|
||||
);
|
||||
if (strlen(app_version->graphics_exec_path)) {
|
||||
fout.printf(
|
||||
|
|
|
@ -102,6 +102,7 @@ public:
|
|||
bool have_trickle_down;
|
||||
bool send_upload_file_status;
|
||||
bool too_large; // working set too large to run now
|
||||
bool needs_shmem; // waiting for a free shared memory segment
|
||||
int want_network;
|
||||
// This task wants to do network comm (for F@h)
|
||||
// this is passed via share-memory message (app_status channel)
|
||||
|
|
|
@ -315,8 +315,8 @@ void ACTIVE_TASK::handle_exited_app(int stat) {
|
|||
#endif
|
||||
}
|
||||
|
||||
cleanup_task(); // Always release shared memory
|
||||
if (!will_restart) {
|
||||
cleanup_task();
|
||||
copy_output_files();
|
||||
read_stderr_file();
|
||||
clean_out_dir(slot_dir);
|
||||
|
|
|
@ -516,8 +516,7 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
shmem_seg_name, sizeof(SHARED_MEM), (void**)&app_client_shm.shm
|
||||
);
|
||||
if (retval) {
|
||||
sprintf(buf, "Can't create shared memory: %s", boincerror(retval));
|
||||
goto error;
|
||||
return retval;
|
||||
}
|
||||
}
|
||||
app_client_shm.reset_msgs();
|
||||
|
@ -580,9 +579,14 @@ int ACTIVE_TASK::start(bool first_time) {
|
|||
(void**)&app_client_shm.shm
|
||||
);
|
||||
if (retval) {
|
||||
sprintf(buf, "Can't create shared memory: %s", boincerror(retval));
|
||||
goto error;
|
||||
needs_shmem = true;
|
||||
// Assume no additional shared memory is available for next 10 seconds
|
||||
// (run only tasks which are already attached to shared memory).
|
||||
gstate.retry_shmem_time = gstate.now + 10.0;
|
||||
destroy_shmem(shmem_seg_name); // Don't leave an orphan shmem segment
|
||||
return retval;
|
||||
}
|
||||
needs_shmem = false;
|
||||
}
|
||||
app_client_shm.reset_msgs();
|
||||
|
||||
|
@ -710,6 +714,9 @@ int ACTIVE_TASK::resume_or_start(bool first_time) {
|
|||
retval = start(false);
|
||||
str = "Restarting";
|
||||
}
|
||||
if ((retval == ERR_SHMGET) || (retval == ERR_SHMAT)) {
|
||||
return retval;
|
||||
}
|
||||
if (retval) {
|
||||
set_task_state(PROCESS_COULDNT_START, "resume_or_start1");
|
||||
return retval;
|
||||
|
|
|
@ -46,6 +46,7 @@
|
|||
#include "network.h"
|
||||
#include "http_curl.h"
|
||||
#include "client_msgs.h"
|
||||
#include "shmem.h"
|
||||
#include "client_state.h"
|
||||
|
||||
using std::max;
|
||||
|
@ -104,6 +105,7 @@ CLIENT_STATE::CLIENT_STATE() {
|
|||
|
||||
debt_interval_start = 0;
|
||||
total_wall_cpu_time_this_debt_interval = 0;
|
||||
retry_shmem_time = 0;
|
||||
must_schedule_cpus = true;
|
||||
must_enforce_cpu_schedule = true;
|
||||
no_gui_rpc = false;
|
||||
|
@ -359,7 +361,11 @@ int CLIENT_STATE::init() {
|
|||
|
||||
auto_update.init();
|
||||
http_ops->cleanup_temp_files();
|
||||
|
||||
|
||||
if (log_flags.stress_shmem_debug) {
|
||||
stress_shmem();
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -268,6 +268,7 @@ private:
|
|||
void make_running_task_heap(vector<ACTIVE_TASK*>&);
|
||||
void print_deadline_misses();
|
||||
public:
|
||||
double retry_shmem_time;
|
||||
inline double work_buf_min() {
|
||||
return global_prefs.work_buf_min_days * 86400;
|
||||
}
|
||||
|
|
|
@ -506,6 +506,14 @@ void CLIENT_STATE::schedule_cpus() {
|
|||
continue;
|
||||
} else {
|
||||
atp->too_large = false;
|
||||
|
||||
if (gstate.retry_shmem_time < gstate.now) {
|
||||
if (atp->app_client_shm.shm == NULL) {
|
||||
atp->needs_shmem = true;
|
||||
continue;
|
||||
}
|
||||
atp->needs_shmem = false;
|
||||
}
|
||||
}
|
||||
ram_left -= atp->procinfo.working_set_size_smoothed;
|
||||
}
|
||||
|
@ -902,10 +910,12 @@ bool CLIENT_STATE::enforce_schedule() {
|
|||
atp->scheduler_state == CPU_SCHED_UNINITIALIZED
|
||||
);
|
||||
if (retval) {
|
||||
report_result_error(
|
||||
*(atp->result), "Couldn't start or resume: %d", retval
|
||||
);
|
||||
request_schedule_cpus("start failed");
|
||||
if ((retval != ERR_SHMGET) && (retval != ERR_SHMAT)) {
|
||||
report_result_error(
|
||||
*(atp->result), "Couldn't start or resume: %d", retval
|
||||
);
|
||||
request_schedule_cpus("start failed");
|
||||
}
|
||||
continue;
|
||||
}
|
||||
atp->run_interval_start_wall_time = now;
|
||||
|
|
|
@ -79,6 +79,7 @@ void LOG_FLAGS::defaults() {
|
|||
mem_usage_debug = false;
|
||||
network_status_debug = false;
|
||||
checkpoint_debug = false;
|
||||
stress_shmem_debug = false;
|
||||
}
|
||||
|
||||
// Parse log flag preferences
|
||||
|
@ -121,6 +122,7 @@ int LOG_FLAGS::parse(XML_PARSER& xp) {
|
|||
if (xp.parse_bool(tag, "mem_usage_debug", mem_usage_debug)) continue;
|
||||
if (xp.parse_bool(tag, "network_status_debug", network_status_debug)) continue;
|
||||
if (xp.parse_bool(tag, "checkpoint_debug", checkpoint_debug)) continue;
|
||||
if (xp.parse_bool(tag, "stress_shmem_debug", stress_shmem_debug)) continue;
|
||||
msg_printf(NULL, MSG_USER_ERROR, "Unrecognized tag in %s: <%s>\n",
|
||||
CONFIG_FILE, tag
|
||||
);
|
||||
|
|
|
@ -73,6 +73,7 @@ struct LOG_FLAGS {
|
|||
bool mem_usage_debug; // memory usage
|
||||
bool network_status_debug;
|
||||
bool checkpoint_debug;
|
||||
bool stress_shmem_debug;
|
||||
|
||||
LOG_FLAGS();
|
||||
void defaults();
|
||||
|
|
|
@ -793,6 +793,8 @@ wxInt32 CViewWork::FormatStatus(wxInt32 item, wxString& strBuffer) const {
|
|||
} else if (result->active_task) {
|
||||
if (result->too_large) {
|
||||
strBuffer = _("Waiting for memory");
|
||||
} else if (result->needs_shmem) {
|
||||
strBuffer = _("Waiting for shared memory");
|
||||
} else if (result->scheduler_state == CPU_SCHED_SCHEDULED) {
|
||||
strBuffer = _("Running");
|
||||
} else if (result->scheduler_state == CPU_SCHED_PREEMPTED) {
|
||||
|
|
|
@ -712,6 +712,8 @@ wxInt32 CViewWorkGrid::FormatStatus(wxInt32 item, wxString& strBuffer) const {
|
|||
} else if (result->active_task) {
|
||||
if (result->too_large) {
|
||||
strBuffer = _("Waiting for memory");
|
||||
} else if (result->needs_shmem) {
|
||||
strBuffer = _("Waiting for shared memory");
|
||||
} else if (result->scheduler_state == CPU_SCHED_SCHEDULED) {
|
||||
strBuffer = _("Running");
|
||||
} else if (result->scheduler_state == CPU_SCHED_PREEMPTED) {
|
||||
|
|
|
@ -212,6 +212,7 @@ public:
|
|||
bool supports_graphics;
|
||||
int graphics_mode_acked;
|
||||
bool too_large;
|
||||
bool needs_shmem;
|
||||
bool edf_scheduled;
|
||||
std::string graphics_exec_path;
|
||||
std::string slot_path;
|
||||
|
|
|
@ -391,6 +391,7 @@ int RESULT::parse(MIOFILE& in) {
|
|||
if (parse_double(buf, "<fraction_done>", fraction_done)) continue;
|
||||
if (parse_double(buf, "<estimated_cpu_time_remaining>", estimated_cpu_time_remaining)) continue;
|
||||
if (parse_bool(buf, "too_large", too_large)) continue;
|
||||
if (parse_bool(buf, "needs_shmem", needs_shmem)) continue;
|
||||
if (parse_bool(buf, "edf_scheduled", edf_scheduled)) continue;
|
||||
if (parse_str(buf, "graphics_exec_path", graphics_exec_path)) continue;
|
||||
if (parse_str(buf, "slot_path", slot_path)) continue;
|
||||
|
@ -428,6 +429,7 @@ void RESULT::clear() {
|
|||
supports_graphics = false;
|
||||
graphics_mode_acked = 0;
|
||||
too_large = false;
|
||||
needs_shmem = false;
|
||||
edf_scheduled = false;
|
||||
|
||||
app = NULL;
|
||||
|
|
22
lib/shmem.C
22
lib/shmem.C
|
@ -260,6 +260,28 @@ int print_shmem_info(key_t key) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// For debugging shared memory logic
|
||||
// For testing on Apple, Linux, UNIX systems with limited number
|
||||
// of shared memory segments per process and / or system-wide
|
||||
// Mac OS X has a default limit of 8 segments per process, 32 system-wide
|
||||
void stress_shmem() {
|
||||
int retval;
|
||||
void * shmaddr[8];
|
||||
key_t key[] = {'BNC1', 'BNC2', 'BNC3', 'BNC4', 'BNC5', 'BNC6', 'BNC7', 'BNC8' };
|
||||
int i, id;
|
||||
|
||||
// Tie up 5 of the 8 shared memory segments each process may have
|
||||
for (i=0; i<5; i++) {
|
||||
retval = create_shmem(key[i], 1024, 0, &shmaddr[i]);
|
||||
if (!retval) {
|
||||
id = shmget(key[i], 0, 0);
|
||||
// Mark it for automatic destruction when BOINC exits
|
||||
if (id >= 0)
|
||||
retval = shmctl(id, IPC_RMID, 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
const char *BOINC_RCSID_f835f078de = "$Id$";
|
||||
|
|
|
@ -64,5 +64,11 @@ extern int detach_shmem(void*);
|
|||
|
||||
extern int shmem_info(key_t key);
|
||||
|
||||
#ifndef __EMX__
|
||||
// For testing on Apple, Linux, UNIX systems with limited number
|
||||
// of shared memory segments per process and / or system-wide
|
||||
void stress_shmem(void);
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue