diff --git a/checkin_notes b/checkin_notes index 56c5d620e8..9e161deaab 100644 --- a/checkin_notes +++ b/checkin_notes @@ -2857,3 +2857,16 @@ Rom 6 Mar 2009 clientgui/ AdvancedFrame.cpp + +David 6 Mar 2009 + - client: add a new mechanism for assigning coproc instances to tasks, + and passing them the corresponding --device N cmdline args. + This fixes a bug introduced in 17402 (Feb 26) + that broke the --device feature, + presumably causing problems on systems with multiple GPUs. + + client/ + app_start.cpp + client_types.h + lib/ + coproc.h diff --git a/client/app_start.cpp b/client/app_start.cpp index 0b153b8371..edecd326b4 100644 --- a/client/app_start.cpp +++ b/client/app_start.cpp @@ -106,20 +106,66 @@ static void debug_print_argv(char** argv) { } #endif -// for apps that use CUDA coprocessors, append "--device x" to the command line +// for apps that use coprocessors, reserve the instances, +// and append "--device x" to the command line // -static void cuda_cmdline(ACTIVE_TASK* atp, char* cmdline) { -#if 0 - // TODO: do this another way - char buf[256]; - if (!coproc_cuda) return; - for (int i=0; iowner[i] == atp) { - sprintf(buf, " --device %d", i); - strcat(cmdline, buf); +static void coproc_cmdline( + COPROC* coproc, ACTIVE_TASK* atp, int ninstances, char* cmdline +) { + unsigned int i; + int j, k; + vector tasks_using_coproc; + + // make a list of the executing tasks (other than this) using this coproc + // + for (i=0; itask_state() != PROCESS_EXECUTING) continue; + if (p->app_version->coprocs.lookup(coproc->type)) { + tasks_using_coproc.push_back(p); + } + } + + // scan the coproc's owner array, + // clearing any entries not in the above list + // + for (j=0; jcount; j++) { + if (coproc->owner[j]) { + bool found = false; + for (k=0; kowner[j] == tasks_using_coproc[k]) { + found = true; + break; + } + } + if (!found) { + coproc->owner[j] = NULL; + } + } + } + + // reserve instances for this job + // + char buf[256]; + k = 0; + for (j=0; jcount) { + msg_printf(atp->result->project, MSG_INTERNAL_ERROR, + "Can't find free %s", coproc->type + ); + return; + } + if (coproc->owner[k] == NULL) { + sprintf(buf, " --device %d", k); + strcat(cmdline, buf); + coproc->owner[k++] = atp; + break; + } + k++; } } -#endif } // Make a unique key for core/app shared memory segment. @@ -527,7 +573,9 @@ int ACTIVE_TASK::start(bool first_time) { sprintf(cmdline, "%s %s %s", exec_path, wup->command_line.c_str(), app_version->cmdline ); - cuda_cmdline(this, cmdline); + if (coproc_cuda && app_version->ncudas) { + coproc_cmdline(coproc_cuda, this, app_version->ncudas, cmdline); + } relative_to_absolute(slot_dir, slotdirpath); bool success = false; @@ -819,7 +867,9 @@ int ACTIVE_TASK::start(bool first_time) { } #endif sprintf(cmdline, "%s %s", wup->command_line.c_str(), app_version->cmdline); - cuda_cmdline(this, cmdline); + if (coproc_cuda && app_version->ncudas) { + coproc_cmdline(coproc_cuda, this, app_version->ncudas, cmdline); + } sprintf(buf, "../../%s", exec_path ); if (g_use_sandbox) { char switcher_path[100]; diff --git a/client/client_types.h b/client/client_types.h index a91299d606..6e6d5bbfff 100644 --- a/client/client_types.h +++ b/client/client_types.h @@ -412,7 +412,7 @@ struct APP_VERSION { char api_version[16]; double avg_ncpus; double max_ncpus; - double ncudas; + int ncudas; double flops; /// additional cmdline args char cmdline[256]; diff --git a/lib/coproc.h b/lib/coproc.h index 3a41601e3b..8aac50a26b 100644 --- a/lib/coproc.h +++ b/lib/coproc.h @@ -28,6 +28,8 @@ #include "miofile.h" +#define MAX_COPROC_INSTANCES 64 + struct COPROC { char type[256]; // must be unique int count; // how many are present @@ -39,6 +41,11 @@ struct COPROC { int req_instances; // requesting enough jobs to use this many instances double estimated_delay; // resource will be saturated for this long + // Used in client to keep track of which tasks are using which instances + // The pointers point to ACTIVE_TASK + // + void* owner[MAX_COPROC_INSTANCES]; + #ifndef _USING_FCGI_ virtual void write_xml(MIOFILE&); #endif