scheduler: for BUDA GPU jobs,

put resource usage info in the <workunit> element.
This commit is contained in:
David Anderson 2024-12-13 15:19:41 -08:00
parent b10221b29e
commit 8151445505
4 changed files with 59 additions and 7 deletions

View File

@ -336,7 +336,7 @@ static int possibly_send_result(SCHED_DB_RESULT& result) {
HOST_USAGE hu;
check_buda_plan_class(wu, hu, is_buda, is_ok);
if (is_buda) {
if (!is_ok) ERR_NO_APP_VERSION;
if (!is_ok) return ERR_NO_APP_VERSION;
} else {
hu = bavp->host_usage;
}

View File

@ -570,10 +570,58 @@ static int insert_wu_tags(WORKUNIT& wu, APP& app) {
return insert_after(wu.xml_doc, "<workunit>\n", buf);
}
// add host usage into to WU's xml_doc (for BUDA jobs)
//
static int add_usage_to_wu(WORKUNIT &wu, HOST_USAGE &hu) {
char buf[2048], buf2[2048];
snprintf(buf, sizeof(buf),
" <avg_ncpus>%f</avg_ncpus>\n"
" <flops>%f</flops>\n",
hu.avg_ncpus,
hu.projected_flops
);
if (hu.proc_type != PROC_TYPE_CPU) {
snprintf(buf2, sizeof(buf2),
" <coproc>\n"
" <type>%s</type>\n"
" <count>%f</count>\n"
" </coproc>\n",
proc_type_name_xml(hu.proc_type),
hu.gpu_usage
);
strcat(buf, buf2);
}
if (strlen(hu.cmdline)) {
snprintf(buf2, sizeof(buf2),
" <cmdline>%s</cmdline>\n",
hu.cmdline
);
strcat(buf, buf2);
}
char *p = wu.xml_doc;
if (strlen(p) + strlen(buf) + 10 > sizeof(wu.xml_doc)) {
log_messages.printf(MSG_CRITICAL,
"add_usage_to_wu(): field too small: %ld %ld %ld\n",
strlen(p), strlen(buf), sizeof(wu.xml_doc)
);
return -1;
}
p = strstr(p, "</workunit>");
if (!p) {
log_messages.printf(MSG_CRITICAL, "add_usage_to_wu(): no end tag\n");
return -1;
}
strcpy(p, buf);
strcat(p, "</workunit>");
return 0;
}
// Add the given workunit, app, and app version to a reply.
//
static int add_wu_to_reply(
WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp
WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp,
bool is_buda, HOST_USAGE &hu
) {
int retval;
WORKUNIT wu2, wu3;
@ -626,6 +674,11 @@ static int add_wu_to_reply(
);
return retval;
}
if (is_buda) {
retval = add_usage_to_wu(wu2, hu);
if (retval) return retval;
}
wu3 = wu2;
if (strlen(config.replace_download_url_by_timezone)) {
process_wu_timezone(wu2, wu3);
@ -1015,7 +1068,7 @@ int add_result_to_reply(
// done with DB updates.
//
retval = add_wu_to_reply(wu, *g_reply, app, bavp);
retval = add_wu_to_reply(wu, *g_reply, app, bavp, is_buda, host_usage);
if (retval) return retval;
// Adjust available disk space.

View File

@ -63,13 +63,13 @@ struct USER_MESSAGE {
// The resource usage (CPU, GPU, RAM) of a job,
// and estimates of its speed
// Populated by plan-class functions if have plan class,
// Populated by plan-class functions if there's a plan class,
// else by HOST_USAGE::sequential_app()
//
struct HOST_USAGE {
int proc_type;
double gpu_usage;
double gpu_ram;
double gpu_ram; // not currently used by client
double avg_ncpus;
double mem_usage;
// mem usage if specified by the plan class
@ -439,7 +439,7 @@ struct WORK_REQ_BASE {
req_instances[proc_type] = 0;
}
// older clients send send a single number, the requested duration of jobs
// older clients send a single number, the requested duration of jobs
//
double seconds_to_fill;

View File

@ -982,4 +982,3 @@ BEST_APP_VERSION* get_app_version(
}
return bavp;
}