From 81514455056c92ab5526f1283f0beafd331f11af Mon Sep 17 00:00:00 2001 From: David Anderson Date: Fri, 13 Dec 2024 15:19:41 -0800 Subject: [PATCH] scheduler: for BUDA GPU jobs, put resource usage info in the element. --- sched/sched_locality.cpp | 2 +- sched/sched_send.cpp | 57 ++++++++++++++++++++++++++++++++++++++-- sched/sched_types.h | 6 ++--- sched/sched_version.cpp | 1 - 4 files changed, 59 insertions(+), 7 deletions(-) diff --git a/sched/sched_locality.cpp b/sched/sched_locality.cpp index 5869a8d8a2..b8b93bd32d 100644 --- a/sched/sched_locality.cpp +++ b/sched/sched_locality.cpp @@ -336,7 +336,7 @@ static int possibly_send_result(SCHED_DB_RESULT& result) { HOST_USAGE hu; check_buda_plan_class(wu, hu, is_buda, is_ok); if (is_buda) { - if (!is_ok) ERR_NO_APP_VERSION; + if (!is_ok) return ERR_NO_APP_VERSION; } else { hu = bavp->host_usage; } diff --git a/sched/sched_send.cpp b/sched/sched_send.cpp index 5fd5f1b066..0ce0f9f88e 100644 --- a/sched/sched_send.cpp +++ b/sched/sched_send.cpp @@ -570,10 +570,58 @@ static int insert_wu_tags(WORKUNIT& wu, APP& app) { return insert_after(wu.xml_doc, "\n", buf); } +// add host usage into to WU's xml_doc (for BUDA jobs) +// +static int add_usage_to_wu(WORKUNIT &wu, HOST_USAGE &hu) { + char buf[2048], buf2[2048]; + snprintf(buf, sizeof(buf), + " %f\n" + " %f\n", + hu.avg_ncpus, + hu.projected_flops + ); + if (hu.proc_type != PROC_TYPE_CPU) { + snprintf(buf2, sizeof(buf2), + " \n" + " %s\n" + " %f\n" + " \n", + proc_type_name_xml(hu.proc_type), + hu.gpu_usage + ); + strcat(buf, buf2); + } + if (strlen(hu.cmdline)) { + snprintf(buf2, sizeof(buf2), + " %s\n", + hu.cmdline + ); + strcat(buf, buf2); + } + + char *p = wu.xml_doc; + if (strlen(p) + strlen(buf) + 10 > sizeof(wu.xml_doc)) { + log_messages.printf(MSG_CRITICAL, + "add_usage_to_wu(): field too small: %ld %ld %ld\n", + strlen(p), strlen(buf), sizeof(wu.xml_doc) + ); + return -1; + } + p = strstr(p, ""); + if (!p) { + log_messages.printf(MSG_CRITICAL, "add_usage_to_wu(): no end tag\n"); + return -1; + } + strcpy(p, buf); + strcat(p, ""); + return 0; +} + // Add the given workunit, app, and app version to a reply. // static int add_wu_to_reply( - WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp + WORKUNIT& wu, SCHEDULER_REPLY&, APP* app, BEST_APP_VERSION* bavp, + bool is_buda, HOST_USAGE &hu ) { int retval; WORKUNIT wu2, wu3; @@ -626,6 +674,11 @@ static int add_wu_to_reply( ); return retval; } + + if (is_buda) { + retval = add_usage_to_wu(wu2, hu); + if (retval) return retval; + } wu3 = wu2; if (strlen(config.replace_download_url_by_timezone)) { process_wu_timezone(wu2, wu3); @@ -1015,7 +1068,7 @@ int add_result_to_reply( // done with DB updates. // - retval = add_wu_to_reply(wu, *g_reply, app, bavp); + retval = add_wu_to_reply(wu, *g_reply, app, bavp, is_buda, host_usage); if (retval) return retval; // Adjust available disk space. diff --git a/sched/sched_types.h b/sched/sched_types.h index 919aac69bc..03522db506 100644 --- a/sched/sched_types.h +++ b/sched/sched_types.h @@ -63,13 +63,13 @@ struct USER_MESSAGE { // The resource usage (CPU, GPU, RAM) of a job, // and estimates of its speed -// Populated by plan-class functions if have plan class, +// Populated by plan-class functions if there's a plan class, // else by HOST_USAGE::sequential_app() // struct HOST_USAGE { int proc_type; double gpu_usage; - double gpu_ram; + double gpu_ram; // not currently used by client double avg_ncpus; double mem_usage; // mem usage if specified by the plan class @@ -439,7 +439,7 @@ struct WORK_REQ_BASE { req_instances[proc_type] = 0; } - // older clients send send a single number, the requested duration of jobs + // older clients send a single number, the requested duration of jobs // double seconds_to_fill; diff --git a/sched/sched_version.cpp b/sched/sched_version.cpp index 59e0043d8f..5396f185d4 100644 --- a/sched/sched_version.cpp +++ b/sched/sched_version.cpp @@ -982,4 +982,3 @@ BEST_APP_VERSION* get_app_version( } return bavp; } -