client/API/docker_wrapper: get CPU and mem usage of Docker apps

Mem usage (WSS):
The easiest way to get the WSS of a Docker contaier is to ask Docker
using the "docker stats" command.
So I have docker_wrapper do this periodically (10 sec... it's a bit slow).

But how to get this back to the client?
Currently there's no provision for an app to reports its own WSS.
So I added one, by adding an optional field to the app status messages
sent from app to client in shared mem.
If this is present, the client uses it instead of procinfo.

CPU time: "docker stats" reports CPU fraction
(averaged over what period?)
We multiply that by the stats poll.
Not exactly the same as CPU time, but close enough.
This commit is contained in:
David Anderson 2024-10-28 19:34:30 -07:00
parent 96370db915
commit 8020fde86a
8 changed files with 124 additions and 38 deletions

View File

@ -1052,7 +1052,8 @@ int boinc_report_app_status_aux(
double _fraction_done,
int other_pid,
double _bytes_sent,
double _bytes_received
double _bytes_received,
double wss
) {
char msg_buf[MSG_CHANNEL_SIZE], buf[1024];
if (standalone) return 0;
@ -1081,6 +1082,10 @@ int boinc_report_app_status_aux(
sprintf(buf, "<sporadic_ac>%d</sporadic_ac>\n", ac_state);
strlcat(msg_buf, buf, sizeof(msg_buf));
}
if (wss) {
sprintf(buf, "<wss>%f</wss>\n", wss);
strlcat(msg_buf, buf, sizeof(msg_buf));
}
#ifdef MSGS_FROM_FILE
if (fout) {
fputs(msg_buf, fout);
@ -1100,7 +1105,7 @@ int boinc_report_app_status(
double _fraction_done
){
return boinc_report_app_status_aux(
cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0
cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0, 0
);
}

View File

@ -138,7 +138,8 @@ extern int boinc_upload_status(std::string& name);
extern char* boinc_msg_prefix(char*, int);
extern int boinc_report_app_status_aux(
double cpu_time, double checkpoint_cpu_time, double _fraction_done,
int other_pid, double bytes_sent, double bytes_received
int other_pid, double bytes_sent, double bytes_received,
double wss
);
extern int boinc_temporary_exit(
int delay, const char* reason=NULL, bool is_notice=false

View File

@ -110,6 +110,7 @@ ACTIVE_TASK::ACTIVE_TASK() {
peak_disk_usage = 0;
once_ran_edf = false;
wss_from_app = 0;
fraction_done = 0;
fraction_done_elapsed_time = 0;
first_fraction_done = 0;
@ -420,6 +421,8 @@ void ACTIVE_TASK_SET::get_memory_usage() {
// at least on Windows. Use the VM size instead.
//
pi.working_set_size_smoothed = atp->wup->rsc_memory_bound;
} else if (atp->wss_from_app > 0) {
pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + atp->wss_from_app);
} else {
pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + pi.working_set_size);
}

View File

@ -53,12 +53,14 @@ typedef int PROCESS_ID;
// Represents a job in progress.
// When an active task is created, it is assigned a "slot"
// When a job is started, it is assigned a "slot"
// which determines the directory it runs in.
// This doesn't change over the life of the active task;
// thus the task can use the slot directory for temp files
// This doesn't change over the life of the job;
// so it can use the slot directory for temp files
// that BOINC doesn't know about.
// If you add anything, initialize it in the constructor
//
struct ACTIVE_TASK {
#ifdef _WIN32
HANDLE process_handle, shm_handle;
@ -100,8 +102,12 @@ struct ACTIVE_TASK {
// most recent CPU time reported by app
bool once_ran_edf;
// END OF ITEMS SAVED IN STATE FILE
// END OF ITEMS SAVED IN STATE FILES
double wss_from_app;
// work set size reported by the app
// (e.g. docker_wrapper does this).
// If nonzero, use this instead of procinfo data
double fraction_done;
// App's estimate of how much of the work unit is done.
// Passed from the application via an API call;

View File

@ -1439,8 +1439,23 @@ bool ACTIVE_TASK::get_app_status_msg() {
}
}
}
parse_double(msg_buf, "<current_cpu_time>", current_cpu_time);
parse_double(msg_buf, "<checkpoint_cpu_time>", checkpoint_cpu_time);
if (parse_double(msg_buf, "<current_cpu_time>", current_cpu_time)) {
if (current_cpu_time < 0) {
msg_printf(result->project, MSG_INFO,
"app reporting negative CPU: %f", current_cpu_time
);
current_cpu_time = 0;
}
}
if (parse_double(msg_buf, "<checkpoint_cpu_time>", checkpoint_cpu_time)) {
if (checkpoint_cpu_time < 0) {
msg_printf(result->project, MSG_INFO,
"app reporting negative checkpoint CPU: %f", checkpoint_cpu_time
);
checkpoint_cpu_time = 0;
}
}
parse_double(msg_buf, "<wss>", wss_from_app);
parse_double(msg_buf, "<fpops_per_cpu_sec>", result->fpops_per_cpu_sec);
parse_double(msg_buf, "<fpops_cumulative>", result->fpops_cumulative);
parse_double(msg_buf, "<intops_per_cpu_sec>", result->intops_per_cpu_sec);
@ -1470,18 +1485,6 @@ bool ACTIVE_TASK::get_app_status_msg() {
if (parse_int(msg_buf, "<sporadic_ac>", i)) {
sporadic_ac_state = (SPORADIC_AC_STATE)i;
}
if (current_cpu_time < 0) {
msg_printf(result->project, MSG_INFO,
"app reporting negative CPU: %f", current_cpu_time
);
current_cpu_time = 0;
}
if (checkpoint_cpu_time < 0) {
msg_printf(result->project, MSG_INFO,
"app reporting negative checkpoint CPU: %f", checkpoint_cpu_time
);
checkpoint_cpu_time = 0;
}
return true;
}

View File

@ -79,14 +79,16 @@ using std::string;
using std::vector;
#define POLL_PERIOD 1.0
#define STATUS_PERIOD 10
// reports status this often
enum JOB_STATUS {JOB_IN_PROGRESS, JOB_SUCCESS, JOB_FAIL};
struct RSC_USAGE {
double cpu_time;
double cpu_frac;
double wss;
void clear() {
cpu_time = 0;
cpu_frac = 0;
wss = 0;
}
};
@ -442,7 +444,10 @@ void poll_client_msgs() {
}
}
JOB_STATUS poll_app(RSC_USAGE &ru) {
// check whether job has exited
// Note: on both Podman and Docker this takes significant CPU time
// (like .03 sec) so do it infrequently (like 5 sec)
JOB_STATUS poll_app() {
char cmd[1024];
vector<string> out;
int retval;
@ -461,6 +466,41 @@ JOB_STATUS poll_app(RSC_USAGE &ru) {
return JOB_FAIL;
}
// get CPU and mem usage
// This is also surprisingly slow
int get_stats(RSC_USAGE &ru) {
char cmd[1024];
vector<string> out;
int retval;
unsigned int n;
sprintf(cmd,
"%s stats --no-stream --format \"{{.CPUPerc}} {{.MemUsage}}\" %s",
cli_prog, container_name
);
retval = run_docker_command(cmd, out);
if (retval) return -1;
n = out.size();
if (n == 0) return -1;
const char *buf = out[n-1].c_str();
// output is like
// 0.00% 420KiB / 503.8GiB
double cpu_pct, mem;
char mem_unit;
n = sscanf(buf, "%lf%% %lf%c", &cpu_pct, &mem, &mem_unit);
if (n != 3) return -1;
switch (mem_unit) {
case 'G': mem *= GIGA; break;
case 'M': mem *= MEGA; break;
case 'K': mem *= KILO; break;
case 'B': break;
default: return -1;
}
ru.cpu_frac = cpu_pct/100.;
ru.wss = mem;
return 0;
}
#ifdef _WIN32
// find a WSL distro with Docker and set up a command link to it
//
@ -543,6 +583,14 @@ int main(int argc, char** argv) {
}
if (verbose) config.print();
if (sporadic) {
retval = boinc_sporadic_dir(".");
if (retval) {
fprintf(stderr, "can't create sporadic files\n");
boinc_finish(retval);
}
}
#ifdef _WIN32
retval = wsl_init();
if (retval) {
@ -578,18 +626,35 @@ int main(int argc, char** argv) {
boinc_finish(1);
}
running = true;
while (1) {
double cpu_time = 0;
for (int i=0; ; i++) {
poll_client_msgs();
switch(poll_app(ru)) {
case JOB_FAIL:
cleanup();
boinc_finish(1);
break;
case JOB_SUCCESS:
copy_files_from_container();
cleanup();
boinc_finish(0);
break;
if (i%STATUS_PERIOD == 0) {
switch(poll_app()) {
case JOB_FAIL:
cleanup();
boinc_finish(1);
break;
case JOB_SUCCESS:
copy_files_from_container();
cleanup();
boinc_finish(0);
break;
default:
break;
}
retval = get_stats(ru);
if (!retval) {
cpu_time += STATUS_PERIOD*ru.cpu_frac;
boinc_report_app_status_aux(
cpu_time,
0, // checkpoint CPU time
0, // frac done
0, // other PID
0,0, // bytes send/received
ru.wss
);
}
}
boinc_sleep(POLL_PERIOD);
}

View File

@ -316,7 +316,8 @@ void VBOX_VM::report_clean(
fraction_done,
vm_pid,
bytes_sent,
bytes_received
bytes_received,
0
);
}

View File

@ -889,7 +889,8 @@ int main(int argc, char** argv) {
fraction_done,
pVM->vm_pid,
bytes_sent,
bytes_received
bytes_received,
0
);
// Wait for up to 5 minutes for the VM to switch states.
@ -1373,7 +1374,8 @@ int main(int argc, char** argv) {
fraction_done,
pVM->vm_pid,
bytes_sent,
bytes_received
bytes_received,
0
);
if (!retval) {