mirror of https://github.com/BOINC/boinc.git
client/API/docker_wrapper: get CPU and mem usage of Docker apps
Mem usage (WSS): The easiest way to get the WSS of a Docker contaier is to ask Docker using the "docker stats" command. So I have docker_wrapper do this periodically (10 sec... it's a bit slow). But how to get this back to the client? Currently there's no provision for an app to reports its own WSS. So I added one, by adding an optional field to the app status messages sent from app to client in shared mem. If this is present, the client uses it instead of procinfo. CPU time: "docker stats" reports CPU fraction (averaged over what period?) We multiply that by the stats poll. Not exactly the same as CPU time, but close enough.
This commit is contained in:
parent
96370db915
commit
8020fde86a
|
@ -1052,7 +1052,8 @@ int boinc_report_app_status_aux(
|
|||
double _fraction_done,
|
||||
int other_pid,
|
||||
double _bytes_sent,
|
||||
double _bytes_received
|
||||
double _bytes_received,
|
||||
double wss
|
||||
) {
|
||||
char msg_buf[MSG_CHANNEL_SIZE], buf[1024];
|
||||
if (standalone) return 0;
|
||||
|
@ -1081,6 +1082,10 @@ int boinc_report_app_status_aux(
|
|||
sprintf(buf, "<sporadic_ac>%d</sporadic_ac>\n", ac_state);
|
||||
strlcat(msg_buf, buf, sizeof(msg_buf));
|
||||
}
|
||||
if (wss) {
|
||||
sprintf(buf, "<wss>%f</wss>\n", wss);
|
||||
strlcat(msg_buf, buf, sizeof(msg_buf));
|
||||
}
|
||||
#ifdef MSGS_FROM_FILE
|
||||
if (fout) {
|
||||
fputs(msg_buf, fout);
|
||||
|
@ -1100,7 +1105,7 @@ int boinc_report_app_status(
|
|||
double _fraction_done
|
||||
){
|
||||
return boinc_report_app_status_aux(
|
||||
cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0
|
||||
cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0, 0
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -138,7 +138,8 @@ extern int boinc_upload_status(std::string& name);
|
|||
extern char* boinc_msg_prefix(char*, int);
|
||||
extern int boinc_report_app_status_aux(
|
||||
double cpu_time, double checkpoint_cpu_time, double _fraction_done,
|
||||
int other_pid, double bytes_sent, double bytes_received
|
||||
int other_pid, double bytes_sent, double bytes_received,
|
||||
double wss
|
||||
);
|
||||
extern int boinc_temporary_exit(
|
||||
int delay, const char* reason=NULL, bool is_notice=false
|
||||
|
|
|
@ -110,6 +110,7 @@ ACTIVE_TASK::ACTIVE_TASK() {
|
|||
peak_disk_usage = 0;
|
||||
once_ran_edf = false;
|
||||
|
||||
wss_from_app = 0;
|
||||
fraction_done = 0;
|
||||
fraction_done_elapsed_time = 0;
|
||||
first_fraction_done = 0;
|
||||
|
@ -420,6 +421,8 @@ void ACTIVE_TASK_SET::get_memory_usage() {
|
|||
// at least on Windows. Use the VM size instead.
|
||||
//
|
||||
pi.working_set_size_smoothed = atp->wup->rsc_memory_bound;
|
||||
} else if (atp->wss_from_app > 0) {
|
||||
pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + atp->wss_from_app);
|
||||
} else {
|
||||
pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + pi.working_set_size);
|
||||
}
|
||||
|
|
14
client/app.h
14
client/app.h
|
@ -53,12 +53,14 @@ typedef int PROCESS_ID;
|
|||
|
||||
// Represents a job in progress.
|
||||
|
||||
// When an active task is created, it is assigned a "slot"
|
||||
// When a job is started, it is assigned a "slot"
|
||||
// which determines the directory it runs in.
|
||||
// This doesn't change over the life of the active task;
|
||||
// thus the task can use the slot directory for temp files
|
||||
// This doesn't change over the life of the job;
|
||||
// so it can use the slot directory for temp files
|
||||
// that BOINC doesn't know about.
|
||||
|
||||
// If you add anything, initialize it in the constructor
|
||||
//
|
||||
struct ACTIVE_TASK {
|
||||
#ifdef _WIN32
|
||||
HANDLE process_handle, shm_handle;
|
||||
|
@ -100,8 +102,12 @@ struct ACTIVE_TASK {
|
|||
// most recent CPU time reported by app
|
||||
bool once_ran_edf;
|
||||
|
||||
// END OF ITEMS SAVED IN STATE FILE
|
||||
// END OF ITEMS SAVED IN STATE FILES
|
||||
|
||||
double wss_from_app;
|
||||
// work set size reported by the app
|
||||
// (e.g. docker_wrapper does this).
|
||||
// If nonzero, use this instead of procinfo data
|
||||
double fraction_done;
|
||||
// App's estimate of how much of the work unit is done.
|
||||
// Passed from the application via an API call;
|
||||
|
|
|
@ -1439,8 +1439,23 @@ bool ACTIVE_TASK::get_app_status_msg() {
|
|||
}
|
||||
}
|
||||
}
|
||||
parse_double(msg_buf, "<current_cpu_time>", current_cpu_time);
|
||||
parse_double(msg_buf, "<checkpoint_cpu_time>", checkpoint_cpu_time);
|
||||
if (parse_double(msg_buf, "<current_cpu_time>", current_cpu_time)) {
|
||||
if (current_cpu_time < 0) {
|
||||
msg_printf(result->project, MSG_INFO,
|
||||
"app reporting negative CPU: %f", current_cpu_time
|
||||
);
|
||||
current_cpu_time = 0;
|
||||
}
|
||||
}
|
||||
if (parse_double(msg_buf, "<checkpoint_cpu_time>", checkpoint_cpu_time)) {
|
||||
if (checkpoint_cpu_time < 0) {
|
||||
msg_printf(result->project, MSG_INFO,
|
||||
"app reporting negative checkpoint CPU: %f", checkpoint_cpu_time
|
||||
);
|
||||
checkpoint_cpu_time = 0;
|
||||
}
|
||||
}
|
||||
parse_double(msg_buf, "<wss>", wss_from_app);
|
||||
parse_double(msg_buf, "<fpops_per_cpu_sec>", result->fpops_per_cpu_sec);
|
||||
parse_double(msg_buf, "<fpops_cumulative>", result->fpops_cumulative);
|
||||
parse_double(msg_buf, "<intops_per_cpu_sec>", result->intops_per_cpu_sec);
|
||||
|
@ -1470,18 +1485,6 @@ bool ACTIVE_TASK::get_app_status_msg() {
|
|||
if (parse_int(msg_buf, "<sporadic_ac>", i)) {
|
||||
sporadic_ac_state = (SPORADIC_AC_STATE)i;
|
||||
}
|
||||
if (current_cpu_time < 0) {
|
||||
msg_printf(result->project, MSG_INFO,
|
||||
"app reporting negative CPU: %f", current_cpu_time
|
||||
);
|
||||
current_cpu_time = 0;
|
||||
}
|
||||
if (checkpoint_cpu_time < 0) {
|
||||
msg_printf(result->project, MSG_INFO,
|
||||
"app reporting negative checkpoint CPU: %f", checkpoint_cpu_time
|
||||
);
|
||||
checkpoint_cpu_time = 0;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -79,14 +79,16 @@ using std::string;
|
|||
using std::vector;
|
||||
|
||||
#define POLL_PERIOD 1.0
|
||||
#define STATUS_PERIOD 10
|
||||
// reports status this often
|
||||
|
||||
enum JOB_STATUS {JOB_IN_PROGRESS, JOB_SUCCESS, JOB_FAIL};
|
||||
|
||||
struct RSC_USAGE {
|
||||
double cpu_time;
|
||||
double cpu_frac;
|
||||
double wss;
|
||||
void clear() {
|
||||
cpu_time = 0;
|
||||
cpu_frac = 0;
|
||||
wss = 0;
|
||||
}
|
||||
};
|
||||
|
@ -442,7 +444,10 @@ void poll_client_msgs() {
|
|||
}
|
||||
}
|
||||
|
||||
JOB_STATUS poll_app(RSC_USAGE &ru) {
|
||||
// check whether job has exited
|
||||
// Note: on both Podman and Docker this takes significant CPU time
|
||||
// (like .03 sec) so do it infrequently (like 5 sec)
|
||||
JOB_STATUS poll_app() {
|
||||
char cmd[1024];
|
||||
vector<string> out;
|
||||
int retval;
|
||||
|
@ -461,6 +466,41 @@ JOB_STATUS poll_app(RSC_USAGE &ru) {
|
|||
return JOB_FAIL;
|
||||
}
|
||||
|
||||
// get CPU and mem usage
|
||||
// This is also surprisingly slow
|
||||
int get_stats(RSC_USAGE &ru) {
|
||||
char cmd[1024];
|
||||
vector<string> out;
|
||||
int retval;
|
||||
unsigned int n;
|
||||
|
||||
sprintf(cmd,
|
||||
"%s stats --no-stream --format \"{{.CPUPerc}} {{.MemUsage}}\" %s",
|
||||
cli_prog, container_name
|
||||
);
|
||||
retval = run_docker_command(cmd, out);
|
||||
if (retval) return -1;
|
||||
n = out.size();
|
||||
if (n == 0) return -1;
|
||||
const char *buf = out[n-1].c_str();
|
||||
// output is like
|
||||
// 0.00% 420KiB / 503.8GiB
|
||||
double cpu_pct, mem;
|
||||
char mem_unit;
|
||||
n = sscanf(buf, "%lf%% %lf%c", &cpu_pct, &mem, &mem_unit);
|
||||
if (n != 3) return -1;
|
||||
switch (mem_unit) {
|
||||
case 'G': mem *= GIGA; break;
|
||||
case 'M': mem *= MEGA; break;
|
||||
case 'K': mem *= KILO; break;
|
||||
case 'B': break;
|
||||
default: return -1;
|
||||
}
|
||||
ru.cpu_frac = cpu_pct/100.;
|
||||
ru.wss = mem;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
// find a WSL distro with Docker and set up a command link to it
|
||||
//
|
||||
|
@ -543,6 +583,14 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
if (verbose) config.print();
|
||||
|
||||
if (sporadic) {
|
||||
retval = boinc_sporadic_dir(".");
|
||||
if (retval) {
|
||||
fprintf(stderr, "can't create sporadic files\n");
|
||||
boinc_finish(retval);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef _WIN32
|
||||
retval = wsl_init();
|
||||
if (retval) {
|
||||
|
@ -578,18 +626,35 @@ int main(int argc, char** argv) {
|
|||
boinc_finish(1);
|
||||
}
|
||||
running = true;
|
||||
while (1) {
|
||||
double cpu_time = 0;
|
||||
for (int i=0; ; i++) {
|
||||
poll_client_msgs();
|
||||
switch(poll_app(ru)) {
|
||||
case JOB_FAIL:
|
||||
cleanup();
|
||||
boinc_finish(1);
|
||||
break;
|
||||
case JOB_SUCCESS:
|
||||
copy_files_from_container();
|
||||
cleanup();
|
||||
boinc_finish(0);
|
||||
break;
|
||||
if (i%STATUS_PERIOD == 0) {
|
||||
switch(poll_app()) {
|
||||
case JOB_FAIL:
|
||||
cleanup();
|
||||
boinc_finish(1);
|
||||
break;
|
||||
case JOB_SUCCESS:
|
||||
copy_files_from_container();
|
||||
cleanup();
|
||||
boinc_finish(0);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
retval = get_stats(ru);
|
||||
if (!retval) {
|
||||
cpu_time += STATUS_PERIOD*ru.cpu_frac;
|
||||
boinc_report_app_status_aux(
|
||||
cpu_time,
|
||||
0, // checkpoint CPU time
|
||||
0, // frac done
|
||||
0, // other PID
|
||||
0,0, // bytes send/received
|
||||
ru.wss
|
||||
);
|
||||
}
|
||||
}
|
||||
boinc_sleep(POLL_PERIOD);
|
||||
}
|
||||
|
|
|
@ -316,7 +316,8 @@ void VBOX_VM::report_clean(
|
|||
fraction_done,
|
||||
vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
bytes_received,
|
||||
0
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -889,7 +889,8 @@ int main(int argc, char** argv) {
|
|||
fraction_done,
|
||||
pVM->vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
bytes_received,
|
||||
0
|
||||
);
|
||||
|
||||
// Wait for up to 5 minutes for the VM to switch states.
|
||||
|
@ -1373,7 +1374,8 @@ int main(int argc, char** argv) {
|
|||
fraction_done,
|
||||
pVM->vm_pid,
|
||||
bytes_sent,
|
||||
bytes_received
|
||||
bytes_received,
|
||||
0
|
||||
);
|
||||
|
||||
if (!retval) {
|
||||
|
|
Loading…
Reference in New Issue