diff --git a/api/boinc_api.cpp b/api/boinc_api.cpp index be7b345d3e..6b1709f6d0 100644 --- a/api/boinc_api.cpp +++ b/api/boinc_api.cpp @@ -1052,7 +1052,8 @@ int boinc_report_app_status_aux( double _fraction_done, int other_pid, double _bytes_sent, - double _bytes_received + double _bytes_received, + double wss ) { char msg_buf[MSG_CHANNEL_SIZE], buf[1024]; if (standalone) return 0; @@ -1081,6 +1082,10 @@ int boinc_report_app_status_aux( sprintf(buf, "%d\n", ac_state); strlcat(msg_buf, buf, sizeof(msg_buf)); } + if (wss) { + sprintf(buf, "%f\n", wss); + strlcat(msg_buf, buf, sizeof(msg_buf)); + } #ifdef MSGS_FROM_FILE if (fout) { fputs(msg_buf, fout); @@ -1100,7 +1105,7 @@ int boinc_report_app_status( double _fraction_done ){ return boinc_report_app_status_aux( - cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0 + cpu_time, checkpoint_cpu_time, _fraction_done, 0, 0, 0, 0 ); } diff --git a/api/boinc_api.h b/api/boinc_api.h index acb65bb845..758cf7e55b 100644 --- a/api/boinc_api.h +++ b/api/boinc_api.h @@ -138,7 +138,8 @@ extern int boinc_upload_status(std::string& name); extern char* boinc_msg_prefix(char*, int); extern int boinc_report_app_status_aux( double cpu_time, double checkpoint_cpu_time, double _fraction_done, - int other_pid, double bytes_sent, double bytes_received + int other_pid, double bytes_sent, double bytes_received, + double wss ); extern int boinc_temporary_exit( int delay, const char* reason=NULL, bool is_notice=false diff --git a/client/app.cpp b/client/app.cpp index eda742eb22..8fbfba2c77 100644 --- a/client/app.cpp +++ b/client/app.cpp @@ -110,6 +110,7 @@ ACTIVE_TASK::ACTIVE_TASK() { peak_disk_usage = 0; once_ran_edf = false; + wss_from_app = 0; fraction_done = 0; fraction_done_elapsed_time = 0; first_fraction_done = 0; @@ -420,6 +421,8 @@ void ACTIVE_TASK_SET::get_memory_usage() { // at least on Windows. Use the VM size instead. // pi.working_set_size_smoothed = atp->wup->rsc_memory_bound; + } else if (atp->wss_from_app > 0) { + pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + atp->wss_from_app); } else { pi.working_set_size_smoothed = .5*(pi.working_set_size_smoothed + pi.working_set_size); } diff --git a/client/app.h b/client/app.h index d7b306bea2..72534512ef 100644 --- a/client/app.h +++ b/client/app.h @@ -53,12 +53,14 @@ typedef int PROCESS_ID; // Represents a job in progress. -// When an active task is created, it is assigned a "slot" +// When a job is started, it is assigned a "slot" // which determines the directory it runs in. -// This doesn't change over the life of the active task; -// thus the task can use the slot directory for temp files +// This doesn't change over the life of the job; +// so it can use the slot directory for temp files // that BOINC doesn't know about. +// If you add anything, initialize it in the constructor +// struct ACTIVE_TASK { #ifdef _WIN32 HANDLE process_handle, shm_handle; @@ -100,8 +102,12 @@ struct ACTIVE_TASK { // most recent CPU time reported by app bool once_ran_edf; - // END OF ITEMS SAVED IN STATE FILE + // END OF ITEMS SAVED IN STATE FILES + double wss_from_app; + // work set size reported by the app + // (e.g. docker_wrapper does this). + // If nonzero, use this instead of procinfo data double fraction_done; // App's estimate of how much of the work unit is done. // Passed from the application via an API call; diff --git a/client/app_control.cpp b/client/app_control.cpp index b03a9875db..14c628461c 100644 --- a/client/app_control.cpp +++ b/client/app_control.cpp @@ -1439,8 +1439,23 @@ bool ACTIVE_TASK::get_app_status_msg() { } } } - parse_double(msg_buf, "", current_cpu_time); - parse_double(msg_buf, "", checkpoint_cpu_time); + if (parse_double(msg_buf, "", current_cpu_time)) { + if (current_cpu_time < 0) { + msg_printf(result->project, MSG_INFO, + "app reporting negative CPU: %f", current_cpu_time + ); + current_cpu_time = 0; + } + } + if (parse_double(msg_buf, "", checkpoint_cpu_time)) { + if (checkpoint_cpu_time < 0) { + msg_printf(result->project, MSG_INFO, + "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time + ); + checkpoint_cpu_time = 0; + } + } + parse_double(msg_buf, "", wss_from_app); parse_double(msg_buf, "", result->fpops_per_cpu_sec); parse_double(msg_buf, "", result->fpops_cumulative); parse_double(msg_buf, "", result->intops_per_cpu_sec); @@ -1470,18 +1485,6 @@ bool ACTIVE_TASK::get_app_status_msg() { if (parse_int(msg_buf, "", i)) { sporadic_ac_state = (SPORADIC_AC_STATE)i; } - if (current_cpu_time < 0) { - msg_printf(result->project, MSG_INFO, - "app reporting negative CPU: %f", current_cpu_time - ); - current_cpu_time = 0; - } - if (checkpoint_cpu_time < 0) { - msg_printf(result->project, MSG_INFO, - "app reporting negative checkpoint CPU: %f", checkpoint_cpu_time - ); - checkpoint_cpu_time = 0; - } return true; } diff --git a/samples/docker_wrapper/docker_wrapper.cpp b/samples/docker_wrapper/docker_wrapper.cpp index 70d9df0039..09458bc445 100644 --- a/samples/docker_wrapper/docker_wrapper.cpp +++ b/samples/docker_wrapper/docker_wrapper.cpp @@ -79,14 +79,16 @@ using std::string; using std::vector; #define POLL_PERIOD 1.0 +#define STATUS_PERIOD 10 + // reports status this often enum JOB_STATUS {JOB_IN_PROGRESS, JOB_SUCCESS, JOB_FAIL}; struct RSC_USAGE { - double cpu_time; + double cpu_frac; double wss; void clear() { - cpu_time = 0; + cpu_frac = 0; wss = 0; } }; @@ -442,7 +444,10 @@ void poll_client_msgs() { } } -JOB_STATUS poll_app(RSC_USAGE &ru) { +// check whether job has exited +// Note: on both Podman and Docker this takes significant CPU time +// (like .03 sec) so do it infrequently (like 5 sec) +JOB_STATUS poll_app() { char cmd[1024]; vector out; int retval; @@ -461,6 +466,41 @@ JOB_STATUS poll_app(RSC_USAGE &ru) { return JOB_FAIL; } +// get CPU and mem usage +// This is also surprisingly slow +int get_stats(RSC_USAGE &ru) { + char cmd[1024]; + vector out; + int retval; + unsigned int n; + + sprintf(cmd, + "%s stats --no-stream --format \"{{.CPUPerc}} {{.MemUsage}}\" %s", + cli_prog, container_name + ); + retval = run_docker_command(cmd, out); + if (retval) return -1; + n = out.size(); + if (n == 0) return -1; + const char *buf = out[n-1].c_str(); + // output is like + // 0.00% 420KiB / 503.8GiB + double cpu_pct, mem; + char mem_unit; + n = sscanf(buf, "%lf%% %lf%c", &cpu_pct, &mem, &mem_unit); + if (n != 3) return -1; + switch (mem_unit) { + case 'G': mem *= GIGA; break; + case 'M': mem *= MEGA; break; + case 'K': mem *= KILO; break; + case 'B': break; + default: return -1; + } + ru.cpu_frac = cpu_pct/100.; + ru.wss = mem; + return 0; +} + #ifdef _WIN32 // find a WSL distro with Docker and set up a command link to it // @@ -543,6 +583,14 @@ int main(int argc, char** argv) { } if (verbose) config.print(); + if (sporadic) { + retval = boinc_sporadic_dir("."); + if (retval) { + fprintf(stderr, "can't create sporadic files\n"); + boinc_finish(retval); + } + } + #ifdef _WIN32 retval = wsl_init(); if (retval) { @@ -578,18 +626,35 @@ int main(int argc, char** argv) { boinc_finish(1); } running = true; - while (1) { + double cpu_time = 0; + for (int i=0; ; i++) { poll_client_msgs(); - switch(poll_app(ru)) { - case JOB_FAIL: - cleanup(); - boinc_finish(1); - break; - case JOB_SUCCESS: - copy_files_from_container(); - cleanup(); - boinc_finish(0); - break; + if (i%STATUS_PERIOD == 0) { + switch(poll_app()) { + case JOB_FAIL: + cleanup(); + boinc_finish(1); + break; + case JOB_SUCCESS: + copy_files_from_container(); + cleanup(); + boinc_finish(0); + break; + default: + break; + } + retval = get_stats(ru); + if (!retval) { + cpu_time += STATUS_PERIOD*ru.cpu_frac; + boinc_report_app_status_aux( + cpu_time, + 0, // checkpoint CPU time + 0, // frac done + 0, // other PID + 0,0, // bytes send/received + ru.wss + ); + } } boinc_sleep(POLL_PERIOD); } diff --git a/samples/vboxwrapper/vbox_common.cpp b/samples/vboxwrapper/vbox_common.cpp index 42664be020..cd42a816ef 100644 --- a/samples/vboxwrapper/vbox_common.cpp +++ b/samples/vboxwrapper/vbox_common.cpp @@ -316,7 +316,8 @@ void VBOX_VM::report_clean( fraction_done, vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); } diff --git a/samples/vboxwrapper/vboxwrapper.cpp b/samples/vboxwrapper/vboxwrapper.cpp index 9f15ee68ae..75eec21aa6 100644 --- a/samples/vboxwrapper/vboxwrapper.cpp +++ b/samples/vboxwrapper/vboxwrapper.cpp @@ -889,7 +889,8 @@ int main(int argc, char** argv) { fraction_done, pVM->vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); // Wait for up to 5 minutes for the VM to switch states. @@ -1373,7 +1374,8 @@ int main(int argc, char** argv) { fraction_done, pVM->vm_pid, bytes_sent, - bytes_received + bytes_received, + 0 ); if (!retval) {