mirror of https://github.com/BOINC/boinc.git
VBOX: Make the minimum checkpoint interval project configurable.
VBOX: Accumulate CPU Time across restarts of the wrapper.
This commit is contained in:
parent
6b00c99a21
commit
81bbafc5c5
|
@ -81,6 +81,8 @@ VBOX_VM::VBOX_VM() {
|
||||||
image_filename.clear();
|
image_filename.clear();
|
||||||
floppy_image_filename.clear();
|
floppy_image_filename.clear();
|
||||||
job_duration = 0.0;
|
job_duration = 0.0;
|
||||||
|
current_cpu_time = 0.0;
|
||||||
|
minimum_checkpoint_interval = 600.0;
|
||||||
fraction_done_filename.clear();
|
fraction_done_filename.clear();
|
||||||
suspended = false;
|
suspended = false;
|
||||||
network_suspended = false;
|
network_suspended = false;
|
||||||
|
|
|
@ -98,7 +98,10 @@ public:
|
||||||
// maximum amount of wall-clock time this VM is allowed to run before
|
// maximum amount of wall-clock time this VM is allowed to run before
|
||||||
// considering itself done.
|
// considering itself done.
|
||||||
double job_duration;
|
double job_duration;
|
||||||
|
// amount of CPU time consumed by the VM (note: use get_vm_cpu_time())
|
||||||
double current_cpu_time;
|
double current_cpu_time;
|
||||||
|
// minimum amount of time between checkpoints
|
||||||
|
double minimum_checkpoint_interval;
|
||||||
// name of file where app will write its fraction done
|
// name of file where app will write its fraction done
|
||||||
std::string fraction_done_filename;
|
std::string fraction_done_filename;
|
||||||
// is the VM suspended?
|
// is the VM suspended?
|
||||||
|
|
|
@ -158,6 +158,7 @@ int parse_job_file(VBOX_VM& vm, vector<string>& copy_to_shared) {
|
||||||
else if (xp.parse_string("os_name", vm.os_name)) continue;
|
else if (xp.parse_string("os_name", vm.os_name)) continue;
|
||||||
else if (xp.parse_string("memory_size_mb", vm.memory_size_mb)) continue;
|
else if (xp.parse_string("memory_size_mb", vm.memory_size_mb)) continue;
|
||||||
else if (xp.parse_double("job_duration", vm.job_duration)) continue;
|
else if (xp.parse_double("job_duration", vm.job_duration)) continue;
|
||||||
|
else if (xp.parse_double("minimum_checkpoint_interval", vm.minimum_checkpoint_interval)) continue;
|
||||||
else if (xp.parse_string("fraction_done_filename", vm.fraction_done_filename)) continue;
|
else if (xp.parse_string("fraction_done_filename", vm.fraction_done_filename)) continue;
|
||||||
else if (xp.parse_bool("enable_cern_dataformat", vm.enable_cern_dataformat)) continue;
|
else if (xp.parse_bool("enable_cern_dataformat", vm.enable_cern_dataformat)) continue;
|
||||||
else if (xp.parse_bool("enable_network", vm.enable_network)) continue;
|
else if (xp.parse_bool("enable_network", vm.enable_network)) continue;
|
||||||
|
@ -178,25 +179,28 @@ int parse_job_file(VBOX_VM& vm, vector<string>& copy_to_shared) {
|
||||||
return ERR_XML_PARSE;
|
return ERR_XML_PARSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
void write_checkpoint(double cpu, VBOX_VM& vm) {
|
void write_checkpoint(double elapsed, double cpu, VBOX_VM& vm) {
|
||||||
FILE* f = fopen(CHECKPOINT_FILENAME, "w");
|
FILE* f = fopen(CHECKPOINT_FILENAME, "w");
|
||||||
if (!f) return;
|
if (!f) return;
|
||||||
fprintf(f, "%f %d %d\n", cpu, vm.pf_host_port, vm.rd_host_port);
|
fprintf(f, "%f %f %d %d\n", elapsed, cpu, vm.pf_host_port, vm.rd_host_port);
|
||||||
fclose(f);
|
fclose(f);
|
||||||
}
|
}
|
||||||
|
|
||||||
void read_checkpoint(double& cpu, VBOX_VM& vm) {
|
void read_checkpoint(double& elapsed, double& cpu, VBOX_VM& vm) {
|
||||||
double c;
|
double c;
|
||||||
|
double e;
|
||||||
int pf_host;
|
int pf_host;
|
||||||
int rd_host;
|
int rd_host;
|
||||||
|
elapsed = 0.0;
|
||||||
cpu = 0.0;
|
cpu = 0.0;
|
||||||
vm.pf_host_port = 0;
|
vm.pf_host_port = 0;
|
||||||
vm.rd_host_port = 0;
|
vm.rd_host_port = 0;
|
||||||
FILE* f = fopen(CHECKPOINT_FILENAME, "r");
|
FILE* f = fopen(CHECKPOINT_FILENAME, "r");
|
||||||
if (!f) return;
|
if (!f) return;
|
||||||
int n = fscanf(f, "%lf %d %d", &c, &pf_host, &rd_host);
|
int n = fscanf(f, "%lf %lf %d %d", &e, &c, &pf_host, &rd_host);
|
||||||
fclose(f);
|
fclose(f);
|
||||||
if (n != 3) return;
|
if (n != 4) return;
|
||||||
|
elapsed = e;
|
||||||
cpu = c;
|
cpu = c;
|
||||||
vm.pf_host_port = pf_host;
|
vm.pf_host_port = pf_host;
|
||||||
vm.rd_host_port = rd_host;
|
vm.rd_host_port = rd_host;
|
||||||
|
@ -370,6 +374,7 @@ void set_remote_desktop_info(APP_INIT_DATA& /* aid */, VBOX_VM& vm) {
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
int retval;
|
int retval;
|
||||||
|
int loop_iteraction = 0;
|
||||||
BOINC_OPTIONS boinc_options;
|
BOINC_OPTIONS boinc_options;
|
||||||
VBOX_VM vm;
|
VBOX_VM vm;
|
||||||
APP_INIT_DATA aid;
|
APP_INIT_DATA aid;
|
||||||
|
@ -377,8 +382,9 @@ int main(int argc, char** argv) {
|
||||||
double elapsed_time = 0;
|
double elapsed_time = 0;
|
||||||
double trickle_period = 0;
|
double trickle_period = 0;
|
||||||
double fraction_done = 0;
|
double fraction_done = 0;
|
||||||
double checkpoint_cpu_time = 0;
|
|
||||||
double current_cpu_time = 0;
|
double current_cpu_time = 0;
|
||||||
|
double starting_cpu_time = 0;
|
||||||
|
double last_checkpoint_time = 0;
|
||||||
double last_status_report_time = 0;
|
double last_status_report_time = 0;
|
||||||
double last_trickle_report_time = 0;
|
double last_trickle_report_time = 0;
|
||||||
double stopwatch_starttime = 0;
|
double stopwatch_starttime = 0;
|
||||||
|
@ -607,6 +613,18 @@ int main(int argc, char** argv) {
|
||||||
boinc_finish(retval);
|
boinc_finish(retval);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Record which mode VirtualBox should be started in.
|
||||||
|
//
|
||||||
|
if (vm.minimum_checkpoint_interval) {
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"%s Detected: minimum checkpoint interval (%f seconds)\n",
|
||||||
|
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||||
|
vm.minimum_checkpoint_interval
|
||||||
|
);
|
||||||
|
vm.headless = false;
|
||||||
|
}
|
||||||
|
|
||||||
// Validate whatever configuration options we can
|
// Validate whatever configuration options we can
|
||||||
//
|
//
|
||||||
if (vm.enable_shared_directory) {
|
if (vm.enable_shared_directory) {
|
||||||
|
@ -701,8 +719,7 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
// Restore from checkpoint
|
// Restore from checkpoint
|
||||||
//
|
//
|
||||||
read_checkpoint(checkpoint_cpu_time, vm);
|
read_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||||
elapsed_time = checkpoint_cpu_time;
|
|
||||||
|
|
||||||
// Should we even try to start things up?
|
// Should we even try to start things up?
|
||||||
//
|
//
|
||||||
|
@ -710,7 +727,7 @@ int main(int argc, char** argv) {
|
||||||
return EXIT_TIME_LIMIT_EXCEEDED;
|
return EXIT_TIME_LIMIT_EXCEEDED;
|
||||||
}
|
}
|
||||||
|
|
||||||
retval = vm.run((elapsed_time > 0));
|
retval = vm.run((current_cpu_time > 0));
|
||||||
if (retval) {
|
if (retval) {
|
||||||
// All 'failure to start' errors are unrecoverable by default
|
// All 'failure to start' errors are unrecoverable by default
|
||||||
bool unrecoverable_error = true;
|
bool unrecoverable_error = true;
|
||||||
|
@ -788,7 +805,7 @@ int main(int argc, char** argv) {
|
||||||
if (!skip_cleanup) {
|
if (!skip_cleanup) {
|
||||||
vm.cleanup();
|
vm.cleanup();
|
||||||
}
|
}
|
||||||
write_checkpoint(elapsed_time, vm);
|
write_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||||
|
|
||||||
if (error_reason.size()) {
|
if (error_reason.size()) {
|
||||||
fprintf(
|
fprintf(
|
||||||
|
@ -811,8 +828,8 @@ int main(int argc, char** argv) {
|
||||||
//
|
//
|
||||||
if (vm.vm_pid) {
|
if (vm.vm_pid) {
|
||||||
retval = boinc_report_app_status_aux(
|
retval = boinc_report_app_status_aux(
|
||||||
elapsed_time,
|
current_cpu_time,
|
||||||
checkpoint_cpu_time,
|
last_checkpoint_time,
|
||||||
fraction_done,
|
fraction_done,
|
||||||
vm.vm_pid,
|
vm.vm_pid,
|
||||||
bytes_sent,
|
bytes_sent,
|
||||||
|
@ -849,8 +866,8 @@ int main(int argc, char** argv) {
|
||||||
buf
|
buf
|
||||||
);
|
);
|
||||||
retval = boinc_report_app_status_aux(
|
retval = boinc_report_app_status_aux(
|
||||||
elapsed_time,
|
current_cpu_time,
|
||||||
checkpoint_cpu_time,
|
last_checkpoint_time,
|
||||||
fraction_done,
|
fraction_done,
|
||||||
vm.vm_pid,
|
vm.vm_pid,
|
||||||
bytes_sent,
|
bytes_sent,
|
||||||
|
@ -897,7 +914,7 @@ int main(int argc, char** argv) {
|
||||||
set_floppy_image(aid, vm);
|
set_floppy_image(aid, vm);
|
||||||
set_port_forwarding_info(aid, vm);
|
set_port_forwarding_info(aid, vm);
|
||||||
set_remote_desktop_info(aid, vm);
|
set_remote_desktop_info(aid, vm);
|
||||||
write_checkpoint(elapsed_time, vm);
|
write_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||||
|
|
||||||
// Force throttling on our first pass through the loop
|
// Force throttling on our first pass through the loop
|
||||||
boinc_status.reread_init_data_file = true;
|
boinc_status.reread_init_data_file = true;
|
||||||
|
@ -905,6 +922,7 @@ int main(int argc, char** argv) {
|
||||||
while (1) {
|
while (1) {
|
||||||
// Begin stopwatch timer
|
// Begin stopwatch timer
|
||||||
stopwatch_starttime = dtime();
|
stopwatch_starttime = dtime();
|
||||||
|
loop_iteraction += 1;
|
||||||
|
|
||||||
// Discover the VM's current state
|
// Discover the VM's current state
|
||||||
vm.poll();
|
vm.poll();
|
||||||
|
@ -1003,8 +1021,8 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
// Basic bookkeeping
|
// Basic bookkeeping
|
||||||
//
|
//
|
||||||
if ((int)elapsed_time % 10) {
|
if (loop_iteraction % 10) {
|
||||||
current_cpu_time = vm.get_vm_cpu_time();
|
current_cpu_time = starting_cpu_time + vm.get_vm_cpu_time();
|
||||||
}
|
}
|
||||||
if (vm.job_duration) {
|
if (vm.job_duration) {
|
||||||
fraction_done = elapsed_time / vm.job_duration;
|
fraction_done = elapsed_time / vm.job_duration;
|
||||||
|
@ -1016,61 +1034,63 @@ int main(int argc, char** argv) {
|
||||||
}
|
}
|
||||||
boinc_report_app_status(
|
boinc_report_app_status(
|
||||||
current_cpu_time,
|
current_cpu_time,
|
||||||
checkpoint_cpu_time,
|
last_checkpoint_time,
|
||||||
fraction_done
|
fraction_done
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Dump a status report at regular intervals
|
||||||
|
//
|
||||||
|
if ((elapsed_time - last_status_report_time) >= 6000.0) {
|
||||||
|
last_status_report_time = elapsed_time;
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"%s Status Report: CPU Time: '%f'\n",
|
||||||
|
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||||
|
current_cpu_time
|
||||||
|
);
|
||||||
|
if (vm.job_duration) {
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"%s Status Report: Job Duration: '%f'\n",
|
||||||
|
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||||
|
vm.job_duration
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (elapsed_time) {
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"%s Status Report: Elapsed Time: '%f'\n",
|
||||||
|
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||||
|
elapsed_time
|
||||||
|
);
|
||||||
|
}
|
||||||
|
if (aid.global_prefs.daily_xfer_limit_mb) {
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"%s Status Report: Network Bytes Sent (Total): '%f'\n",
|
||||||
|
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||||
|
bytes_sent
|
||||||
|
);
|
||||||
|
fprintf(
|
||||||
|
stderr,
|
||||||
|
"%s Status Report: Network Bytes Received (Total): '%f'\n",
|
||||||
|
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||||
|
bytes_received
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
vm.dumphypervisorstatusreports();
|
||||||
|
}
|
||||||
|
|
||||||
if (boinc_time_to_checkpoint()) {
|
if (boinc_time_to_checkpoint()) {
|
||||||
// Only peform a VM checkpoint every ten minutes or so.
|
// Only peform a VM checkpoint every ten minutes or so.
|
||||||
//
|
//
|
||||||
if (current_cpu_time >= checkpoint_cpu_time + random_checkpoint_factor + 600.0) {
|
if (current_cpu_time >= last_checkpoint_time + vm.minimum_checkpoint_interval + random_checkpoint_factor) {
|
||||||
// Basic interleave factor is only needed once.
|
// Basic interleave factor is only needed once.
|
||||||
if (random_checkpoint_factor > 0) {
|
if (random_checkpoint_factor > 0) {
|
||||||
random_checkpoint_factor = 0.0;
|
random_checkpoint_factor = 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((current_cpu_time - last_status_report_time) >= 6000.0) {
|
|
||||||
last_status_report_time = current_cpu_time;
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"%s Status Report: CPU Time: '%f'\n",
|
|
||||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
|
||||||
current_cpu_time
|
|
||||||
);
|
|
||||||
if (vm.job_duration) {
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"%s Status Report: Job Duration: '%f'\n",
|
|
||||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
|
||||||
vm.job_duration
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (elapsed_time) {
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"%s Status Report: Elapsed Time: '%f'\n",
|
|
||||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
|
||||||
elapsed_time
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if (aid.global_prefs.daily_xfer_limit_mb) {
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"%s Status Report: Network Bytes Sent (Total): '%f'\n",
|
|
||||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
|
||||||
bytes_sent
|
|
||||||
);
|
|
||||||
fprintf(
|
|
||||||
stderr,
|
|
||||||
"%s Status Report: Network Bytes Received (Total): '%f'\n",
|
|
||||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
|
||||||
bytes_received
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
vm.dumphypervisorstatusreports();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Checkpoint
|
// Checkpoint
|
||||||
retval = vm.createsnapshot(current_cpu_time);
|
retval = vm.createsnapshot(current_cpu_time);
|
||||||
if (retval) {
|
if (retval) {
|
||||||
|
@ -1088,8 +1108,8 @@ int main(int argc, char** argv) {
|
||||||
} else {
|
} else {
|
||||||
// tell BOINC we've successfully created a checkpoint.
|
// tell BOINC we've successfully created a checkpoint.
|
||||||
//
|
//
|
||||||
checkpoint_cpu_time = current_cpu_time;
|
last_checkpoint_time = current_cpu_time;
|
||||||
write_checkpoint(checkpoint_cpu_time, vm);
|
write_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||||
boinc_checkpoint_completed();
|
boinc_checkpoint_completed();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1097,12 +1117,12 @@ int main(int argc, char** argv) {
|
||||||
|
|
||||||
if (trickle_period) {
|
if (trickle_period) {
|
||||||
if ((elapsed_time - last_trickle_report_time) >= trickle_period) {
|
if ((elapsed_time - last_trickle_report_time) >= trickle_period) {
|
||||||
|
last_trickle_report_time = elapsed_time;
|
||||||
fprintf(
|
fprintf(
|
||||||
stderr,
|
stderr,
|
||||||
"%s Status Report: Trickle-Up Event.\n",
|
"%s Status Report: Trickle-Up Event.\n",
|
||||||
vboxwrapper_msg_prefix(buf, sizeof(buf))
|
vboxwrapper_msg_prefix(buf, sizeof(buf))
|
||||||
);
|
);
|
||||||
last_trickle_report_time = elapsed_time;
|
|
||||||
sprintf(buf,
|
sprintf(buf,
|
||||||
"<cpu_time>%f</cpu_time>", last_trickle_report_time
|
"<cpu_time>%f</cpu_time>", last_trickle_report_time
|
||||||
);
|
);
|
||||||
|
@ -1200,7 +1220,7 @@ int main(int argc, char** argv) {
|
||||||
if (report_net_usage) {
|
if (report_net_usage) {
|
||||||
retval = boinc_report_app_status_aux(
|
retval = boinc_report_app_status_aux(
|
||||||
elapsed_time,
|
elapsed_time,
|
||||||
checkpoint_cpu_time,
|
last_checkpoint_time,
|
||||||
fraction_done,
|
fraction_done,
|
||||||
vm.vm_pid,
|
vm.vm_pid,
|
||||||
bytes_sent,
|
bytes_sent,
|
||||||
|
|
Loading…
Reference in New Issue