mirror of https://github.com/BOINC/boinc.git
VBOX: Make the minimum checkpoint interval project configurable.
VBOX: Accumulate CPU Time across restarts of the wrapper.
This commit is contained in:
parent
6b00c99a21
commit
81bbafc5c5
|
@ -81,6 +81,8 @@ VBOX_VM::VBOX_VM() {
|
|||
image_filename.clear();
|
||||
floppy_image_filename.clear();
|
||||
job_duration = 0.0;
|
||||
current_cpu_time = 0.0;
|
||||
minimum_checkpoint_interval = 600.0;
|
||||
fraction_done_filename.clear();
|
||||
suspended = false;
|
||||
network_suspended = false;
|
||||
|
|
|
@ -98,7 +98,10 @@ public:
|
|||
// maximum amount of wall-clock time this VM is allowed to run before
|
||||
// considering itself done.
|
||||
double job_duration;
|
||||
// amount of CPU time consumed by the VM (note: use get_vm_cpu_time())
|
||||
double current_cpu_time;
|
||||
// minimum amount of time between checkpoints
|
||||
double minimum_checkpoint_interval;
|
||||
// name of file where app will write its fraction done
|
||||
std::string fraction_done_filename;
|
||||
// is the VM suspended?
|
||||
|
|
|
@ -158,6 +158,7 @@ int parse_job_file(VBOX_VM& vm, vector<string>& copy_to_shared) {
|
|||
else if (xp.parse_string("os_name", vm.os_name)) continue;
|
||||
else if (xp.parse_string("memory_size_mb", vm.memory_size_mb)) continue;
|
||||
else if (xp.parse_double("job_duration", vm.job_duration)) continue;
|
||||
else if (xp.parse_double("minimum_checkpoint_interval", vm.minimum_checkpoint_interval)) continue;
|
||||
else if (xp.parse_string("fraction_done_filename", vm.fraction_done_filename)) continue;
|
||||
else if (xp.parse_bool("enable_cern_dataformat", vm.enable_cern_dataformat)) continue;
|
||||
else if (xp.parse_bool("enable_network", vm.enable_network)) continue;
|
||||
|
@ -178,25 +179,28 @@ int parse_job_file(VBOX_VM& vm, vector<string>& copy_to_shared) {
|
|||
return ERR_XML_PARSE;
|
||||
}
|
||||
|
||||
void write_checkpoint(double cpu, VBOX_VM& vm) {
|
||||
void write_checkpoint(double elapsed, double cpu, VBOX_VM& vm) {
|
||||
FILE* f = fopen(CHECKPOINT_FILENAME, "w");
|
||||
if (!f) return;
|
||||
fprintf(f, "%f %d %d\n", cpu, vm.pf_host_port, vm.rd_host_port);
|
||||
fprintf(f, "%f %f %d %d\n", elapsed, cpu, vm.pf_host_port, vm.rd_host_port);
|
||||
fclose(f);
|
||||
}
|
||||
|
||||
void read_checkpoint(double& cpu, VBOX_VM& vm) {
|
||||
void read_checkpoint(double& elapsed, double& cpu, VBOX_VM& vm) {
|
||||
double c;
|
||||
double e;
|
||||
int pf_host;
|
||||
int rd_host;
|
||||
elapsed = 0.0;
|
||||
cpu = 0.0;
|
||||
vm.pf_host_port = 0;
|
||||
vm.rd_host_port = 0;
|
||||
FILE* f = fopen(CHECKPOINT_FILENAME, "r");
|
||||
if (!f) return;
|
||||
int n = fscanf(f, "%lf %d %d", &c, &pf_host, &rd_host);
|
||||
int n = fscanf(f, "%lf %lf %d %d", &e, &c, &pf_host, &rd_host);
|
||||
fclose(f);
|
||||
if (n != 3) return;
|
||||
if (n != 4) return;
|
||||
elapsed = e;
|
||||
cpu = c;
|
||||
vm.pf_host_port = pf_host;
|
||||
vm.rd_host_port = rd_host;
|
||||
|
@ -370,6 +374,7 @@ void set_remote_desktop_info(APP_INIT_DATA& /* aid */, VBOX_VM& vm) {
|
|||
|
||||
int main(int argc, char** argv) {
|
||||
int retval;
|
||||
int loop_iteraction = 0;
|
||||
BOINC_OPTIONS boinc_options;
|
||||
VBOX_VM vm;
|
||||
APP_INIT_DATA aid;
|
||||
|
@ -377,8 +382,9 @@ int main(int argc, char** argv) {
|
|||
double elapsed_time = 0;
|
||||
double trickle_period = 0;
|
||||
double fraction_done = 0;
|
||||
double checkpoint_cpu_time = 0;
|
||||
double current_cpu_time = 0;
|
||||
double starting_cpu_time = 0;
|
||||
double last_checkpoint_time = 0;
|
||||
double last_status_report_time = 0;
|
||||
double last_trickle_report_time = 0;
|
||||
double stopwatch_starttime = 0;
|
||||
|
@ -607,6 +613,18 @@ int main(int argc, char** argv) {
|
|||
boinc_finish(retval);
|
||||
}
|
||||
|
||||
// Record which mode VirtualBox should be started in.
|
||||
//
|
||||
if (vm.minimum_checkpoint_interval) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Detected: minimum checkpoint interval (%f seconds)\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
vm.minimum_checkpoint_interval
|
||||
);
|
||||
vm.headless = false;
|
||||
}
|
||||
|
||||
// Validate whatever configuration options we can
|
||||
//
|
||||
if (vm.enable_shared_directory) {
|
||||
|
@ -701,8 +719,7 @@ int main(int argc, char** argv) {
|
|||
|
||||
// Restore from checkpoint
|
||||
//
|
||||
read_checkpoint(checkpoint_cpu_time, vm);
|
||||
elapsed_time = checkpoint_cpu_time;
|
||||
read_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||
|
||||
// Should we even try to start things up?
|
||||
//
|
||||
|
@ -710,7 +727,7 @@ int main(int argc, char** argv) {
|
|||
return EXIT_TIME_LIMIT_EXCEEDED;
|
||||
}
|
||||
|
||||
retval = vm.run((elapsed_time > 0));
|
||||
retval = vm.run((current_cpu_time > 0));
|
||||
if (retval) {
|
||||
// All 'failure to start' errors are unrecoverable by default
|
||||
bool unrecoverable_error = true;
|
||||
|
@ -788,7 +805,7 @@ int main(int argc, char** argv) {
|
|||
if (!skip_cleanup) {
|
||||
vm.cleanup();
|
||||
}
|
||||
write_checkpoint(elapsed_time, vm);
|
||||
write_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||
|
||||
if (error_reason.size()) {
|
||||
fprintf(
|
||||
|
@ -811,8 +828,8 @@ int main(int argc, char** argv) {
|
|||
//
|
||||
if (vm.vm_pid) {
|
||||
retval = boinc_report_app_status_aux(
|
||||
elapsed_time,
|
||||
checkpoint_cpu_time,
|
||||
current_cpu_time,
|
||||
last_checkpoint_time,
|
||||
fraction_done,
|
||||
vm.vm_pid,
|
||||
bytes_sent,
|
||||
|
@ -849,8 +866,8 @@ int main(int argc, char** argv) {
|
|||
buf
|
||||
);
|
||||
retval = boinc_report_app_status_aux(
|
||||
elapsed_time,
|
||||
checkpoint_cpu_time,
|
||||
current_cpu_time,
|
||||
last_checkpoint_time,
|
||||
fraction_done,
|
||||
vm.vm_pid,
|
||||
bytes_sent,
|
||||
|
@ -897,7 +914,7 @@ int main(int argc, char** argv) {
|
|||
set_floppy_image(aid, vm);
|
||||
set_port_forwarding_info(aid, vm);
|
||||
set_remote_desktop_info(aid, vm);
|
||||
write_checkpoint(elapsed_time, vm);
|
||||
write_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||
|
||||
// Force throttling on our first pass through the loop
|
||||
boinc_status.reread_init_data_file = true;
|
||||
|
@ -905,6 +922,7 @@ int main(int argc, char** argv) {
|
|||
while (1) {
|
||||
// Begin stopwatch timer
|
||||
stopwatch_starttime = dtime();
|
||||
loop_iteraction += 1;
|
||||
|
||||
// Discover the VM's current state
|
||||
vm.poll();
|
||||
|
@ -1003,8 +1021,8 @@ int main(int argc, char** argv) {
|
|||
|
||||
// Basic bookkeeping
|
||||
//
|
||||
if ((int)elapsed_time % 10) {
|
||||
current_cpu_time = vm.get_vm_cpu_time();
|
||||
if (loop_iteraction % 10) {
|
||||
current_cpu_time = starting_cpu_time + vm.get_vm_cpu_time();
|
||||
}
|
||||
if (vm.job_duration) {
|
||||
fraction_done = elapsed_time / vm.job_duration;
|
||||
|
@ -1016,61 +1034,63 @@ int main(int argc, char** argv) {
|
|||
}
|
||||
boinc_report_app_status(
|
||||
current_cpu_time,
|
||||
checkpoint_cpu_time,
|
||||
last_checkpoint_time,
|
||||
fraction_done
|
||||
);
|
||||
|
||||
// Dump a status report at regular intervals
|
||||
//
|
||||
if ((elapsed_time - last_status_report_time) >= 6000.0) {
|
||||
last_status_report_time = elapsed_time;
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: CPU Time: '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
current_cpu_time
|
||||
);
|
||||
if (vm.job_duration) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Job Duration: '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
vm.job_duration
|
||||
);
|
||||
}
|
||||
if (elapsed_time) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Elapsed Time: '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
elapsed_time
|
||||
);
|
||||
}
|
||||
if (aid.global_prefs.daily_xfer_limit_mb) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Network Bytes Sent (Total): '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
bytes_sent
|
||||
);
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Network Bytes Received (Total): '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
bytes_received
|
||||
);
|
||||
}
|
||||
|
||||
vm.dumphypervisorstatusreports();
|
||||
}
|
||||
|
||||
if (boinc_time_to_checkpoint()) {
|
||||
// Only peform a VM checkpoint every ten minutes or so.
|
||||
//
|
||||
if (current_cpu_time >= checkpoint_cpu_time + random_checkpoint_factor + 600.0) {
|
||||
if (current_cpu_time >= last_checkpoint_time + vm.minimum_checkpoint_interval + random_checkpoint_factor) {
|
||||
// Basic interleave factor is only needed once.
|
||||
if (random_checkpoint_factor > 0) {
|
||||
random_checkpoint_factor = 0.0;
|
||||
}
|
||||
|
||||
if ((current_cpu_time - last_status_report_time) >= 6000.0) {
|
||||
last_status_report_time = current_cpu_time;
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: CPU Time: '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
current_cpu_time
|
||||
);
|
||||
if (vm.job_duration) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Job Duration: '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
vm.job_duration
|
||||
);
|
||||
}
|
||||
if (elapsed_time) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Elapsed Time: '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
elapsed_time
|
||||
);
|
||||
}
|
||||
if (aid.global_prefs.daily_xfer_limit_mb) {
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Network Bytes Sent (Total): '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
bytes_sent
|
||||
);
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Network Bytes Received (Total): '%f'\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf)),
|
||||
bytes_received
|
||||
);
|
||||
}
|
||||
|
||||
vm.dumphypervisorstatusreports();
|
||||
}
|
||||
|
||||
// Checkpoint
|
||||
retval = vm.createsnapshot(current_cpu_time);
|
||||
if (retval) {
|
||||
|
@ -1088,8 +1108,8 @@ int main(int argc, char** argv) {
|
|||
} else {
|
||||
// tell BOINC we've successfully created a checkpoint.
|
||||
//
|
||||
checkpoint_cpu_time = current_cpu_time;
|
||||
write_checkpoint(checkpoint_cpu_time, vm);
|
||||
last_checkpoint_time = current_cpu_time;
|
||||
write_checkpoint(elapsed_time, current_cpu_time, vm);
|
||||
boinc_checkpoint_completed();
|
||||
}
|
||||
}
|
||||
|
@ -1097,12 +1117,12 @@ int main(int argc, char** argv) {
|
|||
|
||||
if (trickle_period) {
|
||||
if ((elapsed_time - last_trickle_report_time) >= trickle_period) {
|
||||
last_trickle_report_time = elapsed_time;
|
||||
fprintf(
|
||||
stderr,
|
||||
"%s Status Report: Trickle-Up Event.\n",
|
||||
vboxwrapper_msg_prefix(buf, sizeof(buf))
|
||||
);
|
||||
last_trickle_report_time = elapsed_time;
|
||||
sprintf(buf,
|
||||
"<cpu_time>%f</cpu_time>", last_trickle_report_time
|
||||
);
|
||||
|
@ -1200,7 +1220,7 @@ int main(int argc, char** argv) {
|
|||
if (report_net_usage) {
|
||||
retval = boinc_report_app_status_aux(
|
||||
elapsed_time,
|
||||
checkpoint_cpu_time,
|
||||
last_checkpoint_time,
|
||||
fraction_done,
|
||||
vm.vm_pid,
|
||||
bytes_sent,
|
||||
|
|
Loading…
Reference in New Issue