Merge pull request #3181 from BOINC/dpa_vboxwrapper

vboxwrapper: if control op fails, exit for 5 min instead of 1 day
This commit is contained in:
Adam Radocz 2019-06-20 20:25:04 +02:00 committed by GitHub
commit ea00ec75e3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 130 additions and 100 deletions

View File

@ -267,44 +267,47 @@ void VBOX_BASE::dump_hypervisor_logs(bool include_error_logs) {
if (include_error_logs) {
dump_screenshot();
fprintf(
stderr,
"\n"
" Hypervisor System Log:\n\n"
"%s\n"
" VM Execution Log:\n\n"
"%s\n"
" VM Startup Log:\n\n"
"%s\n"
" VM Trace Log:\n\n"
"%s",
local_system_log.c_str(),
local_vm_log.c_str(),
local_startup_log.c_str(),
local_trace_log.c_str()
);
stderr,
"\n"
" Hypervisor System Log:\n\n"
"%s\n"
" VM Execution Log:\n\n"
"%s\n"
" VM Startup Log:\n\n"
"%s\n"
" VM Trace Log:\n\n"
"%s",
local_system_log.c_str(),
local_vm_log.c_str(),
local_startup_log.c_str(),
local_trace_log.c_str()
);
}
if (vm_exit_code) {
fprintf(
stderr,
"\n"
" VM Exit Code: %d (0x%x)\n\n",
(unsigned int)vm_exit_code,
(unsigned int)vm_exit_code
);
fprintf(stderr,
"\n"
" VM Exit Code: %d (0x%x)\n\n",
(unsigned int)vm_exit_code,
(unsigned int)vm_exit_code
);
}
}
void VBOX_BASE::report_clean(bool unrecoverable_error, bool skip_cleanup, bool do_dump_hypervisor_logs,
int retval, std::string error_reason,
int vm_pid, int temp_delay, std::string temp_reason,
double current_cpu_time,
double last_checkpoint_cpu_time,
double fraction_done,
double bytes_sent,
double bytes_received) {
void VBOX_BASE::report_clean(
bool unrecoverable_error,
bool skip_cleanup,
bool do_dump_hypervisor_logs,
int retval,
string error_reason,
int temp_delay,
string temp_reason,
double total_cpu_time,
double last_checkpoint_cpu_time,
double fraction_done,
double bytes_sent,
double bytes_received
) {
if (unrecoverable_error) {
// Attempt to cleanup the VM and exit.
@ -321,21 +324,20 @@ void VBOX_BASE::report_clean(bool unrecoverable_error, bool skip_cleanup, bool d
}
boinc_finish(retval);
}
else {
// if the VM is already running notify BOINC about the process ID so it can
// clean up the environment. We should be safe to run after that.
} else {
// if the VM is already running notify BOINC about the process ID
// so it can clean up the environment.
// We should be safe to run after that.
//
if (vm_pid) {
retval = boinc_report_app_status_aux(
current_cpu_time,
last_checkpoint_cpu_time,
fraction_done,
vm_pid,
bytes_sent,
bytes_received
);
total_cpu_time,
last_checkpoint_cpu_time,
fraction_done,
vm_pid,
bytes_sent,
bytes_received
);
}
// Give the BOINC API time to report the pid to BOINC.
@ -404,7 +406,7 @@ string VBOX_BASE::get_error(int num){
"VM Hypervisor failed to enter an online state in a timely fashion."
};
std::vector<std::string> v(args, args + 13);
std::vector<string> v(args, args + 13);
return v[num];
}
@ -645,7 +647,7 @@ int VBOX_BASE::get_scratch_directory(string& dir) {
APP_INIT_DATA aid;
boinc_get_init_data_p(&aid);
dir = aid.project_dir + std::string("/scratch");
dir = aid.project_dir + string("/scratch");
if (!dir.empty()) {
return 1;
@ -816,7 +818,7 @@ int VBOX_BASE::get_startup_log(string& log, bool tail_only, unsigned int buffer_
return retval;
}
int VBOX_BASE::read_floppy(std::string& data) {
int VBOX_BASE::read_floppy(string& data) {
if (enable_floppyio && pFloppy) {
data = pFloppy->receive();
return 0;
@ -824,7 +826,7 @@ int VBOX_BASE::read_floppy(std::string& data) {
return 1;
}
int VBOX_BASE::write_floppy(std::string& data) {
int VBOX_BASE::write_floppy(string& data) {
if (enable_floppyio && pFloppy) {
pFloppy->send(data);
return 0;
@ -832,7 +834,7 @@ int VBOX_BASE::write_floppy(std::string& data) {
return 1;
}
void VBOX_BASE::sanitize_format(std::string& output) {
void VBOX_BASE::sanitize_format(string& output) {
// Check for special characters used by printf and render them harmless
string::iterator iter = output.begin();
while (iter != output.end()) {
@ -850,7 +852,7 @@ void VBOX_BASE::sanitize_format(std::string& output) {
}
#ifdef _WIN32
void VBOX_BASE::sanitize_output(std::string& output) {
void VBOX_BASE::sanitize_output(string& output) {
// Remove \r from the log spew
string::iterator iter = output.begin();
while (iter != output.end()) {
@ -862,7 +864,7 @@ void VBOX_BASE::sanitize_output(std::string& output) {
}
}
#else
void VBOX_BASE::sanitize_output(std::string& ) {}
void VBOX_BASE::sanitize_output(string& ) {}
#endif
// Launch VboxSVC.exe before going any further. if we don't, it'll be launched by
@ -969,7 +971,7 @@ int VBOX_BASE::launch_vboxvm() {
char cmdline[1024];
char* argv[5];
int argc;
std::string output;
string output;
int retval = ERR_EXEC;
// Construct the command line parameters
@ -1429,7 +1431,7 @@ CLEANUP:
return retval;
}
void VBOX_BASE::vbm_replay(std::string& command) {
void VBOX_BASE::vbm_replay(string& command) {
FILE* f = fopen(REPLAYLOG_FILENAME, "a");
if (f) {
fprintf(f, "%s\n", command.c_str());
@ -1437,7 +1439,7 @@ void VBOX_BASE::vbm_replay(std::string& command) {
}
}
void VBOX_BASE::vbm_trace(std::string& command, std::string& output, int retval) {
void VBOX_BASE::vbm_trace(string& command, string& output, int retval) {
char buf[256];
int pid;
struct tm tm;

View File

@ -270,14 +270,20 @@ class VBOX_BASE : public VBOX_JOB {
virtual std::string get_error(int choice);
virtual void report_clean(bool unrecoverable_error, bool skip_cleanup, bool do_dump_hypervisor_logs,
int retval, std::string error_reason,
int vm_pid, int temp_delay, std::string temp_reason,
double current_cpu_time,
double last_checkpoint_cpu_time,
double fraction_done,
double bytes_sent,
double bytes_received);
virtual void report_clean(
bool unrecoverable_error,
bool skip_cleanup,
bool do_dump_hypervisor_logs,
int retval,
std::string error_reason,
int temp_delay,
std::string temp_reason,
double current_cpu_time,
double last_checkpoint_cpu_time,
double fraction_done,
double bytes_sent,
double bytes_received
);
};
class VBOX_VM : public VBOX_BASE {

View File

@ -48,6 +48,10 @@
// Daniel Lombraña González <teleyinex AT gmail DOT com>
// Marius Millea <mariusmillea AT gmail DOT com>
#define RESTART_DELAY 300
// if a VM operation (suspend, resume, snapshot) fails,
// exit and restart after this delay.
#ifdef _WIN32
#include "boinc_win.h"
#include "win_util.h"
@ -392,7 +396,8 @@ int main(int argc, char** argv) {
double elapsed_time = 0;
double fraction_done = 0;
double trickle_period = 0;
double current_cpu_time = 0;
double total_cpu_time = 0;
// job CPU time counting previous episodes as well
double starting_cpu_time = 0;
double last_heartbeat_elapsed_time = 0;
double last_checkpoint_cpu_time = 0;
@ -768,7 +773,7 @@ int main(int argc, char** argv) {
pVM->rd_host_port = checkpoint.remote_desktop_port;
elapsed_time = checkpoint.elapsed_time;
starting_cpu_time = checkpoint.cpu_time;
current_cpu_time = starting_cpu_time;
total_cpu_time = starting_cpu_time;
last_checkpoint_elapsed_time = elapsed_time;
last_heartbeat_elapsed_time = elapsed_time;
last_checkpoint_cpu_time = starting_cpu_time;
@ -798,7 +803,7 @@ int main(int argc, char** argv) {
return EXIT_TIME_LIMIT_EXCEEDED;
}
retval = pVM->run(current_cpu_time > 0);
retval = pVM->run(total_cpu_time > 0);
if (retval) {
// All 'failure to start' errors are unrecoverable by default
vboxlog_msg("ERROR: VM failed to start");
@ -840,28 +845,29 @@ int main(int argc, char** argv) {
if (unrecoverable_error) {
if (pVM->online) pVM->capture_screenshot();
checkpoint.update(elapsed_time, current_cpu_time);
checkpoint.update(elapsed_time, total_cpu_time);
}
pVM->report_clean(unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
retval, error_reason, pVM->vm_pid, temp_delay, temp_reason,
current_cpu_time, last_checkpoint_cpu_time, fraction_done,
bytes_sent, bytes_received);
pVM->report_clean(
unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
retval, error_reason, temp_delay, temp_reason,
total_cpu_time, last_checkpoint_cpu_time, fraction_done,
bytes_sent, bytes_received
);
}
// Report the VM pid to BOINC so BOINC can deal with it when needed.
//
vboxlog_msg("Reporting VM Process ID to BOINC.");
retval = boinc_report_app_status_aux(
current_cpu_time,
last_checkpoint_cpu_time,
fraction_done,
pVM->vm_pid,
bytes_sent,
bytes_received
);
total_cpu_time,
last_checkpoint_cpu_time,
fraction_done,
pVM->vm_pid,
bytes_sent,
bytes_received
);
// Wait for up to 5 minutes for the VM to switch states.
// A system under load can take a while.
@ -912,14 +918,16 @@ int main(int argc, char** argv) {
temp_delay = 86400;
}
if (unrecoverable_error) checkpoint.update(elapsed_time, current_cpu_time);
pVM->report_clean(unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
retval, error_reason, pVM->vm_pid, temp_delay, temp_reason,
current_cpu_time, last_checkpoint_cpu_time, fraction_done,
bytes_sent, bytes_received);
if (unrecoverable_error) {
checkpoint.update(elapsed_time, total_cpu_time);
}
pVM->report_clean(
unrecoverable_error, skip_cleanup, do_dump_hypervisor_logs,
retval, error_reason, temp_delay, temp_reason,
total_cpu_time, last_checkpoint_cpu_time, fraction_done,
bytes_sent, bytes_received
);
}
set_floppy_image(aid, *pVM);
@ -927,7 +935,7 @@ int main(int argc, char** argv) {
report_remote_desktop_info(*pVM);
checkpoint.webapi_port = pVM->pf_host_port;
checkpoint.remote_desktop_port = pVM->rd_host_port;
checkpoint.update(elapsed_time, current_cpu_time);
checkpoint.update(elapsed_time, total_cpu_time);
// Force throttling on our first pass through the loop
boinc_status.reread_init_data_file = true;
@ -992,7 +1000,9 @@ int main(int argc, char** argv) {
// Write updates for the graphics application's use
if (pVM->enable_graphics_support) {
boinc_write_graphics_status(current_cpu_time, elapsed_time, fraction_done);
boinc_write_graphics_status(
total_cpu_time, elapsed_time, fraction_done
);
}
if (boinc_status.no_heartbeat || boinc_status.quit_request) {
@ -1000,7 +1010,7 @@ int main(int argc, char** argv) {
if (pVM->enable_vm_savestate_usage) {
retval = pVM->create_snapshot(elapsed_time);
if (!retval) {
checkpoint.update(elapsed_time, current_cpu_time);
checkpoint.update(elapsed_time, total_cpu_time);
boinc_checkpoint_completed();
}
pVM->stop();
@ -1084,7 +1094,7 @@ int main(int argc, char** argv) {
pVM->reset_vm_process_priority();
retval = pVM->create_snapshot(elapsed_time);
if (!retval) {
checkpoint.update(elapsed_time, current_cpu_time);
checkpoint.update(elapsed_time, total_cpu_time);
boinc_checkpoint_completed();
}
pVM->poweroff();
@ -1136,7 +1146,11 @@ int main(int argc, char** argv) {
if ((unsigned)retval == VBOX_E_INVALID_OBJECT_STATE) {
vboxlog_msg("ERROR: VM task failed to pause, rescheduling task for a later time.");
pVM->poweroff();
boinc_temporary_exit(86400, "VM job unmanageable, restarting later.");
sprintf(buf,
"VM suspend failed. Will exit and restart in %d sec.",
RESTART_DELAY
);
boinc_temporary_exit(RESTART_DELAY, buf);
}
}
} else {
@ -1145,14 +1159,18 @@ int main(int argc, char** argv) {
if ((unsigned)retval == VBOX_E_INVALID_OBJECT_STATE) {
vboxlog_msg("ERROR: VM task failed to resume, rescheduling task for a later time.");
pVM->poweroff();
boinc_temporary_exit(86400, "VM job unmanageable, restarting later.");
sprintf(buf,
"VM resume failed. Will exit and restart in %d sec.",
RESTART_DELAY
);
boinc_temporary_exit(RESTART_DELAY, buf);
}
}
// stuff to do every 10 secs (everything else is 1/sec)
//
if ((loop_iteration % 10) == 0) {
current_cpu_time = starting_cpu_time + pVM->get_vm_cpu_time();
total_cpu_time = starting_cpu_time + pVM->get_vm_cpu_time();
check_trickle_triggers(*pVM);
check_intermediate_uploads(*pVM);
}
@ -1172,10 +1190,10 @@ int main(int argc, char** argv) {
fraction_done = 1.0;
}
boinc_report_app_status(
current_cpu_time,
last_checkpoint_cpu_time,
fraction_done
);
total_cpu_time,
last_checkpoint_cpu_time,
fraction_done
);
// write status report to stderr at regular intervals
//
@ -1187,7 +1205,7 @@ int main(int argc, char** argv) {
if (elapsed_time) {
vboxlog_msg("Status Report: Elapsed Time: '%f'", elapsed_time);
}
vboxlog_msg("Status Report: CPU Time: '%f'", current_cpu_time);
vboxlog_msg("Status Report: CPU Time: '%f'", total_cpu_time);
if (aid.global_prefs.daily_xfer_limit_mb) {
vboxlog_msg("Status Report: Network Bytes Sent (Total): '%f'", bytes_sent);
vboxlog_msg("Status Report: Network Bytes Received (Total): '%f'", bytes_received);
@ -1220,13 +1238,17 @@ int main(int argc, char** argv) {
//
vboxlog_msg("ERROR: Checkpoint maintenance failed, rescheduling task for a later time. (%d)", retval);
pVM->poweroff();
boinc_temporary_exit(86400, "VM job unmanageable, restarting later.");
sprintf(buf,
"VM snapshot failed. Will exit and restart in %d sec.",
RESTART_DELAY
);
boinc_temporary_exit(RESTART_DELAY, buf);
} else {
// tell BOINC we've successfully created a checkpoint.
//
checkpoint.update(elapsed_time, current_cpu_time);
checkpoint.update(elapsed_time, total_cpu_time);
last_checkpoint_elapsed_time = elapsed_time;
last_checkpoint_cpu_time = current_cpu_time;
last_checkpoint_cpu_time = total_cpu_time;
boinc_checkpoint_completed();
}
}