- VBOX: Per tester feedback, at an hourly status report to track

network usage.
    - VBOX: For errors where an error code is specified in the output,
        use that value when returning from vbm_open().  Make it easier
        to track failure rates by problem type in the ops pages.
        (Windows Only)
    - VBOX: Try making the process of extracting the hypervisor system
        log more robust, make a temp copy of it in the slot directory.
        Avoid exclusive file lock issues since VboxSvc will still be
        logging to it.
    - VBOX: Change the session lock text to make it clearer that the
        use of other VirtualBox management tools can cause a problem.

        
    samples/vboxwrapper/
        vbox.cpp, .h
        vboxwrapper.cpp

svn path=/trunk/boinc/; revision=24990
This commit is contained in:
Rom Walton 2012-01-03 19:06:39 +00:00
parent 21aed83249
commit d2784ffc4b
4 changed files with 209 additions and 80 deletions

View File

@ -124,4 +124,23 @@ Rom 3 Jan 2012
They didn't wait for the next OpenCL spec change.
client/
coproc_detect.cpp
coproc_detect.cpp
Rom 3 Jan 2012
- VBOX: Per tester feedback, at an hourly status report to track
network usage.
- VBOX: For errors where an error code is specified in the output,
use that value when returning from vbm_open(). Make it easier
to track failure rates by problem type in the ops pages.
(Windows Only)
- VBOX: Try making the process of extracting the hypervisor system
log more robust, make a temp copy of it in the slot directory.
Avoid exclusive file lock issues since VboxSvc will still be
logging to it.
- VBOX: Change the session lock text to make it clearer that the
use of other VirtualBox management tools can cause a problem.
samples/vboxwrapper/
vbox.cpp, .h
vboxwrapper.cpp

View File

@ -63,7 +63,6 @@ VBOX_VM::VBOX_VM() {
job_duration = 0.0;
suspended = false;
network_suspended = false;
startup_completed = false;
online = false;
crashed = false;
enable_cern_dataformat = false;
@ -119,10 +118,7 @@ int VBOX_VM::run() {
boinc_sleep(1.0);
} while (timeout <= dtime());
// From here on out we don't need to report the system log when a vboxmanage
// command fails.
startup_completed = true;
if (!online) return ERR_EXEC;
return 0;
}
@ -163,17 +159,19 @@ int VBOX_VM::vbm_popen(string& arguments, string& output, const char* item, bool
if ((output.find("VBOX_E_INVALID_OBJECT_STATE") != string::npos) &&
(output.find("already locked") != string::npos))
{
if (retry_notes.find("already locked") == string::npos) {
retry_notes += "Virtual Machine session already locked.\n";
if (retry_notes.find("Another VirtualBox management") == string::npos) {
retry_notes += "Another VirtualBox management application has locked the session for\n";
retry_notes += "this virtual machine. BOINC cannot properly monitor this virtual machine\n";
retry_notes += "and so this job will be aborted.\n\n";
}
}
// Timeout?
if (retry_count >= 6) break;
// Retry?
if (!retry_failures) break;
// Timeout?
if (retry_count >= 6) break;
retry_count++;
boinc_sleep(5.0);
}
@ -196,12 +194,6 @@ int VBOX_VM::vbm_popen(string& arguments, string& output, const char* item, bool
output += "\nNotes:\n\n" + retry_notes;
}
if (!startup_completed) {
string system_log;
get_system_log(system_log);
output += "\nHypervisor System Log:\n\n" + system_log;
}
fprintf(
stderr,
"%s Error in %s for VM: %d\nArguments:\n%s\nOutput:\n%s\n",
@ -240,6 +232,9 @@ int VBOX_VM::vbm_popen_raw(string& arguments, string& output) {
DWORD dwCount = 0;
unsigned long ulExitCode = 0;
unsigned long ulExitTimeout = 0;
size_t errcode_start;
size_t errcode_end;
string errcode;
memset(&si, 0, sizeof(si));
memset(&pi, 0, sizeof(pi));
@ -335,7 +330,17 @@ CLEANUP:
if (hWritePipe) CloseHandle(hWritePipe);
if ((ulExitCode != 0) || (!pi.hProcess)) {
retval = ERR_FOPEN;
// Determine the real error code by parsing the output
errcode_start = output.find("(0x");
errcode_start += 1;
errcode_end = output.find(")", errcode_start);
errcode = output.substr(errcode_start, errcode_end - errcode_start);
retval = atol(errcode.c_str());
// If something couldn't be found, just return ERR_FOPEN
if (!retval) retval = ERR_FOPEN;
}
#else
@ -1292,7 +1297,10 @@ int VBOX_VM::get_network_bytes_sent(double& sent) {
int VBOX_VM::get_system_log(string& log) {
string virtualbox_user_home;
string virtualbox_system_log;
string slot_directory;
string virtualbox_system_log_src;
string virtualbox_system_log_dst;
char buf[256];
int retval = 0;
// Where is VirtualBox storing its configuration files?
@ -1309,12 +1317,47 @@ int VBOX_VM::get_system_log(string& log) {
virtualbox_user_home += "/.VirtualBox";
}
// Where should we copy temp files to?
get_slot_directory(slot_directory);
// Locate and read log file
virtualbox_system_log = virtualbox_user_home + "/VBoxSVC.log";
if (boinc_file_exists(virtualbox_system_log.c_str())) {
virtualbox_system_log_src = virtualbox_user_home + "/VBoxSVC.log";
virtualbox_system_log_dst = slot_directory + "/VBoxSVC.log";
if (boinc_file_exists(virtualbox_system_log_src.c_str())) {
// Skip having to deal with various forms of file locks by just making a temp
// copy of the log file.
boinc_copy(virtualbox_system_log_src.c_str(), virtualbox_system_log_dst.c_str());
// Keep only the last 16k if it is larger than that.
retval = read_file_string(virtualbox_system_log.c_str(), log, 16384, true);
read_file_string(virtualbox_system_log_dst.c_str(), log, 16384, true);
#ifdef _WIN32
// Remove \r from the log spew
for (string::iterator iter = log.begin(); iter != log.end(); ++iter) {
if (*iter == '\r') {
log.erase(iter);
}
}
#endif
if (log.size() >= 16384) {
// Look for the next whole line of text.
for (string::iterator iter = log.begin(); iter != log.end(); ++iter) {
if (*iter == '\n') {
log.erase(iter);
break;
}
log.erase(iter);
}
}
} else {
fprintf(
stderr,
"%s Could not find the system log at '%s'.\n",
boinc_msg_prefix(buf, sizeof(buf)),
virtualbox_system_log_src.c_str()
);
retval = ERR_NOT_FOUND;
}
@ -1336,6 +1379,26 @@ int VBOX_VM::get_vm_log(string& log) {
size_t size = output.size();
if (size > 16384) {
log = output.substr(size - 16384, size);
#ifdef _WIN32
// Remove \r from the log spew
for (string::iterator iter = log.begin(); iter != log.end(); ++iter) {
if (*iter == '\r') {
log.erase(iter);
}
}
#endif
if (log.size() >= 16384) {
// Look for the next whole line of text.
for (string::iterator iter = log.begin(); iter != log.end(); ++iter) {
if (*iter == '\n') {
log.erase(iter);
break;
}
log.erase(iter);
}
}
} else {
log = output;
}

View File

@ -51,8 +51,6 @@ struct VBOX_VM {
bool suspended;
// is network access temporarily suspended?
bool network_suspended;
// has the VM successfully completed the startup code
bool startup_completed;
// is VM even online?
bool online;
// Has the VM crashed?

View File

@ -119,17 +119,15 @@ int parse_job_file(VBOX_VM& vm) {
}
void write_checkpoint(double cpu) {
boinc_begin_critical_section();
FILE* f = fopen(CHECKPOINT_FILENAME, "w");
if (!f) return;
fprintf(f, "%f\n", cpu);
fclose(f);
boinc_checkpoint_completed();
}
void read_checkpoint(double& cpu) {
double c;
cpu = 0;
cpu = 0.0;
FILE* f = fopen(CHECKPOINT_FILENAME, "r");
if (!f) return;
int n = fscanf(f, "%lf", &c);
@ -204,18 +202,55 @@ void set_floppy_image(APP_INIT_DATA& aid, VBOX_VM& vm) {
}
}
// set port forwarding information if needed
//
void set_port_forwarding_info(APP_INIT_DATA& /* aid */, VBOX_VM& vm) {
char buf[256];
if (vm.pf_guest_port && vm.pf_host_port) {
fprintf(
stderr,
"%s port forwarding enabled on port '%d'.\n",
boinc_msg_prefix(buf, sizeof(buf)), vm.pf_host_port
);
// Write info to disk
//
MIOFILE mf;
FILE* f = boinc_fopen(PORTFORWARD_FILENAME, "w");
mf.init_file(f);
mf.printf(
"<port_forwarding>\n"
" <rule>\n"
" <host_port>%d</host_port>\n"
" <guest_port>%d</guest_port>\n"
" </rule>\n"
"</port_forwarding>\n",
vm.pf_host_port,
vm.pf_guest_port
);
fclose(f);
sprintf(buf, "http://localhost:%d", vm.pf_host_port);
boinc_web_graphics_url(buf);
}
}
int main(int argc, char** argv) {
int retval;
BOINC_OPTIONS boinc_options;
VBOX_VM vm;
APP_INIT_DATA aid;
double elapsed_time = 0.0;
double checkpoint_cpu_time = 0.0;
double trickle_period = 0.0;
double trickle_cpu_time = 0.0;
double fraction_done = 0.0;
double bytes_sent = 0.0;
double bytes_received = 0.0;
double checkpoint_cpu_time = 0.0;
double last_status_report_time = 0.0;
bool report_vm_pid = false;
double bytes_sent=0, bytes_received=0;
bool report_net_usage = false;
int vm_pid=0;
int vm_max_cpus=0;
@ -328,47 +363,32 @@ int main(int argc, char** argv) {
elapsed_time = aid.starting_elapsed_time;
read_checkpoint(checkpoint_cpu_time);
// Should we even try to start things up?
if (vm.job_duration && (elapsed_time > vm.job_duration)) {
return ERR_RSC_LIMIT_EXCEEDED;
}
retval = vm.run();
if (retval) {
vm.cleanup();
write_checkpoint(checkpoint_cpu_time);
write_checkpoint(elapsed_time);
vm.get_system_log(system_log);
fprintf(
stderr,
"%s Hypervisor System Log:\n\n"
"%s\n",
boinc_msg_prefix(buf, sizeof(buf)),
system_log.c_str()
);
boinc_finish(retval);
}
set_floppy_image(aid, vm);
set_port_forwarding_info(aid, vm);
set_throttles(aid, vm);
// write port-forwarding info if relevant
//
if (vm.pf_guest_port && vm.pf_host_port) {
fprintf(
stderr,
"%s port forwarding enabled on port '%d'.\n",
boinc_msg_prefix(buf, sizeof(buf)), vm.pf_host_port
);
// Write info to disk
//
MIOFILE mf;
FILE* f = boinc_fopen(PORTFORWARD_FILENAME, "w");
mf.init_file(f);
mf.printf(
"<port_forwarding>\n"
" <rule>\n"
" <host_port>%d</host_port>\n"
" <guest_port>%d</guest_port>\n"
" </rule>\n"
"</port_forwarding>\n",
vm.pf_host_port,
vm.pf_guest_port
);
fclose(f);
sprintf(buf, "http://localhost:%d", vm.pf_host_port);
boinc_web_graphics_url(buf);
}
while (1) {
// Discover the VM's current state
vm.poll();
@ -380,7 +400,7 @@ int main(int argc, char** argv) {
}
if (boinc_status.abort_request) {
vm.cleanup();
write_checkpoint(checkpoint_cpu_time);
write_checkpoint(elapsed_time);
boinc_finish(EXIT_ABORTED_BY_CLIENT);
}
if (!vm.online) {
@ -389,7 +409,7 @@ int main(int argc, char** argv) {
vm.get_vm_log(vm_log);
}
vm.cleanup();
write_checkpoint(checkpoint_cpu_time);
write_checkpoint(elapsed_time);
if (vm.crashed || (elapsed_time < vm.job_duration)) {
fprintf(
@ -478,11 +498,14 @@ int main(int argc, char** argv) {
if (vm.suspended) {
vm.resume();
}
elapsed_time += POLL_PERIOD;
if (!vm_pid) {
vm.get_process_id(vm_pid);
report_vm_pid = true;
}
if (boinc_time_to_checkpoint()) {
checkpoint_cpu_time = elapsed_time;
write_checkpoint(checkpoint_cpu_time);
@ -492,19 +515,40 @@ int main(int argc, char** argv) {
fraction_done = 1.0;
}
}
if (report_vm_pid || report_net_usage) {
retval = boinc_report_app_status_aux(
elapsed_time,
checkpoint_cpu_time,
fraction_done,
vm_pid,
bytes_sent,
bytes_received
);
if (!retval) {
report_vm_pid = false;
report_net_usage = false;
}
} else {
boinc_report_app_status(
elapsed_time,
checkpoint_cpu_time,
fraction_done
);
}
if ((elapsed_time - last_status_report_time) >= 6000.0) {
last_status_report_time = elapsed_time;
fprintf(
stderr,
"%s Status Report: Elapsed Time: '%f', Network Bytes Sent (Total): '%f', Network Bytes Received (Total): '%f'\n",
boinc_msg_prefix(buf, sizeof(buf)),
elapsed_time,
bytes_sent,
bytes_received
);
}
boinc_checkpoint_completed();
}
if (report_vm_pid || report_net_usage) {
retval = boinc_report_app_status_aux(
elapsed_time, checkpoint_cpu_time, fraction_done, vm_pid, bytes_sent, bytes_received
);
if (!retval) {
report_vm_pid = false;
report_net_usage = false;
}
} else {
boinc_report_app_status(elapsed_time, checkpoint_cpu_time, fraction_done);
}
if (trickle_period) {
trickle_cpu_time += POLL_PERIOD;
if (trickle_cpu_time >= trickle_period) {
@ -517,12 +561,10 @@ int main(int argc, char** argv) {
// if the VM has a maximum amount of time it is allowed to run,
// shut it down gacefully and exit.
//
if (vm.job_duration > 0.0) {
if (elapsed_time > vm.job_duration) {
vm.cleanup();
write_checkpoint(checkpoint_cpu_time);
boinc_finish(0);
}
if (vm.job_duration && (elapsed_time > vm.job_duration)) {
vm.cleanup();
write_checkpoint(elapsed_time);
boinc_finish(0);
}
}
if (vm.enable_network) {
@ -538,6 +580,13 @@ int main(int argc, char** argv) {
}
if (boinc_status.reread_init_data_file) {
boinc_status.reread_init_data_file = false;
fprintf(
stderr,
"%s Preference change detected\n",
boinc_msg_prefix(buf, sizeof(buf))
);
boinc_parse_init_data_file();
boinc_get_init_data_p(&aid);
set_throttles(aid, vm);