*** empty log message ***

svn path=/trunk/boinc/; revision=10258
This commit is contained in:
David Anderson 2006-06-06 18:45:40 +00:00
parent 50e72f72f9
commit 923075d005
11 changed files with 241 additions and 45 deletions

View File

@ -5468,3 +5468,28 @@ Charlie 6 June 2006
mac_build/
boinc.xcodeproj/
project.pbxproj
David 6 June 2006
- scheduler: if we think a host has detached
(because of zero host ID but existing CPID)
marks its in-progress results as NO_REPLY, not CLIENT_ERROR
- scheduler: remove special handling for reported results
that are OVER/CLIENT_ERROR
(with above change, this shouldn't happen)
- scheduler: if we can't attach to shared memory,
return "Project encountered internal error" to client,
not misleading "Project is temporarily shut down for maintenance"
- shmem-related functions: call perror() on error
- make_project: <show_results>, not <show_result>
db/
boinc_db.h
html/ops/
watchdog.php
lib/
shmem.C
py/Boinc/
setup_project.py
sched/
handle_request.C
main.C

View File

@ -362,10 +362,8 @@ struct WORKUNIT {
// especially for a project already running -
// the database will become inconsistent
#define RESULT_SERVER_STATE_INACTIVE 1
//#define RESULT_SERVER_STATE_INACTIVE 1
#define RESULT_SERVER_STATE_UNSENT 2
#define RESULT_SERVER_STATE_UNSENT_SEQ 3
// unsent, part of a work sequence
#define RESULT_SERVER_STATE_IN_PROGRESS 4
#define RESULT_SERVER_STATE_OVER 5
// we received a reply, timed out, or decided not to send.

View File

@ -23,8 +23,16 @@ echo "
<li><a href=\"#Symbol Stores\">Symbol Stores</a>
<ul>
<li><a href=\"#SymIntroduction\">Introduction</a>
<li><a href=\"#SymRequirements\">Requirements</a>
<li><a href=\"#SymProject\">Project Symbol Store</a>
<li><a href=\"#SymAdd\">Adding symbols to the symbol store</a>
<li><a href=\"#SymUpload\">Uploading symbols to the symbol store</a>
</ul>
<li><a href=\"#Common Issues\">Common Issues</a>
<ul>
<li><a href=\"#CommonIntroduction\">Introduction</a>
<li><a href=\"#Common0xc0000096\">Privileged Instruction (0xc0000096)</a>
</ul>
</ul>
<h3><a name=\"Anatomy of a Windows stack trace\">Anatomy of a Windows stack trace</a></h3>
@ -94,7 +102,7 @@ callstack which you can use to diagnose problems.
<p>
Export symbols usually only appear on DLLs since DLLs
can export function pointers via the export table.
When you see this in the module list you'll only see functions which
When you see this in the module list youll only see functions which
are listed in the export table in the callstack.
<p>
No symbols means that the runtime debugger could not determine a way to give you any
@ -283,11 +291,136 @@ This feature will probably be removed in the future.
<h3><a name=\"Symbol Stores\">Symbol Stores</a></h3>
<h4><a name=\"SymIntroduction\">Introduction</a></h4>
<p>
In order to obtain useful diagnostic information in the event of an application crash,
it is necessary to dump a callstack and any other relevant information about what was
going on at the time of the crash. Symbols are only needed during a crash event,
therefore they are stripped from most applications to cut down on the binary size and
bandwidth requirements to deploy a new release.
<p>
Without symbols, callstacks tend to be nothing more than a list of function pointers
in memory. A developer has to load the un-stripped executable in memory using the
same operating system and similar processor to jump to that memory address in order
to determine the function name and parameters. This is very labor intensive and
generally not a very fun job.
<p>
Microsoft created a technology called a 'Symbol Store' to use with their debugger
technology which allows Windows debuggers to locate and download compressed symbol
files to diagnose problems and convert function pointers into human readable text.
This greatly speeds up the process of diagnosing and fixing bugs.
<p>
With the BOINC Runtime Debugger for Windows framework a project can publish their
symbol files and only have to distribute the application to each of the BOINC
clients. When a crash event occurs the runtime framework will download the symbol
file from the symbol store and then proceed to dump as much diagnostic information
as possible to help projects diagnose the failure.
<p>
<h4><a name=\"SymRequirements\">Requirements</a></h4>
<p>
You'll need the latest stable release of the
<a href='http://www.microsoft.com/whdc/devtools/debugging/default.mspx'>
Debugging Tools for Windows.
</a>
<p>
Verify that your executable is setup to generate PDB debugging symbols for a release
build.
<p>
Verify that the advance linker option to generate a checksum is enabled for a release
build.
<p>
You'll need to explictly name both your EXE and PDB before compilation since
the debugger bases the name of the PDB file off of information that is stored in the
executable header.
<p>
<h4><a name=\"SymProject\">Project Symbol Store</a></h4>
<p>
Specifying a project wide symbol store is as easy as adding the symstore element
to your config.xml file for the project.
<p>
Below is an XML shred with an example symstore element.
<p>
". html_text("
<boinc>
<config>
<symstore>http://sample.example.com/symstore</symstore>
</config>
</boinc>
")."
<p>
<h4><a name=\"SymAdd\">Adding symbols to the symbol store</a></h4>
<p>
<a href='http://msdn.microsoft.com/library/default.asp?url=/library/en-us/debug/base/using_symstore.asp'>Symstore</a>
is a utility to manage symbol stores. You'll want to create a local symbol store on
your Windows build machine in which you'll initially add new symbol files with each
revision of your application.
<p>
Symstore will compress the symbol file and then copy it into your local symbol store.
<p>
Below is an example command which you can run from the Windows command line or
cygwin command line.
<p>
"; block_start(); echo "
symstore.exe add /l /f c:\SampleSrc\*.pdb /s c:\symstore /compress /t \"Sample\" /v \"5.02\" /o /c \"Application Release\"
"; block_end(); echo "
<p>
<h4><a name=\"SymUpload\">Uploading symbols to the symbol store</a></h4>
<p>
Most projects tend to use scp to copy files between Windows machines and their project
server.
<p>
The example below copies the entire symstore to the target location. After the copy
operation you can delete all the subdirectories except '000Admin' to save time uploading
for future application symbols.
<p>
"; block_start(); echo "
pscp.exe -r -C -batch c:\symstore sample@project.example.com:projects/sample/html/user/symstore
"; block_end(); echo "
<p>
<h3><a name=\"Common Issues\">Common Issues</a></h3>
<h4><a name=\"CommonIntroduction\">Introduction</a></h4>
<p>
<p>
<h4><a name=\"Common0xc0000096\">Privileged Instruction (0xc0000096)</a></h4>
"; block_start(); echo "
- Unhandled Exception Record -
Reason: Privileged Instruction (0xc0000096) at address 0x008E9808
- Registers -
eax=00000400 ebx=00000000 ecx=00002922 edx=00b0c650 esi=01e1f7ec edi=027e2abc
eip=008e9808 esp=01e1f778 ebp=ffffffff
cs=001b ss=0023 ds=0023 es=0023 fs=003b gs=0000 efl=00010202
- Callstack -
ChildEBP RetAddr Args to Child
01e1f7b4 008ea16b 3f4bcaf9 3f827d51 01e1f7ec 01e1f7fc rosetta_beta_5.19_windows_intel!spherical+0x1 (rosetta++\structure.cc:1436)
01e1f83c 008ec11f 00b38180 00000003 00000009 01e1f974 rosetta_beta_5.19_windows_intel!HSpair_score+0x0 (rosetta++\structure.cc:367)
01e1f854 008b6d18 00b38058 00b3805c 00b38180 00b381b0 rosetta_beta_5.19_windows_intel!evaluate_ss+0x6 (rosetta++\structure.cc:102)
01e1f974 00937bf6 a8af5c9d 0001c3f9 00001473 00000100 rosetta_beta_5.19_windows_intel!scorefxn+0x25 (rosetta++\score.cc:190)
01e1f9b0 005e435f 00000009 008b7960 0001c3f9 a8af5cd5 rosetta_beta_5.19_windows_intel!main_frag_trial+0x4 (rosetta++\torsion_bbmove_trials.cc:446)
01e1fb74 006f1c01 a8af5e9d 3030302e 3c303030 00000000 rosetta_beta_5.19_windows_intel!fold_abinitio+0xc (rosetta++\fold_abinitio.cc:270)
01e1ffb0 006363c0 7c80b50b 00000000 3030302e 3c303030 rosetta_beta_5.19_windows_intel!main_rosetta+0x5 (rosetta++\main.cc:343)
01e1ffb4 7c80b50b 00000000 3030302e 3c303030 00000000 rosetta_beta_5.19_windows_intel!foobar+0x0 (boinc\api\graphics_impl.c:75)
01e1ffec 00000000 006363b0 00000000 00000000 00000000 kernel32!_BaseThreadStart@8+0x0 (boinc\api\graphics_impl.c:75)
"; block_end(); echo "
<p>
In this example it appears the processor took exception to the fact that a user mode
process attempted to push a kernel mode address onto the stack without first switching
to kernel mode.
<p>
Look at the EBP register, 'ffffffff' when converted into a signed int is equal to '-1'
and when converted to an unsigned int it is equal to 4GB. On Windows anything above 2GB
is considered a kernel mode address. If the Windows machine supports PAE and the /3GB
boot option is specified in BOOT.INI then kernel addresses will start at 3GB instead.
<p>
What has probably happened here is that a function is about to be called and a 'push EBP'
instruction was called to push a new address onto the stack, the CPU threw the exception
since the address was outside user mode land. EBP should have had a similar progression
as all the other stack frames ChildEBP values.
<p>
If EBP had some random kernel mode address it would be pretty easy to dismiss this as
a CPU overheating. 'ffffffff' begs the question is the stack being overwritten by an
error result from another function?
<p>
Investigation of this issue is still ongoing.
";
page_tail();

View File

@ -133,8 +133,11 @@ list_item("server_state",
<ul>
<li> Initially UNSENT
<li> Set by scheduler to IN_PROGRESS when send result
<li> Set by scheduler to OVER when get reply from client
<li> Set by transitioner to OVER if now > result.report_deadline
<li> Set by scheduler to OVER when result is reported
in request message from client.
<li> Set by scheduler to OVER when it thinks
host has detached project.
<li> Set by transitioner to OVER if now &gt; result.report_deadline
<li> Set by transitioner to OVER if WU has error condition
and result.server_state=UNSENT
<li> Set by validator to OVER if WU has canonical result
@ -143,22 +146,29 @@ list_item("server_state",
"
);
list_item("outcome",
"Values: SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED.
"Values: SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED,
VALIDATE_ERROR.
<br>Defined iff result.server_state=OVER
<ul>
<li> Set by scheduler to SUCCESS if get reply and no client error
<li> Set by scheduler to CLIENT_ERROR if get reply and client error
<li> Set by scheduler to NO_REPLY if it thinks host has detached project.
<li> Set by transitioner to NO_REPLY if server_state=IN_PROGRESS
and now<report_deadline
and now &lt; report_deadline
<li> Set by transitioner to DIDNT_NEED if WU has error condition
and result.server_state=UNSENT
<li> Set by validator to DIDNT_NEED if WU has canonical result
and result.server_state=UNSENT
<li> Set by validator to VALIDATE_ERROR if outcome was initially
SUCCESS, but the validator had a permanent error reading a result file,
or a file had a syntax error.
Prevents the validator from trying again.
</ul>
"
);
list_item("client_state",
"Records the client state (upload, process, or download)
"Records the client state (DOWNLOADING, DOWNLOADED,
COMPUTE_ERROR, UPLOADING, UPLOADED, ABORTED)
where an error occurred.
Defined if outcome is CLIENT_ERROR.
"
@ -172,7 +182,7 @@ list_item("file_delete_state",
and file_delete_state=INIT,
and wu.assimilate_state=DONE,
and all the results have server_state=OVER,
and all all the results with outcome=SUCCESS have validate_state<>INIT
and all all the results with outcome=SUCCESS have validate_state&lt;&gt;INIT
<li> Set by transitioner to READY if wu.assimilate_state=DONE
and result.outcome=CLIENT_ERROR
or result.validate_state!=INIT
@ -187,11 +197,16 @@ list_item("validate_state",
<li> Initially INIT
<li> Set by validator to VALID if outcome=SUCCESS and matches canonical result
<li> Set by validator to INVALID if outcome=SUCCESS and doesn't match canonical result
<li> Set by transitioner to NO_CHECK if the WU had an error;
this avoids showing claimed credit as 'pending'.
<li> Set by validator to ERROR if outcome=SUCCESS and
had a permanent error trying to read an output file,
or an output file had a syntax error.
<li> Set by validator to INCONCLUSIVE if check_set()
didn't find a consensus in a set of results containing this one.
<li> Set by scheduler to TOO_LATE if the result was reported
after the canonical result's files were deleted.
</ul>
"
);

View File

@ -146,7 +146,6 @@ foreach ($projects as $p) {
echo "
</ul>
<p>
BOINC is an open-source software platform for volunteer computing.
You can participate in several projects, ensuring that
your computer will be kept busy even when one project has no work.
<p>

View File

@ -296,14 +296,14 @@ Let X be the set of scheduled results that are not currently running,
let Y be the set of running results that are not scheduled,
and let T be the time the scheduler last ran.
The enforcement policy is as follows:
<ul>
<ol>
<li> If deadline_missed(R) for some R in X,
then preempt a result in Y, and run R
(preempt the result with the least CPU wall time since checkpoint).
Repeat as needed.
<li> If there is a result R in Y that checkpointed more recently than T,
then preempt R and run a result in X.
</ul>
</ol>
@ -362,6 +362,10 @@ if total_shortfall > 0
and are proportional to P.resource_share
</pre>
<p>
For non-CPU-intensive projects,
P.work_request_size is set to 1 if P has no nearly-runnable result,
otherwise 0.
<p>
The scheduler RPC mechanism may select a project to contact
because of a user request, an outstanding trickle-up message,

View File

@ -15,8 +15,7 @@
function sound_alarm($x) {
//echo "alarm: $x\n";
mail("davea@ssl.berkeley.edu", "BOINC problem", $x);
mail("eheien@ssl.berkeley.edu", "BOINC problem", $x);
mail(SYS_ADMIN_EMAIL, "BOINC problem", $x);
}
function check_log_file($file, $last_time) {

View File

@ -166,6 +166,7 @@ int create_shmem(key_t key, int size, void** pp) {
id = shmget(key, size, IPC_CREAT|SHM_R|SHM_W);
}
if (id < 0) {
perror("shmget");
return ERR_SHMGET;
}
return attach_shmem(key, pp);
@ -179,9 +180,13 @@ int destroy_shmem(key_t key){
id = shmget(key, 0, 0);
if (id < 0) return 0; // assume it doesn't exist
retval = shmctl(id, IPC_STAT, &buf);
if (retval) return ERR_SHMCTL;
if (retval) {
perror("shmctl STAT");
return ERR_SHMCTL;
}
retval = shmctl(id, IPC_RMID, 0);
if (retval) {
perror("shmctl RMID");
return ERR_SHMCTL;
}
return 0;
@ -193,10 +198,12 @@ int attach_shmem(key_t key, void** pp){
id = shmget(key, 0, 0);
if (id < 0) {
perror("shmget");
return ERR_SHMGET;
}
p = shmat(id, 0, 0);
if ((long)p == -1) {
perror("shmat");
return ERR_SHMAT;
}
*pp = p;

View File

@ -385,7 +385,7 @@ class Project:
config.max_wus_to_send = 50
config.daily_result_quota = 500
config.disable_account_creation = 1
config.show_result = 1
config.show_results = 1
config.master_url = master_url or os.path.join(options.html_url , self.short_name , '')
config.download_url = os.path.join(config.master_url, 'download')

View File

@ -125,10 +125,15 @@ static bool find_host_by_cpid(DB_USER& user, char* host_cpid, DB_HOST& host) {
return false;
}
// scan in-progress results for the given host,
// and mark them as done, client error
// Called when there's evidence that the host has detached.
// Mark in-progress results for the given host
// as server state OVER, outcome NO_REPLY.
// This serves two purposes:
// 1) make sure we don't resend these results to the host
// (they may be the reason the user detached)
// 2) trigger the generation of new results for these WUs
//
static void mark_results_aborted(DB_HOST& host) {
static void mark_results_over(DB_HOST& host) {
char buf[256], buf2[256];
DB_RESULT result;
sprintf(buf, "where hostid=%d and server_state=%d",
@ -139,7 +144,7 @@ static void mark_results_aborted(DB_HOST& host) {
sprintf(buf2,
"server_state=%d, outcome=%d",
RESULT_SERVER_STATE_OVER,
RESULT_OUTCOME_CLIENT_ERROR
RESULT_OUTCOME_NO_REPLY
);
result.update_field(buf2);
@ -305,7 +310,7 @@ lookup_user_and_make_new_host:
"[HOST#%d] [USER#%d] User has another host with same CPID.\n",
host.id, host.userid
);
mark_results_aborted(host);
mark_results_over(host);
goto got_host;
}
}
@ -617,7 +622,9 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
// which suppresses the DB update later on
//
// If result is ALREADY over, do we replace it??
// If result has server_state OVER
// if outcome NO_REPLY accept it (it's just late).
// else ignore it
//
if (srip->server_state == RESULT_SERVER_STATE_OVER) {
char *dont_replace_result = NULL;
@ -635,14 +642,8 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
dont_replace_result = "this work could NOT be sent";
break;
case RESULT_OUTCOME_CLIENT_ERROR:
// result was previously cancelled on server side.
// keep this new, real result ONLY if validator has
// not already been invoked.
if (srip->validate_state != VALIDATE_STATE_INIT) {
dont_replace_result = "result ALREADY reported as error, or canceled on server";
} else if (srip->file_delete_state != FILE_DELETE_INIT) {
dont_replace_result = "result ALREADY reported as error or canceled on server, and deleted";
}
// should never happen!
dont_replace_result = "result ALREADY reported as error";
break;
case RESULT_OUTCOME_NO_REPLY:
// result is late in arriving, but keep it anyhow

View File

@ -76,7 +76,7 @@ GUI_URLS gui_urls;
key_t sema_key;
int g_pid;
static bool db_opened=false;
bool project_stopped = false;
bool shmem_failed = false;
void send_message(const char* msg, int delay, bool send_header) {
if (send_header) {
@ -198,9 +198,10 @@ SCHED_SHMEM* attach_to_feeder_shmem() {
retval = attach_shmem(config.shmem_key, &p);
if (retval || p==0) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"Can't attach shmem (feeder not running?)\n"
"Can't attach shmem: %d (feeder not running?)\n",
retval
);
project_stopped = true;
shmem_failed = true;
} else {
ssp = (SCHED_SHMEM*)p;
retval = ssp->verify();
@ -214,12 +215,16 @@ SCHED_SHMEM* attach_to_feeder_shmem() {
for (i=0; i<10; i++) {
if (ssp->ready) break;
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, "waiting for ready flag\n");
log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
"waiting for ready flag\n"
);
sleep(1);
}
if (!ssp->ready) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "feeder doesn't seem to be running\n");
send_message("Server has software problem", 3600, true);
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"feeder doesn't seem to be running\n"
);
send_message("Project encountered internal error: feeder not running", 3600, true);
exit(0);
}
}
@ -286,7 +291,9 @@ int main(int argc, char** argv) {
retval = config.parse_file("..");
if (retval) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "Can't parse config file\n");
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"Can't parse config file\n"
);
send_message("Server can't parse configuration file", 3600, true);
exit(0);
}
@ -311,8 +318,8 @@ int main(int argc, char** argv) {
while(FCGI_Accept() >= 0) {
counter++;
#endif
if (project_stopped) {
send_message("Project is temporarily shut down for maintenance", 3600, true);
if (shmem_failed) {
send_message("Project encountered internal error: shared memory", 3600, true);
goto done;
}
log_request_info(length);
@ -330,7 +337,9 @@ int main(int argc, char** argv) {
sprintf(reply_path, "%s%d_%u", REPLY_FILE_PREFIX, g_pid, counter);
fout = fopen(req_path, "w");
if (!fout) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't write request file\n");
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"can't write request file\n"
);
exit(1);
}
copy_stream(stdin, fout);
@ -345,12 +354,16 @@ int main(int argc, char** argv) {
fin = fopen(req_path, "r");
if (!fin) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't read request file\n");
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"can't read request file\n"
);
exit(1);
}
fout = fopen(reply_path, "w");
if (!fout) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't write reply file\n");
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"can't write reply file\n"
);
exit(1);
}
@ -359,7 +372,9 @@ int main(int argc, char** argv) {
fclose(fout);
fin = fopen(reply_path, "r");
if (!fin) {
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't read reply file\n");
log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
"can't read reply file\n"
);
exit(1);
}
copy_stream(fin, stdout);