*** empty log message ***

svn path=/trunk/boinc/; revision=10258
2006-06-06 18:45:40 +00:00 · 2006-06-06 18:45:40 +00:00 · 923075d005
parent 50e72f72f9
commit 923075d005
11 changed files with 241 additions and 45 deletions
--- a/25
+++ b/25
@ -5468,3 +5468,28 @@ Charlie 6 June 2006
    mac_build/
        boinc.xcodeproj/
            project.pbxproj
+
+David  6 June 2006
+    - scheduler: if we think a host has detached
+        (because of zero host ID but existing CPID)
+        marks its in-progress results as NO_REPLY, not CLIENT_ERROR
+    - scheduler: remove special handling for reported results
+        that are OVER/CLIENT_ERROR
+        (with above change, this shouldn't happen)
+    - scheduler: if we can't attach to shared memory,
+        return "Project encountered internal error" to client,
+        not misleading "Project is temporarily shut down for maintenance"
+    - shmem-related functions: call perror() on error
+    - make_project: <show_results>, not <show_result>
+
+    db/
+        boinc_db.h
+    html/ops/
+        watchdog.php
+    lib/
+        shmem.C
+    py/Boinc/
+        setup_project.py
+    sched/
+        handle_request.C
+        main.C
--- a/db/boinc_db.h
+++ b/db/boinc_db.h
@ -362,10 +362,8 @@ struct WORKUNIT {
 // especially for a project already running -
 // the database will become inconsistent

-#define RESULT_SERVER_STATE_INACTIVE       1
+//#define RESULT_SERVER_STATE_INACTIVE       1
 #define RESULT_SERVER_STATE_UNSENT         2
-#define RESULT_SERVER_STATE_UNSENT_SEQ     3
-    // unsent, part of a work sequence
 #define RESULT_SERVER_STATE_IN_PROGRESS    4
 #define RESULT_SERVER_STATE_OVER           5
    // we received a reply, timed out, or decided not to send.
--- a/doc/app_debug_win.php
+++ b/doc/app_debug_win.php
@ -23,8 +23,16 @@ echo "
    <li><a href=\"#Symbol Stores\">Symbol Stores</a>
        <ul>
            <li><a href=\"#SymIntroduction\">Introduction</a>
+            <li><a href=\"#SymRequirements\">Requirements</a>
+            <li><a href=\"#SymProject\">Project Symbol Store</a>
+            <li><a href=\"#SymAdd\">Adding symbols to the symbol store</a>
+            <li><a href=\"#SymUpload\">Uploading symbols to the symbol store</a>
        </ul>
    <li><a href=\"#Common Issues\">Common Issues</a>
+        <ul>
+            <li><a href=\"#CommonIntroduction\">Introduction</a>
+            <li><a href=\"#Common0xc0000096\">Privileged Instruction (0xc0000096)</a>
+        </ul>
 </ul>

 <h3><a name=\"Anatomy of a Windows stack trace\">Anatomy of a Windows stack trace</a></h3>
@ -94,7 +102,7 @@ callstack which you can use to diagnose problems.
 <p>
 Export symbols usually only appear on DLLs since DLLs
 can export function pointers via the export table.
-When you see this in the module list you'll only see functions which 
+When you see this in the module list you’ll only see functions which 
 are listed in the export table in the callstack.
 <p>
 No symbols means that the runtime debugger could not determine a way to give you any 
@ -283,11 +291,136 @@ This feature will probably be removed in the future.
 <h3><a name=\"Symbol Stores\">Symbol Stores</a></h3>
 <h4><a name=\"SymIntroduction\">Introduction</a></h4>
 <p>
-
+In order to obtain useful diagnostic information in the event of an application crash, 
+it is necessary to dump a callstack and any other relevant information about what was 
+going on at the time of the crash.  Symbols are only needed during a crash event, 
+therefore they are stripped from most applications to cut down on the binary size and 
+bandwidth requirements to deploy a new release.
+<p>
+Without symbols, callstacks tend to be nothing more than a list of function pointers 
+in memory.  A developer has to load the un-stripped executable in memory using the 
+same operating system and similar processor to jump to that memory address in order 
+to determine the function name and parameters.  This is very labor intensive and 
+generally not a very fun job.
+<p>
+Microsoft created a technology called a 'Symbol Store' to use with their debugger 
+technology which allows Windows debuggers to locate and download compressed symbol 
+files to diagnose problems and convert function pointers into human readable text. 
+This greatly speeds up the process of diagnosing and fixing bugs.
+<p>
+With the BOINC Runtime Debugger for Windows framework a project can publish their 
+symbol files and only have to distribute the application to each of the BOINC 
+clients.  When a crash event occurs the runtime framework will download the symbol 
+file from the symbol store and then proceed to dump as much diagnostic information 
+as possible to help projects diagnose the failure.
+<p>
+<h4><a name=\"SymRequirements\">Requirements</a></h4>
+<p>
+You'll need the latest stable release of the 
+<a href='http://www.microsoft.com/whdc/devtools/debugging/default.mspx'>
+  Debugging Tools for Windows.
+</a>
+<p>
+Verify that your executable is setup to generate PDB debugging symbols for a release
+build.  
+<p>
+Verify that the advance linker option to generate a checksum is enabled for a release
+build.
+<p>
+You'll need to explictly name both your EXE and PDB before compilation since
+the debugger bases the name of the PDB file off of information that is stored in the
+executable header.  
+<p>
+<h4><a name=\"SymProject\">Project Symbol Store</a></h4>
+<p>
+Specifying a project wide symbol store is as easy as adding the symstore element
+to your config.xml file for the project.
+<p>
+Below is an XML shred with an example symstore element.
+<p>
+". html_text("
+<boinc>
+    <config>
+        <symstore>http://sample.example.com/symstore</symstore>
+    </config>
+</boinc>
+")."
+<p>
+<h4><a name=\"SymAdd\">Adding symbols to the symbol store</a></h4>
+<p>
+<a href='http://msdn.microsoft.com/library/default.asp?url=/library/en-us/debug/base/using_symstore.asp'>Symstore</a>
+is a utility to manage symbol stores.  You'll want to create a local symbol store on
+your Windows build machine in which you'll initially add new symbol files with each
+revision of your application.
+<p>
+Symstore will compress the symbol file and then copy it into your local symbol store.
+<p>
+Below is an example command which you can run from the Windows command line or
+cygwin command line.
+<p>
+"; block_start(); echo "
+symstore.exe add /l /f c:\SampleSrc\*.pdb /s c:\symstore /compress /t \"Sample\" /v \"5.02\" /o /c \"Application Release\"
+"; block_end(); echo "
+<p>
+<h4><a name=\"SymUpload\">Uploading symbols to the symbol store</a></h4>
+<p>
+Most projects tend to use scp to copy files between Windows machines and their project
+server.
+<p>
+The example below copies the entire symstore to the target location.  After the copy
+operation you can delete all the subdirectories except '000Admin' to save time uploading
+for future application symbols.
+<p>
+"; block_start(); echo "
+pscp.exe -r -C -batch c:\symstore sample@project.example.com:projects/sample/html/user/symstore
+"; block_end(); echo "
 <p>
 <h3><a name=\"Common Issues\">Common Issues</a></h3>
+<h4><a name=\"CommonIntroduction\">Introduction</a></h4>
 <p>
 <p>
+<h4><a name=\"Common0xc0000096\">Privileged Instruction (0xc0000096)</a></h4>
+"; block_start(); echo "
+- Unhandled Exception Record -
+Reason: Privileged Instruction (0xc0000096) at address 0x008E9808
+
+- Registers -
+eax=00000400 ebx=00000000 ecx=00002922 edx=00b0c650 esi=01e1f7ec edi=027e2abc
+eip=008e9808 esp=01e1f778 ebp=ffffffff
+cs=001b  ss=0023  ds=0023  es=0023  fs=003b  gs=0000             efl=00010202
+
+- Callstack -
+ChildEBP RetAddr  Args to Child
+01e1f7b4 008ea16b 3f4bcaf9 3f827d51 01e1f7ec 01e1f7fc rosetta_beta_5.19_windows_intel!spherical+0x1 (rosetta++\structure.cc:1436) 
+01e1f83c 008ec11f 00b38180 00000003 00000009 01e1f974 rosetta_beta_5.19_windows_intel!HSpair_score+0x0 (rosetta++\structure.cc:367) 
+01e1f854 008b6d18 00b38058 00b3805c 00b38180 00b381b0 rosetta_beta_5.19_windows_intel!evaluate_ss+0x6 (rosetta++\structure.cc:102) 
+01e1f974 00937bf6 a8af5c9d 0001c3f9 00001473 00000100 rosetta_beta_5.19_windows_intel!scorefxn+0x25 (rosetta++\score.cc:190) 
+01e1f9b0 005e435f 00000009 008b7960 0001c3f9 a8af5cd5 rosetta_beta_5.19_windows_intel!main_frag_trial+0x4 (rosetta++\torsion_bbmove_trials.cc:446) 
+01e1fb74 006f1c01 a8af5e9d 3030302e 3c303030 00000000 rosetta_beta_5.19_windows_intel!fold_abinitio+0xc (rosetta++\fold_abinitio.cc:270) 
+01e1ffb0 006363c0 7c80b50b 00000000 3030302e 3c303030 rosetta_beta_5.19_windows_intel!main_rosetta+0x5 (rosetta++\main.cc:343) 
+01e1ffb4 7c80b50b 00000000 3030302e 3c303030 00000000 rosetta_beta_5.19_windows_intel!foobar+0x0 (boinc\api\graphics_impl.c:75) 
+01e1ffec 00000000 006363b0 00000000 00000000 00000000 kernel32!_BaseThreadStart@8+0x0 (boinc\api\graphics_impl.c:75) 
+"; block_end(); echo "
+<p>
+In this example it appears the processor took exception to the fact that a user mode
+process attempted to push a kernel mode address onto the stack without first switching
+to kernel mode.
+<p>
+Look at the EBP register, 'ffffffff' when converted into a signed int is equal to '-1' 
+and when converted to an unsigned int it is equal to 4GB.  On Windows anything above 2GB 
+is considered a kernel mode address.  If the Windows machine supports PAE and the /3GB
+boot option is specified in BOOT.INI then kernel addresses will start at 3GB instead.
+<p>
+What has probably happened here is that a function is about to be called and a 'push EBP'
+instruction was called to push a new address onto the stack, the CPU threw the exception 
+since the address was outside user mode land. EBP should have had a similar progression 
+as all the other stack frames ChildEBP values.
+<p>
+If EBP had some random kernel mode address it would be pretty easy to dismiss this as
+a CPU overheating.  'ffffffff' begs the question is the stack being overwritten by an
+error result from another function?
+<p>
+Investigation of this issue is still ongoing.
 ";

 page_tail();
--- a/doc/backend_state.php
+++ b/doc/backend_state.php
@ -133,8 +133,11 @@ list_item("server_state",
    <ul>
    <li> Initially UNSENT
    <li> Set by scheduler to IN_PROGRESS when send result
-    <li> Set by scheduler to OVER when get reply from client
-    <li> Set by transitioner to OVER if now > result.report_deadline
+    <li> Set by scheduler to OVER when result is reported
+        in request message from client.
+    <li> Set by scheduler to OVER when it thinks
+        host has detached project.
+    <li> Set by transitioner to OVER if now &gt; result.report_deadline
    <li> Set by transitioner to OVER if WU has error condition
        and result.server_state=UNSENT
    <li> Set by validator to OVER if WU has canonical result
@ -143,22 +146,29 @@ list_item("server_state",
    "
 );
 list_item("outcome",
-    "Values: SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED.
+    "Values: SUCCESS, COULDNT_SEND, CLIENT_ERROR, NO_REPLY, DIDNT_NEED,
+    VALIDATE_ERROR.
    <br>Defined iff result.server_state=OVER
    <ul>
    <li> Set by scheduler to SUCCESS if get reply and no client error
    <li> Set by scheduler to CLIENT_ERROR if get reply and client error
+    <li> Set by scheduler to NO_REPLY if it thinks host has detached project.
    <li> Set by transitioner to NO_REPLY if server_state=IN_PROGRESS
-        and now<report_deadline
+        and now &lt; report_deadline
    <li> Set by transitioner to DIDNT_NEED if WU has error condition
        and result.server_state=UNSENT
    <li> Set by validator to DIDNT_NEED if WU has canonical result
        and result.server_state=UNSENT
+    <li> Set by validator to VALIDATE_ERROR if outcome was initially
+        SUCCESS, but the validator had a permanent error reading a result file,
+        or a file had a syntax error.
+        Prevents the validator from trying again.
    </ul>
    "
 );
 list_item("client_state",
-    "Records the client state (upload, process, or download)
+    "Records the client state (DOWNLOADING, DOWNLOADED,
+    COMPUTE_ERROR, UPLOADING, UPLOADED, ABORTED)
    where an error occurred.
    Defined if outcome is CLIENT_ERROR.
    "
@ -172,7 +182,7 @@ list_item("file_delete_state",
        and file_delete_state=INIT,
        and wu.assimilate_state=DONE,
        and all the results have server_state=OVER,
-        and all all the results with outcome=SUCCESS have validate_state<>INIT 
+        and all all the results with outcome=SUCCESS have validate_state&lt;&gt;INIT 
    <li> Set by transitioner to READY if wu.assimilate_state=DONE
        and result.outcome=CLIENT_ERROR
        or result.validate_state!=INIT
@ -187,11 +197,16 @@ list_item("validate_state",
    <li> Initially INIT
    <li> Set by validator to VALID if outcome=SUCCESS and matches canonical result
    <li> Set by validator to INVALID if outcome=SUCCESS and doesn't match canonical result
+    <li> Set by transitioner to NO_CHECK if the WU had an error;
+    this avoids showing claimed credit as 'pending'.
    <li> Set by validator to ERROR if outcome=SUCCESS and
        had a permanent error trying to read an output file,
        or an output file had a syntax error.
    <li> Set by validator to INCONCLUSIVE if check_set()
        didn't find a consensus in a set of results containing this one.
+    <li> Set by scheduler to TOO_LATE if the result was reported
+        after the canonical result's files were deleted.
+        
    </ul>
    "
 );
--- a/doc/index.php
+++ b/doc/index.php
@ -146,7 +146,6 @@ foreach ($projects as $p) {
 echo "
    </ul>
    <p>
-    BOINC is an open-source software platform for volunteer computing.
    You can participate in several projects, ensuring that
    your computer will be kept busy even when one project has no work.
    <p>
--- a/doc/sched.php
+++ b/doc/sched.php
@ -296,14 +296,14 @@ Let X be the set of scheduled results that are not currently running,
 let Y be the set of running results that are not scheduled,
 and let T be the time the scheduler last ran.
 The enforcement policy is as follows:
-<ul>
+<ol>
 <li> If deadline_missed(R) for some R in X,
 then preempt a result in Y, and run R
 (preempt the result with the least CPU wall time since checkpoint).
 Repeat as needed.
 <li> If there is a result R in Y that checkpointed more recently than T,
 then preempt R and run a result in X.
-</ul>
+</ol>



@ -362,6 +362,10 @@ if total_shortfall > 0
        and are proportional to P.resource_share
 </pre>

+<p>
+For non-CPU-intensive projects,
+P.work_request_size is set to 1 if P has no nearly-runnable result,
+otherwise 0.
 <p>
 The scheduler RPC mechanism may select a project to contact
 because of a user request, an outstanding trickle-up message,
--- a/html/ops/watchdogs.php
+++ b/html/ops/watchdogs.php
@ -15,8 +15,7 @@

 function sound_alarm($x) {
    //echo "alarm: $x\n";
-    mail("davea@ssl.berkeley.edu", "BOINC problem", $x);
-    mail("eheien@ssl.berkeley.edu", "BOINC problem", $x);
+    mail(SYS_ADMIN_EMAIL, "BOINC problem", $x);
 }

 function check_log_file($file, $last_time) {
--- a/lib/shmem.C
+++ b/lib/shmem.C
@ -166,6 +166,7 @@ int create_shmem(key_t key, int size, void** pp) {
        id = shmget(key, size, IPC_CREAT|SHM_R|SHM_W);
    }
    if (id < 0) {
+        perror("shmget");
        return ERR_SHMGET;
    }
    return attach_shmem(key, pp);
@ -179,9 +180,13 @@ int destroy_shmem(key_t key){
    id = shmget(key, 0, 0);
    if (id < 0) return 0;           // assume it doesn't exist
    retval = shmctl(id, IPC_STAT, &buf);
-    if (retval) return ERR_SHMCTL;
+    if (retval) {
+        perror("shmctl STAT");
+        return ERR_SHMCTL;
+    }
    retval = shmctl(id, IPC_RMID, 0);
    if (retval) {
+        perror("shmctl RMID");
        return ERR_SHMCTL;
    }
    return 0;
@ -193,10 +198,12 @@ int attach_shmem(key_t key, void** pp){

    id = shmget(key, 0, 0);
    if (id < 0) {
+        perror("shmget");
        return ERR_SHMGET;
    }
    p = shmat(id, 0, 0);
    if ((long)p == -1) {
+        perror("shmat");
        return ERR_SHMAT;
    }
    *pp = p;
--- a/py/Boinc/setup_project.py
+++ b/py/Boinc/setup_project.py
@ -385,7 +385,7 @@ class Project:
        config.max_wus_to_send = 50
        config.daily_result_quota = 500
        config.disable_account_creation = 1
-        config.show_result = 1
+        config.show_results = 1

        config.master_url    = master_url or os.path.join(options.html_url , self.short_name , '')
        config.download_url  = os.path.join(config.master_url, 'download')
--- a/sched/handle_request.C
+++ b/sched/handle_request.C
@ -125,10 +125,15 @@ static bool find_host_by_cpid(DB_USER& user, char* host_cpid, DB_HOST& host) {
    return false;
 }

-// scan in-progress results for the given host,
-// and mark them as done, client error
+// Called when there's evidence that the host has detached.
+// Mark in-progress results for the given host
+// as server state OVER, outcome NO_REPLY.
+// This serves two purposes:
+// 1) make sure we don't resend these results to the host
+//    (they may be the reason the user detached)
+// 2) trigger the generation of new results for these WUs
 //
-static void mark_results_aborted(DB_HOST& host) {
+static void mark_results_over(DB_HOST& host) {
    char buf[256], buf2[256];
    DB_RESULT result;
    sprintf(buf, "where hostid=%d and server_state=%d",
@ -139,7 +144,7 @@ static void mark_results_aborted(DB_HOST& host) {
        sprintf(buf2,
            "server_state=%d, outcome=%d",
            RESULT_SERVER_STATE_OVER,
-            RESULT_OUTCOME_CLIENT_ERROR
+            RESULT_OUTCOME_NO_REPLY
        );
        result.update_field(buf2);

@ -305,7 +310,7 @@ lookup_user_and_make_new_host:
                    "[HOST#%d] [USER#%d] User has another host with same CPID.\n",
                    host.id, host.userid
                );
-                mark_results_aborted(host);
+                mark_results_over(host);
                goto got_host;
            }
        }
@ -617,7 +622,9 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
        // which suppresses the DB update later on
        //

-        // If result is ALREADY over, do we replace it??
+        // If result has server_state OVER
+        //   if outcome NO_REPLY accept it (it's just late).
+        //   else ignore it
        //
        if (srip->server_state == RESULT_SERVER_STATE_OVER) {
            char *dont_replace_result = NULL;
@ -635,14 +642,8 @@ int handle_results(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
                    dont_replace_result = "this work could NOT be sent";
                    break;
                case RESULT_OUTCOME_CLIENT_ERROR:
-                    // result was previously cancelled on server side.
-                    // keep this new, real result ONLY if validator has
-                    // not already been invoked.
-                    if (srip->validate_state != VALIDATE_STATE_INIT) {
-                        dont_replace_result = "result ALREADY reported as error, or canceled on server";
-                    } else if (srip->file_delete_state != FILE_DELETE_INIT) {
-                        dont_replace_result = "result ALREADY reported as error or canceled on server, and deleted";
-                    }
+                    // should never happen!
+                    dont_replace_result = "result ALREADY reported as error";
                    break;
                case RESULT_OUTCOME_NO_REPLY:
                    // result is late in arriving, but keep it anyhow
--- a/sched/main.C
+++ b/sched/main.C
@ -76,7 +76,7 @@ GUI_URLS gui_urls;
 key_t sema_key;
 int g_pid;
 static bool db_opened=false;
-bool project_stopped = false;
+bool shmem_failed = false;

 void send_message(const char* msg, int delay, bool send_header) {
    if (send_header) {
@ -198,9 +198,10 @@ SCHED_SHMEM* attach_to_feeder_shmem() {
    retval = attach_shmem(config.shmem_key, &p);
    if (retval || p==0) {
        log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
-            "Can't attach shmem (feeder not running?)\n"
+            "Can't attach shmem: %d (feeder not running?)\n",
+            retval
        );
-        project_stopped = true;
+        shmem_failed = true;
    } else {
        ssp = (SCHED_SHMEM*)p;
        retval = ssp->verify();
@ -214,12 +215,16 @@ SCHED_SHMEM* attach_to_feeder_shmem() {

        for (i=0; i<10; i++) {
            if (ssp->ready) break;
-            log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG, "waiting for ready flag\n");
+            log_messages.printf(SCHED_MSG_LOG::MSG_DEBUG,
+                "waiting for ready flag\n"
+            );
            sleep(1);
        }
        if (!ssp->ready) {
-            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "feeder doesn't seem to be running\n");
-            send_message("Server has software problem", 3600, true);
+            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
+                "feeder doesn't seem to be running\n"
+            );
+            send_message("Project encountered internal error: feeder not running", 3600, true);
            exit(0);
        }
    }
@ -286,7 +291,9 @@ int main(int argc, char** argv) {

    retval = config.parse_file("..");
    if (retval) {
-        log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "Can't parse config file\n");
+        log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
+            "Can't parse config file\n"
+        );
        send_message("Server can't parse configuration file", 3600, true);
        exit(0);
    }
@ -311,8 +318,8 @@ int main(int argc, char** argv) {
    while(FCGI_Accept() >= 0) {
    counter++;
 #endif
-    if (project_stopped) {
-        send_message("Project is temporarily shut down for maintenance", 3600, true);
+    if (shmem_failed) {
+        send_message("Project encountered internal error: shared memory", 3600, true);
        goto done;
    }
    log_request_info(length);
@ -330,7 +337,9 @@ int main(int argc, char** argv) {
        sprintf(reply_path, "%s%d_%u", REPLY_FILE_PREFIX, g_pid, counter);
        fout = fopen(req_path, "w");
        if (!fout) {
-            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't write request file\n");
+            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
+                "can't write request file\n"
+            );
            exit(1);
        }
        copy_stream(stdin, fout);
@ -345,12 +354,16 @@ int main(int argc, char** argv) {

        fin = fopen(req_path, "r");
        if (!fin) {
-            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't read request file\n");
+            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
+                "can't read request file\n"
+            );
            exit(1);
        }
        fout = fopen(reply_path, "w");
        if (!fout) {
-            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't write reply file\n");
+            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
+                "can't write reply file\n"
+            );
            exit(1);
        }

@ -359,7 +372,9 @@ int main(int argc, char** argv) {
        fclose(fout);
        fin = fopen(reply_path, "r");
        if (!fin) {
-            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL, "can't read reply file\n");
+            log_messages.printf(SCHED_MSG_LOG::MSG_CRITICAL,
+                "can't read reply file\n"
+            );
            exit(1);
        }
        copy_stream(fin, stdout);