mirror of https://github.com/BOINC/boinc.git
*** empty log message ***
svn path=/trunk/boinc/; revision=2344
This commit is contained in:
parent
1938fc43ee
commit
dd4d6d2594
|
@ -6314,3 +6314,22 @@ David Sept 21 2003
|
|||
feeder.C
|
||||
handle_request.C
|
||||
sched_shmem.h
|
||||
|
||||
David Sept 21 2003
|
||||
- Partly debugged the above changes:
|
||||
- The feeder's result enumeration should have no limit;
|
||||
otherwise it may enumerate the same results over and over
|
||||
- If the infeasible_count of a result a threshold (MAX_INFEASIBLE_COUNT)
|
||||
the feeder marks it as couldnt_send.
|
||||
This prevents a situation where results sit in the work array
|
||||
forever, their infeasible_count approaching infinity
|
||||
- Added show_shmem to sched/ Makefile, and fleshed it out a bit
|
||||
- Show hostID correctly when get RPC from wrong major version
|
||||
|
||||
NOTE: still haven't exercised the case where a result is marked COULDNT_SEND
|
||||
|
||||
sched/
|
||||
Makefile.am
|
||||
feeder.C
|
||||
handle_request.C
|
||||
show_shmem.C
|
||||
|
|
|
@ -5,7 +5,7 @@ include $(top_srcdir)/Makefile.incl
|
|||
noinst_PROGRAMS = \
|
||||
cgi feeder show_shmem file_upload_handler \
|
||||
validate_test validate_trivial make_work transitioner file_deleter \
|
||||
assimilator db_dump update_stats
|
||||
assimilator db_dump update_stats show_shmem
|
||||
|
||||
noinst_LIBRARIES = libsched.a
|
||||
|
||||
|
|
153
sched/feeder.C
153
sched/feeder.C
|
@ -17,14 +17,45 @@
|
|||
// Contributor(s):
|
||||
//
|
||||
|
||||
// -------------------------------
|
||||
//
|
||||
// feeder [-asynch] [-d debug_level]
|
||||
// -asynch fork and run in a separate process
|
||||
//
|
||||
// Creates a shared memory segment containing DB info,
|
||||
// including results/workunits to send.
|
||||
// including the work array (results/workunits to send).
|
||||
// This means that the scheduler CGI program doesn't have to
|
||||
// access the DB to get this info.
|
||||
//
|
||||
// -asynch fork and run in a separate process
|
||||
// Try to keep the work array filled.
|
||||
// This is a little tricky.
|
||||
// We use an enumerator.
|
||||
// The inner loop scans the wu_result table,
|
||||
// looking for empty slots and trying to fill them in.
|
||||
// When the enumerator reaches the end, it is restarted;
|
||||
// hopefully there will be some new workunits.
|
||||
// There are two complications:
|
||||
// - An enumeration may return results already in the array.
|
||||
// So, for each result, we scan the entire array to make sure
|
||||
// it's not there already. Can this be streamlined?
|
||||
// - We must avoid excessive re-enumeration,
|
||||
// especially when the number of results is less than the array size.
|
||||
// Crude approach: if a "collision" (as above) occurred on
|
||||
// a pass through the array, wait a long time (5 sec)
|
||||
//
|
||||
// Checking for infeasible results (i.e. can't sent to any host):
|
||||
// - the "infeasible_count" field of WU_RESULT keeps track of
|
||||
// how many times the WU_RESULT was infeasible for a host
|
||||
// - the scheduler gives priority to results that have infeasible_count > 0
|
||||
// - If the infeasible_count of any result exceeds MAX_INFEASIBLE_COUNT,
|
||||
// the feeder flags the result as OVER with outcome COULDNT_SEND,
|
||||
// and flags the WU for the transitioner.
|
||||
// - the feeder tries to ensure that the number of WU_RESULTs
|
||||
// with infeasible_count > 0 doesn't exceed MAX_INFEASIBLE
|
||||
// (compiled into feeder).
|
||||
// If it does, then the feeder picks the WU_RESULT with
|
||||
// the largest infeasible_count, marks if COULDNT_SEND as above,
|
||||
// and repeats this until the infeasible count is low enough again
|
||||
|
||||
// Trigger files:
|
||||
// The feeder program periodically checks for two trigger files:
|
||||
|
@ -55,7 +86,8 @@
|
|||
#include "sched_shmem.h"
|
||||
#include "sched_util.h"
|
||||
|
||||
#define RESULTS_PER_ENUM 100
|
||||
#define MAX_INFEASIBLE_COUNT 50
|
||||
|
||||
#define REREAD_DB_FILENAME "reread_db"
|
||||
#define LOCKFILE "feeder.out"
|
||||
#define PIDFILE "feeder.pid"
|
||||
|
@ -82,34 +114,46 @@ int check_reread_trigger() {
|
|||
return 0;
|
||||
}
|
||||
|
||||
// Try to keep the work array filled.
|
||||
// This is a little tricky.
|
||||
// We use an enumerator.
|
||||
// The inner loop scans the wu_result table,
|
||||
// looking for empty slots and trying to fill them in.
|
||||
// When the enumerator reaches the end, it is restarted;
|
||||
// hopefully there will be some new workunits.
|
||||
// There are two complications:
|
||||
// - An enumeration may return results already in the array.
|
||||
// So, for each result, we scan the entire array to make sure
|
||||
// it's not there already. Can this be streamlined?
|
||||
// - We must avoid excessive re-enumeration,
|
||||
// especially when the number of results is less than the array size.
|
||||
// Crude approach: if a "collision" (as above) occurred on
|
||||
// a pass through the array, wait a long time (5 sec)
|
||||
//
|
||||
// Checking for infeasible results (i.e. can't sent to any host):
|
||||
// - the "infeasible_count" field of WU_RESULT keeps track of
|
||||
// how many times the WU_RESULT was infeasible for a host
|
||||
// - the scheduler gives priority to results that have infeasible_count > 0
|
||||
// - the feeder tries to ensure that the number of WU_RESULTs
|
||||
// with infeasible_count > 0 doesn't exceed MAX_INFEASIBLE
|
||||
// (compiled into feeder).
|
||||
// If it does, then the feeder picks the WU_RESULT with
|
||||
// the largest infeasible_count,
|
||||
// flags the result as OVER with outcome COULDNT_SEND,
|
||||
// flags the WU for the transitioner,
|
||||
// and repeats this until the infeasible count is low enough again
|
||||
static int remove_infeasible(int i) {
|
||||
int retval;
|
||||
DB_RESULT result;
|
||||
DB_WORKUNIT wu;
|
||||
|
||||
WU_RESULT& wu_result = ssp->wu_results[i];
|
||||
wu_result.present = false; // mark as absent
|
||||
result = wu_result.result;
|
||||
wu = wu_result.workunit;
|
||||
|
||||
log_messages.printf(
|
||||
SchedMessages::NORMAL,
|
||||
"[%s] declaring result as unsendable\n",
|
||||
result.name
|
||||
);
|
||||
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
result.outcome = RESULT_OUTCOME_COULDNT_SEND;
|
||||
retval = result.update();
|
||||
if (retval) {
|
||||
log_messages.printf(
|
||||
SchedMessages::CRITICAL,
|
||||
"[%s]: can't update: %d\n",
|
||||
result.name, retval
|
||||
);
|
||||
return retval;
|
||||
}
|
||||
wu.transition_time = time(0);
|
||||
retval = wu.update();
|
||||
if (retval) {
|
||||
log_messages.printf(
|
||||
SchedMessages::CRITICAL,
|
||||
"[%s]: can't update: %d\n",
|
||||
wu.name, retval
|
||||
);
|
||||
return retval;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void scan_work_array(
|
||||
DB_RESULT& result, char* clause,
|
||||
|
@ -123,7 +167,9 @@ static void scan_work_array(
|
|||
for (i=0; i<ssp->nwu_results; i++) {
|
||||
WU_RESULT& wu_result = ssp->wu_results[i];
|
||||
if (wu_result.present) {
|
||||
if (wu_result.infeasible_count > 0) {
|
||||
if (wu_result.infeasible_count > MAX_INFEASIBLE_COUNT) {
|
||||
remove_infeasible(i);
|
||||
} else if (wu_result.infeasible_count > 0) {
|
||||
ninfeasible++;
|
||||
}
|
||||
} else {
|
||||
|
@ -135,7 +181,9 @@ try_again:
|
|||
// there's no point in doing it again.
|
||||
//
|
||||
if (restarted_enum) {
|
||||
log_messages.printf(SchedMessages::DEBUG, "already restarted enum on this array scan\n");
|
||||
log_messages.printf(SchedMessages::DEBUG,
|
||||
"already restarted enum on this array scan\n"
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -143,9 +191,13 @@ try_again:
|
|||
//
|
||||
restarted_enum = true;
|
||||
retval = result.enumerate(clause);
|
||||
log_messages.printf(SchedMessages::DEBUG, "restarting enumeration\n");
|
||||
log_messages.printf(SchedMessages::DEBUG,
|
||||
"restarting enumeration\n"
|
||||
);
|
||||
if (retval) {
|
||||
log_messages.printf(SchedMessages::DEBUG, "enumeration restart returned nothing\n");
|
||||
log_messages.printf(SchedMessages::DEBUG,
|
||||
"enumeration restart returned nothing\n"
|
||||
);
|
||||
no_wus = true;
|
||||
break;
|
||||
}
|
||||
|
@ -205,10 +257,8 @@ try_again:
|
|||
}
|
||||
}
|
||||
|
||||
int remove_most_infeasible() {
|
||||
int i, max, imax=-1, retval;
|
||||
DB_RESULT result;
|
||||
DB_WORKUNIT wu;
|
||||
static int remove_most_infeasible() {
|
||||
int i, max, imax=-1;
|
||||
|
||||
max = 0;
|
||||
for (i=0; i<ssp->nwu_results; i++) {
|
||||
|
@ -220,26 +270,7 @@ int remove_most_infeasible() {
|
|||
}
|
||||
if (max == 0) return -1; // nothing is infeasible
|
||||
|
||||
WU_RESULT& wu_result = ssp->wu_results[imax];
|
||||
wu_result.present = false; // mark as absent
|
||||
result = wu_result.result;
|
||||
wu = wu_result.workunit;
|
||||
|
||||
log_messages.printf(
|
||||
SchedMessages::NORMAL,
|
||||
"[%s] declaring result as unsendable\n",
|
||||
result.name
|
||||
);
|
||||
|
||||
result.server_state = RESULT_SERVER_STATE_OVER;
|
||||
result.outcome = RESULT_OUTCOME_COULDNT_SEND;
|
||||
retval = result.update();
|
||||
if (retval) return retval;
|
||||
wu.transition_time = time(0);
|
||||
retval = wu.update();
|
||||
if (retval) return retval;
|
||||
|
||||
return 0;
|
||||
return remove_infeasible(imax);
|
||||
}
|
||||
|
||||
void feeder_loop() {
|
||||
|
@ -248,8 +279,8 @@ void feeder_loop() {
|
|||
bool no_wus;
|
||||
char clause[256];
|
||||
|
||||
sprintf(clause, "where server_state=%d order by random limit %d",
|
||||
RESULT_SERVER_STATE_UNSENT, RESULTS_PER_ENUM
|
||||
sprintf(clause, "where server_state=%d order by random",
|
||||
RESULT_SERVER_STATE_UNSENT
|
||||
);
|
||||
|
||||
while (1) {
|
||||
|
|
|
@ -691,10 +691,14 @@ static void scan_work_array(
|
|||
}
|
||||
|
||||
result = wu_result.result;
|
||||
wu_result.present = false;
|
||||
|
||||
retval = add_wu_to_reply(wu, reply, platform, ss);
|
||||
if (retval) continue;
|
||||
if (retval) {
|
||||
wu_result.infeasible_count++;
|
||||
continue;
|
||||
}
|
||||
|
||||
wu_result.present = false;
|
||||
|
||||
log_messages.printf(
|
||||
SchedMessages::NORMAL,
|
||||
|
@ -849,8 +853,8 @@ bool wrong_major_version(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
strcpy(reply.message_priority, "low");
|
||||
log_messages.printf(
|
||||
SchedMessages::NORMAL,
|
||||
"[HOST#%d] Wrong major version from user: wanted %d, got %d\n",
|
||||
reply.host.id,
|
||||
"[HOST#%d] [auth %s] Wrong major version from user: wanted %d, got %d\n",
|
||||
sreq.hostid, sreq.authenticator,
|
||||
MAJOR_VERSION, sreq.core_client_major_version
|
||||
);
|
||||
return true;
|
||||
|
@ -858,8 +862,7 @@ bool wrong_major_version(SCHEDULER_REQUEST& sreq, SCHEDULER_REPLY& reply) {
|
|||
return false;
|
||||
}
|
||||
|
||||
inline static const char* get_remote_addr()
|
||||
{
|
||||
inline static const char* get_remote_addr() {
|
||||
const char * r = getenv("REMOTE_ADDR");
|
||||
return r ? r : "?.?.?.?";
|
||||
}
|
||||
|
|
|
@ -46,7 +46,16 @@ int main() {
|
|||
ssp = (SCHED_SHMEM*)p;
|
||||
retval = ssp->verify();
|
||||
printf("ready: %d\n", ssp->ready);
|
||||
printf("nwu_results: %d\n", ssp->nwu_results);
|
||||
printf("max_wu_results: %d\n", ssp->max_wu_results);
|
||||
for (i=0; i<ssp->max_wu_results; i++) {
|
||||
printf("%d. %s\n", i, ssp->wu_results[i].present?"present":"absent");
|
||||
WU_RESULT& wu_result = ssp->wu_results[i];
|
||||
if (wu_result.present) {
|
||||
printf("%d: present; infeasible_count %d; result %s\n",
|
||||
i, wu_result.infeasible_count, wu_result.result.name
|
||||
);
|
||||
} else {
|
||||
printf("%d: absent\n", i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue