boinc/sched/validate_util2.cpp

275 lines
8.2 KiB
C++

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
// Simple validator framework:
// Lets you create a custom validator by supplying three functions.
// See https://github.com/BOINC/boinc/wiki/ValidationSimple
//
#include "config.h"
#include <vector>
#include <cstdlib>
#include <string>
#include "boinc_db.h"
#include "error_numbers.h"
#include "sched_config.h"
#include "sched_msgs.h"
#include "validator.h"
#include "validate_util.h"
#include "validate_util2.h"
using std::vector;
// Given a set of results:
// 1) call init_result() for each one;
// this detects results with bad or missing output files
// 2) if # of good results is >= wu.min_quorum,
// check for a canonical result,
// i.e. a set of at least min_quorum/2+1 results for which
// that are equivalent according to check_pair().
//
// input invariants:
// for each result:
// result.outcome == SUCCESS
// result.validate_state == INIT
//
// Outputs:
// canonicalid: the ID of canonical result, if any
// result.outcome, result.validate_state
// modified; caller must update DB
// retry: set to true if some result had a transient failure
// (i.e. there was a broken NFS mount).
// Should call this again after a while.
//
// always return zero
//
int check_set(
vector<RESULT>& results, WORKUNIT& wu,
DB_ID_TYPE& canonicalid, double&, bool& retry
) {
vector<void*> data;
vector<bool> had_error;
int i, j, neq = 0, n, retval=0;
int min_valid = wu.min_quorum/2+1;
retry = false;
n = results.size();
data.resize(n);
had_error.resize(n);
// Initialize results
// For each one we potentially allocate data,
// so always exit via goto cleanup:
// to free this mem
for (i=0; i<n; i++) {
data[i] = NULL;
had_error[i] = false;
}
int good_results = 0;
int suspicious_results = 0;
for (i=0; i<n; i++) {
retval = init_result(results[i], data[i]);
switch (retval) {
case 0:
good_results++;
break;
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
log_messages.printf(MSG_CRITICAL,
"check_set: init_result([RESULT#%lu %s]) transient failure\n",
results[i].id, results[i].name
);
retry = true;
had_error[i] = true;
break;
case VAL_RESULT_SUSPICIOUS:
log_messages.printf(MSG_NORMAL,
"[RESULT#%lu %s] considered to be suspicious\n",
results[i].id, results[i].name
);
suspicious_results++;
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_set: init_result([RESULT#%lu %s]) failed: %s\n",
results[i].id, results[i].name, boincerror(retval)
);
results[i].outcome = RESULT_OUTCOME_VALIDATE_ERROR;
results[i].validate_state = VALIDATE_STATE_INVALID;
had_error[i] = true;
break;
}
}
// don't count a single "suspicious" result as "good",
// but do if there are more results to compare it with
//
if (suspicious_results > 1 || good_results > 0) {
good_results += suspicious_results;
}
// if there are "suspicious" results and min_quorum < g_app->target_nresults
// (i.e. adaptive replication), raise min_quorum to g_app->target_nresults
// the abs() is there for Einstein@Home-specific use of app->target_nresults,
// please leave it in there.
//
if (suspicious_results && wu.min_quorum < abs(g_app->target_nresults)) {
log_messages.printf(MSG_NORMAL, "suspicious result - raising quorum\n");
wu.min_quorum = abs(g_app->target_nresults);
}
if (good_results < wu.min_quorum) goto cleanup;
// for each result, count how many it matches (including itself)
// If this is at least min_valid, it's the canonical result
for (i=0; i<n; i++) {
if (had_error[i]) continue;
vector<bool> matches;
matches.resize(n);
neq = 0;
for (j=0; j<n; j++) {
if (had_error[j]) continue;
if (i == j) {
++neq;
matches[j] = true;
continue;
}
bool match = false;
retval = compare_results(
results[i], data[i], results[j], data[j], match
);
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
retry = true;
goto cleanup;
case 0:
if (match) {
++neq;
matches[j] = true;
}
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_set(): compare_results([RESULT#%lu %s], [RESULT#%lu %s]) failed\n",
results[i].id, results[i].name, results[j].id, results[j].name
);
}
}
if (neq >= min_valid) {
// set validate state for each result
//
for (j=0; j!=n; j++) {
if (had_error[j]) continue;
results[j].validate_state = matches[j] ? VALIDATE_STATE_VALID : VALIDATE_STATE_INVALID;
}
canonicalid = results[i].id;
break;
}
}
cleanup:
for (i=0; i<n; i++) {
cleanup_result(results[i], data[i]);
}
return 0;
}
// a straggler instance has arrived after the WU is already validated.
// r1 is the new result; r2 is canonical result
//
void check_pair(RESULT& r1, RESULT& r2, bool& retry) {
void* data1;
void* data2;
int retval;
bool match;
retry = false;
retval = init_result(r1, data1);
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) transient failure 1\n",
r1.id, r1.name
);
retry = true;
return;
case 0:
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) perm failure 1\n",
r1.id, r1.name
);
r1.outcome = RESULT_OUTCOME_VALIDATE_ERROR;
r1.validate_state = VALIDATE_STATE_INVALID;
return;
}
retval = init_result(r2, data2);
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) transient failure 2\n",
r2.id, r2.name
);
cleanup_result(r1, data1);
retry = true;
return;
case 0:
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_pair: init_result([RESULT#%lu %s]) perm failure2\n",
r2.id, r2.name
);
cleanup_result(r1, data1);
r1.outcome = RESULT_OUTCOME_VALIDATE_ERROR;
r1.validate_state = VALIDATE_STATE_INVALID;
return;
}
retval = compare_results(r1, data1, r2, data2, match);
switch (retval) {
case ERR_OPENDIR:
case VAL_RESULT_TRANSIENT_ERROR:
retry = true;
break;
case 0:
r1.validate_state = match?VALIDATE_STATE_VALID:VALIDATE_STATE_INVALID;
break;
default:
log_messages.printf(MSG_CRITICAL,
"check_pair: compare_results([RESULT#%lu RESULT#%lu]) perm failure\n",
r1.id, r2.id
);
r1.validate_state = VALIDATE_STATE_INVALID;
break;
}
cleanup_result(r1, data1);
cleanup_result(r2, data2);
}