From f44c9910e704f3e932fdcdbd7a1f9359d36e2be3 Mon Sep 17 00:00:00 2001 From: David Anderson Date: Tue, 12 Jul 2011 20:44:28 +0000 Subject: [PATCH] - validator: if job FLOPs estimates are accurate, PFC values should be around 1. If they differ from 1 by a factor of > 1e4, ignore them, and put an error message into the validator log - validator: if get_pfc() fails because an app version is missing from the DB (i.e. the project deleted it) keep going so we don't reprocess the WU forever svn path=/trunk/boinc/; revision=23837 --- checkin_notes | 12 ++++++++++++ sched/credit.cpp | 33 +++++++++++++++++++++++++-------- 2 files changed, 37 insertions(+), 8 deletions(-) diff --git a/checkin_notes b/checkin_notes index 1542094805..af0fd6a523 100644 --- a/checkin_notes +++ b/checkin_notes @@ -4066,3 +4066,15 @@ David 12 July 2011 app_details.php inc/ boinc_db.inc + +David 12 July 2011 + - validator: if job FLOPs estimates are accurate, + PFC values should be around 1. + If they differ from 1 by a factor of > 1e4, ignore them, + and put an error message into the validator log + - validator: if get_pfc() fails because an app version is + missing from the DB (i.e. the project deleted it) + keep going so we don't reprocess the WU forever + + sched/ + credit.cpp diff --git a/sched/credit.cpp b/sched/credit.cpp index 16d2810b31..48171871a6 100644 --- a/sched/credit.cpp +++ b/sched/credit.cpp @@ -337,6 +337,17 @@ inline double wu_estimated_credit(WORKUNIT& wu, DB_APP& app) { return wu_estimated_pfc(wu, app)*COBBLESTONE_SCALE; } +inline bool is_pfc_sane(double x, WORKUNIT& wu, DB_APP& app) { + if (x > 1e4 || x < 1e-4) { + log_messages.printf(MSG_CRITICAL, + "Bad FLOP ratio (%f): check workunit.rsc_fpops_est for %s (app %s)\n", + x, wu.name, app.name + ); + return false; + } + return true; +} + // Compute or estimate "claimed peak FLOP count". // Possibly update host_app_version records and write to DB. // Possibly update app_version records in memory and let caller write to DB, @@ -638,7 +649,10 @@ int get_pfc( raw_pfc, wu.rsc_fpops_est ); } - avp->pfc_samples.push_back(raw_pfc/wu.rsc_fpops_est); + double x = raw_pfc / wu.rsc_fpops_est; + if (is_pfc_sane(x, wu, app)) { + avp->pfc_samples.push_back(x); + } } if (config.debug_credit) { @@ -649,11 +663,11 @@ int get_pfc( (r.received_time - r.sent_time) ); } - - hav.pfc.update( - raw_pfc / wu.rsc_fpops_est, - HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT - ); + + double x = raw_pfc / wu.rsc_fpops_est; + if (is_pfc_sane(x, wu, app)) { + hav.pfc.update(x, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT); + } hav.et.update_var( r.elapsed_time / wu.rsc_fpops_est, HAV_AVG_THRESH, HAV_AVG_WEIGHT, HAV_AVG_LIMIT @@ -727,7 +741,7 @@ int assign_credit_set( log_messages.printf(MSG_CRITICAL, "get_pfc() error: %s\n", boincerror(retval) ); - return retval; + continue; } else { if (config.debug_credit) { log_messages.printf(MSG_NORMAL, @@ -766,9 +780,12 @@ int assign_credit_set( double x; if (normal.size()) { x = low_average(normal); - } else { + } else if (approx.size()) { x = vec_min(approx); + } else { + x = 0; } + x *= COBBLESTONE_SCALE; if (config.debug_credit) { log_messages.printf(MSG_NORMAL,