boinc/client/app_config.cpp

184 lines
6.1 KiB
C++
Raw Normal View History

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2012 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC. If not, see <http://www.gnu.org/licenses/>.
#include "filesys.h"
#include "client_msgs.h"
#include "client_state.h"
#include "client_types.h"
#include "project.h"
#include "result.h"
#include "cc_config.h"
#include "app_config.h"
static void show_warning(PROJECT* p, char* name) {
msg_printf(p, MSG_USER_ALERT,
"Your app_config.xml file refers to an unknown application '%s'. Known applications: %s",
name, app_list_string(p).c_str()
);
}
// having parsed a project's app_config.xml, put the config into effect
//
int APP_CONFIGS::config_app_versions(PROJECT* p, bool show_warnings) {
unsigned int i;
bool showed_notice = false;
for (i=0; i<app_configs.size(); i++) {
APP_CONFIG& ac = app_configs[i];
APP* app = gstate.lookup_app(p, ac.name);
if (!app) {
if (show_warnings) {
show_warning(p, ac.name);
showed_notice = true;
}
continue;
}
app->max_concurrent = ac.max_concurrent;
app->fraction_done_exact = ac.fraction_done_exact;
app->report_results_immediately = ac.report_results_immediately;
if (!ac.gpu_gpu_usage || !ac.gpu_cpu_usage) continue;
for (unsigned int j=0; j<gstate.app_versions.size(); j++) {
APP_VERSION* avp = gstate.app_versions[j];
if (avp->app != app) continue;
if (!avp->gpu_usage.rsc_type) continue;
avp->gpu_usage.usage = ac.gpu_gpu_usage;
avp->avg_ncpus = ac.gpu_cpu_usage;
}
}
for (i=0; i<app_version_configs.size(); i++) {
APP_VERSION_CONFIG& avc = app_version_configs[i];
APP* app = gstate.lookup_app(p, avc.app_name);
if (!app) {
if (show_warnings) {
show_warning(p, avc.app_name);
showed_notice = true;
}
continue;
}
bool found = false;
const size_t cmdline_len = strlen(avc.cmdline);
for (unsigned int j=0; j<gstate.app_versions.size(); j++) {
APP_VERSION* avp = gstate.app_versions[j];
if (avp->app != app) continue;
if (strcmp(avp->plan_class, avc.plan_class)) continue;
found = true;
if (cmdline_len) {
safe_strcpy(avp->cmdline, avc.cmdline);
}
if (avc.avg_ncpus) {
avp->avg_ncpus = avc.avg_ncpus;
}
if (avc.ngpus) {
avp->gpu_usage.usage = avc.ngpus;
}
}
if (!found) {
msg_printf(p, MSG_USER_ALERT,
"Entry in app_config.xml for app '%s', plan class '%s' doesn't match any app versions",
avc.app_name, avc.plan_class
);
}
}
if (showed_notice) return ERR_XML_PARSE;
return 0;
}
void max_concurrent_init() {
for (unsigned int i=0; i<gstate.apps.size(); i++) {
gstate.apps[i]->app_n_concurrent = 0;
}
for (unsigned int i=0; i<gstate.projects.size(); i++) {
gstate.projects[i]->proj_n_concurrent = 0;
}
}
// undo the effects of an app_config.xml that no longer exists
// NOTE: all we can do here is to clear APP::max_concurrent;
// we can't restore device usage info because we don't have it.
// It will be restored on next scheduler RPC.
//
static void clear_app_config(PROJECT* p) {
p->app_configs.clear();
p->report_results_immediately = false;
for (unsigned int i=0; i<gstate.apps.size(); i++) {
APP* app = gstate.apps[i];
if (app->project != p) continue;
app->max_concurrent = 0;
app->report_results_immediately = false;
}
}
static void print_msgs(vector<string> msgs, PROJECT* p) {
for (unsigned int i=0; i<msgs.size(); i++) {
2017-05-12 23:09:31 +00:00
msg_printf_notice(p, false, NULL, "%s", msgs[i].c_str());
}
}
// check for app_config.xml files, and parse them.
// Called at startup and on read_cc_config() RPC
//
void check_app_config(const char* prefix) {
char path[MAXPATHLEN];
FILE* f;
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
snprintf(path, sizeof(path), "%s%s/%s",
prefix, p->project_dir(), APP_CONFIG_FILE_NAME
);
f = boinc_fopen(path, "r");
if (!f) {
clear_app_config(p);
continue;
}
msg_printf(p, MSG_INFO, "Found %s", APP_CONFIG_FILE_NAME);
vector<string> msgs;
int retval = p->app_configs.parse_file(f, msgs, log_flags);
print_msgs(msgs, p);
if (!retval) {
p->report_results_immediately = p->app_configs.report_results_immediately;
retval = p->app_configs.config_app_versions(p, true);
if (!retval) {
notices.remove_notices(p, REMOVE_APP_CONFIG_MSG);
}
}
fclose(f);
}
}
Avoid starvation when max_concurrent is used, and related fixes. Synopsis: max concurrent was being enforced in the last stage of CPU sched, but not in earlier stages, or in work fetch. This caused starvation in some cases. Fix this by modeling max concurrent in RR sim and make_run_list(). - CPU sched: model and enforce max concurrent limits in building run list for CPU jobs; otherwise the list has jobs we can't actually run - RR simulation: model and enforce max concurrent limits - RR sim: fix bug in calculation of # idle instances - RR sim: model unavailability of GPUs e.g. if we can't run GPU jobs we can potentially run more CPU jobs - work fetch: if a project is at a max concurrent limit, don't fetch work from it. The jobs we get (possibly) wouldn't be runnable. NOTE: we currently provide max concurrent limits at both project and app level. The problem with app level is that apps can have versions that use different resources. It would be better to have limits at the resource level instead. - In many cases (e.g. job completion) CPU sched and work fetch are both done back to back. Each of them does RR simulation. Only need to do this once (efficiency). - Show max concurrent settings in startup messages - Make max runnable jobs (1000) into a #define - Fix removal of "can't fetch work" notices - Make "can't fetch work" notices resource-specific; the reasons may differ between resources - Get rid of WF_DEBUG macro; just print everything if log_flags.work_fetch_debug is set. - Change project- and resource-level work-fetch reason codes (DONT_FETCH_PREFS etc.) from #defines to enums, and give them prefixes RSC_REASON and PROJECT_REASON - Fix bug where the return of compute_project_reason() wasn't actually being stored in project.work_fetch. - Add work-fetch reason MAX_CONCURRENT (project is at max concurrent limit)
2018-12-28 20:55:05 +00:00
void show_app_config() {
if (!have_max_concurrent) return;
for (unsigned int i=0; i<gstate.projects.size(); i++) {
PROJECT* p = gstate.projects[i];
if (p->app_configs.project_max_concurrent) {
msg_printf(p, MSG_INFO,
"Max %d concurrent jobs", p->app_configs.project_max_concurrent
);
}
}
for (unsigned int i=0; i<gstate.apps.size(); i++) {
APP* app = gstate.apps[i];
if (app->max_concurrent) {
msg_printf(app->project, MSG_INFO,
"%s: Max %d concurrent jobs", app->name, app->max_concurrent
);
}
}
}