boinc/client/work_fetch.cpp

// This file is part of BOINC.
// http://boinc.berkeley.edu
// Copyright (C) 2008 University of California
//
// BOINC is free software; you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License
// as published by the Free Software Foundation,
// either version 3 of the License, or (at your option) any later version.
//
// BOINC is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
// See the GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public License
// along with BOINC.  If not, see <http://www.gnu.org/licenses/>.

#include "client_types.h"
#include "client_msgs.h"
#ifdef SIM
#include "sim.h"
#else
#include "client_state.h"
#endif

#include "work_fetch.h"

using std::vector;

RSC_WORK_FETCH cuda_work_fetch;
RSC_WORK_FETCH cpu_work_fetch;
WORK_FETCH work_fetch;

RSC_PROJECT_WORK_FETCH& RSC_WORK_FETCH::project_state(PROJECT* p) {
    switch(rsc_type) {
    case RSC_TYPE_CPU: return p->cpu_pwf;
    case RSC_TYPE_CUDA: return p->cuda_pwf;
    }
}

bool RSC_WORK_FETCH::may_have_work(PROJECT* p) {
    RSC_PROJECT_WORK_FETCH& w = project_state(p);
    return (w.backoff_time < gstate.now);
}

void RSC_WORK_FETCH::rr_init() {
    shortfall = 0;
    nidle_now = 0;
    total_resource_share = 0;
    runnable_resource_share = 0;
}

void WORK_FETCH::rr_init() {
    cpu_work_fetch.rr_init();
    if (coproc_cuda) {
        cuda_work_fetch.rr_init();
    }
    estimated_delay = 0;
}

void RSC_WORK_FETCH::accumulate_shortfall(double d_time, double nused) {
    double idle = ninstances - nused;
    if (idle > 0) {
        shortfall += idle*d_time;
    }
}

void RSC_PROJECT_WORK_FETCH::accumulate_shortfall(
    RSC_WORK_FETCH& rwf,
    PROJECT* p,
    double d_time,
    double nused
) {
    double rsf = rwf.total_resource_share?p->resource_share/rwf.total_resource_share:1;
    double share = rwf.ninstances * rsf;
    double x = share - nused;
    if (x > 0) {
        shortfall += d_time * x;
    }
}

PROJECT* RSC_WORK_FETCH::choose_project() {
    PROJECT* pbest = NULL;

    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        if (p->non_cpu_intensive) continue;
        if (!p->can_request_work()) continue;
        if (!may_have_work(p)) continue;
        if (pbest) {
            if (pbest->pwf.overall_debt > p->pwf.overall_debt) {
                continue;
            }
        }
        pbest = p;
    }
    return pbest;
}

void WORK_FETCH::set_overall_debts() {
    for (unsigned i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        p->pwf.overall_debt = p->cpu_pwf.debt;
        if (coproc_cuda) {
            p->pwf.overall_debt += cuda_work_fetch.speed*p->cuda_pwf.debt;
        }
    }
}

void RSC_WORK_FETCH::print_state(char* name) {
    msg_printf(0, MSG_INFO,
        "[wfd] %s: shortfall %.2f nidle %.2f total RS %.2f runnable RS %.2f",
        name,
        shortfall, nidle_now,
        total_resource_share, runnable_resource_share
    );
    for (unsigned int i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        RSC_PROJECT_WORK_FETCH& pwf = project_state(p);
        msg_printf(p, MSG_INFO,
            "[wfd] %s: shortfall %.2f nidle %.2f",
            name, pwf.shortfall, pwf.nidle_now
        );
    }
}

void WORK_FETCH::print_state() {
    msg_printf(0, MSG_INFO, "[wfd] ------- start work fetch state -------");
    cpu_work_fetch.print_state("CPU");
    if (coproc_cuda) {
        cuda_work_fetch.print_state("CUDA");
    }
    for (unsigned int i=0; i<gstate.projects.size(); i++) {
        PROJECT* p = gstate.projects[i];
        msg_printf(p, MSG_INFO, "[wfd] overall_debt %f", p->pwf.overall_debt);
    }
    msg_printf(0, MSG_INFO, "[wfd] ------- end work fetch state -------");
}

static void print_req(PROJECT* p) {
    msg_printf(p, MSG_INFO,
        "[wfd] request: CPU (%.2f sec, %.2f) CUDA (%.2f sec, %.2f)",
        p->cpu_pwf.shortfall, p->cpu_pwf.nidle_now,
        p->cuda_pwf.shortfall, p->cuda_pwf.nidle_now
    );
}

// choose a project to fetch work from
//
PROJECT* WORK_FETCH::choose_project() {
    PROJECT* p = 0;
    gstate.rr_simulation();
    set_overall_debts();

    // if a resource is currently idle, get work for it;
    // give GPU priority over CPU
    //
    if (coproc_cuda && cuda_work_fetch.nidle_now) {
        p = cuda_work_fetch.choose_project();
        if (p) {
            p->cpu_pwf.shortfall = 0;
        }
    }
    if (!p && cpu_work_fetch.nidle_now) {
        p = cpu_work_fetch.choose_project();
        if (p) {
            p->cuda_pwf.shortfall = 0;
        }
    }
    if (!p && coproc_cuda && cuda_work_fetch.shortfall) {
        p = cuda_work_fetch.choose_project();
    }
    if (!p && cpu_work_fetch.shortfall) {
        p = cpu_work_fetch.choose_project();
    }
    if (log_flags.work_fetch_debug) {
        print_state();
        if (p) {
            print_req(p);
        } else {
            msg_printf(0, MSG_INFO, "No project chosen for work fetch");
        }
    }

    return p;
}

void WORK_FETCH::accumulate_inst_sec(ACTIVE_TASK* atp, double dt) {
    APP_VERSION* avp = atp->result->avp;
    PROJECT* p = atp->result->project;
    double x = dt*avp->avg_ncpus;
    p->cpu_pwf.secs_this_debt_interval += x;
    cpu_work_fetch.secs_this_debt_interval += x;
    if (coproc_cuda) {
        x = dt*coproc_cuda->used;
        p->cuda_pwf.secs_this_debt_interval += x;
        cuda_work_fetch.secs_this_debt_interval += x;
    }
}

void RSC_WORK_FETCH::update_debts() {
    unsigned int i;
    int nprojects = 0;
    double ders = 0;
    PROJECT* p;

    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
        if (!w.debt_eligible(p)) continue;
        ders += p->resource_share;
    }
    double total_debt = 0;
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
        if (w.debt_eligible(p)) {
            double share_frac = p->resource_share/ders;
            w.debt += share_frac*secs_this_debt_interval - w.secs_this_debt_interval;
        }
        total_debt += w.debt;
        nprojects++;
    }

    //  normalize so mean is zero,
    //
    double avg_debt = total_debt / nprojects;
    for (i=0; i<gstate.projects.size(); i++) {
        p = gstate.projects[i];
        RSC_PROJECT_WORK_FETCH& w = project_state(p);
        w.debt-= avg_debt;
    }
}

bool RSC_PROJECT_WORK_FETCH::debt_eligible(PROJECT* p) {
    if (backoff_interval > 0) return false;
    if (p->suspended_via_gui) return false;
    return true;
}

void WORK_FETCH::write_request(PROJECT* p, FILE* f) {
    if (p->cpu_pwf.shortfall > 0 && p->cpu_pwf.shortfall < 1) {
        p->cpu_pwf.shortfall = 1;
    }
    double work_req_seconds = p->cpu_pwf.shortfall;
    fprintf(f,
        "    <cpu_req_seconds>%f</cpu_req_seconds>\n"
        "    <cpu_ninstances>%f</cpu_ninstances>\n",
        p->cpu_pwf.shortfall,
        p->cpu_pwf.nidle_now
    );
    if (coproc_cuda) {
        if (p->cuda_pwf.shortfall > work_req_seconds) {
            work_req_seconds = p->cuda_pwf.shortfall;
        }
        if (p->cuda_pwf.shortfall > 0 && p->cuda_pwf.shortfall < 1) {
            p->cuda_pwf.shortfall = 1;
        }
        fprintf(f,
            "    <cuda_req_seconds>%f</cuda_req_seconds>\n"
            "    <cuda_ninstances>%f</cuda_ninstances>\n",
            p->cuda_pwf.shortfall,
            p->cuda_pwf.nidle_now
        );
    }
    fprintf(f,
        "    <work_req_seconds>%f</work_req_seconds>\n",
        work_req_seconds
    );
}

// we just got a scheduler reply with the given jobs.
//
void WORK_FETCH::handle_reply(PROJECT* p, vector<RESULT*> new_results) {
    unsigned int i;

    for (i=0; i<new_results.size(); i++) {
        RESULT* rp = new_results[i];
    }
}

void WORK_FETCH::set_initial_work_request(PROJECT* p) {
    p->cpu_pwf.shortfall = 1;
    p->cuda_pwf.shortfall = 1;
}

void WORK_FETCH::init() {
    cpu_work_fetch.rsc_type = RSC_TYPE_CPU;
    cpu_work_fetch.ninstances = gstate.ncpus;

    if (coproc_cuda) {
        cuda_work_fetch.rsc_type = RSC_TYPE_CUDA;
        cuda_work_fetch.ninstances = coproc_cuda->count;
        cuda_work_fetch.speed = coproc_cuda->flops_estimate()/gstate.host_info.p_fpops;
    }
}

////////////////////////

void CLIENT_STATE::compute_nuploading_results() {
    unsigned int i;

    for (i=0; i<projects.size(); i++) {
        projects[i]->nuploading_results = 0;
    }
    for (i=0; i<results.size(); i++) {
        RESULT* rp = results[i];
        if (rp->state() == RESULT_FILES_UPLOADING) {
            rp->project->nuploading_results++;
        }
    }
}

bool PROJECT::runnable() {
    if (suspended_via_gui) return false;
    for (unsigned int i=0; i<gstate.results.size(); i++) {
        RESULT* rp = gstate.results[i];
        if (rp->project != this) continue;
        if (rp->runnable()) return true;
    }
    return false;
}

bool PROJECT::downloading() {
    if (suspended_via_gui) return false;
    for (unsigned int i=0; i<gstate.results.size(); i++) {
        RESULT* rp = gstate.results[i];
        if (rp->project != this) continue;
        if (rp->downloading()) return true;
    }
    return false;
}

bool PROJECT::some_result_suspended() {
    unsigned int i;
    for (i=0; i<gstate.results.size(); i++) {
         RESULT *rp = gstate.results[i];
         if (rp->project != this) continue;
         if (rp->suspended_via_gui) return true;
     }
    return false;
}

bool PROJECT::can_request_work() {
    if (suspended_via_gui) return false;
    if (master_url_fetch_pending) return false;
    if (min_rpc_time > gstate.now) return false;
    if (dont_request_more_work) return false;
    return true;
}

bool PROJECT::potentially_runnable() {
    if (runnable()) return true;
    if (can_request_work()) return true;
    if (downloading()) return true;
    return false;
}

bool PROJECT::nearly_runnable() {
    if (runnable()) return true;
    if (downloading()) return true;
    return false;
}

bool RSC_PROJECT_WORK_FETCH::overworked() {
    return debt < -gstate.global_prefs.cpu_scheduling_period();
}

bool RESULT::runnable() {
    if (suspended_via_gui) return false;
    if (project->suspended_via_gui) return false;
    if (state() != RESULT_FILES_DOWNLOADED) return false;
    return true;
}

bool RESULT::nearly_runnable() {
    return runnable() || downloading();
}

// Return true if the result is waiting for its files to download,
// and nothing prevents this from happening soon
//
bool RESULT::downloading() {
    if (suspended_via_gui) return false;
    if (project->suspended_via_gui) return false;
    if (state() > RESULT_FILES_DOWNLOADING) return false;
    return true;
}

double RESULT::estimated_duration_uncorrected() {
    return wup->rsc_fpops_est/avp->flops;
}

// estimate how long a result will take on this host
//
#ifdef SIM
double RESULT::estimated_duration(bool for_work_fetch) {
    SIM_PROJECT* spp = (SIM_PROJECT*)project;
    if (dual_dcf && for_work_fetch && spp->completions_ratio_mean) {
        return estimated_duration_uncorrected()*spp->completions_ratio_mean;
    }
    return estimated_duration_uncorrected()*project->duration_correction_factor;
}
#else
double RESULT::estimated_duration(bool) {
    return estimated_duration_uncorrected()*project->duration_correction_factor;
}
#endif

double RESULT::estimated_time_remaining(bool for_work_fetch) {
    if (computing_done()) return 0;
    ACTIVE_TASK* atp = gstate.lookup_active_task_by_result(this);
    if (atp) {
        return atp->est_time_to_completion(for_work_fetch);
    }
    return estimated_duration(for_work_fetch);
}

// Returns the estimated CPU time to completion (in seconds) of this task.
// Compute this as a weighted average of estimates based on
// 1) the workunit's flops count
// 2) the current reported CPU time and fraction done
//
double ACTIVE_TASK::est_time_to_completion(bool for_work_fetch) {
    if (fraction_done >= 1) return 0;
    double wu_est = result->estimated_duration(for_work_fetch);
    if (fraction_done <= 0) return wu_est;
    double frac_est = (elapsed_time / fraction_done) - elapsed_time;
    double fraction_left = 1-fraction_done;
	double wu_weight = fraction_left * fraction_left;
	double fd_weight = 1 - wu_weight;
    double x = fd_weight*frac_est + wu_weight*fraction_left*wu_est;
    return x;
}

// the fraction of time a given CPU is working for BOINC
//
double CLIENT_STATE::overall_cpu_frac() {
    double running_frac = time_stats.on_frac * time_stats.active_frac;
    if (running_frac < 0.01) running_frac = 0.01;
    if (running_frac > 1) running_frac = 1;
	return running_frac;
}

// called when benchmarks change
//
void CLIENT_STATE::scale_duration_correction_factors(double factor) {
    if (factor <= 0) return;
    for (unsigned int i=0; i<projects.size(); i++) {
        PROJECT* p = projects[i];
        p->duration_correction_factor *= factor;
    }
	if (log_flags.cpu_sched_debug) {
		msg_printf(NULL, MSG_INFO,
            "[cpu_sched_debug] scaling duration correction factors by %f",
            factor
        );
	}
}

// Choose a new host CPID.
// If using account manager, do scheduler RPCs
// to all acct-mgr-attached projects to propagate the CPID
//
void CLIENT_STATE::generate_new_host_cpid() {
    host_info.generate_host_cpid();
    for (unsigned int i=0; i<projects.size(); i++) {
        if (projects[i]->attached_via_acct_mgr) {
            projects[i]->sched_rpc_pending = RPC_REASON_ACCT_MGR_REQ;
            projects[i]->set_min_rpc_time(now + 15, "Sending new host CPID");
        }
    }
}