#!/usr/bin/env python3 # Based on https://github.com/pythonspeed/cachegrind-benchmarking/blob/main/cachegrind.py """ Run a program under Cachegrind, combining various metrics into one single performance metric. License: https://opensource.org/licenses/MIT ## Features * Disables ASLR. * Sets consistent cache sizes. * Calculates a combined performance metric. For more information see the detailed write up at: https://pythonspeed.com/articles/consistent-benchmarking-in-ci/ ## Usage $ python3 cachegrind.py ./yourprogram --yourparam=yourvalues If you're benchmarking Python, make sure to set PYTHONHASHSEED to a fixed value (e.g. `export PYTHONHASHSEED=1234`). Other languages may have similar requirements to reduce variability. The last line printed will be a combined performance metric, but you can tweak the script to extract more info, or use it as a library. Copyright © 2020, Hyphenated Enterprises LLC. """ from __future__ import annotations import sys import typing as t from subprocess import DEVNULL from subprocess import check_call from subprocess import check_output from subprocess import run from tempfile import NamedTemporaryFile try: check_call(['setarch', '-h'], stdout=DEVNULL, stderr=DEVNULL) check_call(['valgrind', '-h'], stdout=DEVNULL, stderr=DEVNULL) except FileNotFoundError as exc: # e.g. macOS raise SystemExit(f'Command not found: {exc.filename}') from None ARCH = check_output(['uname', '-m'], text=True).strip() DISABLE_ASLR_CMD = ['setarch', ARCH, '-R'] def run_with_cachegrind(args_list: list[str]) -> dict[str, int]: """ Run the the given program and arguments under Cachegrind, parse the Cachegrind specs. For now we just ignore program output, and in general this is not robust. """ temp_file = NamedTemporaryFile('r+') run([ *DISABLE_ASLR_CMD, 'valgrind', '--tool=cachegrind', # Set some reasonable L1 and LL values, based on Haswell. # Feel free to update, important part is that they are consistent across runs, # instead of the default of copying from the current machine. '--I1=32768,8,64', '--D1=32768,8,64', '--LL=8388608,16,64', '--cachegrind-out-file=' + temp_file.name, *args_list, ]) # Don't fail if the program fails (to support e.g. `pytest --benchmark-compare-fail=...`) return parse_cachegrind_output(temp_file) def parse_cachegrind_output(temp_file: t.IO[str]) -> dict[str, int]: header = summary = '' for line in temp_file: if line.startswith('events: '): header = line[len('events: ') :].strip() elif line.startswith('summary: '): summary = line[len('summary:') :].strip() assert header assert summary return dict(zip(header.split(), (int(i) for i in summary.split()))) def get_counts(cg_results: dict[str, int]) -> dict[str, int]: """ Given the result of run_with_cachegrind(), figure out the parameters we will use for final estimate. We pretend there's no L2 since Cachegrind doesn't currently support it. Caveats: we're not including time to process instructions, only time to access instruction cache(s), so we're assuming time to fetch and run_with_cachegrind instruction is the same as time to retrieve data if they're both to L1 cache. """ result = {} d = cg_results ram_hits = d['DLmr'] + d['DLmw'] + d['ILmr'] l3_hits = d['I1mr'] + d['D1mw'] + d['D1mr'] - ram_hits total_memory_rw = d['Ir'] + d['Dr'] + d['Dw'] l1_hits = total_memory_rw - l3_hits - ram_hits assert total_memory_rw == l1_hits + l3_hits + ram_hits result['l1'] = l1_hits result['l3'] = l3_hits result['ram'] = ram_hits return result def combined_instruction_estimate(counts: dict[str, int]) -> int: """ Given the result of run_with_cachegrind(), return estimate of total time to run_with_cachegrind. Multipliers were determined empirically, but some research suggests they're a reasonable approximation for cache time ratios. L3 is probably too low, but then we're not simulating L2... """ return counts['l1'] + (5 * counts['l3']) + (35 * counts['ram']) def main() -> None: results = run_with_cachegrind(sys.argv[1:]) counts = get_counts(results) estimate = combined_instruction_estimate(counts) print(f'{"*" * 80}\nCombined instruction estimate: {estimate:,}') # noqa: T201 if __name__ == '__main__': main()