bidict/cachegrind.py

140 lines
4.4 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# Based on https://github.com/pythonspeed/cachegrind-benchmarking/blob/main/cachegrind.py
"""
Run a program under Cachegrind, combining various metrics into one single performance metric.
License: https://opensource.org/licenses/MIT
## Features
* Disables ASLR.
* Sets consistent cache sizes.
* Calculates a combined performance metric.
For more information see the detailed write up at:
https://pythonspeed.com/articles/consistent-benchmarking-in-ci/
## Usage
$ python3 cachegrind.py ./yourprogram --yourparam=yourvalues
If you're benchmarking Python, make sure to set PYTHONHASHSEED to a fixed value
(e.g. `export PYTHONHASHSEED=1234`). Other languages may have similar
requirements to reduce variability.
The last line printed will be a combined performance metric, but you can tweak
the script to extract more info, or use it as a library.
Copyright © 2020, Hyphenated Enterprises LLC.
"""
from __future__ import annotations
import sys
import typing as t
from subprocess import DEVNULL
from subprocess import check_call
from subprocess import check_output
from subprocess import run
from tempfile import NamedTemporaryFile
try:
check_call(['setarch', '-h'], stdout=DEVNULL, stderr=DEVNULL)
check_call(['valgrind', '-h'], stdout=DEVNULL, stderr=DEVNULL)
except FileNotFoundError as exc: # e.g. macOS
raise SystemExit(f'Command not found: {exc.filename}') from None
ARCH = check_output(['uname', '-m'], text=True).strip()
DISABLE_ASLR_CMD = ['setarch', ARCH, '-R']
def run_with_cachegrind(args_list: list[str]) -> dict[str, int]:
"""
Run the the given program and arguments under Cachegrind, parse the
Cachegrind specs.
For now we just ignore program output, and in general this is not robust.
"""
temp_file = NamedTemporaryFile('r+')
run([
*DISABLE_ASLR_CMD,
'valgrind',
'--tool=cachegrind',
# Set some reasonable L1 and LL values, based on Haswell.
# Feel free to update, important part is that they are consistent across runs,
# instead of the default of copying from the current machine.
'--I1=32768,8,64',
'--D1=32768,8,64',
'--LL=8388608,16,64',
'--cachegrind-out-file=' + temp_file.name,
*args_list,
]) # Don't fail if the program fails (to support e.g. `pytest --benchmark-compare-fail=...`)
return parse_cachegrind_output(temp_file)
def parse_cachegrind_output(temp_file: t.IO[str]) -> dict[str, int]:
header = summary = ''
for line in temp_file:
if line.startswith('events: '):
header = line[len('events: ') :].strip()
elif line.startswith('summary: '):
summary = line[len('summary:') :].strip()
assert header
assert summary
return dict(zip(header.split(), (int(i) for i in summary.split())))
def get_counts(cg_results: dict[str, int]) -> dict[str, int]:
"""
Given the result of run_with_cachegrind(), figure out the parameters we will use for final
estimate.
We pretend there's no L2 since Cachegrind doesn't currently support it.
Caveats: we're not including time to process instructions, only time to
access instruction cache(s), so we're assuming time to fetch and run_with_cachegrind
instruction is the same as time to retrieve data if they're both to L1
cache.
"""
result = {}
d = cg_results
ram_hits = d['DLmr'] + d['DLmw'] + d['ILmr']
l3_hits = d['I1mr'] + d['D1mw'] + d['D1mr'] - ram_hits
total_memory_rw = d['Ir'] + d['Dr'] + d['Dw']
l1_hits = total_memory_rw - l3_hits - ram_hits
assert total_memory_rw == l1_hits + l3_hits + ram_hits
result['l1'] = l1_hits
result['l3'] = l3_hits
result['ram'] = ram_hits
return result
def combined_instruction_estimate(counts: dict[str, int]) -> int:
"""
Given the result of run_with_cachegrind(), return estimate of total time to run_with_cachegrind.
Multipliers were determined empirically, but some research suggests they're
a reasonable approximation for cache time ratios. L3 is probably too low,
but then we're not simulating L2...
"""
return counts['l1'] + (5 * counts['l3']) + (35 * counts['ram'])
def main() -> None:
results = run_with_cachegrind(sys.argv[1:])
counts = get_counts(results)
estimate = combined_instruction_estimate(counts)
print(f'{"*" * 80}\nCombined instruction estimate: {estimate:,}') # noqa: T201
if __name__ == '__main__':
main()