mirror of https://github.com/jab/bidict.git
140 lines
4.4 KiB
Python
140 lines
4.4 KiB
Python
|
#!/usr/bin/env python3
|
||
|
|
||
|
# Based on https://github.com/pythonspeed/cachegrind-benchmarking/blob/main/cachegrind.py
|
||
|
"""
|
||
|
Run a program under Cachegrind, combining various metrics into one single performance metric.
|
||
|
|
||
|
License: https://opensource.org/licenses/MIT
|
||
|
|
||
|
## Features
|
||
|
|
||
|
* Disables ASLR.
|
||
|
* Sets consistent cache sizes.
|
||
|
* Calculates a combined performance metric.
|
||
|
|
||
|
For more information see the detailed write up at:
|
||
|
|
||
|
https://pythonspeed.com/articles/consistent-benchmarking-in-ci/
|
||
|
|
||
|
## Usage
|
||
|
|
||
|
$ python3 cachegrind.py ./yourprogram --yourparam=yourvalues
|
||
|
|
||
|
If you're benchmarking Python, make sure to set PYTHONHASHSEED to a fixed value
|
||
|
(e.g. `export PYTHONHASHSEED=1234`). Other languages may have similar
|
||
|
requirements to reduce variability.
|
||
|
|
||
|
The last line printed will be a combined performance metric, but you can tweak
|
||
|
the script to extract more info, or use it as a library.
|
||
|
|
||
|
Copyright © 2020, Hyphenated Enterprises LLC.
|
||
|
"""
|
||
|
|
||
|
from __future__ import annotations
|
||
|
|
||
|
import sys
|
||
|
import typing as t
|
||
|
from subprocess import DEVNULL
|
||
|
from subprocess import check_call
|
||
|
from subprocess import check_output
|
||
|
from subprocess import run
|
||
|
from tempfile import NamedTemporaryFile
|
||
|
|
||
|
|
||
|
try:
|
||
|
check_call(['setarch', '-h'], stdout=DEVNULL, stderr=DEVNULL)
|
||
|
check_call(['valgrind', '-h'], stdout=DEVNULL, stderr=DEVNULL)
|
||
|
except FileNotFoundError as exc: # e.g. macOS
|
||
|
raise SystemExit(f'Command not found: {exc.filename}') from None
|
||
|
|
||
|
ARCH = check_output(['uname', '-m'], text=True).strip()
|
||
|
DISABLE_ASLR_CMD = ['setarch', ARCH, '-R']
|
||
|
|
||
|
|
||
|
def run_with_cachegrind(args_list: list[str]) -> dict[str, int]:
|
||
|
"""
|
||
|
Run the the given program and arguments under Cachegrind, parse the
|
||
|
Cachegrind specs.
|
||
|
|
||
|
For now we just ignore program output, and in general this is not robust.
|
||
|
"""
|
||
|
temp_file = NamedTemporaryFile('r+')
|
||
|
run([
|
||
|
*DISABLE_ASLR_CMD,
|
||
|
'valgrind',
|
||
|
'--tool=cachegrind',
|
||
|
# Set some reasonable L1 and LL values, based on Haswell.
|
||
|
# Feel free to update, important part is that they are consistent across runs,
|
||
|
# instead of the default of copying from the current machine.
|
||
|
'--I1=32768,8,64',
|
||
|
'--D1=32768,8,64',
|
||
|
'--LL=8388608,16,64',
|
||
|
'--cachegrind-out-file=' + temp_file.name,
|
||
|
*args_list,
|
||
|
]) # Don't fail if the program fails (to support e.g. `pytest --benchmark-compare-fail=...`)
|
||
|
return parse_cachegrind_output(temp_file)
|
||
|
|
||
|
|
||
|
def parse_cachegrind_output(temp_file: t.IO[str]) -> dict[str, int]:
|
||
|
header = summary = ''
|
||
|
for line in temp_file:
|
||
|
if line.startswith('events: '):
|
||
|
header = line[len('events: ') :].strip()
|
||
|
elif line.startswith('summary: '):
|
||
|
summary = line[len('summary:') :].strip()
|
||
|
assert header
|
||
|
assert summary
|
||
|
return dict(zip(header.split(), (int(i) for i in summary.split())))
|
||
|
|
||
|
|
||
|
def get_counts(cg_results: dict[str, int]) -> dict[str, int]:
|
||
|
"""
|
||
|
Given the result of run_with_cachegrind(), figure out the parameters we will use for final
|
||
|
estimate.
|
||
|
|
||
|
We pretend there's no L2 since Cachegrind doesn't currently support it.
|
||
|
|
||
|
Caveats: we're not including time to process instructions, only time to
|
||
|
access instruction cache(s), so we're assuming time to fetch and run_with_cachegrind
|
||
|
instruction is the same as time to retrieve data if they're both to L1
|
||
|
cache.
|
||
|
"""
|
||
|
result = {}
|
||
|
d = cg_results
|
||
|
|
||
|
ram_hits = d['DLmr'] + d['DLmw'] + d['ILmr']
|
||
|
|
||
|
l3_hits = d['I1mr'] + d['D1mw'] + d['D1mr'] - ram_hits
|
||
|
|
||
|
total_memory_rw = d['Ir'] + d['Dr'] + d['Dw']
|
||
|
l1_hits = total_memory_rw - l3_hits - ram_hits
|
||
|
assert total_memory_rw == l1_hits + l3_hits + ram_hits
|
||
|
|
||
|
result['l1'] = l1_hits
|
||
|
result['l3'] = l3_hits
|
||
|
result['ram'] = ram_hits
|
||
|
|
||
|
return result
|
||
|
|
||
|
|
||
|
def combined_instruction_estimate(counts: dict[str, int]) -> int:
|
||
|
"""
|
||
|
Given the result of run_with_cachegrind(), return estimate of total time to run_with_cachegrind.
|
||
|
|
||
|
Multipliers were determined empirically, but some research suggests they're
|
||
|
a reasonable approximation for cache time ratios. L3 is probably too low,
|
||
|
but then we're not simulating L2...
|
||
|
"""
|
||
|
return counts['l1'] + (5 * counts['l3']) + (35 * counts['ram'])
|
||
|
|
||
|
|
||
|
def main() -> None:
|
||
|
results = run_with_cachegrind(sys.argv[1:])
|
||
|
counts = get_counts(results)
|
||
|
estimate = combined_instruction_estimate(counts)
|
||
|
print(f'{"*" * 80}\nCombined instruction estimate: {estimate:,}') # noqa: T201
|
||
|
|
||
|
|
||
|
if __name__ == '__main__':
|
||
|
main()
|