From 6d791a97364b68d5f9c3514a0470aac487fc538d Mon Sep 17 00:00:00 2001 From: Pablo Galindo Salgado Date: Tue, 30 Aug 2022 18:11:18 +0100 Subject: [PATCH] gh-96143: Allow Linux perf profiler to see Python calls (GH-96123) :warning: :warning: Note for reviewers, hackers and fellow systems/low-level/compiler engineers :warning: :warning: If you have a lot of experience with this kind of shenanigans and want to improve the **first** version, **please make a PR against my branch** or **reach out by email** or **suggest code changes directly on GitHub**. If you have any **refinements or optimizations** please, wait until the first version is merged before starting hacking or proposing those so we can keep this PR productive. --- Doc/c-api/init_config.rst | 14 + Doc/howto/index.rst | 1 + Doc/howto/perf_profiling.rst | 200 +++++++ Doc/using/cmdline.rst | 13 + Include/cpython/initconfig.h | 1 + Include/internal/pycore_ceval.h | 21 + Lib/test/test_embed.py | 5 + Lib/test/test_perf_profiler.py | 348 ++++++++++++ Makefile.pre.in | 7 +- ...2-08-20-18-36-40.gh-issue-96143.nh3GFM.rst | 7 + Modules/posixmodule.c | 5 + Objects/asm_trampoline.S | 28 + Objects/perf_trampoline.c | 501 ++++++++++++++++++ PCbuild/_freeze_module.vcxproj | 1 + PCbuild/_freeze_module.vcxproj.filters | 3 + PCbuild/pythoncore.vcxproj | 1 + PCbuild/pythoncore.vcxproj.filters | 3 + Python/clinic/sysmodule.c.h | 75 ++- Python/initconfig.c | 39 ++ Python/pylifecycle.c | 11 + Python/sysmodule.c | 77 +++ configure | 30 ++ configure.ac | 20 + pyconfig.h.in | 3 + 24 files changed, 1412 insertions(+), 2 deletions(-) create mode 100644 Doc/howto/perf_profiling.rst create mode 100644 Lib/test/test_perf_profiler.py create mode 100644 Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst create mode 100644 Objects/asm_trampoline.S create mode 100644 Objects/perf_trampoline.c diff --git a/Doc/c-api/init_config.rst b/Doc/c-api/init_config.rst index 2074ec4e0e8..c4a342ee811 100644 --- a/Doc/c-api/init_config.rst +++ b/Doc/c-api/init_config.rst @@ -1155,6 +1155,20 @@ PyConfig Default: ``-1`` in Python mode, ``0`` in isolated mode. + .. c:member:: int perf_profiling + + Enable compatibility mode with the perf profiler? + + If non-zero, initialize the perf trampoline. See :ref:`perf_profiling` + for more information. + + Set by :option:`-X perf <-X>` command line option and by the + :envvar:`PYTHONPERFSUPPORT` environment variable. + + Default: ``-1``. + + .. versionadded:: 3.12 + .. c:member:: int use_environment Use :ref:`environment variables `? diff --git a/Doc/howto/index.rst b/Doc/howto/index.rst index 8a378e6659e..f521276a5a8 100644 --- a/Doc/howto/index.rst +++ b/Doc/howto/index.rst @@ -30,6 +30,7 @@ Currently, the HOWTOs are: ipaddress.rst clinic.rst instrumentation.rst + perf_profiling.rst annotations.rst isolating-extensions.rst diff --git a/Doc/howto/perf_profiling.rst b/Doc/howto/perf_profiling.rst new file mode 100644 index 00000000000..2e1bb48af8c --- /dev/null +++ b/Doc/howto/perf_profiling.rst @@ -0,0 +1,200 @@ +.. highlight:: shell-session + +.. _perf_profiling: + +============================================== +Python support for the Linux ``perf`` profiler +============================================== + +:author: Pablo Galindo + +The Linux ``perf`` profiler is a very powerful tool that allows you to profile and +obtain information about the performance of your application. ``perf`` also has +a very vibrant ecosystem of tools that aid with the analysis of the data that it +produces. + +The main problem with using the ``perf`` profiler with Python applications is that +``perf`` only allows to get information about native symbols, this is, the names of +the functions and procedures written in C. This means that the names and file names +of the Python functions in your code will not appear in the output of the ``perf``. + +Since Python 3.12, the interpreter can run in a special mode that allows Python +functions to appear in the output of the ``perf`` profiler. When this mode is +enabled, the interpreter will interpose a small piece of code compiled on the +fly before the execution of every Python function and it will teach ``perf`` the +relationship between this piece of code and the associated Python function using +`perf map files`_. + +.. warning:: + + Support for the ``perf`` profiler is only currently available for Linux on + selected architectures. Check the output of the configure build step or + check the output of ``python -m sysconfig | grep HAVE_PERF_TRAMPOLINE`` + to see if your system is supported. + +For example, consider the following script: + +.. code-block:: python + + def foo(n): + result = 0 + for _ in range(n): + result += 1 + return result + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + if __name__ == "__main__": + baz(1000000) + +We can run perf to sample CPU stack traces at 9999 Hertz: + + $ perf record -F 9999 -g -o perf.data python my_script.py + +Then we can use perf report to analyze the data: + +.. code-block:: shell-session + + $ perf report --stdio -n -g + + # Children Self Samples Command Shared Object Symbol + # ........ ........ ............ .......... .................. .......................................... + # + 91.08% 0.00% 0 python.exe python.exe [.] _start + | + ---_start + | + --90.71%--__libc_start_main + Py_BytesMain + | + |--56.88%--pymain_run_python.constprop.0 + | | + | |--56.13%--_PyRun_AnyFileObject + | | _PyRun_SimpleFileObject + | | | + | | |--55.02%--run_mod + | | | | + | | | --54.65%--PyEval_EvalCode + | | | _PyEval_EvalFrameDefault + | | | PyObject_Vectorcall + | | | _PyEval_Vector + | | | _PyEval_EvalFrameDefault + | | | PyObject_Vectorcall + | | | _PyEval_Vector + | | | _PyEval_EvalFrameDefault + | | | PyObject_Vectorcall + | | | _PyEval_Vector + | | | | + | | | |--51.67%--_PyEval_EvalFrameDefault + | | | | | + | | | | |--11.52%--_PyLong_Add + | | | | | | + | | | | | |--2.97%--_PyObject_Malloc + ... + +As you can see here, the Python functions are not shown in the output, only ``_Py_Eval_EvalFrameDefault`` appears +(the function that evaluates the Python bytecode) shows up. Unfortunately that's not very useful because all Python +functions use the same C function to evaluate bytecode so we cannot know which Python function corresponds to which +bytecode-evaluating function. + +Instead, if we run the same experiment with perf support activated we get: + +.. code-block:: shell-session + + $ perf report --stdio -n -g + + # Children Self Samples Command Shared Object Symbol + # ........ ........ ............ .......... .................. ..................................................................... + # + 90.58% 0.36% 1 python.exe python.exe [.] _start + | + ---_start + | + --89.86%--__libc_start_main + Py_BytesMain + | + |--55.43%--pymain_run_python.constprop.0 + | | + | |--54.71%--_PyRun_AnyFileObject + | | _PyRun_SimpleFileObject + | | | + | | |--53.62%--run_mod + | | | | + | | | --53.26%--PyEval_EvalCode + | | | py:::/src/script.py + | | | _PyEval_EvalFrameDefault + | | | PyObject_Vectorcall + | | | _PyEval_Vector + | | | py::baz:/src/script.py + | | | _PyEval_EvalFrameDefault + | | | PyObject_Vectorcall + | | | _PyEval_Vector + | | | py::bar:/src/script.py + | | | _PyEval_EvalFrameDefault + | | | PyObject_Vectorcall + | | | _PyEval_Vector + | | | py::foo:/src/script.py + | | | | + | | | |--51.81%--_PyEval_EvalFrameDefault + | | | | | + | | | | |--13.77%--_PyLong_Add + | | | | | | + | | | | | |--3.26%--_PyObject_Malloc + + + +Enabling perf profiling mode +---------------------------- + +There are two main ways to activate the perf profiling mode. If you want it to be +active since the start of the Python interpreter, you can use the `-Xperf` option: + + $ python -Xperf my_script.py + +There is also support for dynamically activating and deactivating the perf +profiling mode by using the APIs in the :mod:`sys` module: + +.. code-block:: python + + import sys + sys.activate_stack_trampoline("perf") + + # Run some code with Perf profiling active + + sys.deactivate_stack_trampoline() + + # Perf profiling is not active anymore + +These APIs can be handy if you want to activate/deactivate profiling mode in +response to a signal or other communication mechanism with your process. + + + +Now we can analyze the data with ``perf report``: + + $ perf report -g -i perf.data + + +How to obtain the best results +------------------------------- + +For the best results, Python should be compiled with +``CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"`` as this allows +profilers to unwind using only the frame pointer and not on DWARF debug +information. This is because as the code that is interposed to allow perf +support is dynamically generated it doesn't have any DWARF debugging information +available. + +You can check if you system has been compiled with this flag by running: + + $ python -m sysconfig | grep 'no-omit-frame-pointer' + +If you don't see any output it means that your interpreter has not been compiled with +frame pointers and therefore it may not be able to show Python functions in the output +of ``perf``. + +.. _perf map files: https://github.com/torvalds/linux/blob/0513e464f9007b70b96740271a948ca5ab6e7dd7/tools/perf/Documentation/jit-interface.txt diff --git a/Doc/using/cmdline.rst b/Doc/using/cmdline.rst index 6678d476fa8..5ecc882d818 100644 --- a/Doc/using/cmdline.rst +++ b/Doc/using/cmdline.rst @@ -535,6 +535,12 @@ Miscellaneous options development (running from the source tree) then the default is "off". Note that the "importlib_bootstrap" and "importlib_bootstrap_external" frozen modules are always used, even if this flag is set to "off". + * ``-X perf`` to activate compatibility mode with the ``perf`` profiler. + When this option is activated, the Linux ``perf`` profiler will be able to + report Python calls. This option is only available on some platforms and + will do nothing if is not supported on the current system. The default value + is "off". See also :envvar:`PYTHONPERFSUPPORT` and :ref:`perf_profiling` + for more information. It also allows passing arbitrary values and retrieving them through the :data:`sys._xoptions` dictionary. @@ -1025,6 +1031,13 @@ conflict. .. versionadded:: 3.11 +.. envvar:: PYTHONPERFSUPPORT + + If this variable is set to a nonzero value, it activates compatibility mode + with the ``perf`` profiler so Python calls can be detected by it. See the + :ref:`perf_profiling` section for more information. + + .. versionadded:: 3.12 Debug-mode variables diff --git a/Include/cpython/initconfig.h b/Include/cpython/initconfig.h index 3b6d59389f2..c6057a4c3ed 100644 --- a/Include/cpython/initconfig.h +++ b/Include/cpython/initconfig.h @@ -142,6 +142,7 @@ typedef struct PyConfig { unsigned long hash_seed; int faulthandler; int tracemalloc; + int perf_profiling; int import_time; int code_debug_ranges; int show_ref_count; diff --git a/Include/internal/pycore_ceval.h b/Include/internal/pycore_ceval.h index 2fcdaad358b..4914948c6ca 100644 --- a/Include/internal/pycore_ceval.h +++ b/Include/internal/pycore_ceval.h @@ -65,6 +65,27 @@ extern PyObject* _PyEval_BuiltinsFromGlobals( PyThreadState *tstate, PyObject *globals); +// Trampoline API + +typedef struct { + // Callback to initialize the trampoline state + void* (*init_state)(void); + // Callback to register every trampoline being created + void (*write_state)(void* state, const void *code_addr, + unsigned int code_size, PyCodeObject* code); + // Callback to free the trampoline state + int (*free_state)(void* state); +} _PyPerf_Callbacks; + +extern int _PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *); +extern void _PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *); +extern int _PyPerfTrampoline_Init(int activate); +extern int _PyPerfTrampoline_Fini(void); +extern int _PyIsPerfTrampolineActive(void); +extern PyStatus _PyPerfTrampoline_AfterFork_Child(void); +#ifdef PY_HAVE_PERF_TRAMPOLINE +extern _PyPerf_Callbacks _Py_perfmap_callbacks; +#endif static inline PyObject* _PyEval_EvalFrame(PyThreadState *tstate, struct _PyInterpreterFrame *frame, int throwflag) diff --git a/Lib/test/test_embed.py b/Lib/test/test_embed.py index c546bb08e29..70d7367ea9e 100644 --- a/Lib/test/test_embed.py +++ b/Lib/test/test_embed.py @@ -436,6 +436,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): 'hash_seed': 0, 'faulthandler': 0, 'tracemalloc': 0, + 'perf_profiling': 0, 'import_time': 0, 'code_debug_ranges': 1, 'show_ref_count': 0, @@ -520,6 +521,7 @@ class InitConfigTests(EmbeddingTestsMixin, unittest.TestCase): use_hash_seed=0, faulthandler=0, tracemalloc=0, + perf_profiling=0, pathconfig_warnings=0, ) if MS_WINDOWS: @@ -828,6 +830,7 @@ def test_init_from_config(self): 'use_hash_seed': 1, 'hash_seed': 123, 'tracemalloc': 2, + 'perf_profiling': 0, 'import_time': 1, 'code_debug_ranges': 0, 'show_ref_count': 1, @@ -890,6 +893,7 @@ def test_init_compat_env(self): 'use_hash_seed': 1, 'hash_seed': 42, 'tracemalloc': 2, + 'perf_profiling': 0, 'import_time': 1, 'code_debug_ranges': 0, 'malloc_stats': 1, @@ -921,6 +925,7 @@ def test_init_python_env(self): 'use_hash_seed': 1, 'hash_seed': 42, 'tracemalloc': 2, + 'perf_profiling': 0, 'import_time': 1, 'code_debug_ranges': 0, 'malloc_stats': 1, diff --git a/Lib/test/test_perf_profiler.py b/Lib/test/test_perf_profiler.py new file mode 100644 index 00000000000..c2aad85b652 --- /dev/null +++ b/Lib/test/test_perf_profiler.py @@ -0,0 +1,348 @@ +import unittest +import subprocess +import sys +import sysconfig +import os +import pathlib +from test import support +from test.support.script_helper import ( + make_script, + assert_python_failure, + assert_python_ok, +) +from test.support.os_helper import temp_dir + + +if not support.has_subprocess_support: + raise unittest.SkipTest("test module requires subprocess") + + +def supports_trampoline_profiling(): + perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE") + if not perf_trampoline: + return False + return int(perf_trampoline) == 1 + + +if not supports_trampoline_profiling(): + raise unittest.SkipTest("perf trampoline profiling not supported") + + +class TestPerfTrampoline(unittest.TestCase): + def setUp(self): + super().setUp() + self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map")) + + def tearDown(self) -> None: + super().tearDown() + files_to_delete = ( + set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files + ) + for file in files_to_delete: + file.unlink() + + def test_trampoline_works(self): + code = """if 1: + def foo(): + pass + + def bar(): + foo() + + def baz(): + bar() + + baz() + """ + with temp_dir() as script_dir: + script = make_script(script_dir, "perftest", code) + with subprocess.Popen( + [sys.executable, "-Xperf", script], + universal_newlines=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) as process: + stdout, stderr = process.communicate() + + self.assertEqual(stderr, "") + self.assertEqual(stdout, "") + + perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map") + self.assertTrue(perf_file.exists()) + perf_file_contents = perf_file.read_text() + self.assertIn(f"py::foo:{script}", perf_file_contents) + self.assertIn(f"py::bar:{script}", perf_file_contents) + self.assertIn(f"py::baz:{script}", perf_file_contents) + + def test_trampoline_works_with_forks(self): + code = """if 1: + import os, sys + + def foo_fork(): + pass + + def bar_fork(): + foo_fork() + + def baz_fork(): + bar_fork() + + def foo(): + pid = os.fork() + if pid == 0: + print(os.getpid()) + baz_fork() + else: + _, status = os.waitpid(-1, 0) + sys.exit(status) + + def bar(): + foo() + + def baz(): + bar() + + baz() + """ + with temp_dir() as script_dir: + script = make_script(script_dir, "perftest", code) + with subprocess.Popen( + [sys.executable, "-Xperf", script], + universal_newlines=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) as process: + stdout, stderr = process.communicate() + + self.assertEqual(process.returncode, 0) + self.assertEqual(stderr, "") + child_pid = int(stdout.strip()) + perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map") + perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map") + self.assertTrue(perf_file.exists()) + self.assertTrue(perf_child_file.exists()) + + perf_file_contents = perf_file.read_text() + self.assertIn(f"py::foo:{script}", perf_file_contents) + self.assertIn(f"py::bar:{script}", perf_file_contents) + self.assertIn(f"py::baz:{script}", perf_file_contents) + + child_perf_file_contents = perf_child_file.read_text() + self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents) + self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents) + self.assertIn(f"py::baz_fork:{script}", child_perf_file_contents) + + def test_sys_api(self): + code = """if 1: + import sys + def foo(): + pass + + def spam(): + pass + + def bar(): + sys.deactivate_stack_trampoline() + foo() + sys.activate_stack_trampoline("perf") + spam() + + def baz(): + bar() + + sys.activate_stack_trampoline("perf") + baz() + """ + with temp_dir() as script_dir: + script = make_script(script_dir, "perftest", code) + with subprocess.Popen( + [sys.executable, script], + universal_newlines=True, + stderr=subprocess.PIPE, + stdout=subprocess.PIPE, + ) as process: + stdout, stderr = process.communicate() + + self.assertEqual(stderr, "") + self.assertEqual(stdout, "") + + perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map") + self.assertTrue(perf_file.exists()) + perf_file_contents = perf_file.read_text() + self.assertNotIn(f"py::foo:{script}", perf_file_contents) + self.assertIn(f"py::spam:{script}", perf_file_contents) + self.assertIn(f"py::bar:{script}", perf_file_contents) + self.assertIn(f"py::baz:{script}", perf_file_contents) + + def test_sys_api_with_existing_trampoline(self): + code = """if 1: + import sys + sys.activate_stack_trampoline("perf") + sys.activate_stack_trampoline("perf") + """ + assert_python_ok("-c", code) + + def test_sys_api_with_invalid_trampoline(self): + code = """if 1: + import sys + sys.activate_stack_trampoline("invalid") + """ + rc, out, err = assert_python_failure("-c", code) + self.assertIn("invalid backend: invalid", err.decode()) + + def test_sys_api_get_status(self): + code = """if 1: + import sys + sys.activate_stack_trampoline("perf") + assert sys.is_stack_trampoline_active() is True + sys.deactivate_stack_trampoline() + assert sys.is_stack_trampoline_active() is False + """ + assert_python_ok("-c", code) + + +def is_unwinding_reliable(): + cflags = sysconfig.get_config_var("PY_CORE_CFLAGS") + if not cflags: + return False + return "no-omit-frame-pointer" in cflags + + +def perf_command_works(): + try: + cmd = ["perf", "--help"] + stdout = subprocess.check_output(cmd, universal_newlines=True) + except (subprocess.SubprocessError, OSError): + return False + + # perf version does not return a version number on Fedora. Use presence + # of "perf.data" in help as indicator that it's perf from Linux tools. + if "perf.data" not in stdout: + return False + + # Check that we can run a simple perf run + with temp_dir() as script_dir: + try: + output_file = script_dir + "/perf_output.perf" + cmd = ( + "perf", + "record", + "-g", + "--call-graph=fp", + "-o", + output_file, + "--", + sys.executable, + "-c", + 'print("hello")', + ) + stdout = subprocess.check_output( + cmd, cwd=script_dir, universal_newlines=True, stderr=subprocess.STDOUT + ) + except (subprocess.SubprocessError, OSError): + return False + + if "hello" not in stdout: + return False + + return True + + +def run_perf(cwd, *args, **env_vars): + if env_vars: + env = os.environ.copy() + env.update(env_vars) + else: + env = None + output_file = cwd + "/perf_output.perf" + base_cmd = ("perf", "record", "-g", "--call-graph=fp", "-o", output_file, "--") + proc = subprocess.run( + base_cmd + args, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + ) + if proc.returncode: + print(proc.stderr) + raise ValueError(f"Perf failed with return code {proc.returncode}") + + base_cmd = ("perf", "script") + proc = subprocess.run( + ("perf", "script", "-i", output_file), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env, + check=True, + ) + return proc.stdout.decode("utf-8", "replace"), proc.stderr.decode( + "utf-8", "replace" + ) + + +@unittest.skipUnless(perf_command_works(), "perf command doesn't work") +@unittest.skipUnless(is_unwinding_reliable(), "Unwinding is unreliable") +@support.skip_if_sanitizer(address=True, memory=True, ub=True) +class TestPerfProfiler(unittest.TestCase): + def setUp(self): + super().setUp() + self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map")) + + def tearDown(self) -> None: + super().tearDown() + files_to_delete = ( + set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files + ) + for file in files_to_delete: + file.unlink() + + def test_python_calls_appear_in_the_stack_if_perf_activated(self): + with temp_dir() as script_dir: + code = """if 1: + def foo(n): + x = 0 + for i in range(n): + x += i + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + """ + script = make_script(script_dir, "perftest", code) + stdout, stderr = run_perf(script_dir, sys.executable, "-Xperf", script) + self.assertEqual(stderr, "") + + self.assertIn(f"py::foo:{script}", stdout) + self.assertIn(f"py::bar:{script}", stdout) + self.assertIn(f"py::baz:{script}", stdout) + + def test_python_calls_do_not_appear_in_the_stack_if_perf_activated(self): + with temp_dir() as script_dir: + code = """if 1: + def foo(n): + x = 0 + for i in range(n): + x += i + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + """ + script = make_script(script_dir, "perftest", code) + stdout, stderr = run_perf(script_dir, sys.executable, script) + self.assertEqual(stderr, "") + + self.assertNotIn(f"py::foo:{script}", stdout) + self.assertNotIn(f"py::bar:{script}", stdout) + self.assertNotIn(f"py::baz:{script}", stdout) + + +if __name__ == "__main__": + unittest.main() diff --git a/Makefile.pre.in b/Makefile.pre.in index 94ddfa4b1be..107a7075ebf 100644 --- a/Makefile.pre.in +++ b/Makefile.pre.in @@ -478,7 +478,9 @@ OBJECT_OBJS= \ Objects/unicodeobject.o \ Objects/unicodectype.o \ Objects/unionobject.o \ - Objects/weakrefobject.o + Objects/weakrefobject.o \ + Objects/perf_trampoline.o \ + @PERF_TRAMPOLINE_OBJ@ DEEPFREEZE_OBJS = Python/deepfreeze/deepfreeze.o @@ -2358,6 +2360,9 @@ config.status: $(srcdir)/configure .PRECIOUS: config.status $(BUILDPYTHON) Makefile Makefile.pre +Objects/asm_trampoline.o: $(srcdir)/Objects/asm_trampoline.S + $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< + # Some make's put the object file in the current directory .c.o: $(CC) -c $(PY_CORE_CFLAGS) -o $@ $< diff --git a/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst new file mode 100644 index 00000000000..30f44fd453a --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2022-08-20-18-36-40.gh-issue-96143.nh3GFM.rst @@ -0,0 +1,7 @@ +Add a new ``-X perf`` Python command line option as well as +:func:`sys.activate_stack_trampoline` and :func:`sys.deactivate_stack_trampoline` +function in the :mod:`sys` module that allows to set/unset the interpreter in a +way that the Linux ``perf`` profiler can detect Python calls. The new +:func:`sys.is_stack_trampoline_active` function allows to query the state of the +perf trampoline. Design by Pablo Galindo. Patch by Pablo Galindo and Christian Heimes +with contributions from Gregory P. Smith [Google] and Mark Shannon. diff --git a/Modules/posixmodule.c b/Modules/posixmodule.c index d45fa231ae5..3810bc87c1f 100644 --- a/Modules/posixmodule.c +++ b/Modules/posixmodule.c @@ -606,6 +606,11 @@ PyOS_AfterFork_Child(void) } assert(_PyThreadState_GET() == tstate); + status = _PyPerfTrampoline_AfterFork_Child(); + if (_PyStatus_EXCEPTION(status)) { + goto fatal_error; + } + run_at_forkers(tstate->interp->after_forkers_child, 0); return; diff --git a/Objects/asm_trampoline.S b/Objects/asm_trampoline.S new file mode 100644 index 00000000000..460707717df --- /dev/null +++ b/Objects/asm_trampoline.S @@ -0,0 +1,28 @@ + .text + .globl _Py_trampoline_func_start +# The following assembly is equivalent to: +# PyObject * +# trampoline(PyThreadState *ts, _PyInterpreterFrame *f, +# int throwflag, py_evaluator evaluator) +# { +# return evaluator(ts, f, throwflag); +# } +_Py_trampoline_func_start: +#ifdef __x86_64__ + sub $8, %rsp + call *%rcx + add $8, %rsp + ret +#endif // __x86_64__ +#if defined(__aarch64__) && defined(__AARCH64EL__) && !defined(__ILP32__) + // ARM64 little endian, 64bit ABI + // generate with aarch64-linux-gnu-gcc 12.1 + stp x29, x30, [sp, -16]! + mov x29, sp + blr x3 + ldp x29, x30, [sp], 16 + ret +#endif + .globl _Py_trampoline_func_end +_Py_trampoline_func_end: + .section .note.GNU-stack,"",@progbits diff --git a/Objects/perf_trampoline.c b/Objects/perf_trampoline.c new file mode 100644 index 00000000000..02206b2786c --- /dev/null +++ b/Objects/perf_trampoline.c @@ -0,0 +1,501 @@ +/* + +Perf trampoline instrumentation +=============================== + +This file contains instrumentation to allow to associate +calls to the CPython eval loop back to the names of the Python +functions and filename being executed. + +Many native performance profilers like the Linux perf tools are +only available to 'see' the C stack when sampling from the profiled +process. This means that if we have the following python code: + + import time + def foo(n): + # Some CPU intensive code + + def bar(n): + foo(n) + + def baz(n): + bar(n) + + baz(10000000) + +A performance profiler that is only able to see native frames will +produce the following backtrace when sampling from foo(): + + _PyEval_EvalFrameDefault -----> Evaluation frame of foo() + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault ------> Evaluation frame of bar() + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault -------> Evaluation frame of baz() + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + ... + + Py_RunMain + +Because the profiler is only able to see the native frames and the native +function that runs the evaluation loop is the same (_PyEval_EvalFrameDefault) +then the profiler and any reporter generated by it will not be able to +associate the names of the Python functions and the filenames associated with +those calls, rendering the results useless in the Python world. + +To fix this problem, we introduce the concept of a trampoline frame. A +trampoline frame is a piece of code that is unique per Python code object that +is executed before entering the CPython eval loop. This piece of code just +calls the original Python evaluation function (_PyEval_EvalFrameDefault) and +forwards all the arguments received. In this way, when a profiler samples +frames from the previous example it will see; + + _PyEval_EvalFrameDefault -----> Evaluation frame of foo() + [Jit compiled code 3] + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault ------> Evaluation frame of bar() + [Jit compiled code 2] + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + _PyEval_EvalFrameDefault -------> Evaluation frame of baz() + [Jit compiled code 1] + _PyEval_EvalFrame + _PyEval_Vector + _PyFunction_Vectorcall + PyObject_Vectorcall + call_function + + ... + + Py_RunMain + +When we generate every unique copy of the trampoline (what here we called "[Jit +compiled code N]") we write the relationship between the compiled code and the +Python function that is associated with it. Every profiler requires this +information in a different format. For example, the Linux "perf" profiler +requires a file in "/tmp/perf-PID.map" (name and location not configurable) +with the following format: + + + +If this file is available when "perf" generates reports, it will automatically +associate every trampoline with the Python function that it is associated with +allowing it to generate reports that include Python information. These reports +then can also be filtered in a way that *only* Python information appears. + +Notice that for this to work, there must be a unique copied of the trampoline +per Python code object even if the code in the trampoline is the same. To +achieve this we have a assembly template in Objects/asm_trampiline.S that is +compiled into the Python executable/shared library. This template generates a +symbol that maps the start of the assembly code and another that marks the end +of the assembly code for the trampoline. Then, every time we need a unique +trampoline for a Python code object, we copy the assembly code into a mmaped +area that has executable permissions and we return the start of that area as +our trampoline function. + +Asking for a mmap-ed memory area for trampoline is very wasteful so we +allocate big arenas of memory in a single mmap call, we populate the entire +arena with copies of the trampoline (this allows us to now have to invalidate +the icache for the instructions in the page) and then we return the next +available chunk every time someone asks for a new trampoline. We keep a linked +list of arenas in case the current memory arena is exhausted and another one is +needed. + +For the best results, Python should be compiled with +CFLAGS="-fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" as this allows +profilers to unwind using only the frame pointer and not on DWARF debug +information (note that as trampilines are dynamically generated there won't be +any DWARF information available for them). +*/ + +#include "Python.h" +#include "pycore_ceval.h" +#include "pycore_frame.h" +#include "pycore_interp.h" + +typedef enum { + PERF_STATUS_FAILED = -1, // Perf trampoline is in an invalid state + PERF_STATUS_NO_INIT = 0, // Perf trampoline is not initialized + PERF_STATUS_OK = 1, // Perf trampoline is ready to be executed +} perf_status_t; + +#ifdef PY_HAVE_PERF_TRAMPOLINE + +#include +#include +#include +#include +#include +#include + +/* The function pointer is passed as last argument. The other three arguments + * are passed in the same order as the function requires. This results in + * shorter, more efficient ASM code for trampoline. + */ +typedef PyObject *(*py_evaluator)(PyThreadState *, _PyInterpreterFrame *, + int throwflag); +typedef PyObject *(*py_trampoline)(PyThreadState *, _PyInterpreterFrame *, int, + py_evaluator); + +extern void *_Py_trampoline_func_start; // Start of the template of the + // assembly trampoline +extern void * + _Py_trampoline_func_end; // End of the template of the assembly trampoline + +struct code_arena_st { + char *start_addr; // Start of the memory arena + char *current_addr; // Address of the current trampoline within the arena + size_t size; // Size of the memory arena + size_t size_left; // Remaining size of the memory arena + size_t code_size; // Size of the code of every trampoline in the arena + struct code_arena_st + *prev; // Pointer to the arena or NULL if this is the first arena. +}; + +typedef struct code_arena_st code_arena_t; + +struct trampoline_api_st { + void* (*init_state)(void); + void (*write_state)(void* state, const void *code_addr, + unsigned int code_size, PyCodeObject* code); + int (*free_state)(void* state); + void *state; +}; + +typedef struct trampoline_api_st trampoline_api_t; + +static perf_status_t perf_status = PERF_STATUS_NO_INIT; +static Py_ssize_t extra_code_index = -1; +static code_arena_t *code_arena; +static trampoline_api_t trampoline_api; + +static FILE *perf_map_file; + +static void * +perf_map_get_file(void) +{ + if (perf_map_file) { + return perf_map_file; + } + char filename[100]; + pid_t pid = getpid(); + // Location and file name of perf map is hard-coded in perf tool. + // Use exclusive create flag wit nofollow to prevent symlink attacks. + int flags = O_WRONLY | O_CREAT | O_EXCL | O_NOFOLLOW | O_CLOEXEC; + snprintf(filename, sizeof(filename) - 1, "/tmp/perf-%jd.map", + (intmax_t)pid); + int fd = open(filename, flags, 0600); + if (fd == -1) { + perf_status = PERF_STATUS_FAILED; + PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); + return NULL; + } + perf_map_file = fdopen(fd, "w"); + if (!perf_map_file) { + perf_status = PERF_STATUS_FAILED; + PyErr_SetFromErrnoWithFilename(PyExc_OSError, filename); + close(fd); + return NULL; + } + return perf_map_file; +} + +static int +perf_map_close(void *state) +{ + FILE *fp = (FILE *)state; + int ret = 0; + if (fp) { + ret = fclose(fp); + } + perf_map_file = NULL; + perf_status = PERF_STATUS_NO_INIT; + return ret; +} + +static void +perf_map_write_entry(void *state, const void *code_addr, + unsigned int code_size, PyCodeObject *co) +{ + assert(state != NULL); + FILE *method_file = (FILE *)state; + const char *entry = PyUnicode_AsUTF8(co->co_qualname); + if (entry == NULL) { + _PyErr_WriteUnraisableMsg("Failed to get qualname from code object", + NULL); + return; + } + const char *filename = PyUnicode_AsUTF8(co->co_filename); + if (filename == NULL) { + _PyErr_WriteUnraisableMsg("Failed to get filename from code object", + NULL); + return; + } + fprintf(method_file, "%p %x py::%s:%s\n", code_addr, code_size, entry, + filename); + fflush(method_file); +} + +_PyPerf_Callbacks _Py_perfmap_callbacks = { + &perf_map_get_file, + &perf_map_write_entry, + &perf_map_close +}; + +static int +new_code_arena(void) +{ + // non-trivial programs typically need 64 to 256 kiB. + size_t mem_size = 4096 * 16; + assert(mem_size % sysconf(_SC_PAGESIZE) == 0); + char *memory = + mmap(NULL, // address + mem_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, // fd (not used here) + 0); // offset (not used here) + if (!memory) { + PyErr_SetFromErrno(PyExc_OSError); + _PyErr_WriteUnraisableMsg( + "Failed to create new mmap for perf trampoline", NULL); + perf_status = PERF_STATUS_FAILED; + return -1; + } + void *start = &_Py_trampoline_func_start; + void *end = &_Py_trampoline_func_end; + size_t code_size = end - start; + + size_t n_copies = mem_size / code_size; + for (size_t i = 0; i < n_copies; i++) { + memcpy(memory + i * code_size, start, code_size * sizeof(char)); + } + // Some systems may prevent us from creating executable code on the fly. + int res = mprotect(memory, mem_size, PROT_READ | PROT_EXEC); + if (res == -1) { + PyErr_SetFromErrno(PyExc_OSError); + munmap(memory, mem_size); + _PyErr_WriteUnraisableMsg( + "Failed to set mmap for perf trampoline to PROT_READ | PROT_EXEC", + NULL); + return -1; + } + + code_arena_t *new_arena = PyMem_RawCalloc(1, sizeof(code_arena_t)); + if (new_arena == NULL) { + PyErr_NoMemory(); + munmap(memory, mem_size); + _PyErr_WriteUnraisableMsg("Failed to allocate new code arena struct", + NULL); + return -1; + } + + new_arena->start_addr = memory; + new_arena->current_addr = memory; + new_arena->size = mem_size; + new_arena->size_left = mem_size; + new_arena->code_size = code_size; + new_arena->prev = code_arena; + code_arena = new_arena; + return 0; +} + +static void +free_code_arenas(void) +{ + code_arena_t *cur = code_arena; + code_arena_t *prev; + code_arena = NULL; // invalid static pointer + while (cur) { + munmap(cur->start_addr, cur->size); + prev = cur->prev; + PyMem_RawFree(cur); + cur = prev; + } +} + +static inline py_trampoline +code_arena_new_code(code_arena_t *code_arena) +{ + py_trampoline trampoline = (py_trampoline)code_arena->current_addr; + code_arena->size_left -= code_arena->code_size; + code_arena->current_addr += code_arena->code_size; + return trampoline; +} + +static inline py_trampoline +compile_trampoline(void) +{ + if ((code_arena == NULL) || + (code_arena->size_left <= code_arena->code_size)) { + if (new_code_arena() < 0) { + return NULL; + } + } + assert(code_arena->size_left <= code_arena->size); + return code_arena_new_code(code_arena); +} + +static PyObject * +py_trampoline_evaluator(PyThreadState *ts, _PyInterpreterFrame *frame, + int throw) +{ + if (perf_status == PERF_STATUS_FAILED || + perf_status == PERF_STATUS_NO_INIT) { + goto default_eval; + } + PyCodeObject *co = frame->f_code; + py_trampoline f = NULL; + assert(extra_code_index != -1); + int ret = _PyCode_GetExtra((PyObject *)co, extra_code_index, (void **)&f); + if (ret != 0 || f == NULL) { + // This is the first time we see this code object so we need + // to compile a trampoline for it. + py_trampoline new_trampoline = compile_trampoline(); + if (new_trampoline == NULL) { + goto default_eval; + } + trampoline_api.write_state(trampoline_api.state, new_trampoline, + code_arena->code_size, co); + _PyCode_SetExtra((PyObject *)co, extra_code_index, + (void *)new_trampoline); + f = new_trampoline; + } + assert(f != NULL); + return f(ts, frame, throw, _PyEval_EvalFrameDefault); +default_eval: + // Something failed, fall back to the default evaluator. + return _PyEval_EvalFrameDefault(ts, frame, throw); +} +#endif // PY_HAVE_PERF_TRAMPOLINE + +int +_PyIsPerfTrampolineActive(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + return tstate->interp->eval_frame == py_trampoline_evaluator; +#endif + return 0; +} + +void +_PyPerfTrampoline_GetCallbacks(_PyPerf_Callbacks *callbacks) +{ + if (callbacks == NULL) { + return; + } +#ifdef PY_HAVE_PERF_TRAMPOLINE + callbacks->init_state = trampoline_api.init_state; + callbacks->write_state = trampoline_api.write_state; + callbacks->free_state = trampoline_api.free_state; +#endif + return; +} + +int +_PyPerfTrampoline_SetCallbacks(_PyPerf_Callbacks *callbacks) +{ + if (callbacks == NULL) { + return -1; + } +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (trampoline_api.state) { + _PyPerfTrampoline_Fini(); + } + trampoline_api.init_state = callbacks->init_state; + trampoline_api.write_state = callbacks->write_state; + trampoline_api.free_state = callbacks->free_state; + trampoline_api.state = NULL; + perf_status = PERF_STATUS_OK; +#endif + return 0; +} + +int +_PyPerfTrampoline_Init(int activate) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->interp->eval_frame && + tstate->interp->eval_frame != py_trampoline_evaluator) { + PyErr_SetString(PyExc_RuntimeError, + "Trampoline cannot be initialized as a custom eval " + "frame is already present"); + return -1; + } + if (!activate) { + tstate->interp->eval_frame = NULL; + } + else { + tstate->interp->eval_frame = py_trampoline_evaluator; + if (new_code_arena() < 0) { + return -1; + } + if (trampoline_api.state == NULL) { + void *state = trampoline_api.init_state(); + if (state == NULL) { + return -1; + } + trampoline_api.state = state; + } + extra_code_index = _PyEval_RequestCodeExtraIndex(NULL); + if (extra_code_index == -1) { + return -1; + } + perf_status = PERF_STATUS_OK; + } +#endif + return 0; +} + +int +_PyPerfTrampoline_Fini(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + PyThreadState *tstate = _PyThreadState_GET(); + if (tstate->interp->eval_frame == py_trampoline_evaluator) { + tstate->interp->eval_frame = NULL; + } + free_code_arenas(); + if (trampoline_api.state != NULL) { + trampoline_api.free_state(trampoline_api.state); + trampoline_api.state = NULL; + } + extra_code_index = -1; +#endif + return 0; +} + +PyStatus +_PyPerfTrampoline_AfterFork_Child(void) +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + // Restart trampoline in file in child. + int was_active = _PyIsPerfTrampolineActive(); + _PyPerfTrampoline_Fini(); + if (was_active) { + _PyPerfTrampoline_Init(1); + } +#endif + return PyStatus_Ok(); +} diff --git a/PCbuild/_freeze_module.vcxproj b/PCbuild/_freeze_module.vcxproj index 4c007297112..0e446fe46eb 100644 --- a/PCbuild/_freeze_module.vcxproj +++ b/PCbuild/_freeze_module.vcxproj @@ -129,6 +129,7 @@ + diff --git a/PCbuild/_freeze_module.vcxproj.filters b/PCbuild/_freeze_module.vcxproj.filters index 5c984999c0c..96ab2f2a4aa 100644 --- a/PCbuild/_freeze_module.vcxproj.filters +++ b/PCbuild/_freeze_module.vcxproj.filters @@ -85,6 +85,9 @@ Source Files + + Source Files + Source Files diff --git a/PCbuild/pythoncore.vcxproj b/PCbuild/pythoncore.vcxproj index 45e5013e61f..ff17304032c 100644 --- a/PCbuild/pythoncore.vcxproj +++ b/PCbuild/pythoncore.vcxproj @@ -429,6 +429,7 @@ + diff --git a/PCbuild/pythoncore.vcxproj.filters b/PCbuild/pythoncore.vcxproj.filters index 581ea6e3c58..7d7fe7267c8 100644 --- a/PCbuild/pythoncore.vcxproj.filters +++ b/PCbuild/pythoncore.vcxproj.filters @@ -923,6 +923,9 @@ Objects + + Objects + Objects diff --git a/Python/clinic/sysmodule.c.h b/Python/clinic/sysmodule.c.h index 0f9636690a4..ddf01a7ccdd 100644 --- a/Python/clinic/sysmodule.c.h +++ b/Python/clinic/sysmodule.c.h @@ -1151,6 +1151,79 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored)) #endif /* defined(ANDROID_API_LEVEL) */ +PyDoc_STRVAR(sys_activate_stack_trampoline__doc__, +"activate_stack_trampoline($module, backend, /)\n" +"--\n" +"\n" +"Activate the perf profiler trampoline."); + +#define SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF \ + {"activate_stack_trampoline", (PyCFunction)sys_activate_stack_trampoline, METH_O, sys_activate_stack_trampoline__doc__}, + +static PyObject * +sys_activate_stack_trampoline_impl(PyObject *module, const char *backend); + +static PyObject * +sys_activate_stack_trampoline(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + const char *backend; + + if (!PyUnicode_Check(arg)) { + _PyArg_BadArgument("activate_stack_trampoline", "argument", "str", arg); + goto exit; + } + Py_ssize_t backend_length; + backend = PyUnicode_AsUTF8AndSize(arg, &backend_length); + if (backend == NULL) { + goto exit; + } + if (strlen(backend) != (size_t)backend_length) { + PyErr_SetString(PyExc_ValueError, "embedded null character"); + goto exit; + } + return_value = sys_activate_stack_trampoline_impl(module, backend); + +exit: + return return_value; +} + +PyDoc_STRVAR(sys_deactivate_stack_trampoline__doc__, +"deactivate_stack_trampoline($module, /)\n" +"--\n" +"\n" +"Dectivate the perf profiler trampoline."); + +#define SYS_DEACTIVATE_STACK_TRAMPOLINE_METHODDEF \ + {"deactivate_stack_trampoline", (PyCFunction)sys_deactivate_stack_trampoline, METH_NOARGS, sys_deactivate_stack_trampoline__doc__}, + +static PyObject * +sys_deactivate_stack_trampoline_impl(PyObject *module); + +static PyObject * +sys_deactivate_stack_trampoline(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ + return sys_deactivate_stack_trampoline_impl(module); +} + +PyDoc_STRVAR(sys_is_stack_trampoline_active__doc__, +"is_stack_trampoline_active($module, /)\n" +"--\n" +"\n" +"Returns *True* if the perf profiler trampoline is active."); + +#define SYS_IS_STACK_TRAMPOLINE_ACTIVE_METHODDEF \ + {"is_stack_trampoline_active", (PyCFunction)sys_is_stack_trampoline_active, METH_NOARGS, sys_is_stack_trampoline_active__doc__}, + +static PyObject * +sys_is_stack_trampoline_active_impl(PyObject *module); + +static PyObject * +sys_is_stack_trampoline_active(PyObject *module, PyObject *Py_UNUSED(ignored)) +{ + return sys_is_stack_trampoline_active_impl(module); +} + #ifndef SYS_GETWINDOWSVERSION_METHODDEF #define SYS_GETWINDOWSVERSION_METHODDEF #endif /* !defined(SYS_GETWINDOWSVERSION_METHODDEF) */ @@ -1194,4 +1267,4 @@ sys_getandroidapilevel(PyObject *module, PyObject *Py_UNUSED(ignored)) #ifndef SYS_GETANDROIDAPILEVEL_METHODDEF #define SYS_GETANDROIDAPILEVEL_METHODDEF #endif /* !defined(SYS_GETANDROIDAPILEVEL_METHODDEF) */ -/*[clinic end generated code: output=322fb0409e376ad4 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=43b44240211afe95 input=a9049054013a1b77]*/ diff --git a/Python/initconfig.c b/Python/initconfig.c index 70f0363297f..33a8f276b19 100644 --- a/Python/initconfig.c +++ b/Python/initconfig.c @@ -118,6 +118,11 @@ The following implementation-specific options are available:\n\ files are desired as well as suppressing the extra visual location indicators \n\ when the interpreter displays tracebacks.\n\ \n\ +-X perf: activate support for the Linux \"perf\" profiler by activating the \"perf\"\n\ + trampoline. When this option is activated, the Linux \"perf\" profiler will be \n\ + able to report Python calls. This option is only available on some platforms and will \n\ + do nothing if is not supported on the current system. The default value is \"off\".\n\ +\n\ -X frozen_modules=[on|off]: whether or not frozen modules should be used.\n\ The default is \"on\" (or \"off\" if you are running a local build)."; @@ -745,6 +750,7 @@ _PyConfig_InitCompatConfig(PyConfig *config) config->use_hash_seed = -1; config->faulthandler = -1; config->tracemalloc = -1; + config->perf_profiling = -1; config->module_search_paths_set = 0; config->parse_argv = 0; config->site_import = -1; @@ -829,6 +835,7 @@ PyConfig_InitIsolatedConfig(PyConfig *config) config->use_hash_seed = 0; config->faulthandler = 0; config->tracemalloc = 0; + config->perf_profiling = 0; config->safe_path = 1; config->pathconfig_warnings = 0; #ifdef MS_WINDOWS @@ -940,6 +947,7 @@ _PyConfig_Copy(PyConfig *config, const PyConfig *config2) COPY_ATTR(_install_importlib); COPY_ATTR(faulthandler); COPY_ATTR(tracemalloc); + COPY_ATTR(perf_profiling); COPY_ATTR(import_time); COPY_ATTR(code_debug_ranges); COPY_ATTR(show_ref_count); @@ -1050,6 +1058,7 @@ _PyConfig_AsDict(const PyConfig *config) SET_ITEM_UINT(hash_seed); SET_ITEM_INT(faulthandler); SET_ITEM_INT(tracemalloc); + SET_ITEM_INT(perf_profiling); SET_ITEM_INT(import_time); SET_ITEM_INT(code_debug_ranges); SET_ITEM_INT(show_ref_count); @@ -1331,6 +1340,7 @@ _PyConfig_FromDict(PyConfig *config, PyObject *dict) CHECK_VALUE("hash_seed", config->hash_seed <= MAX_HASH_SEED); GET_UINT(faulthandler); GET_UINT(tracemalloc); + GET_UINT(perf_profiling); GET_UINT(import_time); GET_UINT(code_debug_ranges); GET_UINT(show_ref_count); @@ -1687,6 +1697,26 @@ config_read_env_vars(PyConfig *config) return _PyStatus_OK(); } +static PyStatus +config_init_perf_profiling(PyConfig *config) +{ + int active = 0; + const char *env = config_get_env(config, "PYTHONPERFSUPPORT"); + if (env) { + if (_Py_str_to_int(env, &active) != 0) { + active = 0; + } + if (active) { + config->perf_profiling = 1; + } + } + const wchar_t *xoption = config_get_xoption(config, L"perf"); + if (xoption) { + config->perf_profiling = 1; + } + return _PyStatus_OK(); + +} static PyStatus config_init_tracemalloc(PyConfig *config) @@ -1788,6 +1818,12 @@ config_read_complex_options(PyConfig *config) return status; } } + if (config->perf_profiling < 0) { + status = config_init_perf_profiling(config); + if (_PyStatus_EXCEPTION(status)) { + return status; + } + } if (config->pycache_prefix == NULL) { status = config_init_pycache_prefix(config); @@ -2104,6 +2140,9 @@ config_read(PyConfig *config, int compute_path_config) if (config->tracemalloc < 0) { config->tracemalloc = 0; } + if (config->perf_profiling < 0) { + config->perf_profiling = 0; + } if (config->use_hash_seed < 0) { config->use_hash_seed = 0; config->hash_seed = 0; diff --git a/Python/pylifecycle.c b/Python/pylifecycle.c index bb646f1a0ee..8ce6d71651c 100644 --- a/Python/pylifecycle.c +++ b/Python/pylifecycle.c @@ -1149,6 +1149,16 @@ init_interp_main(PyThreadState *tstate) if (_PyTraceMalloc_Init(config->tracemalloc) < 0) { return _PyStatus_ERR("can't initialize tracemalloc"); } + + +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (config->perf_profiling) { + if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 || + _PyPerfTrampoline_Init(config->perf_profiling) < 0) { + return _PyStatus_ERR("can't initialize the perf trampoline"); + } + } +#endif } status = init_sys_streams(tstate); @@ -1723,6 +1733,7 @@ finalize_interp_clear(PyThreadState *tstate) _PyArg_Fini(); _Py_ClearFileSystemEncoding(); _Py_Deepfreeze_Fini(); + _PyPerfTrampoline_Fini(); } finalize_interp_types(tstate->interp); diff --git a/Python/sysmodule.c b/Python/sysmodule.c index c2864387941..75e64553d88 100644 --- a/Python/sysmodule.c +++ b/Python/sysmodule.c @@ -2053,6 +2053,80 @@ sys_getandroidapilevel_impl(PyObject *module) } #endif /* ANDROID_API_LEVEL */ +/*[clinic input] +sys.activate_stack_trampoline + + backend: str + / + +Activate the perf profiler trampoline. +[clinic start generated code]*/ + +static PyObject * +sys_activate_stack_trampoline_impl(PyObject *module, const char *backend) +/*[clinic end generated code: output=5783cdeb51874b43 input=b09020e3a17c78c5]*/ +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (strcmp(backend, "perf") == 0) { + _PyPerf_Callbacks cur_cb; + _PyPerfTrampoline_GetCallbacks(&cur_cb); + if (cur_cb.init_state != _Py_perfmap_callbacks.init_state) { + if (_PyPerfTrampoline_SetCallbacks(&_Py_perfmap_callbacks) < 0 ) { + PyErr_SetString(PyExc_ValueError, "can't activate perf trampoline"); + return NULL; + } + } + } + else { + PyErr_Format(PyExc_ValueError, "invalid backend: %s", backend); + return NULL; + } + if (_PyPerfTrampoline_Init(1) < 0) { + return NULL; + } + Py_RETURN_NONE; +#else + PyErr_SetString(PyExc_ValueError, "perf trampoline not available"); + return NULL; +#endif +} + + +/*[clinic input] +sys.deactivate_stack_trampoline + +Dectivate the perf profiler trampoline. +[clinic start generated code]*/ + +static PyObject * +sys_deactivate_stack_trampoline_impl(PyObject *module) +/*[clinic end generated code: output=b50da25465df0ef1 input=491f4fc1ed615736]*/ +{ + if (_PyPerfTrampoline_Init(0) < 0) { + return NULL; + } + Py_RETURN_NONE; +} + +/*[clinic input] +sys.is_stack_trampoline_active + +Returns *True* if the perf profiler trampoline is active. +[clinic start generated code]*/ + +static PyObject * +sys_is_stack_trampoline_active_impl(PyObject *module) +/*[clinic end generated code: output=ab2746de0ad9d293 input=061fa5776ac9dd59]*/ +{ +#ifdef PY_HAVE_PERF_TRAMPOLINE + if (_PyIsPerfTrampolineActive()) { + Py_RETURN_TRUE; + } +#endif + Py_RETURN_FALSE; +} + + static PyMethodDef sys_methods[] = { /* Might as well keep this in alphabetic order */ SYS_ADDAUDITHOOK_METHODDEF @@ -2108,6 +2182,9 @@ static PyMethodDef sys_methods[] = { METH_VARARGS | METH_KEYWORDS, set_asyncgen_hooks_doc}, SYS_GET_ASYNCGEN_HOOKS_METHODDEF SYS_GETANDROIDAPILEVEL_METHODDEF + SYS_ACTIVATE_STACK_TRAMPOLINE_METHODDEF + SYS_DEACTIVATE_STACK_TRAMPOLINE_METHODDEF + SYS_IS_STACK_TRAMPOLINE_ACTIVE_METHODDEF SYS_UNRAISABLEHOOK_METHODDEF #ifdef Py_STATS SYS__STATS_ON_METHODDEF diff --git a/configure b/configure index fc7f7fadedb..9522977c8c7 100755 --- a/configure +++ b/configure @@ -861,6 +861,7 @@ LIBEXPAT_CFLAGS TZPATH LIBUUID_LIBS LIBUUID_CFLAGS +PERF_TRAMPOLINE_OBJ SHLIBS CFLAGSFORSHARED LINKFORSHARED @@ -11498,6 +11499,35 @@ esac { $as_echo "$as_me:${as_lineno-$LINENO}: result: $SHLIBS" >&5 $as_echo "$SHLIBS" >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking perf trampoline" >&5 +$as_echo_n "checking perf trampoline... " >&6; } +case $PLATFORM_TRIPLET in #( + x86_64-linux-gnu) : + perf_trampoline=yes ;; #( + aarch64-linux-gnu) : + perf_trampoline=yes ;; #( + *) : + perf_trampoline=no + ;; +esac +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $perf_trampoline" >&5 +$as_echo "$perf_trampoline" >&6; } + +if test "x$perf_trampoline" = xyes; then : + + +$as_echo "#define PY_HAVE_PERF_TRAMPOLINE 1" >>confdefs.h + + PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o + + if test "x$Py_DEBUG" = xtrue; then : + + as_fn_append BASECFLAGS " -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer" + +fi + +fi + # checks for libraries { $as_echo "$as_me:${as_lineno-$LINENO}: checking for sendfile in -lsendfile" >&5 diff --git a/configure.ac b/configure.ac index 2b927cd961f..3a009bb5042 100644 --- a/configure.ac +++ b/configure.ac @@ -3452,6 +3452,26 @@ case "$ac_sys_system" in esac AC_MSG_RESULT($SHLIBS) +dnl perf trampoline is Linux specific and requires an arch-specific +dnl trampoline in asssembly. +AC_MSG_CHECKING([perf trampoline]) +AS_CASE([$PLATFORM_TRIPLET], + [x86_64-linux-gnu], [perf_trampoline=yes], + [aarch64-linux-gnu], [perf_trampoline=yes], + [perf_trampoline=no] +) +AC_MSG_RESULT([$perf_trampoline]) + +AS_VAR_IF([perf_trampoline], [yes], [ + AC_DEFINE([PY_HAVE_PERF_TRAMPOLINE], [1], [Define to 1 if you have the perf trampoline.]) + PERF_TRAMPOLINE_OBJ=Objects/asm_trampoline.o + + dnl perf needs frame pointers for unwinding, include compiler option in debug builds + AS_VAR_IF([Py_DEBUG], [true], [ + AS_VAR_APPEND([BASECFLAGS], [" -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer"]) + ]) +]) +AC_SUBST([PERF_TRAMPOLINE_OBJ]) # checks for libraries AC_CHECK_LIB(sendfile, sendfile) diff --git a/pyconfig.h.in b/pyconfig.h.in index 10e7ad12fa9..1ce09855f55 100644 --- a/pyconfig.h.in +++ b/pyconfig.h.in @@ -1568,6 +1568,9 @@ /* Define if you want to coerce the C locale to a UTF-8 based locale */ #undef PY_COERCE_C_LOCALE +/* Define to 1 if you have the perf trampoline. */ +#undef PY_HAVE_PERF_TRAMPOLINE + /* Define to 1 to build the sqlite module with loadable extensions support. */ #undef PY_SQLITE_ENABLE_LOAD_EXTENSION