gh-136459: Add perf trampoline support for macOS (#136461)

2025-11-24 12:20:42 +00:00 · 2025-07-22 17:47:24 +02:00 · 2025-07-22 17:47:24 +02:00 · a667800558
commit a667800558
parent b6d3242244
10 changed files with 351 additions and 27 deletions
--- a/Doc/c-api/perfmaps.rst
+++ b/Doc/c-api/perfmaps.rst
@ -5,11 +5,12 @@
 Support for Perf Maps
 ----------------------

-On supported platforms (as of this writing, only Linux), the runtime can take
+On supported platforms (Linux and macOS), the runtime can take
 advantage of *perf map files* to make Python functions visible to an external
-profiling tool (such as `perf <https://perf.wiki.kernel.org/index.php/Main_Page>`_).
-A running process may create a file in the ``/tmp`` directory, which contains entries
-that can map a section of executable code to a name. This interface is described in the
+profiling tool (such as `perf <https://perf.wiki.kernel.org/index.php/Main_Page>`_ or
+`samply <https://github.com/mstange/samply/>`_). A running process may create a
+file in the ``/tmp`` directory, which contains entries that can map a section
+of executable code to a name. This interface is described in the
 `documentation of the Linux Perf tool <https://git.kernel.org/pub/scm/linux/
 kernel/git/torvalds/linux.git/tree/tools/perf/Documentation/jit-interface.txt>`_.

--- a/Doc/howto/perf_profiling.rst
+++ b/Doc/howto/perf_profiling.rst
@ -2,34 +2,35 @@

 .. _perf_profiling:

-==============================================
-Python support for the Linux ``perf`` profiler
-==============================================
+========================================================
+Python support for the ``perf map`` compatible profilers
+========================================================

 :author: Pablo Galindo

-`The Linux perf profiler <https://perf.wiki.kernel.org>`_
-is a very powerful tool that allows you to profile and obtain
-information about the performance of your application.
-``perf`` also has a very vibrant ecosystem of tools
-that aid with the analysis of the data that it produces.
+`The Linux perf profiler <https://perf.wiki.kernel.org>`_ and
+`samply <https://github.com/mstange/samply>`_ are powerful tools that allow you to
+profile and obtain information about the performance of your application.
+Both tools have vibrant ecosystems that aid with the analysis of the data they produce.

-The main problem with using the ``perf`` profiler with Python applications is that
-``perf`` only gets information about native symbols, that is, the names of
+The main problem with using these profilers with Python applications is that
+they only get information about native symbols, that is, the names of
 functions and procedures written in C. This means that the names and file names
-of Python functions in your code will not appear in the output of ``perf``.
+of Python functions in your code will not appear in the profiler output.

 Since Python 3.12, the interpreter can run in a special mode that allows Python
-functions to appear in the output of the ``perf`` profiler. When this mode is
+functions to appear in the output of compatible profilers. When this mode is
 enabled, the interpreter will interpose a small piece of code compiled on the
-fly before the execution of every Python function and it will teach ``perf`` the
+fly before the execution of every Python function and it will teach the profiler the
 relationship between this piece of code and the associated Python function using
 :doc:`perf map files <../c-api/perfmaps>`.

 .. note::

-    Support for the ``perf`` profiler is currently only available for Linux on
-    select architectures. Check the output of the ``configure`` build step or
+    Support for profiling is available on Linux and macOS on select architectures.
+    Perf is available on Linux, while samply can be used on both Linux and macOS.
+    samply support on macOS is available starting from Python 3.15.
+    Check the output of the ``configure`` build step or
    check the output of ``python -m sysconfig | grep HAVE_PERF_TRAMPOLINE``
    to see if your system is supported.

@ -148,6 +149,31 @@ Instead, if we run the same experiment with ``perf`` support enabled we get:



+Using the samply profiler
+-------------------------
+
+samply is a modern profiler that can be used as an alternative to perf.
+It uses the same perf map files that Python generates, making it compatible
+with Python's profiling support. samply is particularly useful on macOS
+where perf is not available.
+
+To use samply with Python, first install it following the instructions at
+https://github.com/mstange/samply, then run::
+
+    $ samply record PYTHONPERFSUPPORT=1 python my_script.py
+
+This will open a web interface where you can analyze the profiling data
+interactively. The advantage of samply is that it provides a modern
+web-based interface for analyzing profiling data and works on both Linux
+and macOS.
+
+On macOS, samply support requires Python 3.15 or later. Also on macOS, samply
+can't profile signed Python executables due to restrictions by macOS. You can
+profile with Python binaries that you've compiled yourself, or which are
+unsigned or locally-signed (such as anything installed by Homebrew). In
+order to attach to running processes on macOS, run ``samply setup`` once (and
+every time samply is updated) to self-sign the samply binary.
+
 How to enable ``perf`` profiling support
 ----------------------------------------

--- a/Lib/test/test_perfmaps.py
+++ b/Lib/test/test_perfmaps.py
@ -1,5 +1,5 @@
 import os
-import sys
+import sysconfig
 import unittest

 try:
@ -7,10 +7,14 @@ try:
 except ImportError:
    raise unittest.SkipTest("requires _testinternalcapi")

+def supports_trampoline_profiling():
+    perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE")
+    if not perf_trampoline:
+        return False
+    return int(perf_trampoline) == 1

-if sys.platform != 'linux':
-    raise unittest.SkipTest('Linux only')
-
+if not supports_trampoline_profiling():
+    raise unittest.SkipTest("perf trampoline profiling not supported")

 class TestPerfMapWriting(unittest.TestCase):
    def test_write_perf_map_entry(self):
--- a/Lib/test/test_samply_profiler.py
+++ b/Lib/test/test_samply_profiler.py
@ -0,0 +1,244 @@
+import unittest
+import subprocess
+import sys
+import sysconfig
+import os
+import pathlib
+from test import support
+from test.support.script_helper import (
+    make_script,
+)
+from test.support.os_helper import temp_dir
+
+
+if not support.has_subprocess_support:
+    raise unittest.SkipTest("test module requires subprocess")
+
+if support.check_sanitizer(address=True, memory=True, ub=True, function=True):
+    # gh-109580: Skip the test because it does crash randomly if Python is
+    # built with ASAN.
+    raise unittest.SkipTest("test crash randomly on ASAN/MSAN/UBSAN build")
+
+
+def supports_trampoline_profiling():
+    perf_trampoline = sysconfig.get_config_var("PY_HAVE_PERF_TRAMPOLINE")
+    if not perf_trampoline:
+        return False
+    return int(perf_trampoline) == 1
+
+
+if not supports_trampoline_profiling():
+    raise unittest.SkipTest("perf trampoline profiling not supported")
+
+
+def samply_command_works():
+    try:
+        cmd = ["samply", "--help"]
+    except (subprocess.SubprocessError, OSError):
+        return False
+
+    # Check that we can run a simple samply run
+    with temp_dir() as script_dir:
+        try:
+            output_file = script_dir + "/profile.json.gz"
+            cmd = (
+                "samply",
+                "record",
+                "--save-only",
+                "--output",
+                output_file,
+                sys.executable,
+                "-c",
+                'print("hello")',
+            )
+            env = {**os.environ, "PYTHON_JIT": "0"}
+            stdout = subprocess.check_output(
+                cmd, cwd=script_dir, text=True, stderr=subprocess.STDOUT, env=env
+            )
+        except (subprocess.SubprocessError, OSError):
+            return False
+
+        if "hello" not in stdout:
+            return False
+
+    return True
+
+
+def run_samply(cwd, *args, **env_vars):
+    env = os.environ.copy()
+    if env_vars:
+        env.update(env_vars)
+    env["PYTHON_JIT"] = "0"
+    output_file = cwd + "/profile.json.gz"
+    base_cmd = (
+        "samply",
+        "record",
+        "--save-only",
+        "-o", output_file,
+    )
+    proc = subprocess.run(
+        base_cmd + args,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env=env,
+    )
+    if proc.returncode:
+        print(proc.stderr, file=sys.stderr)
+        raise ValueError(f"Samply failed with return code {proc.returncode}")
+
+    import gzip
+    with gzip.open(output_file, mode="rt", encoding="utf-8") as f:
+        return f.read()
+
+
+@unittest.skipUnless(samply_command_works(), "samply command doesn't work")
+class TestSamplyProfilerMixin:
+    def run_samply(self, script_dir, perf_mode, script):
+        raise NotImplementedError()
+
+    def test_python_calls_appear_in_the_stack_if_perf_activated(self):
+        with temp_dir() as script_dir:
+            code = """if 1:
+                def foo(n):
+                    x = 0
+                    for i in range(n):
+                        x += i
+
+                def bar(n):
+                    foo(n)
+
+                def baz(n):
+                    bar(n)
+
+                baz(10000000)
+                """
+            script = make_script(script_dir, "perftest", code)
+            output = self.run_samply(script_dir, script)
+
+            self.assertIn(f"py::foo:{script}", output)
+            self.assertIn(f"py::bar:{script}", output)
+            self.assertIn(f"py::baz:{script}", output)
+
+    def test_python_calls_do_not_appear_in_the_stack_if_perf_deactivated(self):
+        with temp_dir() as script_dir:
+            code = """if 1:
+                def foo(n):
+                    x = 0
+                    for i in range(n):
+                        x += i
+
+                def bar(n):
+                    foo(n)
+
+                def baz(n):
+                    bar(n)
+
+                baz(10000000)
+                """
+            script = make_script(script_dir, "perftest", code)
+            output = self.run_samply(
+                script_dir, script, activate_trampoline=False
+            )
+
+            self.assertNotIn(f"py::foo:{script}", output)
+            self.assertNotIn(f"py::bar:{script}", output)
+            self.assertNotIn(f"py::baz:{script}", output)
+
+
+@unittest.skipUnless(samply_command_works(), "samply command doesn't work")
+class TestSamplyProfiler(unittest.TestCase, TestSamplyProfilerMixin):
+    def run_samply(self, script_dir, script, activate_trampoline=True):
+        if activate_trampoline:
+            return run_samply(script_dir, sys.executable, "-Xperf", script)
+        return run_samply(script_dir, sys.executable, script)
+
+    def setUp(self):
+        super().setUp()
+        self.perf_files = set(pathlib.Path("/tmp/").glob("perf-*.map"))
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        files_to_delete = (
+            set(pathlib.Path("/tmp/").glob("perf-*.map")) - self.perf_files
+        )
+        for file in files_to_delete:
+            file.unlink()
+
+    def test_pre_fork_compile(self):
+        code = """if 1:
+                import sys
+                import os
+                import sysconfig
+                from _testinternalcapi import (
+                    compile_perf_trampoline_entry,
+                    perf_trampoline_set_persist_after_fork,
+                )
+
+                def foo_fork():
+                    pass
+
+                def bar_fork():
+                    foo_fork()
+
+                def foo():
+                    import time; time.sleep(1)
+
+                def bar():
+                    foo()
+
+                def compile_trampolines_for_all_functions():
+                    perf_trampoline_set_persist_after_fork(1)
+                    for _, obj in globals().items():
+                        if callable(obj) and hasattr(obj, '__code__'):
+                            compile_perf_trampoline_entry(obj.__code__)
+
+                if __name__ == "__main__":
+                    compile_trampolines_for_all_functions()
+                    pid = os.fork()
+                    if pid == 0:
+                        print(os.getpid())
+                        bar_fork()
+                    else:
+                        bar()
+                """
+
+        with temp_dir() as script_dir:
+            script = make_script(script_dir, "perftest", code)
+            env = {**os.environ, "PYTHON_JIT": "0"}
+            with subprocess.Popen(
+                [sys.executable, "-Xperf", script],
+                universal_newlines=True,
+                stderr=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                env=env,
+            ) as process:
+                stdout, stderr = process.communicate()
+
+        self.assertEqual(process.returncode, 0)
+        self.assertNotIn("Error:", stderr)
+        child_pid = int(stdout.strip())
+        perf_file = pathlib.Path(f"/tmp/perf-{process.pid}.map")
+        perf_child_file = pathlib.Path(f"/tmp/perf-{child_pid}.map")
+        self.assertTrue(perf_file.exists())
+        self.assertTrue(perf_child_file.exists())
+
+        perf_file_contents = perf_file.read_text()
+        self.assertIn(f"py::foo:{script}", perf_file_contents)
+        self.assertIn(f"py::bar:{script}", perf_file_contents)
+        self.assertIn(f"py::foo_fork:{script}", perf_file_contents)
+        self.assertIn(f"py::bar_fork:{script}", perf_file_contents)
+
+        child_perf_file_contents = perf_child_file.read_text()
+        self.assertIn(f"py::foo_fork:{script}", child_perf_file_contents)
+        self.assertIn(f"py::bar_fork:{script}", child_perf_file_contents)
+
+        # Pre-compiled perf-map entries of a forked process must be
+        # identical in both the parent and child perf-map files.
+        perf_file_lines = perf_file_contents.split("\n")
+        for line in perf_file_lines:
+            if f"py::foo_fork:{script}" in line or f"py::bar_fork:{script}" in line:
+                self.assertIn(line, child_perf_file_contents)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/Misc/ACKS
+++ b/Misc/ACKS
@ -43,6 +43,7 @@ Ray Allen
 Billy G. Allie
 Jamiel Almeida
 Kevin Altis
+Nazım Can Altınova
 Samy Lahfa
 Skyler Leigh Amador
 Joe Amenta
--- a/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst
+++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-07-09-11-15-42.gh-issue-136459.m4Udh8.rst
@ -0,0 +1,3 @@
+Add support for perf trampoline on macOS, to allow profilers wit JIT map
+support to read Python calls. While profiling, ``PYTHONPERFSUPPORT=1`` can
+be appended to enable the trampoline.
--- a/Python/asm_trampoline.S
+++ b/Python/asm_trampoline.S
@ -1,5 +1,9 @@
    .text
+#if defined(__APPLE__)
+    .globl	__Py_trampoline_func_start
+#else
    .globl	_Py_trampoline_func_start
+#endif
 # The following assembly is equivalent to:
 # PyObject *
 # trampoline(PyThreadState *ts, _PyInterpreterFrame *f,
@ -7,7 +11,11 @@
 # {
 #     return evaluator(ts, f, throwflag);
 # }
+#if defined(__APPLE__)
+__Py_trampoline_func_start:
+#else
 _Py_trampoline_func_start:
+#endif
 #ifdef __x86_64__
 #if defined(__CET__) && (__CET__ & 1)
    endbr64
@ -35,9 +43,14 @@ _Py_trampoline_func_start:
    addi    sp,sp,16
    jr      ra
 #endif
+#if defined(__APPLE__)
+    .globl	__Py_trampoline_func_end
+__Py_trampoline_func_end:
+#else
    .globl	_Py_trampoline_func_end
 _Py_trampoline_func_end:
    .section        .note.GNU-stack,"",@progbits
+#endif
 # Note for indicating the assembly code supports CET
 #if defined(__x86_64__) && defined(__CET__) && (__CET__ & 1)
    .section    .note.gnu.property,"a"
--- a/Python/perf_jit_trampoline.c
+++ b/Python/perf_jit_trampoline.c
@ -66,7 +66,9 @@
 #ifdef PY_HAVE_PERF_TRAMPOLINE

 /* Standard library includes for perf jitdump implementation */
-#include <elf.h>                  // ELF architecture constants
+#if defined(__linux__)
+#  include <elf.h>                // ELF architecture constants
+#endif
 #include <fcntl.h>                // File control operations
 #include <stdio.h>                // Standard I/O operations
 #include <stdlib.h>               // Standard library functions
@ -74,7 +76,9 @@
 #include <sys/types.h>            // System data types
 #include <unistd.h>               // System calls (sysconf, getpid)
 #include <sys/time.h>             // Time functions (gettimeofday)
-#include <sys/syscall.h>          // System call interface
+#if defined(__linux__)
+#  include <sys/syscall.h>        // System call interface
+#endif

 // =============================================================================
 //                           CONSTANTS AND CONFIGURATION
@ -101,6 +105,22 @@
 * based on the actual unwind information requirements.
 */

+
+/* These constants are defined inside <elf.h>, which we can't use outside of linux. */
+#if !defined(__linux__)
+#  if defined(__i386__) || defined(_M_IX86)
+#    define EM_386      3
+#  elif defined(__arm__) || defined(_M_ARM)
+#    define EM_ARM      40
+#  elif defined(__x86_64__) || defined(_M_X64)
+#    define EM_X86_64   62
+#  elif defined(__aarch64__)
+#    define EM_AARCH64  183
+#  elif defined(__riscv)
+#    define EM_RISCV    243
+#  endif
+#endif
+
 /* Convenient access to the global trampoline API state */
 #define trampoline_api _PyRuntime.ceval.perf.trampoline_api

@ -194,7 +214,7 @@ struct BaseEvent {
 typedef struct {
    struct BaseEvent base;   // Common event header
    uint32_t process_id;     // Process ID where code was generated
-    uint32_t thread_id;      // Thread ID where code was generated
+    uint64_t thread_id;      // Thread ID where code was generated
    uint64_t vma;            // Virtual memory address where code is loaded
    uint64_t code_address;   // Address of the actual machine code
    uint64_t code_size;      // Size of the machine code in bytes
@ -1035,6 +1055,10 @@ static void* perf_map_jit_init(void) {
        return NULL;  // Failed to get page size
    }

+#if defined(__APPLE__)
+    // On macOS, samply uses a preload to find jitdumps and this mmap can be slow.
+    perf_jit_map_state.mapped_buffer = NULL;
+#else
    /*
     * Map the first page of the jitdump file
     *
@ -1057,6 +1081,7 @@ static void* perf_map_jit_init(void) {
        close(fd);
        return NULL;  // Memory mapping failed
    }
+#endif

    perf_jit_map_state.mapped_size = page_size;

@ -1263,7 +1288,11 @@ static void perf_map_jit_write_entry(void *state, const void *code_addr,
    ev.base.size = sizeof(ev) + (name_length+1) + size;
    ev.base.time_stamp = get_current_monotonic_ticks();
    ev.process_id = getpid();
+#if defined(__APPLE__)
+    pthread_threadid_np(NULL, &ev.thread_id);
+#else
    ev.thread_id = syscall(SYS_gettid);  // Get thread ID via system call
+#endif
    ev.vma = base;                       // Virtual memory address
    ev.code_address = base;              // Same as VMA for our use case
    ev.code_size = size;
--- a/2
+++ b/2
@ -13816,6 +13816,8 @@ case $PLATFORM_TRIPLET in #(
    perf_trampoline=yes ;; #(
  aarch64-linux-gnu) :
    perf_trampoline=yes ;; #(
+  darwin) :
+    perf_trampoline=yes ;; #(
  *) :
    perf_trampoline=no
 ;;
--- a/configure.ac
+++ b/configure.ac
@ -3692,12 +3692,13 @@ case "$ac_sys_system" in
 esac
 AC_MSG_RESULT([$SHLIBS])

-dnl perf trampoline is Linux specific and requires an arch-specific
+dnl perf trampoline is Linux and macOS specific and requires an arch-specific
 dnl trampoline in assembly.
 AC_MSG_CHECKING([perf trampoline])
 AS_CASE([$PLATFORM_TRIPLET],
  [x86_64-linux-gnu], [perf_trampoline=yes],
  [aarch64-linux-gnu], [perf_trampoline=yes],
+  [darwin], [perf_trampoline=yes],
  [perf_trampoline=no]
 )
 AC_MSG_RESULT([$perf_trampoline])