[3.14] gh-136541: Fix several problems of perf trampolines in x86_64 and aarch64 (GH-136500) (#136544)

gh-136541: Fix several problems of perf trampolines in x86_64 and aarch64 (GH-136500) This commit fixes the following problems: * The x86_64 trampolines are not preserving frame pointers * The hardcoded offsets to the code segment from the FDE only worked properly for x64_64 * The CIE data was not following conventions of aarch64 * The eh_frame for aarch64 was not fully correct (cherry picked from commit 236f733d8f) Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
2025-07-24 11:44:31 +00:00 · 2025-07-11 16:06:19 +02:00 · 2025-07-11 16:06:19 +02:00 · 5535482d2a
commit 5535482d2a
parent a464c4e2e8
4 changed files with 147 additions and 41 deletions
--- a/Python/perf_trampoline.c
+++ b/Python/perf_trampoline.c
@ -162,6 +162,8 @@ static void invalidate_icache(char* begin, char*end) {
 }
 #endif

+#define CODE_ALIGNMENT 32
+
 /* The function pointer is passed as last argument. The other three arguments
 * are passed in the same order as the function requires. This results in
 * shorter, more efficient ASM code for trampoline.
@ -291,7 +293,9 @@ new_code_arena(void)
    void *start = &_Py_trampoline_func_start;
    void *end = &_Py_trampoline_func_end;
    size_t code_size = end - start;
-    size_t chunk_size = round_up(code_size + trampoline_api.code_padding, 16);
+    size_t unaligned_size = code_size + trampoline_api.code_padding;
+    size_t chunk_size = round_up(unaligned_size, CODE_ALIGNMENT);
+    assert(chunk_size % CODE_ALIGNMENT == 0);
    // TODO: Check the effect of alignment of the code chunks. Initial investigation
    // showed that this has no effect on performance in x86-64 or aarch64 and the current
    // version has the advantage that the unwinder in GDB can unwind across JIT-ed code.
@ -356,7 +360,9 @@ static inline py_trampoline
 code_arena_new_code(code_arena_t *code_arena)
 {
    py_trampoline trampoline = (py_trampoline)code_arena->current_addr;
-    size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding, 16);
+    size_t total_code_size = round_up(code_arena->code_size + trampoline_api.code_padding,
+                                  CODE_ALIGNMENT);
+    assert(total_code_size % CODE_ALIGNMENT == 0);
    code_arena->size_left -= total_code_size;
    code_arena->current_addr += total_code_size;
    return trampoline;
@ -489,9 +495,6 @@ _PyPerfTrampoline_Init(int activate)
    }
    else {
        _PyInterpreterState_SetEvalFrameFunc(tstate->interp, py_trampoline_evaluator);
-        if (new_code_arena() < 0) {
-            return -1;
-        }
        extra_code_index = _PyEval_RequestCodeExtraIndex(NULL);
        if (extra_code_index == -1) {
            return -1;
@ -499,6 +502,9 @@ _PyPerfTrampoline_Init(int activate)
        if (trampoline_api.state == NULL && trampoline_api.init_state != NULL) {
            trampoline_api.state = trampoline_api.init_state();
        }
+        if (new_code_arena() < 0) {
+            return -1;
+        }
        perf_status = PERF_STATUS_OK;
    }
 #endif