mirror of
https://github.com/python/cpython.git
synced 2025-09-14 20:56:06 +00:00
GH-119726: Deduplicate AArch64 trampolines within a trace (GH-123872)
This commit is contained in:
parent
7a178b7605
commit
b85923a0fe
5 changed files with 146 additions and 58 deletions
92
Python/jit.c
92
Python/jit.c
|
@ -3,6 +3,7 @@
|
|||
#include "Python.h"
|
||||
|
||||
#include "pycore_abstract.h"
|
||||
#include "pycore_bitutils.h"
|
||||
#include "pycore_call.h"
|
||||
#include "pycore_ceval.h"
|
||||
#include "pycore_critical_section.h"
|
||||
|
@ -113,6 +114,21 @@ mark_executable(unsigned char *memory, size_t size)
|
|||
|
||||
// JIT compiler stuff: /////////////////////////////////////////////////////////
|
||||
|
||||
#define SYMBOL_MASK_WORDS 4
|
||||
|
||||
typedef uint32_t symbol_mask[SYMBOL_MASK_WORDS];
|
||||
|
||||
typedef struct {
|
||||
unsigned char *mem;
|
||||
symbol_mask mask;
|
||||
size_t size;
|
||||
} trampoline_state;
|
||||
|
||||
typedef struct {
|
||||
trampoline_state trampolines;
|
||||
uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH];
|
||||
} jit_state;
|
||||
|
||||
// Warning! AArch64 requires you to get your hands dirty. These are your gloves:
|
||||
|
||||
// value[value_start : value_start + len]
|
||||
|
@ -390,66 +406,126 @@ patch_x86_64_32rx(unsigned char *location, uint64_t value)
|
|||
patch_32r(location, value);
|
||||
}
|
||||
|
||||
void patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state);
|
||||
|
||||
#include "jit_stencils.h"
|
||||
|
||||
#if defined(__aarch64__) || defined(_M_ARM64)
|
||||
#define TRAMPOLINE_SIZE 16
|
||||
#else
|
||||
#define TRAMPOLINE_SIZE 0
|
||||
#endif
|
||||
|
||||
// Generate and patch AArch64 trampolines. The symbols to jump to are stored
|
||||
// in the jit_stencils.h in the symbols_map.
|
||||
void
|
||||
patch_aarch64_trampoline(unsigned char *location, int ordinal, jit_state *state)
|
||||
{
|
||||
// Masking is done modulo 32 as the mask is stored as an array of uint32_t
|
||||
const uint32_t symbol_mask = 1 << (ordinal % 32);
|
||||
const uint32_t trampoline_mask = state->trampolines.mask[ordinal / 32];
|
||||
assert(symbol_mask & trampoline_mask);
|
||||
|
||||
// Count the number of set bits in the trampoline mask lower than ordinal,
|
||||
// this gives the index into the array of trampolines.
|
||||
int index = _Py_popcount32(trampoline_mask & (symbol_mask - 1));
|
||||
for (int i = 0; i < ordinal / 32; i++) {
|
||||
index += _Py_popcount32(state->trampolines.mask[i]);
|
||||
}
|
||||
|
||||
uint32_t *p = (uint32_t*)(state->trampolines.mem + index * TRAMPOLINE_SIZE);
|
||||
assert((size_t)(index + 1) * TRAMPOLINE_SIZE <= state->trampolines.size);
|
||||
|
||||
uint64_t value = (uintptr_t)symbols_map[ordinal];
|
||||
|
||||
/* Generate the trampoline
|
||||
0: 58000048 ldr x8, 8
|
||||
4: d61f0100 br x8
|
||||
8: 00000000 // The next two words contain the 64-bit address to jump to.
|
||||
c: 00000000
|
||||
*/
|
||||
p[0] = 0x58000048;
|
||||
p[1] = 0xD61F0100;
|
||||
p[2] = value & 0xffffffff;
|
||||
p[3] = value >> 32;
|
||||
|
||||
patch_aarch64_26r(location, (uintptr_t)p);
|
||||
}
|
||||
|
||||
static void
|
||||
combine_symbol_mask(const symbol_mask src, symbol_mask dest)
|
||||
{
|
||||
// Calculate the union of the trampolines required by each StencilGroup
|
||||
for (size_t i = 0; i < SYMBOL_MASK_WORDS; i++) {
|
||||
dest[i] |= src[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Compiles executor in-place. Don't forget to call _PyJIT_Free later!
|
||||
int
|
||||
_PyJIT_Compile(_PyExecutorObject *executor, const _PyUOpInstruction trace[], size_t length)
|
||||
{
|
||||
const StencilGroup *group;
|
||||
// Loop once to find the total compiled size:
|
||||
uintptr_t instruction_starts[UOP_MAX_TRACE_LENGTH];
|
||||
size_t code_size = 0;
|
||||
size_t data_size = 0;
|
||||
jit_state state = {};
|
||||
group = &trampoline;
|
||||
code_size += group->code_size;
|
||||
data_size += group->data_size;
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
const _PyUOpInstruction *instruction = &trace[i];
|
||||
group = &stencil_groups[instruction->opcode];
|
||||
instruction_starts[i] = code_size;
|
||||
state.instruction_starts[i] = code_size;
|
||||
code_size += group->code_size;
|
||||
data_size += group->data_size;
|
||||
combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
|
||||
}
|
||||
group = &stencil_groups[_FATAL_ERROR];
|
||||
code_size += group->code_size;
|
||||
data_size += group->data_size;
|
||||
combine_symbol_mask(group->trampoline_mask, state.trampolines.mask);
|
||||
// Calculate the size of the trampolines required by the whole trace
|
||||
for (size_t i = 0; i < Py_ARRAY_LENGTH(state.trampolines.mask); i++) {
|
||||
state.trampolines.size += _Py_popcount32(state.trampolines.mask[i]) * TRAMPOLINE_SIZE;
|
||||
}
|
||||
// Round up to the nearest page:
|
||||
size_t page_size = get_page_size();
|
||||
assert((page_size & (page_size - 1)) == 0);
|
||||
size_t padding = page_size - ((code_size + data_size) & (page_size - 1));
|
||||
size_t total_size = code_size + data_size + padding;
|
||||
size_t padding = page_size - ((code_size + data_size + state.trampolines.size) & (page_size - 1));
|
||||
size_t total_size = code_size + data_size + state.trampolines.size + padding;
|
||||
unsigned char *memory = jit_alloc(total_size);
|
||||
if (memory == NULL) {
|
||||
return -1;
|
||||
}
|
||||
// Update the offsets of each instruction:
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
instruction_starts[i] += (uintptr_t)memory;
|
||||
state.instruction_starts[i] += (uintptr_t)memory;
|
||||
}
|
||||
// Loop again to emit the code:
|
||||
unsigned char *code = memory;
|
||||
unsigned char *data = memory + code_size;
|
||||
state.trampolines.mem = memory + code_size + data_size;
|
||||
// Compile the trampoline, which handles converting between the native
|
||||
// calling convention and the calling convention used by jitted code
|
||||
// (which may be different for efficiency reasons). On platforms where
|
||||
// we don't change calling conventions, the trampoline is empty and
|
||||
// nothing is emitted here:
|
||||
group = &trampoline;
|
||||
group->emit(code, data, executor, NULL, instruction_starts);
|
||||
group->emit(code, data, executor, NULL, &state);
|
||||
code += group->code_size;
|
||||
data += group->data_size;
|
||||
assert(trace[0].opcode == _START_EXECUTOR);
|
||||
for (size_t i = 0; i < length; i++) {
|
||||
const _PyUOpInstruction *instruction = &trace[i];
|
||||
group = &stencil_groups[instruction->opcode];
|
||||
group->emit(code, data, executor, instruction, instruction_starts);
|
||||
group->emit(code, data, executor, instruction, &state);
|
||||
code += group->code_size;
|
||||
data += group->data_size;
|
||||
}
|
||||
// Protect against accidental buffer overrun into data:
|
||||
group = &stencil_groups[_FATAL_ERROR];
|
||||
group->emit(code, data, executor, NULL, instruction_starts);
|
||||
group->emit(code, data, executor, NULL, &state);
|
||||
code += group->code_size;
|
||||
data += group->data_size;
|
||||
assert(code == memory + code_size);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue