mirror of
https://github.com/python/cpython.git
synced 2025-10-13 18:33:34 +00:00
gh-115999: Implement thread-local bytecode and enable specialization for BINARY_OP
(#123926)
Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads. Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization. Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
This commit is contained in:
parent
e5a4b402ae
commit
2e95c5ba3b
44 changed files with 1510 additions and 255 deletions
|
@ -6,17 +6,22 @@
|
|||
#include "pycore_code.h" // _PyCodeConstructor
|
||||
#include "pycore_frame.h" // FRAME_SPECIALS_SIZE
|
||||
#include "pycore_hashtable.h" // _Py_hashtable_t
|
||||
#include "pycore_index_pool.h" // _PyIndexPool
|
||||
#include "pycore_initconfig.h" // _PyStatus_OK()
|
||||
#include "pycore_interp.h" // PyInterpreterState.co_extra_freefuncs
|
||||
#include "pycore_object.h" // _PyObject_SetDeferredRefcount
|
||||
#include "pycore_object_stack.h"
|
||||
#include "pycore_opcode_metadata.h" // _PyOpcode_Deopt, _PyOpcode_Caches
|
||||
#include "pycore_opcode_utils.h" // RESUME_AT_FUNC_START
|
||||
#include "pycore_pymem.h" // _PyMem_FreeDelayed
|
||||
#include "pycore_pystate.h" // _PyInterpreterState_GET()
|
||||
#include "pycore_setobject.h" // _PySet_NextEntry()
|
||||
#include "pycore_tuple.h" // _PyTuple_ITEMS()
|
||||
#include "pycore_uniqueid.h" // _PyObject_AssignUniqueId()
|
||||
#include "clinic/codeobject.c.h"
|
||||
|
||||
#define INITIAL_SPECIALIZED_CODE_SIZE 16
|
||||
|
||||
static const char *
|
||||
code_event_name(PyCodeEvent event) {
|
||||
switch (event) {
|
||||
|
@ -440,9 +445,15 @@ _PyCode_Validate(struct _PyCodeConstructor *con)
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern void _PyCode_Quicken(PyCodeObject *code);
|
||||
extern void
|
||||
_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts,
|
||||
int enable_counters);
|
||||
|
||||
static void
|
||||
#ifdef Py_GIL_DISABLED
|
||||
static _PyCodeArray * _PyCodeArray_New(Py_ssize_t size);
|
||||
#endif
|
||||
|
||||
static int
|
||||
init_code(PyCodeObject *co, struct _PyCodeConstructor *con)
|
||||
{
|
||||
int nlocalsplus = (int)PyTuple_GET_SIZE(con->localsplusnames);
|
||||
|
@ -505,14 +516,27 @@ init_code(PyCodeObject *co, struct _PyCodeConstructor *con)
|
|||
|
||||
memcpy(_PyCode_CODE(co), PyBytes_AS_STRING(con->code),
|
||||
PyBytes_GET_SIZE(con->code));
|
||||
#ifdef Py_GIL_DISABLED
|
||||
co->co_tlbc = _PyCodeArray_New(INITIAL_SPECIALIZED_CODE_SIZE);
|
||||
if (co->co_tlbc == NULL) {
|
||||
return -1;
|
||||
}
|
||||
co->co_tlbc->entries[0] = co->co_code_adaptive;
|
||||
#endif
|
||||
int entry_point = 0;
|
||||
while (entry_point < Py_SIZE(co) &&
|
||||
_PyCode_CODE(co)[entry_point].op.code != RESUME) {
|
||||
entry_point++;
|
||||
}
|
||||
co->_co_firsttraceable = entry_point;
|
||||
_PyCode_Quicken(co);
|
||||
#ifdef Py_GIL_DISABLED
|
||||
_PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts,
|
||||
interp->config.tlbc_enabled);
|
||||
#else
|
||||
_PyCode_Quicken(_PyCode_CODE(co), Py_SIZE(co), co->co_consts, 1);
|
||||
#endif
|
||||
notify_code_watchers(PY_CODE_EVENT_CREATE, co);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -667,7 +691,12 @@ _PyCode_New(struct _PyCodeConstructor *con)
|
|||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
init_code(co, con);
|
||||
|
||||
if (init_code(co, con) < 0) {
|
||||
Py_DECREF(co);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef Py_GIL_DISABLED
|
||||
co->_co_unique_id = _PyObject_AssignUniqueId((PyObject *)co);
|
||||
_PyObject_GC_TRACK(co);
|
||||
|
@ -1871,6 +1900,17 @@ code_dealloc(PyCodeObject *co)
|
|||
PyObject_ClearWeakRefs((PyObject*)co);
|
||||
}
|
||||
free_monitoring_data(co->_co_monitoring);
|
||||
#ifdef Py_GIL_DISABLED
|
||||
// The first element always points to the mutable bytecode at the end of
|
||||
// the code object, which will be freed when the code object is freed.
|
||||
for (Py_ssize_t i = 1; i < co->co_tlbc->size; i++) {
|
||||
char *entry = co->co_tlbc->entries[i];
|
||||
if (entry != NULL) {
|
||||
PyMem_Free(entry);
|
||||
}
|
||||
}
|
||||
PyMem_Free(co->co_tlbc);
|
||||
#endif
|
||||
PyObject_Free(co);
|
||||
}
|
||||
|
||||
|
@ -2646,5 +2686,270 @@ _PyCode_Fini(PyInterpreterState *interp)
|
|||
_Py_hashtable_destroy(state->constants);
|
||||
state->constants = NULL;
|
||||
}
|
||||
_PyIndexPool_Fini(&interp->tlbc_indices);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef Py_GIL_DISABLED
|
||||
|
||||
// Thread-local bytecode (TLBC)
|
||||
//
|
||||
// Each thread specializes a thread-local copy of the bytecode, created on the
|
||||
// first RESUME, in free-threaded builds. All copies of the bytecode for a code
|
||||
// object are stored in the `co_tlbc` array. Threads reserve a globally unique
|
||||
// index identifying its copy of the bytecode in all `co_tlbc` arrays at thread
|
||||
// creation and release the index at thread destruction. The first entry in
|
||||
// every `co_tlbc` array always points to the "main" copy of the bytecode that
|
||||
// is stored at the end of the code object. This ensures that no bytecode is
|
||||
// copied for programs that do not use threads.
|
||||
//
|
||||
// Thread-local bytecode can be disabled at runtime by providing either `-X
|
||||
// tlbc=0` or `PYTHON_TLBC=0`. Disabling thread-local bytecode also disables
|
||||
// specialization. All threads share the main copy of the bytecode when
|
||||
// thread-local bytecode is disabled.
|
||||
//
|
||||
// Concurrent modifications to the bytecode made by the specializing
|
||||
// interpreter and instrumentation use atomics, with specialization taking care
|
||||
// not to overwrite an instruction that was instrumented concurrently.
|
||||
|
||||
int32_t
|
||||
_Py_ReserveTLBCIndex(PyInterpreterState *interp)
|
||||
{
|
||||
if (interp->config.tlbc_enabled) {
|
||||
return _PyIndexPool_AllocIndex(&interp->tlbc_indices);
|
||||
}
|
||||
// All threads share the main copy of the bytecode when TLBC is disabled
|
||||
return 0;
|
||||
}
|
||||
|
||||
void
|
||||
_Py_ClearTLBCIndex(_PyThreadStateImpl *tstate)
|
||||
{
|
||||
PyInterpreterState *interp = ((PyThreadState *)tstate)->interp;
|
||||
if (interp->config.tlbc_enabled) {
|
||||
_PyIndexPool_FreeIndex(&interp->tlbc_indices, tstate->tlbc_index);
|
||||
}
|
||||
}
|
||||
|
||||
static _PyCodeArray *
|
||||
_PyCodeArray_New(Py_ssize_t size)
|
||||
{
|
||||
_PyCodeArray *arr = PyMem_Calloc(
|
||||
1, offsetof(_PyCodeArray, entries) + sizeof(void *) * size);
|
||||
if (arr == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
arr->size = size;
|
||||
return arr;
|
||||
}
|
||||
|
||||
static void
|
||||
copy_code(_Py_CODEUNIT *dst, PyCodeObject *co)
|
||||
{
|
||||
int code_len = (int) Py_SIZE(co);
|
||||
for (int i = 0; i < code_len; i += _PyInstruction_GetLength(co, i)) {
|
||||
dst[i] = _Py_GetBaseCodeUnit(co, i);
|
||||
}
|
||||
_PyCode_Quicken(dst, code_len, co->co_consts, 1);
|
||||
}
|
||||
|
||||
static Py_ssize_t
|
||||
get_pow2_greater(Py_ssize_t initial, Py_ssize_t limit)
|
||||
{
|
||||
// initial must be a power of two
|
||||
assert(!(initial & (initial - 1)));
|
||||
Py_ssize_t res = initial;
|
||||
while (res && res < limit) {
|
||||
res <<= 1;
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
static _Py_CODEUNIT *
|
||||
create_tlbc_lock_held(PyCodeObject *co, Py_ssize_t idx)
|
||||
{
|
||||
_PyCodeArray *tlbc = co->co_tlbc;
|
||||
if (idx >= tlbc->size) {
|
||||
Py_ssize_t new_size = get_pow2_greater(tlbc->size, idx + 1);
|
||||
if (!new_size) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
_PyCodeArray *new_tlbc = _PyCodeArray_New(new_size);
|
||||
if (new_tlbc == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
memcpy(new_tlbc->entries, tlbc->entries, tlbc->size * sizeof(void *));
|
||||
_Py_atomic_store_ptr_release(&co->co_tlbc, new_tlbc);
|
||||
_PyMem_FreeDelayed(tlbc);
|
||||
tlbc = new_tlbc;
|
||||
}
|
||||
char *bc = PyMem_Calloc(1, _PyCode_NBYTES(co));
|
||||
if (bc == NULL) {
|
||||
PyErr_NoMemory();
|
||||
return NULL;
|
||||
}
|
||||
copy_code((_Py_CODEUNIT *) bc, co);
|
||||
assert(tlbc->entries[idx] == NULL);
|
||||
tlbc->entries[idx] = bc;
|
||||
return (_Py_CODEUNIT *) bc;
|
||||
}
|
||||
|
||||
static _Py_CODEUNIT *
|
||||
get_tlbc_lock_held(PyCodeObject *co)
|
||||
{
|
||||
_PyCodeArray *tlbc = co->co_tlbc;
|
||||
_PyThreadStateImpl *tstate = (_PyThreadStateImpl *)PyThreadState_GET();
|
||||
int32_t idx = tstate->tlbc_index;
|
||||
if (idx < tlbc->size && tlbc->entries[idx] != NULL) {
|
||||
return (_Py_CODEUNIT *)tlbc->entries[idx];
|
||||
}
|
||||
return create_tlbc_lock_held(co, idx);
|
||||
}
|
||||
|
||||
_Py_CODEUNIT *
|
||||
_PyCode_GetTLBC(PyCodeObject *co)
|
||||
{
|
||||
_Py_CODEUNIT *result;
|
||||
Py_BEGIN_CRITICAL_SECTION(co);
|
||||
result = get_tlbc_lock_held(co);
|
||||
Py_END_CRITICAL_SECTION();
|
||||
return result;
|
||||
}
|
||||
|
||||
// My kingdom for a bitset
|
||||
struct flag_set {
|
||||
uint8_t *flags;
|
||||
Py_ssize_t size;
|
||||
};
|
||||
|
||||
static inline int
|
||||
flag_is_set(struct flag_set *flags, Py_ssize_t idx)
|
||||
{
|
||||
assert(idx >= 0);
|
||||
return (idx < flags->size) && flags->flags[idx];
|
||||
}
|
||||
|
||||
// Set the flag for each tlbc index in use
|
||||
static int
|
||||
get_indices_in_use(PyInterpreterState *interp, struct flag_set *in_use)
|
||||
{
|
||||
assert(interp->stoptheworld.world_stopped);
|
||||
assert(in_use->flags == NULL);
|
||||
int32_t max_index = 0;
|
||||
for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) {
|
||||
int32_t idx = ((_PyThreadStateImpl *) p)->tlbc_index;
|
||||
if (idx > max_index) {
|
||||
max_index = idx;
|
||||
}
|
||||
}
|
||||
in_use->size = (size_t) max_index + 1;
|
||||
in_use->flags = PyMem_Calloc(in_use->size, sizeof(*in_use->flags));
|
||||
if (in_use->flags == NULL) {
|
||||
return -1;
|
||||
}
|
||||
for (PyThreadState *p = interp->threads.head; p != NULL; p = p->next) {
|
||||
in_use->flags[((_PyThreadStateImpl *) p)->tlbc_index] = 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
struct get_code_args {
|
||||
_PyObjectStack code_objs;
|
||||
struct flag_set indices_in_use;
|
||||
int err;
|
||||
};
|
||||
|
||||
static void
|
||||
clear_get_code_args(struct get_code_args *args)
|
||||
{
|
||||
if (args->indices_in_use.flags != NULL) {
|
||||
PyMem_Free(args->indices_in_use.flags);
|
||||
args->indices_in_use.flags = NULL;
|
||||
}
|
||||
_PyObjectStack_Clear(&args->code_objs);
|
||||
}
|
||||
|
||||
static inline int
|
||||
is_bytecode_unused(_PyCodeArray *tlbc, Py_ssize_t idx,
|
||||
struct flag_set *indices_in_use)
|
||||
{
|
||||
assert(idx > 0 && idx < tlbc->size);
|
||||
return tlbc->entries[idx] != NULL && !flag_is_set(indices_in_use, idx);
|
||||
}
|
||||
|
||||
static int
|
||||
get_code_with_unused_tlbc(PyObject *obj, struct get_code_args *args)
|
||||
{
|
||||
if (!PyCode_Check(obj)) {
|
||||
return 1;
|
||||
}
|
||||
PyCodeObject *co = (PyCodeObject *) obj;
|
||||
_PyCodeArray *tlbc = co->co_tlbc;
|
||||
// The first index always points at the main copy of the bytecode embedded
|
||||
// in the code object.
|
||||
for (Py_ssize_t i = 1; i < tlbc->size; i++) {
|
||||
if (is_bytecode_unused(tlbc, i, &args->indices_in_use)) {
|
||||
if (_PyObjectStack_Push(&args->code_objs, obj) < 0) {
|
||||
args->err = -1;
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
static void
|
||||
free_unused_bytecode(PyCodeObject *co, struct flag_set *indices_in_use)
|
||||
{
|
||||
_PyCodeArray *tlbc = co->co_tlbc;
|
||||
// The first index always points at the main copy of the bytecode embedded
|
||||
// in the code object.
|
||||
for (Py_ssize_t i = 1; i < tlbc->size; i++) {
|
||||
if (is_bytecode_unused(tlbc, i, indices_in_use)) {
|
||||
PyMem_Free(tlbc->entries[i]);
|
||||
tlbc->entries[i] = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int
|
||||
_Py_ClearUnusedTLBC(PyInterpreterState *interp)
|
||||
{
|
||||
struct get_code_args args = {
|
||||
.code_objs = {NULL},
|
||||
.indices_in_use = {NULL, 0},
|
||||
.err = 0,
|
||||
};
|
||||
_PyEval_StopTheWorld(interp);
|
||||
// Collect in-use tlbc indices
|
||||
if (get_indices_in_use(interp, &args.indices_in_use) < 0) {
|
||||
goto err;
|
||||
}
|
||||
// Collect code objects that have bytecode not in use by any thread
|
||||
_PyGC_VisitObjectsWorldStopped(
|
||||
interp, (gcvisitobjects_t)get_code_with_unused_tlbc, &args);
|
||||
if (args.err < 0) {
|
||||
goto err;
|
||||
}
|
||||
// Free unused bytecode. This must happen outside of gc_visit_heaps; it is
|
||||
// unsafe to allocate or free any mimalloc managed memory when it's
|
||||
// running.
|
||||
PyObject *obj;
|
||||
while ((obj = _PyObjectStack_Pop(&args.code_objs)) != NULL) {
|
||||
free_unused_bytecode((PyCodeObject*) obj, &args.indices_in_use);
|
||||
}
|
||||
_PyEval_StartTheWorld(interp);
|
||||
clear_get_code_args(&args);
|
||||
return 0;
|
||||
|
||||
err:
|
||||
_PyEval_StartTheWorld(interp);
|
||||
clear_get_code_args(&args);
|
||||
PyErr_NoMemory();
|
||||
return -1;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue