gh-115999: Implement thread-local bytecode and enable specialization for BINARY_OP (#123926)

Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads.

Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization.

Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
This commit is contained in:
mpage 2024-11-04 11:13:32 -08:00 committed by GitHub
parent e5a4b402ae
commit 2e95c5ba3b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
44 changed files with 1510 additions and 255 deletions

View file

@ -11,6 +11,7 @@ extern "C" {
#include "pycore_stackref.h" // _PyStackRef
#include "pycore_lock.h" // PyMutex
#include "pycore_backoff.h" // _Py_BackoffCounter
#include "pycore_tstate.h" // _PyThreadStateImpl
/* Each instruction in a code object is a fixed-width value,
@ -313,11 +314,17 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);
/** API for executors */
extern void _PyCode_Clear_Executors(PyCodeObject *code);
#ifdef Py_GIL_DISABLED
// gh-115999 tracks progress on addressing this.
#define ENABLE_SPECIALIZATION 0
// Use this to enable specialization families once they are thread-safe. All
// uses will be replaced with ENABLE_SPECIALIZATION once all families are
// thread-safe.
#define ENABLE_SPECIALIZATION_FT 1
#else
#define ENABLE_SPECIALIZATION 1
#define ENABLE_SPECIALIZATION_FT ENABLE_SPECIALIZATION
#endif
/* Specialization functions */
@ -600,6 +607,40 @@ struct _PyCode8 _PyCode_DEF(8);
PyAPI_DATA(const struct _PyCode8) _Py_InitCleanup;
#ifdef Py_GIL_DISABLED
// Return a pointer to the thread-local bytecode for the current thread, if it
// exists.
static inline _Py_CODEUNIT *
_PyCode_GetTLBCFast(PyThreadState *tstate, PyCodeObject *co)
{
_PyCodeArray *code = _Py_atomic_load_ptr_acquire(&co->co_tlbc);
int32_t idx = ((_PyThreadStateImpl*) tstate)->tlbc_index;
if (idx < code->size && code->entries[idx] != NULL) {
return (_Py_CODEUNIT *) code->entries[idx];
}
return NULL;
}
// Return a pointer to the thread-local bytecode for the current thread,
// creating it if necessary.
extern _Py_CODEUNIT *_PyCode_GetTLBC(PyCodeObject *co);
// Reserve an index for the current thread into thread-local bytecode
// arrays
//
// Returns the reserved index or -1 on error.
extern int32_t _Py_ReserveTLBCIndex(PyInterpreterState *interp);
// Release the current thread's index into thread-local bytecode arrays
extern void _Py_ClearTLBCIndex(_PyThreadStateImpl *tstate);
// Free all TLBC copies not associated with live threads.
//
// Returns 0 on success or -1 on error.
extern int _Py_ClearUnusedTLBC(PyInterpreterState *interp);
#endif
#ifdef __cplusplus
}
#endif