gh-115999: Implement thread-local bytecode and enable specialization for BINARY_OP (#123926)

Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads.

Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization.

Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
This commit is contained in:
mpage 2024-11-04 11:13:32 -08:00 committed by GitHub
parent e5a4b402ae
commit 2e95c5ba3b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
44 changed files with 1510 additions and 255 deletions

View file

@ -168,11 +168,11 @@ dummy_func(
}
op(_QUICKEN_RESUME, (--)) {
#if ENABLE_SPECIALIZATION
#if ENABLE_SPECIALIZATION_FT
if (tstate->tracing == 0 && this_instr->op.code == RESUME) {
FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK);
}
#endif /* ENABLE_SPECIALIZATION */
#endif /* ENABLE_SPECIALIZATION_FT */
}
tier1 op(_MAYBE_INSTRUMENT, (--)) {
@ -190,7 +190,26 @@ dummy_func(
}
}
op(_LOAD_BYTECODE, (--)) {
#ifdef Py_GIL_DISABLED
if (frame->tlbc_index !=
((_PyThreadStateImpl *)tstate)->tlbc_index) {
_Py_CODEUNIT *bytecode =
_PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
ERROR_IF(bytecode == NULL, error);
int off = this_instr - _PyFrame_GetBytecode(frame);
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
frame->instr_ptr = bytecode + off;
// Make sure this_instr gets reset correctley for any uops that
// follow
next_instr = frame->instr_ptr;
DISPATCH();
}
#endif
}
macro(RESUME) =
_LOAD_BYTECODE +
_MAYBE_INSTRUMENT +
_QUICKEN_RESUME +
_CHECK_PERIODIC_IF_NOT_YIELD_FROM;
@ -204,6 +223,10 @@ dummy_func(
uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version);
assert((version & _PY_EVAL_EVENTS_MASK) == 0);
DEOPT_IF(eval_breaker != version);
#ifdef Py_GIL_DISABLED
DEOPT_IF(frame->tlbc_index !=
((_PyThreadStateImpl *)tstate)->tlbc_index);
#endif
}
op(_MONITOR_RESUME, (--)) {
@ -217,6 +240,7 @@ dummy_func(
}
macro(INSTRUMENTED_RESUME) =
_LOAD_BYTECODE +
_MAYBE_INSTRUMENT +
_CHECK_PERIODIC_IF_NOT_YIELD_FROM +
_MONITOR_RESUME;
@ -682,8 +706,8 @@ dummy_func(
};
specializing op(_SPECIALIZE_BINARY_SUBSCR, (counter/1, container, sub -- container, sub)) {
assert(frame->stackpointer == NULL);
#if ENABLE_SPECIALIZATION
assert(frame->stackpointer == NULL);
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_Py_Specialize_BinarySubscr(container, sub, next_instr);
@ -1236,7 +1260,7 @@ dummy_func(
if (oparg) {
PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]);
if (PyLong_Check(lasti)) {
frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti);
frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti);
assert(!_PyErr_Occurred(tstate));
}
else {
@ -2671,9 +2695,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
DEAD(cond);
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
@ -2681,9 +2703,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
DEAD(cond);
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
@ -3697,7 +3717,7 @@ dummy_func(
op(_CREATE_INIT_FRAME, (init[1], self[1], args[oparg] -- init_frame: _PyInterpreterFrame *)) {
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK);
assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
DEAD(init);
@ -4593,7 +4613,7 @@ dummy_func(
}
specializing op(_SPECIALIZE_BINARY_OP, (counter/1, lhs, rhs -- lhs, rhs)) {
#if ENABLE_SPECIALIZATION
#if ENABLE_SPECIALIZATION_FT
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_Py_Specialize_BinaryOp(lhs, rhs, next_instr, oparg, LOCALS_ARRAY);
@ -4601,7 +4621,7 @@ dummy_func(
}
OPCODE_DEFERRED_INC(BINARY_OP);
ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter);
#endif /* ENABLE_SPECIALIZATION */
#endif /* ENABLE_SPECIALIZATION_FT */
assert(NB_ADD <= oparg);
assert(oparg <= NB_INPLACE_XOR);
}
@ -4632,7 +4652,7 @@ dummy_func(
int original_opcode = 0;
if (tstate->tracing) {
PyCodeObject *code = _PyFrame_GetCode(frame);
original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode;
original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode;
next_instr = this_instr;
} else {
original_opcode = _Py_call_instrumentation_line(
@ -4687,9 +4707,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
int offset = flag * oparg;
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}
@ -4698,9 +4716,7 @@ dummy_func(
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
int offset = flag * oparg;
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}
@ -4715,9 +4731,7 @@ dummy_func(
PyStackRef_CLOSE(value_stackref);
offset = 0;
}
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
}
@ -4815,7 +4829,7 @@ dummy_func(
tier2 op(_EXIT_TRACE, (exit_p/4 --)) {
_PyExitData *exit = (_PyExitData *)exit_p;
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target;
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
#if defined(Py_DEBUG) && !defined(_Py_JIT)
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
if (lltrace >= 2) {
@ -4823,7 +4837,7 @@ dummy_func(
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
(int)(target - _PyCode_CODE(code)),
(int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
}
#endif
@ -4933,7 +4947,7 @@ dummy_func(
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
(int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))),
(int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
}
#endif
@ -4995,7 +5009,7 @@ dummy_func(
}
tier2 op(_ERROR_POP_N, (target/2, unused[oparg] --)) {
frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target;
frame->instr_ptr = _PyFrame_GetBytecode(frame) + target;
SYNC_SP();
GOTO_UNWIND();
}

View file

@ -189,7 +189,7 @@ lltrace_instruction(_PyInterpreterFrame *frame,
dump_stack(frame, stack_pointer);
const char *opname = _PyOpcode_OpName[opcode];
assert(opname != NULL);
int offset = (int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame)));
int offset = (int)(next_instr - _PyFrame_GetBytecode(frame));
if (OPCODE_HAS_ARG((int)_PyOpcode_Deopt[opcode])) {
printf("%d: %s %d\n", offset * 2, opname, oparg);
}
@ -841,6 +841,19 @@ _PyEval_EvalFrameDefault(PyThreadState *tstate, _PyInterpreterFrame *frame, int
}
/* Because this avoids the RESUME,
* we need to update instrumentation */
#ifdef Py_GIL_DISABLED
/* Load thread-local bytecode */
if (frame->tlbc_index != ((_PyThreadStateImpl *)tstate)->tlbc_index) {
_Py_CODEUNIT *bytecode =
_PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
if (bytecode == NULL) {
goto error;
}
ptrdiff_t off = frame->instr_ptr - _PyFrame_GetBytecode(frame);
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
frame->instr_ptr = bytecode + off;
}
#endif
_Py_Instrument(_PyFrame_GetCode(frame), tstate->interp);
monitor_throw(tstate, frame, frame->instr_ptr);
/* TO DO -- Monitor throw entry. */
@ -983,7 +996,7 @@ exception_unwind:
Python main loop. */
PyObject *exc = _PyErr_GetRaisedException(tstate);
PUSH(PyStackRef_FromPyObjectSteal(exc));
next_instr = _PyCode_CODE(_PyFrame_GetCode(frame)) + handler;
next_instr = _PyFrame_GetBytecode(frame) + handler;
if (monitor_handled(tstate, frame, next_instr, exc) < 0) {
goto exception_unwind;
@ -1045,6 +1058,8 @@ enter_tier_two:
#undef ENABLE_SPECIALIZATION
#define ENABLE_SPECIALIZATION 0
#undef ENABLE_SPECIALIZATION_FT
#define ENABLE_SPECIALIZATION_FT 0
#ifdef Py_DEBUG
#define DPRINTF(level, ...) \
@ -1139,7 +1154,7 @@ exit_to_tier1_dynamic:
goto goto_to_tier1;
exit_to_tier1:
assert(next_uop[-1].format == UOP_FORMAT_TARGET);
next_instr = next_uop[-1].target + _PyCode_CODE(_PyFrame_GetCode(frame));
next_instr = next_uop[-1].target + _PyFrame_GetBytecode(frame);
goto_to_tier1:
#ifdef Py_DEBUG
if (lltrace >= 2) {
@ -1764,7 +1779,7 @@ _PyEvalFramePushAndInit(PyThreadState *tstate, _PyStackRef func,
if (frame == NULL) {
goto fail;
}
_PyFrame_Initialize(frame, func, locals, code, 0, previous);
_PyFrame_Initialize(tstate, frame, func, locals, code, 0, previous);
if (initialize_locals(tstate, func_obj, frame->localsplus, args, argcount, kwnames)) {
assert(frame->owner == FRAME_OWNED_BY_THREAD);
clear_thread_frame(tstate, frame);

View file

@ -151,7 +151,7 @@ GETITEM(PyObject *v, Py_ssize_t i) {
/* Code access macros */
/* The integer overflow is checked by an assertion below. */
#define INSTR_OFFSET() ((int)(next_instr - _PyCode_CODE(_PyFrame_GetCode(frame))))
#define INSTR_OFFSET() ((int)(next_instr - _PyFrame_GetBytecode(frame)))
#define NEXTOPARG() do { \
_Py_CODEUNIT word = {.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t*)next_instr)}; \
opcode = word.op.code; \
@ -301,14 +301,6 @@ GETITEM(PyObject *v, Py_ssize_t i) {
#define ADAPTIVE_COUNTER_TRIGGERS(COUNTER) \
backoff_counter_triggers(forge_backoff_counter((COUNTER)))
#ifdef Py_GIL_DISABLED
#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
do { \
/* gh-115999 tracks progress on addressing this. */ \
static_assert(0, "The specializing interpreter is not yet thread-safe"); \
} while (0);
#define PAUSE_ADAPTIVE_COUNTER(COUNTER) ((void)COUNTER)
#else
#define ADVANCE_ADAPTIVE_COUNTER(COUNTER) \
do { \
(COUNTER) = advance_backoff_counter((COUNTER)); \
@ -318,6 +310,18 @@ GETITEM(PyObject *v, Py_ssize_t i) {
do { \
(COUNTER) = pause_backoff_counter((COUNTER)); \
} while (0);
#ifdef ENABLE_SPECIALIZATION_FT
/* Multiple threads may execute these concurrently if thread-local bytecode is
* disabled and they all execute the main copy of the bytecode. Specialization
* is disabled in that case so the value is unused, but the RMW cycle should be
* free of data races.
*/
#define RECORD_BRANCH_TAKEN(bitset, flag) \
FT_ATOMIC_STORE_UINT16_RELAXED( \
bitset, (FT_ATOMIC_LOAD_UINT16_RELAXED(bitset) << 1) | (flag))
#else
#define RECORD_BRANCH_TAKEN(bitset, flag)
#endif
#define UNBOUNDLOCAL_ERROR_MSG \

View file

@ -41,6 +41,8 @@
/* _QUICKEN_RESUME is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */
/* _LOAD_BYTECODE is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */
case _RESUME_CHECK: {
#if defined(__EMSCRIPTEN__)
if (_Py_emscripten_signal_clock == 0) {
@ -56,6 +58,13 @@
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
#ifdef Py_GIL_DISABLED
if (frame->tlbc_index !=
((_PyThreadStateImpl *)tstate)->tlbc_index) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
#endif
break;
}
@ -4480,8 +4489,8 @@
_PyFrame_SetStackPointer(frame, stack_pointer);
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
stack_pointer = _PyFrame_GetStackPointer(frame);
assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -5683,7 +5692,9 @@
PyObject *exit_p = (PyObject *)CURRENT_OPERAND();
_PyExitData *exit = (_PyExitData *)exit_p;
PyCodeObject *code = _PyFrame_GetCode(frame);
_Py_CODEUNIT *target = _PyCode_CODE(code) + exit->target;
_PyFrame_SetStackPointer(frame, stack_pointer);
_Py_CODEUNIT *target = _PyFrame_GetBytecode(frame) + exit->target;
stack_pointer = _PyFrame_GetStackPointer(frame);
#if defined(Py_DEBUG) && !defined(_Py_JIT)
OPT_HIST(trace_uop_execution_counter, trace_run_length_hist);
if (lltrace >= 2) {
@ -5692,7 +5703,7 @@
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
(int)(target - _PyCode_CODE(code)),
(int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
stack_pointer = _PyFrame_GetStackPointer(frame);
}
@ -5878,7 +5889,7 @@
_PyUOpPrint(&next_uop[-1]);
printf(", exit %u, temp %d, target %d -> %s]\n",
exit - current_executor->exits, exit->temperature.value_and_backoff,
(int)(target - _PyCode_CODE(_PyFrame_GetCode(frame))),
(int)(target - _PyFrame_GetBytecode(frame)),
_PyOpcode_OpName[target->op.code]);
stack_pointer = _PyFrame_GetStackPointer(frame);
}
@ -5956,9 +5967,11 @@
case _ERROR_POP_N: {
oparg = CURRENT_OPARG();
uint32_t target = (uint32_t)CURRENT_OPERAND();
frame->instr_ptr = ((_Py_CODEUNIT *)_PyFrame_GetCode(frame)->co_code_adaptive) + target;
stack_pointer += -oparg;
assert(WITHIN_STACK_BOUNDS());
_PyFrame_SetStackPointer(frame, stack_pointer);
frame->instr_ptr = _PyFrame_GetBytecode(frame) + target;
stack_pointer = _PyFrame_GetStackPointer(frame);
GOTO_UNWIND();
break;
}

View file

@ -63,7 +63,8 @@ take_ownership(PyFrameObject *f, _PyInterpreterFrame *frame)
// This may be a newly-created generator or coroutine frame. Since it's
// dead anyways, just pretend that the first RESUME ran:
PyCodeObject *code = _PyFrame_GetCode(frame);
frame->instr_ptr = _PyCode_CODE(code) + code->_co_firsttraceable + 1;
frame->instr_ptr =
_PyFrame_GetBytecode(frame) + code->_co_firsttraceable + 1;
}
assert(!_PyFrame_IsIncomplete(frame));
assert(f->f_back == NULL);

View file

@ -1953,16 +1953,22 @@ custom_visitor_wrapper(const mi_heap_t *heap, const mi_heap_area_t *area,
}
void
PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg)
_PyGC_VisitObjectsWorldStopped(PyInterpreterState *interp,
gcvisitobjects_t callback, void *arg)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
struct custom_visitor_args wrapper = {
.callback = callback,
.arg = arg,
};
_PyEval_StopTheWorld(interp);
gc_visit_heaps(interp, &custom_visitor_wrapper, &wrapper.base);
}
void
PyUnstable_GC_VisitObjects(gcvisitobjects_t callback, void *arg)
{
PyInterpreterState *interp = _PyInterpreterState_GET();
_PyEval_StopTheWorld(interp);
_PyGC_VisitObjectsWorldStopped(interp, callback, arg);
_PyEval_StartTheWorld(interp);
}

View file

@ -25,7 +25,7 @@
lhs = stack_pointer[-2];
uint16_t counter = read_u16(&this_instr[1].cache);
(void)counter;
#if ENABLE_SPECIALIZATION
#if ENABLE_SPECIALIZATION_FT
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -35,7 +35,7 @@
}
OPCODE_DEFERRED_INC(BINARY_OP);
ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter);
#endif /* ENABLE_SPECIALIZATION */
#endif /* ENABLE_SPECIALIZATION_FT */
assert(NB_ADD <= oparg);
assert(oparg <= NB_INPLACE_XOR);
}
@ -435,8 +435,8 @@
container = stack_pointer[-2];
uint16_t counter = read_u16(&this_instr[1].cache);
(void)counter;
assert(frame->stackpointer == NULL);
#if ENABLE_SPECIALIZATION
assert(frame->stackpointer == NULL);
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -1066,8 +1066,8 @@
_PyFrame_SetStackPointer(frame, stack_pointer);
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
stack_pointer = _PyFrame_GetStackPointer(frame);
assert(_PyCode_CODE(_PyFrame_GetCode(shim))[0].op.code == EXIT_INIT_CHECK);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -4711,7 +4711,9 @@
int original_opcode = 0;
if (tstate->tracing) {
PyCodeObject *code = _PyFrame_GetCode(frame);
original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyCode_CODE(code))].original_opcode;
_PyFrame_SetStackPointer(frame, stack_pointer);
original_opcode = code->_co_monitoring->lines[(int)(this_instr - _PyFrame_GetBytecode(frame))].original_opcode;
stack_pointer = _PyFrame_GetStackPointer(frame);
next_instr = this_instr;
} else {
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -4759,9 +4761,7 @@
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
int offset = flag * oparg;
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
DISPATCH();
}
@ -4782,9 +4782,7 @@
PyStackRef_CLOSE(value_stackref);
offset = 0;
}
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
DISPATCH();
}
@ -4822,9 +4820,7 @@
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
int offset = flag * oparg;
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
INSTRUMENTED_JUMP(this_instr, next_instr + offset, PY_MONITORING_EVENT_BRANCH);
DISPATCH();
}
@ -4834,6 +4830,28 @@
(void)this_instr;
next_instr += 1;
INSTRUCTION_STATS(INSTRUMENTED_RESUME);
// _LOAD_BYTECODE
{
#ifdef Py_GIL_DISABLED
if (frame->tlbc_index !=
((_PyThreadStateImpl *)tstate)->tlbc_index) {
_PyFrame_SetStackPointer(frame, stack_pointer);
_Py_CODEUNIT *bytecode =
_PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
stack_pointer = _PyFrame_GetStackPointer(frame);
if (bytecode == NULL) goto error;
_PyFrame_SetStackPointer(frame, stack_pointer);
int off = this_instr - _PyFrame_GetBytecode(frame);
stack_pointer = _PyFrame_GetStackPointer(frame);
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
frame->instr_ptr = bytecode + off;
// Make sure this_instr gets reset correctley for any uops that
// follow
next_instr = frame->instr_ptr;
DISPATCH();
}
#endif
}
// _MAYBE_INSTRUMENT
{
if (tstate->tracing == 0) {
@ -6646,9 +6664,7 @@
cond = stack_pointer[-1];
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
@ -6680,9 +6696,7 @@
cond = b;
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
stack_pointer += -1;
@ -6715,9 +6729,7 @@
cond = b;
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_False);
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
}
stack_pointer += -1;
@ -6735,9 +6747,7 @@
cond = stack_pointer[-1];
assert(PyStackRef_BoolCheck(cond));
int flag = PyStackRef_Is(cond, PyStackRef_True);
#if ENABLE_SPECIALIZATION
this_instr[1].cache = (this_instr[1].cache << 1) | flag;
#endif
RECORD_BRANCH_TAKEN(this_instr[1].cache, flag);
JUMPBY(oparg * flag);
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
@ -6832,7 +6842,11 @@
if (oparg) {
PyObject *lasti = PyStackRef_AsPyObjectBorrow(values[0]);
if (PyLong_Check(lasti)) {
frame->instr_ptr = _PyCode_CODE(_PyFrame_GetCode(frame)) + PyLong_AsLong(lasti);
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
_PyFrame_SetStackPointer(frame, stack_pointer);
frame->instr_ptr = _PyFrame_GetBytecode(frame) + PyLong_AsLong(lasti);
stack_pointer = _PyFrame_GetStackPointer(frame);
assert(!_PyErr_Occurred(tstate));
}
else {
@ -6844,6 +6858,8 @@
Py_DECREF(exc);
goto error;
}
stack_pointer += 1;
assert(WITHIN_STACK_BOUNDS());
}
assert(exc && PyExceptionInstance_Check(exc));
stack_pointer += -1;
@ -6871,6 +6887,28 @@
PREDICTED(RESUME);
_Py_CODEUNIT* const this_instr = next_instr - 1;
(void)this_instr;
// _LOAD_BYTECODE
{
#ifdef Py_GIL_DISABLED
if (frame->tlbc_index !=
((_PyThreadStateImpl *)tstate)->tlbc_index) {
_PyFrame_SetStackPointer(frame, stack_pointer);
_Py_CODEUNIT *bytecode =
_PyEval_GetExecutableCode(tstate, _PyFrame_GetCode(frame));
stack_pointer = _PyFrame_GetStackPointer(frame);
if (bytecode == NULL) goto error;
_PyFrame_SetStackPointer(frame, stack_pointer);
int off = this_instr - _PyFrame_GetBytecode(frame);
stack_pointer = _PyFrame_GetStackPointer(frame);
frame->tlbc_index = ((_PyThreadStateImpl *)tstate)->tlbc_index;
frame->instr_ptr = bytecode + off;
// Make sure this_instr gets reset correctley for any uops that
// follow
next_instr = frame->instr_ptr;
DISPATCH();
}
#endif
}
// _MAYBE_INSTRUMENT
{
if (tstate->tracing == 0) {
@ -6890,11 +6928,11 @@
}
// _QUICKEN_RESUME
{
#if ENABLE_SPECIALIZATION
#if ENABLE_SPECIALIZATION_FT
if (tstate->tracing == 0 && this_instr->op.code == RESUME) {
FT_ATOMIC_STORE_UINT8_RELAXED(this_instr->op.code, RESUME_CHECK);
}
#endif /* ENABLE_SPECIALIZATION */
#endif /* ENABLE_SPECIALIZATION_FT */
}
// _CHECK_PERIODIC_IF_NOT_YIELD_FROM
{
@ -6925,6 +6963,10 @@
uintptr_t version = FT_ATOMIC_LOAD_UINTPTR_ACQUIRE(_PyFrame_GetCode(frame)->_co_instrumentation_version);
assert((version & _PY_EVAL_EVENTS_MASK) == 0);
DEOPT_IF(eval_breaker != version, RESUME);
#ifdef Py_GIL_DISABLED
DEOPT_IF(frame->tlbc_index !=
((_PyThreadStateImpl *)tstate)->tlbc_index, RESUME);
#endif
DISPATCH();
}

193
Python/index_pool.c Normal file
View file

@ -0,0 +1,193 @@
#include <stdbool.h>
#include "Python.h"
#include "pycore_index_pool.h"
#include "pycore_lock.h"
#ifdef Py_GIL_DISABLED
static inline void
swap(int32_t *values, Py_ssize_t i, Py_ssize_t j)
{
int32_t tmp = values[i];
values[i] = values[j];
values[j] = tmp;
}
static bool
heap_try_swap(_PyIndexHeap *heap, Py_ssize_t i, Py_ssize_t j)
{
if (i < 0 || i >= heap->size) {
return 0;
}
if (j < 0 || j >= heap->size) {
return 0;
}
if (i <= j) {
if (heap->values[i] <= heap->values[j]) {
return 0;
}
}
else if (heap->values[j] <= heap->values[i]) {
return 0;
}
swap(heap->values, i, j);
return 1;
}
static inline Py_ssize_t
parent(Py_ssize_t i)
{
return (i - 1) / 2;
}
static inline Py_ssize_t
left_child(Py_ssize_t i)
{
return 2 * i + 1;
}
static inline Py_ssize_t
right_child(Py_ssize_t i)
{
return 2 * i + 2;
}
static void
heap_add(_PyIndexHeap *heap, int32_t val)
{
assert(heap->size < heap->capacity);
// Add val to end
heap->values[heap->size] = val;
heap->size++;
// Sift up
for (Py_ssize_t cur = heap->size - 1; cur > 0; cur = parent(cur)) {
if (!heap_try_swap(heap, cur, parent(cur))) {
break;
}
}
}
static Py_ssize_t
heap_min_child(_PyIndexHeap *heap, Py_ssize_t i)
{
if (left_child(i) < heap->size) {
if (right_child(i) < heap->size) {
Py_ssize_t lval = heap->values[left_child(i)];
Py_ssize_t rval = heap->values[right_child(i)];
return lval < rval ? left_child(i) : right_child(i);
}
return left_child(i);
}
else if (right_child(i) < heap->size) {
return right_child(i);
}
return -1;
}
static int32_t
heap_pop(_PyIndexHeap *heap)
{
assert(heap->size > 0);
// Pop smallest and replace with the last element
int32_t result = heap->values[0];
heap->values[0] = heap->values[heap->size - 1];
heap->size--;
// Sift down
for (Py_ssize_t cur = 0; cur < heap->size;) {
Py_ssize_t min_child = heap_min_child(heap, cur);
if (min_child > -1 && heap_try_swap(heap, cur, min_child)) {
cur = min_child;
}
else {
break;
}
}
return result;
}
static int
heap_ensure_capacity(_PyIndexHeap *heap, Py_ssize_t limit)
{
assert(limit > 0);
if (heap->capacity > limit) {
return 0;
}
Py_ssize_t new_capacity = heap->capacity ? heap->capacity : 1024;
while (new_capacity && new_capacity < limit) {
new_capacity <<= 1;
}
if (!new_capacity) {
return -1;
}
int32_t *new_values = PyMem_RawCalloc(new_capacity, sizeof(int32_t));
if (new_values == NULL) {
return -1;
}
if (heap->values != NULL) {
memcpy(new_values, heap->values, heap->capacity);
PyMem_RawFree(heap->values);
}
heap->values = new_values;
heap->capacity = new_capacity;
return 0;
}
static void
heap_fini(_PyIndexHeap *heap)
{
if (heap->values != NULL) {
PyMem_RawFree(heap->values);
heap->values = NULL;
}
heap->size = -1;
heap->capacity = -1;
}
#define LOCK_POOL(pool) PyMutex_LockFlags(&pool->mutex, _Py_LOCK_DONT_DETACH)
#define UNLOCK_POOL(pool) PyMutex_Unlock(&pool->mutex)
int32_t
_PyIndexPool_AllocIndex(_PyIndexPool *pool)
{
LOCK_POOL(pool);
int32_t index;
_PyIndexHeap *free_indices = &pool->free_indices;
if (free_indices->size == 0) {
// No free indices. Make sure the heap can always store all of the
// indices that have been allocated to avoid having to allocate memory
// (which can fail) when freeing an index. Freeing indices happens when
// threads are being destroyed, which makes error handling awkward /
// impossible. This arrangement shifts handling of allocation failures
// to when indices are allocated, which happens at thread creation,
// where we are better equipped to deal with failure.
if (heap_ensure_capacity(free_indices, pool->next_index + 1) < 0) {
UNLOCK_POOL(pool);
PyErr_NoMemory();
return -1;
}
index = pool->next_index++;
}
else {
index = heap_pop(free_indices);
}
UNLOCK_POOL(pool);
return index;
}
void
_PyIndexPool_FreeIndex(_PyIndexPool *pool, int32_t index)
{
LOCK_POOL(pool);
heap_add(&pool->free_indices, index);
UNLOCK_POOL(pool);
}
void
_PyIndexPool_Fini(_PyIndexPool *pool)
{
heap_fini(&pool->free_indices);
}
#endif // Py_GIL_DISABLED

View file

@ -134,6 +134,7 @@ static const PyConfigSpec PYCONFIG_SPEC[] = {
SPEC(dump_refs_file, WSTR_OPT, READ_ONLY, NO_SYS),
#ifdef Py_GIL_DISABLED
SPEC(enable_gil, INT, READ_ONLY, NO_SYS),
SPEC(tlbc_enabled, INT, READ_ONLY, NO_SYS),
#endif
SPEC(faulthandler, BOOL, READ_ONLY, NO_SYS),
SPEC(filesystem_encoding, WSTR, READ_ONLY, NO_SYS),
@ -315,8 +316,13 @@ The following implementation-specific options are available:\n\
"\
-X showrefcount: output the total reference count and number of used\n\
memory blocks when the program finishes or after each statement in\n\
the interactive interpreter; only works on debug builds\n\
-X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n\
the interactive interpreter; only works on debug builds\n"
#ifdef Py_GIL_DISABLED
"-X tlbc=[0|1]: enable (1) or disable (0) thread-local bytecode. Also\n\
PYTHON_TLBC\n"
#endif
"\
-X tracemalloc[=N]: trace Python memory allocations; N sets a traceback limit\n \
of N frames (default: 1); also PYTHONTRACEMALLOC=N\n\
-X utf8[=0|1]: enable (1) or disable (0) UTF-8 mode; also PYTHONUTF8\n\
-X warn_default_encoding: enable opt-in EncodingWarning for 'encoding=None';\n\
@ -400,6 +406,9 @@ static const char usage_envvars[] =
#ifdef Py_STATS
"PYTHONSTATS : turns on statistics gathering (-X pystats)\n"
#endif
#ifdef Py_GIL_DISABLED
"PYTHON_TLBC : when set to 0, disables thread-local bytecode (-X tlbc)\n"
#endif
"PYTHONTRACEMALLOC: trace Python memory allocations (-X tracemalloc)\n"
"PYTHONUNBUFFERED: disable stdout/stderr buffering (-u)\n"
"PYTHONUTF8 : control the UTF-8 mode (-X utf8)\n"
@ -979,6 +988,7 @@ _PyConfig_InitCompatConfig(PyConfig *config)
config->cpu_count = -1;
#ifdef Py_GIL_DISABLED
config->enable_gil = _PyConfig_GIL_DEFAULT;
config->tlbc_enabled = 1;
#endif
}
@ -1862,6 +1872,36 @@ error:
"n must be greater than 0");
}
static PyStatus
config_init_tlbc(PyConfig *config)
{
#ifdef Py_GIL_DISABLED
const char *env = config_get_env(config, "PYTHON_TLBC");
if (env) {
int enabled;
if (_Py_str_to_int(env, &enabled) < 0 || (enabled < 0) || (enabled > 1)) {
return _PyStatus_ERR(
"PYTHON_TLBC=N: N is missing or invalid");
}
config->tlbc_enabled = enabled;
}
const wchar_t *xoption = config_get_xoption(config, L"tlbc");
if (xoption) {
int enabled;
const wchar_t *sep = wcschr(xoption, L'=');
if (!sep || (config_wstr_to_int(sep + 1, &enabled) < 0) || (enabled < 0) || (enabled > 1)) {
return _PyStatus_ERR(
"-X tlbc=n: n is missing or invalid");
}
config->tlbc_enabled = enabled;
}
return _PyStatus_OK();
#else
return _PyStatus_OK();
#endif
}
static PyStatus
config_init_perf_profiling(PyConfig *config)
{
@ -2111,6 +2151,11 @@ config_read_complex_options(PyConfig *config)
}
#endif
status = config_init_tlbc(config);
if (_PyStatus_EXCEPTION(status)) {
return status;
}
return _PyStatus_OK();
}

View file

@ -44,10 +44,24 @@
#define UNLOCK_CODE() Py_END_CRITICAL_SECTION()
#define MODIFY_BYTECODE(code, func, ...) \
do { \
PyCodeObject *co = (code); \
for (Py_ssize_t i = 0; i < code->co_tlbc->size; i++) { \
char *bc = co->co_tlbc->entries[i]; \
if (bc == NULL) { \
continue; \
} \
(func)((_Py_CODEUNIT *)bc, __VA_ARGS__); \
} \
} while (0)
#else
#define LOCK_CODE(code)
#define UNLOCK_CODE()
#define MODIFY_BYTECODE(code, func, ...) \
(func)(_PyCode_CODE(code), __VA_ARGS__)
#endif
@ -309,7 +323,8 @@ _PyInstruction_GetLength(PyCodeObject *code, int offset)
{
ASSERT_WORLD_STOPPED_OR_LOCKED(code);
int opcode = _PyCode_CODE(code)[offset].op.code;
int opcode =
FT_ATOMIC_LOAD_UINT8_RELAXED(_PyCode_CODE(code)[offset].op.code);
assert(opcode != 0);
assert(opcode != RESERVED);
if (opcode == INSTRUMENTED_LINE) {
@ -578,7 +593,9 @@ sanity_check_instrumentation(PyCodeObject *code)
_Py_CODEUNIT
_Py_GetBaseCodeUnit(PyCodeObject *code, int i)
{
_Py_CODEUNIT inst = _PyCode_CODE(code)[i];
_Py_CODEUNIT *src_instr = _PyCode_CODE(code) + i;
_Py_CODEUNIT inst = {
.cache = FT_ATOMIC_LOAD_UINT16_RELAXED(*(uint16_t *)src_instr)};
int opcode = inst.op.code;
if (opcode < MIN_INSTRUMENTED_OPCODE) {
inst.op.code = _PyOpcode_Deopt[opcode];
@ -614,21 +631,22 @@ _Py_GetBaseCodeUnit(PyCodeObject *code, int i)
}
static void
de_instrument(PyCodeObject *code, int i, int event)
de_instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i,
int event)
{
assert(event != PY_MONITORING_EVENT_INSTRUCTION);
assert(event != PY_MONITORING_EVENT_LINE);
_Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
_Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode = *opcode_ptr;
assert(opcode != ENTER_EXECUTOR);
if (opcode == INSTRUMENTED_LINE) {
opcode_ptr = &code->_co_monitoring->lines[i].original_opcode;
opcode_ptr = &monitoring->lines[i].original_opcode;
opcode = *opcode_ptr;
}
if (opcode == INSTRUMENTED_INSTRUCTION) {
opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i];
opcode_ptr = &monitoring->per_instruction_opcodes[i];
opcode = *opcode_ptr;
}
int deinstrumented = DE_INSTRUMENT[opcode];
@ -644,65 +662,68 @@ de_instrument(PyCodeObject *code, int i, int event)
}
static void
de_instrument_line(PyCodeObject *code, int i)
de_instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring,
int i)
{
_Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
_Py_CODEUNIT *instr = &bytecode[i];
int opcode = instr->op.code;
if (opcode != INSTRUMENTED_LINE) {
return;
}
_PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
_PyCoLineInstrumentationData *lines = &monitoring->lines[i];
int original_opcode = lines->original_opcode;
if (original_opcode == INSTRUMENTED_INSTRUCTION) {
lines->original_opcode = code->_co_monitoring->per_instruction_opcodes[i];
lines->original_opcode = monitoring->per_instruction_opcodes[i];
}
CHECK(original_opcode != 0);
CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]);
instr->op.code = original_opcode;
FT_ATOMIC_STORE_UINT8(instr->op.code, original_opcode);
if (_PyOpcode_Caches[original_opcode]) {
instr[1].counter = adaptive_counter_warmup();
FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff,
adaptive_counter_warmup().value_and_backoff);
}
assert(instr->op.code != INSTRUMENTED_LINE);
}
static void
de_instrument_per_instruction(PyCodeObject *code, int i)
de_instrument_per_instruction(_Py_CODEUNIT *bytecode,
_PyCoMonitoringData *monitoring, int i)
{
_Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
_Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode = *opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
opcode_ptr = &code->_co_monitoring->lines[i].original_opcode;
opcode_ptr = &monitoring->lines[i].original_opcode;
opcode = *opcode_ptr;
}
if (opcode != INSTRUMENTED_INSTRUCTION) {
return;
}
int original_opcode = code->_co_monitoring->per_instruction_opcodes[i];
int original_opcode = monitoring->per_instruction_opcodes[i];
CHECK(original_opcode != 0);
CHECK(original_opcode == _PyOpcode_Deopt[original_opcode]);
*opcode_ptr = original_opcode;
FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, original_opcode);
if (_PyOpcode_Caches[original_opcode]) {
instr[1].counter = adaptive_counter_warmup();
FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff,
adaptive_counter_warmup().value_and_backoff);
}
assert(*opcode_ptr != INSTRUMENTED_INSTRUCTION);
assert(instr->op.code != INSTRUMENTED_INSTRUCTION);
}
static void
instrument(PyCodeObject *code, int i)
instrument(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i)
{
_Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
_Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode =*opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
_PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
_PyCoLineInstrumentationData *lines = &monitoring->lines[i];
opcode_ptr = &lines->original_opcode;
opcode = *opcode_ptr;
}
if (opcode == INSTRUMENTED_INSTRUCTION) {
opcode_ptr = &code->_co_monitoring->per_instruction_opcodes[i];
opcode_ptr = &monitoring->per_instruction_opcodes[i];
opcode = *opcode_ptr;
CHECK(opcode != INSTRUMENTED_INSTRUCTION && opcode != INSTRUMENTED_LINE);
CHECK(opcode == _PyOpcode_Deopt[opcode]);
@ -716,52 +737,52 @@ instrument(PyCodeObject *code, int i)
if (_PyOpcode_Caches[deopt]) {
FT_ATOMIC_STORE_UINT16_RELAXED(instr[1].counter.value_and_backoff,
adaptive_counter_warmup().value_and_backoff);
instr[1].counter = adaptive_counter_warmup();
}
}
}
static void
instrument_line(PyCodeObject *code, int i)
instrument_line(_Py_CODEUNIT *bytecode, _PyCoMonitoringData *monitoring, int i)
{
uint8_t *opcode_ptr = &_PyCode_CODE(code)[i].op.code;
uint8_t *opcode_ptr = &bytecode[i].op.code;
int opcode = *opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
return;
}
_PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
_PyCoLineInstrumentationData *lines = &monitoring->lines[i];
lines->original_opcode = _PyOpcode_Deopt[opcode];
CHECK(lines->original_opcode > 0);
*opcode_ptr = INSTRUMENTED_LINE;
FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_LINE);
}
static void
instrument_per_instruction(PyCodeObject *code, int i)
instrument_per_instruction(_Py_CODEUNIT *bytecode,
_PyCoMonitoringData *monitoring, int i)
{
_Py_CODEUNIT *instr = &_PyCode_CODE(code)[i];
_Py_CODEUNIT *instr = &bytecode[i];
uint8_t *opcode_ptr = &instr->op.code;
int opcode = *opcode_ptr;
if (opcode == INSTRUMENTED_LINE) {
_PyCoLineInstrumentationData *lines = &code->_co_monitoring->lines[i];
_PyCoLineInstrumentationData *lines = &monitoring->lines[i];
opcode_ptr = &lines->original_opcode;
opcode = *opcode_ptr;
}
if (opcode == INSTRUMENTED_INSTRUCTION) {
assert(code->_co_monitoring->per_instruction_opcodes[i] > 0);
assert(monitoring->per_instruction_opcodes[i] > 0);
return;
}
CHECK(opcode != 0);
if (is_instrumented(opcode)) {
code->_co_monitoring->per_instruction_opcodes[i] = opcode;
monitoring->per_instruction_opcodes[i] = opcode;
}
else {
assert(opcode != 0);
assert(_PyOpcode_Deopt[opcode] != 0);
assert(_PyOpcode_Deopt[opcode] != RESUME);
code->_co_monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode];
monitoring->per_instruction_opcodes[i] = _PyOpcode_Deopt[opcode];
}
assert(code->_co_monitoring->per_instruction_opcodes[i] > 0);
*opcode_ptr = INSTRUMENTED_INSTRUCTION;
assert(monitoring->per_instruction_opcodes[i] > 0);
FT_ATOMIC_STORE_UINT8_RELAXED(*opcode_ptr, INSTRUMENTED_INSTRUCTION);
}
static void
@ -773,19 +794,19 @@ remove_tools(PyCodeObject * code, int offset, int event, int tools)
assert(PY_MONITORING_IS_INSTRUMENTED_EVENT(event));
assert(opcode_has_event(_Py_GetBaseCodeUnit(code, offset).op.code));
_PyCoMonitoringData *monitoring = code->_co_monitoring;
bool should_de_instrument;
if (monitoring && monitoring->tools) {
monitoring->tools[offset] &= ~tools;
if (monitoring->tools[offset] == 0) {
de_instrument(code, offset, event);
}
should_de_instrument = (monitoring->tools[offset] == 0);
}
else {
/* Single tool */
uint8_t single_tool = code->_co_monitoring->active_monitors.tools[event];
assert(_Py_popcount32(single_tool) <= 1);
if (((single_tool & tools) == single_tool)) {
de_instrument(code, offset, event);
}
should_de_instrument = ((single_tool & tools) == single_tool);
}
if (should_de_instrument) {
MODIFY_BYTECODE(code, de_instrument, monitoring, offset, event);
}
}
@ -804,22 +825,23 @@ remove_line_tools(PyCodeObject * code, int offset, int tools)
{
ASSERT_WORLD_STOPPED_OR_LOCKED(code);
assert(code->_co_monitoring);
if (code->_co_monitoring->line_tools)
_PyCoMonitoringData *monitoring = code->_co_monitoring;
assert(monitoring);
bool should_de_instrument;
if (monitoring->line_tools)
{
uint8_t *toolsptr = &code->_co_monitoring->line_tools[offset];
uint8_t *toolsptr = &monitoring->line_tools[offset];
*toolsptr &= ~tools;
if (*toolsptr == 0 ) {
de_instrument_line(code, offset);
}
should_de_instrument = (*toolsptr == 0);
}
else {
/* Single tool */
uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE];
uint8_t single_tool = monitoring->active_monitors.tools[PY_MONITORING_EVENT_LINE];
assert(_Py_popcount32(single_tool) <= 1);
if (((single_tool & tools) == single_tool)) {
de_instrument_line(code, offset);
}
should_de_instrument = ((single_tool & tools) == single_tool);
}
if (should_de_instrument) {
MODIFY_BYTECODE(code, de_instrument_line, monitoring, offset);
}
}
@ -841,7 +863,7 @@ add_tools(PyCodeObject * code, int offset, int event, int tools)
assert(_Py_popcount32(tools) == 1);
assert(tools_is_subset_for_event(code, event, tools));
}
instrument(code, offset);
MODIFY_BYTECODE(code, instrument, code->_co_monitoring, offset);
}
static void
@ -858,7 +880,7 @@ add_line_tools(PyCodeObject * code, int offset, int tools)
/* Single tool */
assert(_Py_popcount32(tools) == 1);
}
instrument_line(code, offset);
MODIFY_BYTECODE(code, instrument_line, code->_co_monitoring, offset);
}
@ -876,7 +898,7 @@ add_per_instruction_tools(PyCodeObject * code, int offset, int tools)
/* Single tool */
assert(_Py_popcount32(tools) == 1);
}
instrument_per_instruction(code, offset);
MODIFY_BYTECODE(code, instrument_per_instruction, code->_co_monitoring, offset);
}
@ -885,21 +907,22 @@ remove_per_instruction_tools(PyCodeObject * code, int offset, int tools)
{
ASSERT_WORLD_STOPPED_OR_LOCKED(code);
_PyCoMonitoringData *monitoring = code->_co_monitoring;
assert(code->_co_monitoring);
bool should_de_instrument;
if (code->_co_monitoring->per_instruction_tools) {
uint8_t *toolsptr = &code->_co_monitoring->per_instruction_tools[offset];
*toolsptr &= ~tools;
if (*toolsptr == 0) {
de_instrument_per_instruction(code, offset);
}
should_de_instrument = (*toolsptr == 0);
}
else {
/* Single tool */
uint8_t single_tool = code->_co_monitoring->active_monitors.tools[PY_MONITORING_EVENT_INSTRUCTION];
assert(_Py_popcount32(single_tool) <= 1);
if (((single_tool & tools) == single_tool)) {
de_instrument_per_instruction(code, offset);
}
should_de_instrument = ((single_tool & tools) == single_tool);
}
if (should_de_instrument) {
MODIFY_BYTECODE(code, de_instrument_per_instruction, monitoring, offset);
}
}
@ -1087,7 +1110,7 @@ call_instrumentation_vector(
PyCodeObject *code = _PyFrame_GetCode(frame);
assert(args[1] == NULL);
args[1] = (PyObject *)code;
int offset = (int)(instr - _PyCode_CODE(code));
int offset = (int)(instr - _PyFrame_GetBytecode(frame));
/* Offset visible to user should be the offset in bytes, as that is the
* convention for APIs involving code offsets. */
int bytes_offset = offset * (int)sizeof(_Py_CODEUNIT);
@ -1173,8 +1196,7 @@ _Py_call_instrumentation_jump(
assert(event == PY_MONITORING_EVENT_JUMP ||
event == PY_MONITORING_EVENT_BRANCH);
assert(frame->instr_ptr == instr);
PyCodeObject *code = _PyFrame_GetCode(frame);
int to = (int)(target - _PyCode_CODE(code));
int to = (int)(target - _PyFrame_GetBytecode(frame));
PyObject *to_obj = PyLong_FromLong(to * (int)sizeof(_Py_CODEUNIT));
if (to_obj == NULL) {
return NULL;
@ -1240,7 +1262,8 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame,
PyCodeObject *code = _PyFrame_GetCode(frame);
assert(tstate->tracing == 0);
assert(debug_check_sanity(tstate->interp, code));
int i = (int)(instr - _PyCode_CODE(code));
_Py_CODEUNIT *bytecode = _PyFrame_GetBytecode(frame);
int i = (int)(instr - bytecode);
_PyCoMonitoringData *monitoring = code->_co_monitoring;
_PyCoLineInstrumentationData *line_data = &monitoring->lines[i];
@ -1256,10 +1279,10 @@ _Py_call_instrumentation_line(PyThreadState *tstate, _PyInterpreterFrame* frame,
line = compute_line(code, i, line_delta);
assert(line >= 0);
assert(prev != NULL);
int prev_index = (int)(prev - _PyCode_CODE(code));
int prev_index = (int)(prev - bytecode);
int prev_line = _Py_Instrumentation_GetLine(code, prev_index);
if (prev_line == line) {
int prev_opcode = _PyCode_CODE(code)[prev_index].op.code;
int prev_opcode = bytecode[prev_index].op.code;
/* RESUME and INSTRUMENTED_RESUME are needed for the operation of
* instrumentation, so must never be hidden by an INSTRUMENTED_LINE.
*/
@ -1359,7 +1382,7 @@ int
_Py_call_instrumentation_instruction(PyThreadState *tstate, _PyInterpreterFrame* frame, _Py_CODEUNIT *instr)
{
PyCodeObject *code = _PyFrame_GetCode(frame);
int offset = (int)(instr - _PyCode_CODE(code));
int offset = (int)(instr - _PyFrame_GetBytecode(frame));
_PyCoMonitoringData *instrumentation_data = code->_co_monitoring;
assert(instrumentation_data->per_instruction_opcodes);
int next_opcode = instrumentation_data->per_instruction_opcodes[offset];

View file

@ -17,6 +17,8 @@
/* _QUICKEN_RESUME is not a viable micro-op for tier 2 */
/* _LOAD_BYTECODE is not a viable micro-op for tier 2 */
case _RESUME_CHECK: {
break;
}

View file

@ -1513,6 +1513,11 @@ new_threadstate(PyInterpreterState *interp, int whence)
PyMem_RawFree(new_tstate);
return NULL;
}
int32_t tlbc_idx = _Py_ReserveTLBCIndex(interp);
if (tlbc_idx < 0) {
PyMem_RawFree(new_tstate);
return NULL;
}
#endif
/* We serialize concurrent creation to protect global state. */
@ -1555,6 +1560,7 @@ new_threadstate(PyInterpreterState *interp, int whence)
#ifdef Py_GIL_DISABLED
// Must be called with lock unlocked to avoid lock ordering deadlocks.
_Py_qsbr_register(tstate, interp, qsbr_idx);
tstate->tlbc_index = tlbc_idx;
#endif
return (PyThreadState *)tstate;
@ -1706,6 +1712,10 @@ PyThreadState_Clear(PyThreadState *tstate)
// Remove ourself from the biased reference counting table of threads.
_Py_brc_remove_thread(tstate);
// Release our thread-local copies of the bytecode for reuse by another
// thread
_Py_ClearTLBCIndex((_PyThreadStateImpl *)tstate);
#endif
// Merge our queue of pointers to be freed into the interpreter queue.

View file

@ -24,6 +24,25 @@ extern const char *_PyUOpName(int index);
* ./adaptive.md
*/
#ifdef Py_GIL_DISABLED
#define SET_OPCODE_OR_RETURN(instr, opcode) \
do { \
uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \
if (old_op >= MIN_INSTRUMENTED_OPCODE) { \
/* Lost race with instrumentation */ \
return; \
} \
if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, \
(opcode))) { \
/* Lost race with instrumentation */ \
assert(old_op >= MIN_INSTRUMENTED_OPCODE); \
return; \
} \
} while (0)
#else
#define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode)
#endif
#ifdef Py_STATS
GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 };
static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats };
@ -436,16 +455,25 @@ do { \
# define SPECIALIZATION_FAIL(opcode, kind) ((void)0)
#endif
// Initialize warmup counters and insert superinstructions. This cannot fail.
// Initialize warmup counters and optimize instructions. This cannot fail.
void
_PyCode_Quicken(PyCodeObject *code)
_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts,
int enable_counters)
{
#if ENABLE_SPECIALIZATION
#if ENABLE_SPECIALIZATION_FT
_Py_BackoffCounter jump_counter, adaptive_counter;
if (enable_counters) {
jump_counter = initial_jump_backoff_counter();
adaptive_counter = adaptive_counter_warmup();
}
else {
jump_counter = initial_unreachable_backoff_counter();
adaptive_counter = initial_unreachable_backoff_counter();
}
int opcode = 0;
int oparg = 0;
_Py_CODEUNIT *instructions = _PyCode_CODE(code);
/* The last code unit cannot have a cache, so we don't need to check it */
for (int i = 0; i < Py_SIZE(code)-1; i++) {
for (Py_ssize_t i = 0; i < size-1; i++) {
opcode = instructions[i].op.code;
int caches = _PyOpcode_Caches[opcode];
oparg = (oparg << 8) | instructions[i].op.arg;
@ -453,7 +481,7 @@ _PyCode_Quicken(PyCodeObject *code)
// The initial value depends on the opcode
switch (opcode) {
case JUMP_BACKWARD:
instructions[i + 1].counter = initial_jump_backoff_counter();
instructions[i + 1].counter = jump_counter;
break;
case POP_JUMP_IF_FALSE:
case POP_JUMP_IF_TRUE:
@ -462,7 +490,7 @@ _PyCode_Quicken(PyCodeObject *code)
instructions[i + 1].cache = 0x5555; // Alternating 0, 1 bits
break;
default:
instructions[i + 1].counter = adaptive_counter_warmup();
instructions[i + 1].counter = adaptive_counter;
break;
}
i += caches;
@ -471,7 +499,7 @@ _PyCode_Quicken(PyCodeObject *code)
/* We can't do this in the bytecode compiler as
* marshalling can intern strings and make them immortal. */
PyObject *obj = PyTuple_GET_ITEM(code->co_consts, oparg);
PyObject *obj = PyTuple_GET_ITEM(consts, oparg);
if (_Py_IsImmortal(obj)) {
instructions[i].op.code = LOAD_CONST_IMMORTAL;
}
@ -480,7 +508,7 @@ _PyCode_Quicken(PyCodeObject *code)
oparg = 0;
}
}
#endif /* ENABLE_SPECIALIZATION */
#endif /* ENABLE_SPECIALIZATION_FT */
}
#define SIMPLE_FUNCTION 0
@ -2243,9 +2271,10 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
{
PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st);
PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st);
assert(ENABLE_SPECIALIZATION);
assert(ENABLE_SPECIALIZATION_FT);
assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP);
_PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1);
uint8_t specialized_op;
switch (oparg) {
case NB_ADD:
case NB_INPLACE_ADD:
@ -2256,18 +2285,18 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
_Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1];
bool to_store = (next.op.code == STORE_FAST);
if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) {
instr->op.code = BINARY_OP_INPLACE_ADD_UNICODE;
specialized_op = BINARY_OP_INPLACE_ADD_UNICODE;
goto success;
}
instr->op.code = BINARY_OP_ADD_UNICODE;
specialized_op = BINARY_OP_ADD_UNICODE;
goto success;
}
if (PyLong_CheckExact(lhs)) {
instr->op.code = BINARY_OP_ADD_INT;
specialized_op = BINARY_OP_ADD_INT;
goto success;
}
if (PyFloat_CheckExact(lhs)) {
instr->op.code = BINARY_OP_ADD_FLOAT;
specialized_op = BINARY_OP_ADD_FLOAT;
goto success;
}
break;
@ -2277,11 +2306,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
break;
}
if (PyLong_CheckExact(lhs)) {
instr->op.code = BINARY_OP_MULTIPLY_INT;
specialized_op = BINARY_OP_MULTIPLY_INT;
goto success;
}
if (PyFloat_CheckExact(lhs)) {
instr->op.code = BINARY_OP_MULTIPLY_FLOAT;
specialized_op = BINARY_OP_MULTIPLY_FLOAT;
goto success;
}
break;
@ -2291,22 +2320,23 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
break;
}
if (PyLong_CheckExact(lhs)) {
instr->op.code = BINARY_OP_SUBTRACT_INT;
specialized_op = BINARY_OP_SUBTRACT_INT;
goto success;
}
if (PyFloat_CheckExact(lhs)) {
instr->op.code = BINARY_OP_SUBTRACT_FLOAT;
specialized_op = BINARY_OP_SUBTRACT_FLOAT;
goto success;
}
break;
}
SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs));
STAT_INC(BINARY_OP, failure);
instr->op.code = BINARY_OP;
SET_OPCODE_OR_RETURN(instr, BINARY_OP);
cache->counter = adaptive_counter_backoff(cache->counter);
return;
success:
STAT_INC(BINARY_OP, success);
SET_OPCODE_OR_RETURN(instr, specialized_op);
cache->counter = adaptive_counter_cooldown();
}

View file

@ -2174,6 +2174,11 @@ sys__clear_internal_caches_impl(PyObject *module)
#ifdef _Py_TIER2
PyInterpreterState *interp = _PyInterpreterState_GET();
_Py_Executors_InvalidateAll(interp, 0);
#endif
#ifdef Py_GIL_DISABLED
if (_Py_ClearUnusedTLBC(_PyInterpreterState_GET()) < 0) {
return NULL;
}
#endif
PyType_ClearCache();
Py_RETURN_NONE;