gh-115999: Enable specialization of CALL instructions in free-threaded builds (#127123)

The CALL family of instructions were mostly thread-safe already and only required a small number of changes, which are documented below.

A few changes were needed to make CALL_ALLOC_AND_ENTER_INIT thread-safe:

Added _PyType_LookupRefAndVersion, which returns the type version corresponding to the returned ref.

Added _PyType_CacheInitForSpecialization, which takes an init method and the corresponding type version and only populates the specialization cache if the current type version matches the supplied version. This prevents potentially caching a stale value in free-threaded builds if we race with an update to __init__.

Only cache __init__ functions that are deferred in free-threaded builds. This ensures that the reference to __init__ that is stored in the specialization cache is valid if the type version guard in _CHECK_AND_ALLOCATE_OBJECT passes.
Fix a bug in _CREATE_INIT_FRAME where the frame is pushed to the stack on failure.

A few other miscellaneous changes were also needed:

Use {LOCK,UNLOCK}_OBJECT in LIST_APPEND. This ensures that the list's per-object lock is held while we are appending to it.

Add missing co_tlbc for _Py_InitCleanup.

Stop/start the world around setting the eval frame hook. This allows us to read interp->eval_frame non-atomically and preserves the behavior of _CHECK_PEP_523 documented below.
This commit is contained in:
mpage 2024-12-03 11:20:20 -08:00 committed by GitHub
parent fc5a0dc224
commit dabcecfd6d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 220 additions and 92 deletions

View file

@ -880,7 +880,7 @@
callable = &stack_pointer[-2 - oparg];
uint16_t counter = read_u16(&this_instr[1].cache);
(void)counter;
#if ENABLE_SPECIALIZATION
#if ENABLE_SPECIALIZATION_FT
if (ADAPTIVE_COUNTER_TRIGGERS(counter)) {
next_instr = this_instr;
_PyFrame_SetStackPointer(frame, stack_pointer);
@ -890,7 +890,7 @@
}
OPCODE_DEFERRED_INC(CALL);
ADVANCE_ADAPTIVE_COUNTER(this_instr[1].counter);
#endif /* ENABLE_SPECIALIZATION */
#endif /* ENABLE_SPECIALIZATION_FT */
}
/* Skip 2 cache entries */
// _MAYBE_EXPAND_METHOD
@ -1048,10 +1048,10 @@
DEOPT_IF(!PyStackRef_IsNull(null[0]), CALL);
DEOPT_IF(!PyType_Check(callable_o), CALL);
PyTypeObject *tp = (PyTypeObject *)callable_o;
DEOPT_IF(tp->tp_version_tag != type_version, CALL);
DEOPT_IF(FT_ATOMIC_LOAD_UINT32_RELAXED(tp->tp_version_tag) != type_version, CALL);
assert(tp->tp_flags & Py_TPFLAGS_INLINE_VALUES);
PyHeapTypeObject *cls = (PyHeapTypeObject *)callable_o;
PyFunctionObject *init_func = (PyFunctionObject *)cls->_spec_cache.init;
PyFunctionObject *init_func = (PyFunctionObject *)FT_ATOMIC_LOAD_PTR_ACQUIRE(cls->_spec_cache.init);
PyCodeObject *code = (PyCodeObject *)init_func->func_code;
DEOPT_IF(!_PyThreadState_HasStackSpace(tstate, code->co_framesize + _Py_InitCleanup.co_framesize), CALL);
STAT_INC(CALL, hit);
@ -1073,20 +1073,21 @@
_PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
assert(_PyFrame_GetBytecode(shim)[1].op.code == RETURN_VALUE);
stack_pointer = _PyFrame_GetStackPointer(frame);
/* Push self onto stack of shim */
shim->localsplus[0] = PyStackRef_DUP(self[0]);
_PyFrame_SetStackPointer(frame, stack_pointer);
init_frame = _PyEvalFramePushAndInit(
_PyInterpreterFrame *temp = _PyEvalFramePushAndInit(
tstate, init[0], NULL, args-1, oparg+1, NULL, shim);
stack_pointer = _PyFrame_GetStackPointer(frame);
stack_pointer[-2 - oparg].bits = (uintptr_t)init_frame;
stack_pointer += -1 - oparg;
stack_pointer += -2 - oparg;
assert(WITHIN_STACK_BOUNDS());
if (init_frame == NULL) {
if (temp == NULL) {
_PyEval_FrameClearAndPop(tstate, shim);
goto error;
}
init_frame = temp;
frame->return_offset = 1 + INLINE_CACHE_ENTRIES_CALL;
/* Account for pushing the extra frame.
* We don't check recursion depth here,
@ -1100,8 +1101,6 @@
// Eventually this should be the only occurrence of this code.
assert(tstate->interp->eval_frame == NULL);
_PyInterpreterFrame *temp = new_frame;
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
_PyFrame_SetStackPointer(frame, stack_pointer);
assert(new_frame->previous == frame || new_frame->previous->previous == frame);
CALL_STAT_INC(inlined_py_calls);
@ -2383,8 +2382,10 @@
DEOPT_IF(callable_o != interp->callable_cache.list_append, CALL);
assert(self_o != NULL);
DEOPT_IF(!PyList_Check(self_o), CALL);
DEOPT_IF(!LOCK_OBJECT(self_o), CALL);
STAT_INC(CALL, hit);
int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg));
UNLOCK_OBJECT(self_o);
PyStackRef_CLOSE(self);
PyStackRef_CLOSE(callable);
if (err) goto pop_3_error;