gh-115999: Enable specialization of CALL instructions in free-threaded builds (#127123)

The CALL family of instructions were mostly thread-safe already and only required a small number of changes, which are documented below. A few changes were needed to make CALL_ALLOC_AND_ENTER_INIT thread-safe: Added _PyType_LookupRefAndVersion, which returns the type version corresponding to the returned ref. Added _PyType_CacheInitForSpecialization, which takes an init method and the corresponding type version and only populates the specialization cache if the current type version matches the supplied version. This prevents potentially caching a stale value in free-threaded builds if we race with an update to __init__. Only cache __init__ functions that are deferred in free-threaded builds. This ensures that the reference to __init__ that is stored in the specialization cache is valid if the type version guard in _CHECK_AND_ALLOCATE_OBJECT passes. Fix a bug in _CREATE_INIT_FRAME where the frame is pushed to the stack on failure. A few other miscellaneous changes were also needed: Use {LOCK,UNLOCK}_OBJECT in LIST_APPEND. This ensures that the list's per-object lock is held while we are appending to it. Add missing co_tlbc for _Py_InitCleanup. Stop/start the world around setting the eval frame hook. This allows us to read interp->eval_frame non-atomically and preserves the behavior of _CHECK_PEP_523 documented below.
2025-11-25 04:34:37 +00:00 · 2024-12-03 11:20:20 -08:00 · 2024-12-03 11:20:20 -08:00 · dabcecfd6d
commit dabcecfd6d
parent fc5a0dc224
11 changed files with 220 additions and 92 deletions
--- a/Python/executor_cases.c.h
+++ b/Python/executor_cases.c.h
@ -4500,13 +4500,13 @@
                JUMP_TO_JUMP_TARGET();
            }
            PyTypeObject *tp = (PyTypeObject *)callable_o;
-            if (tp->tp_version_tag != type_version) {
+            if (FT_ATOMIC_LOAD_UINT32_RELAXED(tp->tp_version_tag) != type_version) {
                UOP_STAT_INC(uopcode, miss);
                JUMP_TO_JUMP_TARGET();
            }
            assert(tp->tp_flags & Py_TPFLAGS_INLINE_VALUES);
            PyHeapTypeObject *cls = (PyHeapTypeObject *)callable_o;
-            PyFunctionObject *init_func = (PyFunctionObject *)cls->_spec_cache.init;
+            PyFunctionObject *init_func = (PyFunctionObject *)FT_ATOMIC_LOAD_PTR_ACQUIRE(cls->_spec_cache.init);
            PyCodeObject *code = (PyCodeObject *)init_func->func_code;
            if (!_PyThreadState_HasStackSpace(tstate, code->co_framesize + _Py_InitCleanup.co_framesize)) {
                UOP_STAT_INC(uopcode, miss);
@ -4537,25 +4537,29 @@
            _PyInterpreterFrame *shim = _PyFrame_PushTrampolineUnchecked(
                tstate, (PyCodeObject *)&_Py_InitCleanup, 1, frame);
            assert(_PyFrame_GetBytecode(shim)[0].op.code == EXIT_INIT_CHECK);
+            assert(_PyFrame_GetBytecode(shim)[1].op.code == RETURN_VALUE);
            stack_pointer = _PyFrame_GetStackPointer(frame);
            /* Push self onto stack of shim */
            shim->localsplus[0] = PyStackRef_DUP(self[0]);
            _PyFrame_SetStackPointer(frame, stack_pointer);
-            init_frame = _PyEvalFramePushAndInit(
+            _PyInterpreterFrame *temp = _PyEvalFramePushAndInit(
                tstate, init[0], NULL, args-1, oparg+1, NULL, shim);
            stack_pointer = _PyFrame_GetStackPointer(frame);
-            stack_pointer[-2 - oparg].bits = (uintptr_t)init_frame;
-            stack_pointer += -1 - oparg;
+            stack_pointer += -2 - oparg;
            assert(WITHIN_STACK_BOUNDS());
-            if (init_frame == NULL) {
+            if (temp == NULL) {
                _PyEval_FrameClearAndPop(tstate, shim);
                JUMP_TO_ERROR();
            }
+            init_frame = temp;
            frame->return_offset = 1 + INLINE_CACHE_ENTRIES_CALL;
            /* Account for pushing the extra frame.
             * We don't check recursion depth here,
             * as it will be checked after start_frame */
            tstate->py_recursion_remaining--;
+            stack_pointer[0].bits = (uintptr_t)init_frame;
+            stack_pointer += 1;
+            assert(WITHIN_STACK_BOUNDS());
            break;
        }

@ -4908,8 +4912,13 @@
                UOP_STAT_INC(uopcode, miss);
                JUMP_TO_JUMP_TARGET();
            }
+            if (!LOCK_OBJECT(self_o)) {
+                UOP_STAT_INC(uopcode, miss);
+                JUMP_TO_JUMP_TARGET();
+            }
            STAT_INC(CALL, hit);
            int err = _PyList_AppendTakeRef((PyListObject *)self_o, PyStackRef_AsPyObjectSteal(arg));
+            UNLOCK_OBJECT(self_o);
            PyStackRef_CLOSE(self);
            PyStackRef_CLOSE(callable);
            if (err) JUMP_TO_ERROR();