gh-115999: Implement thread-local bytecode and enable specialization for BINARY_OP (#123926)

Each thread specializes a thread-local copy of the bytecode, created on the first RESUME, in free-threaded builds. All copies of the bytecode for a code object are stored in the co_tlbc array on the code object. Threads reserve a globally unique index identifying its copy of the bytecode in all co_tlbc arrays at thread creation and release the index at thread destruction. The first entry in every co_tlbc array always points to the "main" copy of the bytecode that is stored at the end of the code object. This ensures that no bytecode is copied for programs that do not use threads. Thread-local bytecode can be disabled at runtime by providing either -X tlbc=0 or PYTHON_TLBC=0. Disabling thread-local bytecode also disables specialization. Concurrent modifications to the bytecode made by the specializing interpreter and instrumentation use atomics, with specialization taking care not to overwrite an instruction that was instrumented concurrently.
2025-09-26 10:19:53 +00:00 · 2024-11-04 11:13:32 -08:00 · 2024-11-04 11:13:32 -08:00 · 2e95c5ba3b
commit 2e95c5ba3b
parent e5a4b402ae
44 changed files with 1510 additions and 255 deletions
--- a/Python/specialize.c
+++ b/Python/specialize.c
@ -24,6 +24,25 @@ extern const char *_PyUOpName(int index);
 * ./adaptive.md
 */

+#ifdef Py_GIL_DISABLED
+#define SET_OPCODE_OR_RETURN(instr, opcode)                                \
+    do {                                                                   \
+        uint8_t old_op = _Py_atomic_load_uint8_relaxed(&(instr)->op.code); \
+        if (old_op >= MIN_INSTRUMENTED_OPCODE) {                           \
+            /* Lost race with instrumentation */                           \
+            return;                                                        \
+        }                                                                  \
+        if (!_Py_atomic_compare_exchange_uint8(&(instr)->op.code, &old_op, \
+                                               (opcode))) {                \
+            /* Lost race with instrumentation */                           \
+            assert(old_op >= MIN_INSTRUMENTED_OPCODE);                     \
+            return;                                                        \
+        }                                                                  \
+    } while (0)
+#else
+#define SET_OPCODE_OR_RETURN(instr, opcode) (instr)->op.code = (opcode)
+#endif
+
 #ifdef Py_STATS
 GCStats _py_gc_stats[NUM_GENERATIONS] = { 0 };
 static PyStats _Py_stats_struct = { .gc_stats = _py_gc_stats };
@ -436,16 +455,25 @@ do { \
 #  define SPECIALIZATION_FAIL(opcode, kind) ((void)0)
 #endif

-// Initialize warmup counters and insert superinstructions. This cannot fail.
+// Initialize warmup counters and optimize instructions. This cannot fail.
 void
-_PyCode_Quicken(PyCodeObject *code)
+_PyCode_Quicken(_Py_CODEUNIT *instructions, Py_ssize_t size, PyObject *consts,
+                int enable_counters)
 {
-    #if ENABLE_SPECIALIZATION
+    #if ENABLE_SPECIALIZATION_FT
+    _Py_BackoffCounter jump_counter, adaptive_counter;
+    if (enable_counters) {
+        jump_counter = initial_jump_backoff_counter();
+        adaptive_counter = adaptive_counter_warmup();
+    }
+    else {
+        jump_counter = initial_unreachable_backoff_counter();
+        adaptive_counter = initial_unreachable_backoff_counter();
+    }
    int opcode = 0;
    int oparg = 0;
-    _Py_CODEUNIT *instructions = _PyCode_CODE(code);
    /* The last code unit cannot have a cache, so we don't need to check it */
-    for (int i = 0; i < Py_SIZE(code)-1; i++) {
+    for (Py_ssize_t i = 0; i < size-1; i++) {
        opcode = instructions[i].op.code;
        int caches = _PyOpcode_Caches[opcode];
        oparg = (oparg << 8) | instructions[i].op.arg;
@ -453,7 +481,7 @@ _PyCode_Quicken(PyCodeObject *code)
            // The initial value depends on the opcode
            switch (opcode) {
                case JUMP_BACKWARD:
-                    instructions[i + 1].counter = initial_jump_backoff_counter();
+                    instructions[i + 1].counter = jump_counter;
                    break;
                case POP_JUMP_IF_FALSE:
                case POP_JUMP_IF_TRUE:
@ -462,7 +490,7 @@ _PyCode_Quicken(PyCodeObject *code)
                    instructions[i + 1].cache = 0x5555;  // Alternating 0, 1 bits
                    break;
                default:
-                    instructions[i + 1].counter = adaptive_counter_warmup();
+                    instructions[i + 1].counter = adaptive_counter;
                    break;
            }
            i += caches;
@ -471,7 +499,7 @@ _PyCode_Quicken(PyCodeObject *code)
            /* We can't do this in the bytecode compiler as
             * marshalling can intern strings and make them immortal. */

-            PyObject *obj = PyTuple_GET_ITEM(code->co_consts, oparg);
+            PyObject *obj = PyTuple_GET_ITEM(consts, oparg);
            if (_Py_IsImmortal(obj)) {
                instructions[i].op.code = LOAD_CONST_IMMORTAL;
            }
@ -480,7 +508,7 @@ _PyCode_Quicken(PyCodeObject *code)
            oparg = 0;
        }
    }
-    #endif /* ENABLE_SPECIALIZATION */
+    #endif /* ENABLE_SPECIALIZATION_FT */
 }

 #define SIMPLE_FUNCTION 0
@ -2243,9 +2271,10 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
 {
    PyObject *lhs = PyStackRef_AsPyObjectBorrow(lhs_st);
    PyObject *rhs = PyStackRef_AsPyObjectBorrow(rhs_st);
-    assert(ENABLE_SPECIALIZATION);
+    assert(ENABLE_SPECIALIZATION_FT);
    assert(_PyOpcode_Caches[BINARY_OP] == INLINE_CACHE_ENTRIES_BINARY_OP);
    _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(instr + 1);
+    uint8_t specialized_op;
    switch (oparg) {
        case NB_ADD:
        case NB_INPLACE_ADD:
@ -2256,18 +2285,18 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
                _Py_CODEUNIT next = instr[INLINE_CACHE_ENTRIES_BINARY_OP + 1];
                bool to_store = (next.op.code == STORE_FAST);
                if (to_store && PyStackRef_AsPyObjectBorrow(locals[next.op.arg]) == lhs) {
-                    instr->op.code = BINARY_OP_INPLACE_ADD_UNICODE;
+                    specialized_op = BINARY_OP_INPLACE_ADD_UNICODE;
                    goto success;
                }
-                instr->op.code = BINARY_OP_ADD_UNICODE;
+                specialized_op = BINARY_OP_ADD_UNICODE;
                goto success;
            }
            if (PyLong_CheckExact(lhs)) {
-                instr->op.code = BINARY_OP_ADD_INT;
+                specialized_op = BINARY_OP_ADD_INT;
                goto success;
            }
            if (PyFloat_CheckExact(lhs)) {
-                instr->op.code = BINARY_OP_ADD_FLOAT;
+                specialized_op = BINARY_OP_ADD_FLOAT;
                goto success;
            }
            break;
@ -2277,11 +2306,11 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
                break;
            }
            if (PyLong_CheckExact(lhs)) {
-                instr->op.code = BINARY_OP_MULTIPLY_INT;
+                specialized_op = BINARY_OP_MULTIPLY_INT;
                goto success;
            }
            if (PyFloat_CheckExact(lhs)) {
-                instr->op.code = BINARY_OP_MULTIPLY_FLOAT;
+                specialized_op = BINARY_OP_MULTIPLY_FLOAT;
                goto success;
            }
            break;
@ -2291,22 +2320,23 @@ _Py_Specialize_BinaryOp(_PyStackRef lhs_st, _PyStackRef rhs_st, _Py_CODEUNIT *in
                break;
            }
            if (PyLong_CheckExact(lhs)) {
-                instr->op.code = BINARY_OP_SUBTRACT_INT;
+                specialized_op = BINARY_OP_SUBTRACT_INT;
                goto success;
            }
            if (PyFloat_CheckExact(lhs)) {
-                instr->op.code = BINARY_OP_SUBTRACT_FLOAT;
+                specialized_op = BINARY_OP_SUBTRACT_FLOAT;
                goto success;
            }
            break;
    }
    SPECIALIZATION_FAIL(BINARY_OP, binary_op_fail_kind(oparg, lhs, rhs));
    STAT_INC(BINARY_OP, failure);
-    instr->op.code = BINARY_OP;
+    SET_OPCODE_OR_RETURN(instr, BINARY_OP);
    cache->counter = adaptive_counter_backoff(cache->counter);
    return;
 success:
    STAT_INC(BINARY_OP, success);
+    SET_OPCODE_OR_RETURN(instr, specialized_op);
    cache->counter = adaptive_counter_cooldown();
 }