gh-129201: Use prefetch in GC mark alive phase. (gh-129203)

For the free-threaded version of the cyclic GC, restructure the "mark alive" phase to use software prefetch instructions. This gives a speedup in most cases when the number of objects is large enough. The prefetching is enabled conditionally based on the number of long-lived objects the GC finds.
2025-07-23 03:05:38 +00:00 · 2025-02-05 11:38:30 -08:00 · 2025-02-05 11:38:30 -08:00 · cdcacec79f
commit cdcacec79f
parent 5fb019fc29
2 changed files with 434 additions and 41 deletions
--- a/Python/gc_free_threading.c
+++ b/Python/gc_free_threading.c
@ -21,6 +21,9 @@
 // enable the "mark alive" pass of GC
 #define GC_ENABLE_MARK_ALIVE 1

+// if true, enable the use of "prefetch" CPU instructions
+#define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
+
 // include additional roots in "mark alive" pass
 #define GC_MARK_ALIVE_EXTRA_ROOTS 1

@ -472,13 +475,193 @@ gc_maybe_untrack(PyObject *op)
 }

 #ifdef GC_ENABLE_MARK_ALIVE
+
+// prefetch buffer and stack //////////////////////////////////
+
+// The buffer is a circular FIFO queue of PyObject pointers.  We take
+// care to not dereference these pointers until they are taken out of
+// the buffer.  A prefetch CPU instruction is issued when a pointer is
+// put into the buffer.  If all is working as expected, there will be
+// enough time between the enqueue and dequeue so that the needed memory
+// for the object, most importantly ob_gc_bits and ob_type words, will
+// already be in the CPU cache.
+#define BUFFER_SIZE 256
+#define BUFFER_HI 16
+#define BUFFER_LO 8
+#define BUFFER_MASK (BUFFER_SIZE - 1)
+
+// the buffer size must be an exact power of two
+static_assert(BUFFER_SIZE > 0 && !(BUFFER_SIZE & BUFFER_MASK),
+              "Invalid BUFFER_SIZE, must be power of 2");
+// the code below assumes these relationships are true
+static_assert(BUFFER_HI < BUFFER_SIZE &&
+              BUFFER_LO < BUFFER_HI &&
+              BUFFER_LO > 0,
+              "Invalid prefetch buffer level settings.");
+
+// Prefetch intructions will fetch the line of data from memory that
+// contains the byte specified with the source operand to a location in
+// the cache hierarchy specified by a locality hint.  The instruction
+// is only a hint and the CPU is free to ignore it.  Instructions and
+// behaviour are CPU specific but the definitions of locality hints
+// below are mostly consistent.
+//
+// * T0 (temporal data) prefetch data into all levels of the cache hierarchy.
+//
+// * T1 (temporal data with respect to first level cache) prefetch data into
+//   level 2 cache and higher.
+//
+// * T2 (temporal data with respect to second level cache) prefetch data into
+//   level 3 cache and higher, or an implementation-specific choice.
+//
+// * NTA (non-temporal data with respect to all cache levels) prefetch data into
+//   non-temporal cache structure and into a location close to the processor,
+//   minimizing cache pollution.
+
+#if defined(__GNUC__) || defined(__clang__)
+    #define PREFETCH_T0(ptr)  __builtin_prefetch(ptr, 0, 3)
+    #define PREFETCH_T1(ptr)  __builtin_prefetch(ptr, 0, 2)
+    #define PREFETCH_T2(ptr)  __builtin_prefetch(ptr, 0, 1)
+    #define PREFETCH_NTA(ptr)  __builtin_prefetch(ptr, 0, 0)
+#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
+    #include <mmintrin.h>
+    #define PREFETCH_T0(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
+    #define PREFETCH_T1(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
+    #define PREFETCH_T2(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_T2)
+    #define PREFETCH_NTA(ptr)  _mm_prefetch((const char*)(ptr), _MM_HINT_NTA)
+#elif defined (__aarch64__)
+    #define PREFETCH_T0(ptr)  \
+        do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_T1(ptr)  \
+        do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_T2(ptr)  \
+        do { __asm__ __volatile__("prfm pldl3keep, %0" ::"Q"(*(ptr))); } while (0)
+    #define PREFETCH_NTA(ptr)  \
+        do { __asm__ __volatile__("prfm pldl1strm, %0" ::"Q"(*(ptr))); } while (0)
+#else
+    #define PREFETCH_T0(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_T1(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_T2(ptr) do { (void)(ptr); } while (0)  /* disabled */
+    #define PREFETCH_NTA(ptr) do { (void)(ptr); } while (0)  /* disabled */
+#endif
+
+#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
+    #define prefetch(ptr) PREFETCH_T1(ptr)
+#else
+    #define prefetch(ptr)
+#endif
+
+// a contigous sequence of PyObject pointers, can contain NULLs
+typedef struct {
+    PyObject **start;
+    PyObject **end;
+} gc_span_t;
+
+typedef struct {
+    Py_ssize_t size;
+    Py_ssize_t capacity;
+    gc_span_t *stack;
+} gc_span_stack_t;
+
+typedef struct {
+    unsigned int in;
+    unsigned int out;
+    _PyObjectStack stack;
+    gc_span_stack_t spans;
+    PyObject *buffer[BUFFER_SIZE];
+    bool use_prefetch;
+} gc_mark_args_t;
+
+
+// Returns number of entries in buffer
+static inline unsigned int
+gc_mark_buffer_len(gc_mark_args_t *args)
+{
+    return args->in - args->out;
+}
+
+// Returns number of free entry slots in buffer
+static inline unsigned int
+gc_mark_buffer_avail(gc_mark_args_t *args)
+{
+    return BUFFER_SIZE - gc_mark_buffer_len(args);
+}
+
+static inline bool
+gc_mark_buffer_is_empty(gc_mark_args_t *args)
+{
+    return args->in == args->out;
+}
+
+static inline bool
+gc_mark_buffer_is_full(gc_mark_args_t *args)
+{
+    return gc_mark_buffer_len(args) == BUFFER_SIZE;
+}
+
+static inline PyObject *
+gc_mark_buffer_pop(gc_mark_args_t *args)
+{
+    assert(!gc_mark_buffer_is_empty(args));
+    PyObject *op = args->buffer[args->out & BUFFER_MASK];
+    args->out++;
+    return op;
+}
+
+// Called when there is space in the buffer for the object.  Issue the
+// prefetch instruction and add it to the end of the buffer.
+static inline void
+gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
+{
+    assert(!gc_mark_buffer_is_full(args));
+    prefetch(op);
+    args->buffer[args->in & BUFFER_MASK] = op;
+    args->in++;
+}
+
+// Called when we run out of space in the buffer or if the prefetching
+// is disabled. The object will be pushed on the gc_mark_args.stack.
 static int
-mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
+gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
+{
+    if (_PyObjectStack_Push(ms, op) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+static int
+gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
+{
+    if (start == end) {
+        return 0;
+    }
+    if (ss->size >= ss->capacity) {
+        if (ss->capacity == 0) {
+            ss->capacity = 256;
+        }
+        else {
+            ss->capacity *= 2;
+        }
+        ss->stack = (gc_span_t *)PyMem_Realloc(ss->stack, ss->capacity * sizeof(gc_span_t));
+        if (ss->stack == NULL) {
+            return -1;
+        }
+    }
+    assert(end > start);
+    ss->stack[ss->size].start = start;
+    ss->stack[ss->size].end = end;
+    ss->size++;
+    return 0;
+}
+
+static int
+gc_mark_enqueue_no_buffer(PyObject *op, gc_mark_args_t *args)
 {
    if (op == NULL) {
        return 0;
    }
-    if (!_PyObject_GC_IS_TRACKED(op)) {
+    if (!gc_has_bit(op,  _PyGC_BITS_TRACKED)) {
        return 0;
    }
    if (gc_is_alive(op)) {
@ -491,12 +674,68 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
    // Need to call tp_traverse on this object. Add to stack and mark it
    // alive so we don't traverse it a second time.
    gc_set_alive(op);
-    if (_PyObjectStack_Push(stack, op) < 0) {
+    if (_PyObjectStack_Push(&args->stack, op) < 0) {
        return -1;
    }
    return 0;
 }

+static int
+gc_mark_enqueue_buffer(PyObject *op, gc_mark_args_t *args)
+{
+    assert(op != NULL);
+    if (!gc_mark_buffer_is_full(args)) {
+        gc_mark_buffer_push(op, args);
+        return 0;
+    }
+    else {
+        return gc_mark_stack_push(&args->stack, op);
+    }
+}
+
+// Called when we find an object that needs to be marked alive (either from a
+// root or from calling tp_traverse).
+static int
+gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
+{
+    if (args->use_prefetch) {
+        return gc_mark_enqueue_buffer(op, args);
+    }
+    else {
+        return gc_mark_enqueue_no_buffer(op, args);
+    }
+}
+
+// Called when we have a contigous sequence of PyObject pointers, either
+// a tuple or list object.  This will add the items to the buffer if there
+// is space for them all otherwise push a new "span" on the span stack.  Using
+// spans has the advantage of not creating a deep _PyObjectStack stack when
+// dealing with long sequences.  Those sequences will be processed in smaller
+// chunks by the gc_prime_from_spans() function.
+static int
+gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
+{
+    Py_ssize_t used = gc_mark_buffer_len(args);
+    Py_ssize_t free = BUFFER_SIZE - used;
+    if (free >= size) {
+        for (Py_ssize_t i = 0; i < size; i++) {
+            PyObject *op = item[i];
+            if (op == NULL) {
+                continue;
+            }
+            gc_mark_buffer_push(op, args);
+        }
+    }
+    else {
+        assert(size > 0);
+        PyObject **end = &item[size];
+        if (gc_mark_span_push(&args->spans, item, end) < 0) {
+            return -1;
+        }
+    }
+    return 0;
+}
+
 static bool
 gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
                    void *block, size_t block_size, void *args)
@ -511,25 +750,56 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
    return true;
 }

+static int
+gc_mark_traverse_list(PyObject *self, void *args)
+{
+    PyListObject *list = (PyListObject *)self;
+    if (list->ob_item == NULL) {
+        return 0;
+    }
+    if (gc_mark_enqueue_span(list->ob_item, PyList_GET_SIZE(list), args) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
+static int
+gc_mark_traverse_tuple(PyObject *self, void *args)
+{
+    _PyTuple_MaybeUntrack(self);
+    if (!gc_has_bit(self,  _PyGC_BITS_TRACKED)) {
+        gc_clear_alive(self);
+        return 0;
+    }
+    PyTupleObject *tuple = _PyTuple_CAST(self);
+    if (gc_mark_enqueue_span(tuple->ob_item, Py_SIZE(tuple), args) < 0) {
+        return -1;
+    }
+    return 0;
+}
+
 static void
 gc_abort_mark_alive(PyInterpreterState *interp,
                    struct collection_state *state,
-                    _PyObjectStack *stack)
+                    gc_mark_args_t *args)
 {
-    // We failed to allocate memory for "stack" while doing the "mark
-    // alive" phase.  In that case, free the object stack and make sure
-    // that no objects have the alive bit set.
-    _PyObjectStack_Clear(stack);
+    // We failed to allocate memory while doing the "mark alive" phase.
+    // In that case, free the memory used for marking state and make
+    // sure that no objects have the alive bit set.
+    _PyObjectStack_Clear(&args->stack);
+    if (args->spans.stack != NULL) {
+        PyMem_Free(args->spans.stack);
+    }
    gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
 }

 #ifdef GC_MARK_ALIVE_STACKS
 static int
-gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
+gc_visit_stackref_mark_alive(gc_mark_args_t *args, _PyStackRef stackref)
 {
    if (!PyStackRef_IsNull(stackref)) {
        PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
-        if (mark_alive_stack_push(op, stack) < 0) {
+        if (gc_mark_enqueue(op, args) < 0) {
            return -1;
        }
    }
@ -537,7 +807,7 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
 }

 static int
-gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack)
+gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, gc_mark_args_t *args)
 {
    int err = 0;
    _Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
@ -554,13 +824,13 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
            }

            _PyStackRef *top = f->stackpointer;
-            if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) {
+            if (gc_visit_stackref_mark_alive(args, f->f_executable) < 0) {
                err = -1;
                goto exit;
            }
            while (top != f->localsplus) {
                --top;
-                if (gc_visit_stackref_mark_alive(stack, *top) < 0) {
+                if (gc_visit_stackref_mark_alive(args, *top) < 0) {
                    err = -1;
                    goto exit;
                }
@ -904,22 +1174,124 @@ static int
 move_legacy_finalizer_reachable(struct collection_state *state);

 #ifdef GC_ENABLE_MARK_ALIVE
+
+static void
+gc_prime_from_spans(gc_mark_args_t *args)
+{
+    Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
+    // there should always be at least this amount of space
+    assert(space <= gc_mark_buffer_avail(args));
+    assert(space > 0);
+    gc_span_t entry = args->spans.stack[--args->spans.size];
+    // spans on the stack should always have one or more elements
+    assert(entry.start < entry.end);
+    do {
+        PyObject *op = *entry.start;
+        entry.start++;
+        if (op != NULL) {
+            gc_mark_buffer_push(op, args);
+            space--;
+            if (space == 0) {
+                // buffer is as full as we want and not done with span
+                gc_mark_span_push(&args->spans, entry.start, entry.end);
+                return;
+            }
+        }
+    } while (entry.start < entry.end);
+}
+
+static void
+gc_prime_buffer(gc_mark_args_t *args)
+{
+    if (args->spans.size > 0) {
+        gc_prime_from_spans(args);
+    }
+    else {
+        // When priming, don't fill the buffer too full since that would
+        // likely cause the stack to be used shortly after when it
+        // fills. We want to use the buffer as much as possible and so
+        // we only fill to BUFFER_HI, not BUFFER_SIZE.
+        Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
+        assert(space > 0);
+        do {
+            PyObject *op = _PyObjectStack_Pop(&args->stack);
+            if (op == NULL) {
+                return;
+            }
+            gc_mark_buffer_push(op, args);
+            space--;
+        } while (space > 0);
+    }
+}
+
 static int
-propagate_alive_bits(_PyObjectStack *stack)
+gc_propagate_alive_prefetch(gc_mark_args_t *args)
 {
    for (;;) {
-        PyObject *op = _PyObjectStack_Pop(stack);
-        if (op == NULL) {
-            break;
+        Py_ssize_t buf_used = gc_mark_buffer_len(args);
+        if (buf_used <= BUFFER_LO) {
+            // The mark buffer is getting empty.  If it's too empty
+            // then there will not be enough delay between issuing
+            // the prefetch and when the object is actually accessed.
+            // Prime the buffer with object pointers from the stack or
+            // from the spans, if there are any available.
+            gc_prime_buffer(args);
+            if (gc_mark_buffer_is_empty(args)) {
+                return 0;
+            }
        }
-        assert(_PyObject_GC_IS_TRACKED(op));
-        assert(gc_is_alive(op));
+        PyObject *op = gc_mark_buffer_pop(args);
+
+        if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
+            continue;
+        }
+
+        if (gc_is_alive(op)) {
+            continue; // already visited this object
+        }
+
+        // Need to call tp_traverse on this object. Mark it alive so we
+        // don't traverse it a second time.
+        gc_set_alive(op);
+
        traverseproc traverse = Py_TYPE(op)->tp_traverse;
-        if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) {
+        if (traverse == PyList_Type.tp_traverse) {
+            if (gc_mark_traverse_list(op, args) < 0) {
+                return -1;
+            }
+        }
+        else if (traverse == PyTuple_Type.tp_traverse) {
+            if (gc_mark_traverse_tuple(op, args) < 0) {
+                return -1;
+            }
+        }
+        else if (traverse(op, (visitproc)&gc_mark_enqueue_buffer, args) < 0) {
            return -1;
        }
    }
-    return 0;
+}
+
+static int
+gc_propagate_alive(gc_mark_args_t *args)
+{
+    if (args->use_prefetch) {
+        return gc_propagate_alive_prefetch(args);
+    }
+    else {
+        for (;;) {
+            PyObject *op = _PyObjectStack_Pop(&args->stack);
+            if (op == NULL) {
+                break;
+            }
+            assert(_PyObject_GC_IS_TRACKED(op));
+            assert(gc_is_alive(op));
+            traverseproc traverse = Py_TYPE(op)->tp_traverse;
+            if (traverse(op, (visitproc)&gc_mark_enqueue_no_buffer, args) < 0) {
+                return -1;
+            }
+        }
+        return 0;
+    }
 }

 // Using tp_traverse, mark everything reachable from known root objects
@ -939,48 +1311,64 @@ propagate_alive_bits(_PyObjectStack *stack)
 //
 // Returns -1 on failure (out of memory).
 static int
-mark_alive_from_roots(PyInterpreterState *interp,
-                      struct collection_state *state)
+gc_mark_alive_from_roots(PyInterpreterState *interp,
+                         struct collection_state *state)
 {
 #ifdef GC_DEBUG
    // Check that all objects don't have alive bit set
    gc_visit_heaps(interp, &validate_alive_bits, &state->base);
 #endif
-    _PyObjectStack stack = { NULL };
+    gc_mark_args_t mark_args = { 0 };

-    #define STACK_PUSH(op) \
-        if (mark_alive_stack_push(op, &stack) < 0) { \
-            gc_abort_mark_alive(interp, state, &stack); \
-            return -1; \
+    // Using prefetch instructions is only a win if the set of objects being
+    // examined by the GC does not fit into CPU caches.  Otherwise, using the
+    // buffer and prefetch instructions is just overhead.  Using the long lived
+    // object count seems a good estimate of if things will fit in the cache.
+    // On 64-bit platforms, the minimum object size is 32 bytes.  A 4MB L2 cache
+    // would hold about 130k objects.
+    mark_args.use_prefetch = interp->gc.long_lived_total > 200000;
+
+    #define MARK_ENQUEUE(op) \
+        if (op != NULL ) { \
+            if (gc_mark_enqueue(op, &mark_args) < 0) { \
+                gc_abort_mark_alive(interp, state, &mark_args); \
+                return -1; \
+            } \
        }
-    STACK_PUSH(interp->sysdict);
+    MARK_ENQUEUE(interp->sysdict);
 #ifdef GC_MARK_ALIVE_EXTRA_ROOTS
-    STACK_PUSH(interp->builtins);
-    STACK_PUSH(interp->dict);
+    MARK_ENQUEUE(interp->builtins);
+    MARK_ENQUEUE(interp->dict);
    struct types_state *types = &interp->types;
    for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) {
-        STACK_PUSH(types->builtins.initialized[i].tp_dict);
-        STACK_PUSH(types->builtins.initialized[i].tp_subclasses);
+        MARK_ENQUEUE(types->builtins.initialized[i].tp_dict);
+        MARK_ENQUEUE(types->builtins.initialized[i].tp_subclasses);
    }
    for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) {
-        STACK_PUSH(types->for_extensions.initialized[i].tp_dict);
-        STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses);
+        MARK_ENQUEUE(types->for_extensions.initialized[i].tp_dict);
+        MARK_ENQUEUE(types->for_extensions.initialized[i].tp_subclasses);
    }
 #endif
 #ifdef GC_MARK_ALIVE_STACKS
-    if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) {
-        gc_abort_mark_alive(interp, state, &stack);
+    if (gc_visit_thread_stacks_mark_alive(interp, &mark_args) < 0) {
+        gc_abort_mark_alive(interp, state, &mark_args);
        return -1;
    }
 #endif
-    #undef STACK_PUSH
+    #undef MARK_ENQUEUE

    // Use tp_traverse to find everything reachable from roots.
-    if (propagate_alive_bits(&stack) < 0) {
-        gc_abort_mark_alive(interp, state, &stack);
+    if (gc_propagate_alive(&mark_args) < 0) {
+        gc_abort_mark_alive(interp, state, &mark_args);
        return -1;
    }

+    assert(mark_args.spans.size == 0);
+    if (mark_args.spans.stack != NULL) {
+        PyMem_Free(mark_args.spans.stack);
+    }
+    assert(mark_args.stack.head == NULL);
+
    return 0;
 }
 #endif // GC_ENABLE_MARK_ALIVE
@ -1559,7 +1947,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
    if (!state->gcstate->freeze_active) {
        // Mark objects reachable from known roots as "alive".  These will
        // be ignored for rest of the GC pass.
-        int err = mark_alive_from_roots(interp, state);
+        int err = gc_mark_alive_from_roots(interp, state);
        if (err < 0) {
            _PyEval_StartTheWorld(interp);
            PyErr_NoMemory();