gh-129201: Use prefetch in GC mark alive phase. (gh-129203)

For the free-threaded version of the cyclic GC, restructure the "mark alive" phase to use software prefetch instructions.  This gives a speedup in most cases when the number of objects is large enough.  The prefetching is enabled conditionally based on the number of long-lived objects the GC finds.
This commit is contained in:
Neil Schemenauer 2025-02-05 11:38:30 -08:00 committed by GitHub
parent 5fb019fc29
commit cdcacec79f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 434 additions and 41 deletions

View file

@ -0,0 +1,5 @@
The free-threaded version of the cyclic garbage collector has been optimized to
conditionally use CPU prefetch instructions during the collection. This can
reduce collection times by making it more likely that data is in the CPU cache
when it is needed. The prefetch instructions are enabled if the number of
long-lived objects (objects surviving a full collection) exceeds a threshold.

View file

@ -21,6 +21,9 @@
// enable the "mark alive" pass of GC
#define GC_ENABLE_MARK_ALIVE 1
// if true, enable the use of "prefetch" CPU instructions
#define GC_ENABLE_PREFETCH_INSTRUCTIONS 1
// include additional roots in "mark alive" pass
#define GC_MARK_ALIVE_EXTRA_ROOTS 1
@ -472,13 +475,193 @@ gc_maybe_untrack(PyObject *op)
}
#ifdef GC_ENABLE_MARK_ALIVE
// prefetch buffer and stack //////////////////////////////////
// The buffer is a circular FIFO queue of PyObject pointers. We take
// care to not dereference these pointers until they are taken out of
// the buffer. A prefetch CPU instruction is issued when a pointer is
// put into the buffer. If all is working as expected, there will be
// enough time between the enqueue and dequeue so that the needed memory
// for the object, most importantly ob_gc_bits and ob_type words, will
// already be in the CPU cache.
#define BUFFER_SIZE 256
#define BUFFER_HI 16
#define BUFFER_LO 8
#define BUFFER_MASK (BUFFER_SIZE - 1)
// the buffer size must be an exact power of two
static_assert(BUFFER_SIZE > 0 && !(BUFFER_SIZE & BUFFER_MASK),
"Invalid BUFFER_SIZE, must be power of 2");
// the code below assumes these relationships are true
static_assert(BUFFER_HI < BUFFER_SIZE &&
BUFFER_LO < BUFFER_HI &&
BUFFER_LO > 0,
"Invalid prefetch buffer level settings.");
// Prefetch intructions will fetch the line of data from memory that
// contains the byte specified with the source operand to a location in
// the cache hierarchy specified by a locality hint. The instruction
// is only a hint and the CPU is free to ignore it. Instructions and
// behaviour are CPU specific but the definitions of locality hints
// below are mostly consistent.
//
// * T0 (temporal data) prefetch data into all levels of the cache hierarchy.
//
// * T1 (temporal data with respect to first level cache) prefetch data into
// level 2 cache and higher.
//
// * T2 (temporal data with respect to second level cache) prefetch data into
// level 3 cache and higher, or an implementation-specific choice.
//
// * NTA (non-temporal data with respect to all cache levels) prefetch data into
// non-temporal cache structure and into a location close to the processor,
// minimizing cache pollution.
#if defined(__GNUC__) || defined(__clang__)
#define PREFETCH_T0(ptr) __builtin_prefetch(ptr, 0, 3)
#define PREFETCH_T1(ptr) __builtin_prefetch(ptr, 0, 2)
#define PREFETCH_T2(ptr) __builtin_prefetch(ptr, 0, 1)
#define PREFETCH_NTA(ptr) __builtin_prefetch(ptr, 0, 0)
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) && !defined(_M_ARM64EC)
#include <mmintrin.h>
#define PREFETCH_T0(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0)
#define PREFETCH_T1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1)
#define PREFETCH_T2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T2)
#define PREFETCH_NTA(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_NTA)
#elif defined (__aarch64__)
#define PREFETCH_T0(ptr) \
do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0)
#define PREFETCH_T1(ptr) \
do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0)
#define PREFETCH_T2(ptr) \
do { __asm__ __volatile__("prfm pldl3keep, %0" ::"Q"(*(ptr))); } while (0)
#define PREFETCH_NTA(ptr) \
do { __asm__ __volatile__("prfm pldl1strm, %0" ::"Q"(*(ptr))); } while (0)
#else
#define PREFETCH_T0(ptr) do { (void)(ptr); } while (0) /* disabled */
#define PREFETCH_T1(ptr) do { (void)(ptr); } while (0) /* disabled */
#define PREFETCH_T2(ptr) do { (void)(ptr); } while (0) /* disabled */
#define PREFETCH_NTA(ptr) do { (void)(ptr); } while (0) /* disabled */
#endif
#ifdef GC_ENABLE_PREFETCH_INSTRUCTIONS
#define prefetch(ptr) PREFETCH_T1(ptr)
#else
#define prefetch(ptr)
#endif
// a contigous sequence of PyObject pointers, can contain NULLs
typedef struct {
PyObject **start;
PyObject **end;
} gc_span_t;
typedef struct {
Py_ssize_t size;
Py_ssize_t capacity;
gc_span_t *stack;
} gc_span_stack_t;
typedef struct {
unsigned int in;
unsigned int out;
_PyObjectStack stack;
gc_span_stack_t spans;
PyObject *buffer[BUFFER_SIZE];
bool use_prefetch;
} gc_mark_args_t;
// Returns number of entries in buffer
static inline unsigned int
gc_mark_buffer_len(gc_mark_args_t *args)
{
return args->in - args->out;
}
// Returns number of free entry slots in buffer
static inline unsigned int
gc_mark_buffer_avail(gc_mark_args_t *args)
{
return BUFFER_SIZE - gc_mark_buffer_len(args);
}
static inline bool
gc_mark_buffer_is_empty(gc_mark_args_t *args)
{
return args->in == args->out;
}
static inline bool
gc_mark_buffer_is_full(gc_mark_args_t *args)
{
return gc_mark_buffer_len(args) == BUFFER_SIZE;
}
static inline PyObject *
gc_mark_buffer_pop(gc_mark_args_t *args)
{
assert(!gc_mark_buffer_is_empty(args));
PyObject *op = args->buffer[args->out & BUFFER_MASK];
args->out++;
return op;
}
// Called when there is space in the buffer for the object. Issue the
// prefetch instruction and add it to the end of the buffer.
static inline void
gc_mark_buffer_push(PyObject *op, gc_mark_args_t *args)
{
assert(!gc_mark_buffer_is_full(args));
prefetch(op);
args->buffer[args->in & BUFFER_MASK] = op;
args->in++;
}
// Called when we run out of space in the buffer or if the prefetching
// is disabled. The object will be pushed on the gc_mark_args.stack.
static int
mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
gc_mark_stack_push(_PyObjectStack *ms, PyObject *op)
{
if (_PyObjectStack_Push(ms, op) < 0) {
return -1;
}
return 0;
}
static int
gc_mark_span_push(gc_span_stack_t *ss, PyObject **start, PyObject **end)
{
if (start == end) {
return 0;
}
if (ss->size >= ss->capacity) {
if (ss->capacity == 0) {
ss->capacity = 256;
}
else {
ss->capacity *= 2;
}
ss->stack = (gc_span_t *)PyMem_Realloc(ss->stack, ss->capacity * sizeof(gc_span_t));
if (ss->stack == NULL) {
return -1;
}
}
assert(end > start);
ss->stack[ss->size].start = start;
ss->stack[ss->size].end = end;
ss->size++;
return 0;
}
static int
gc_mark_enqueue_no_buffer(PyObject *op, gc_mark_args_t *args)
{
if (op == NULL) {
return 0;
}
if (!_PyObject_GC_IS_TRACKED(op)) {
if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
return 0;
}
if (gc_is_alive(op)) {
@ -491,12 +674,68 @@ mark_alive_stack_push(PyObject *op, _PyObjectStack *stack)
// Need to call tp_traverse on this object. Add to stack and mark it
// alive so we don't traverse it a second time.
gc_set_alive(op);
if (_PyObjectStack_Push(stack, op) < 0) {
if (_PyObjectStack_Push(&args->stack, op) < 0) {
return -1;
}
return 0;
}
static int
gc_mark_enqueue_buffer(PyObject *op, gc_mark_args_t *args)
{
assert(op != NULL);
if (!gc_mark_buffer_is_full(args)) {
gc_mark_buffer_push(op, args);
return 0;
}
else {
return gc_mark_stack_push(&args->stack, op);
}
}
// Called when we find an object that needs to be marked alive (either from a
// root or from calling tp_traverse).
static int
gc_mark_enqueue(PyObject *op, gc_mark_args_t *args)
{
if (args->use_prefetch) {
return gc_mark_enqueue_buffer(op, args);
}
else {
return gc_mark_enqueue_no_buffer(op, args);
}
}
// Called when we have a contigous sequence of PyObject pointers, either
// a tuple or list object. This will add the items to the buffer if there
// is space for them all otherwise push a new "span" on the span stack. Using
// spans has the advantage of not creating a deep _PyObjectStack stack when
// dealing with long sequences. Those sequences will be processed in smaller
// chunks by the gc_prime_from_spans() function.
static int
gc_mark_enqueue_span(PyObject **item, Py_ssize_t size, gc_mark_args_t *args)
{
Py_ssize_t used = gc_mark_buffer_len(args);
Py_ssize_t free = BUFFER_SIZE - used;
if (free >= size) {
for (Py_ssize_t i = 0; i < size; i++) {
PyObject *op = item[i];
if (op == NULL) {
continue;
}
gc_mark_buffer_push(op, args);
}
}
else {
assert(size > 0);
PyObject **end = &item[size];
if (gc_mark_span_push(&args->spans, item, end) < 0) {
return -1;
}
}
return 0;
}
static bool
gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
void *block, size_t block_size, void *args)
@ -511,25 +750,56 @@ gc_clear_alive_bits(const mi_heap_t *heap, const mi_heap_area_t *area,
return true;
}
static int
gc_mark_traverse_list(PyObject *self, void *args)
{
PyListObject *list = (PyListObject *)self;
if (list->ob_item == NULL) {
return 0;
}
if (gc_mark_enqueue_span(list->ob_item, PyList_GET_SIZE(list), args) < 0) {
return -1;
}
return 0;
}
static int
gc_mark_traverse_tuple(PyObject *self, void *args)
{
_PyTuple_MaybeUntrack(self);
if (!gc_has_bit(self, _PyGC_BITS_TRACKED)) {
gc_clear_alive(self);
return 0;
}
PyTupleObject *tuple = _PyTuple_CAST(self);
if (gc_mark_enqueue_span(tuple->ob_item, Py_SIZE(tuple), args) < 0) {
return -1;
}
return 0;
}
static void
gc_abort_mark_alive(PyInterpreterState *interp,
struct collection_state *state,
_PyObjectStack *stack)
gc_mark_args_t *args)
{
// We failed to allocate memory for "stack" while doing the "mark
// alive" phase. In that case, free the object stack and make sure
// that no objects have the alive bit set.
_PyObjectStack_Clear(stack);
// We failed to allocate memory while doing the "mark alive" phase.
// In that case, free the memory used for marking state and make
// sure that no objects have the alive bit set.
_PyObjectStack_Clear(&args->stack);
if (args->spans.stack != NULL) {
PyMem_Free(args->spans.stack);
}
gc_visit_heaps(interp, &gc_clear_alive_bits, &state->base);
}
#ifdef GC_MARK_ALIVE_STACKS
static int
gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
gc_visit_stackref_mark_alive(gc_mark_args_t *args, _PyStackRef stackref)
{
if (!PyStackRef_IsNull(stackref)) {
PyObject *op = PyStackRef_AsPyObjectBorrow(stackref);
if (mark_alive_stack_push(op, stack) < 0) {
if (gc_mark_enqueue(op, args) < 0) {
return -1;
}
}
@ -537,7 +807,7 @@ gc_visit_stackref_mark_alive(_PyObjectStack *stack, _PyStackRef stackref)
}
static int
gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *stack)
gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, gc_mark_args_t *args)
{
int err = 0;
_Py_FOR_EACH_TSTATE_BEGIN(interp, p) {
@ -554,13 +824,13 @@ gc_visit_thread_stacks_mark_alive(PyInterpreterState *interp, _PyObjectStack *st
}
_PyStackRef *top = f->stackpointer;
if (gc_visit_stackref_mark_alive(stack, f->f_executable) < 0) {
if (gc_visit_stackref_mark_alive(args, f->f_executable) < 0) {
err = -1;
goto exit;
}
while (top != f->localsplus) {
--top;
if (gc_visit_stackref_mark_alive(stack, *top) < 0) {
if (gc_visit_stackref_mark_alive(args, *top) < 0) {
err = -1;
goto exit;
}
@ -904,22 +1174,124 @@ static int
move_legacy_finalizer_reachable(struct collection_state *state);
#ifdef GC_ENABLE_MARK_ALIVE
static void
gc_prime_from_spans(gc_mark_args_t *args)
{
Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
// there should always be at least this amount of space
assert(space <= gc_mark_buffer_avail(args));
assert(space > 0);
gc_span_t entry = args->spans.stack[--args->spans.size];
// spans on the stack should always have one or more elements
assert(entry.start < entry.end);
do {
PyObject *op = *entry.start;
entry.start++;
if (op != NULL) {
gc_mark_buffer_push(op, args);
space--;
if (space == 0) {
// buffer is as full as we want and not done with span
gc_mark_span_push(&args->spans, entry.start, entry.end);
return;
}
}
} while (entry.start < entry.end);
}
static void
gc_prime_buffer(gc_mark_args_t *args)
{
if (args->spans.size > 0) {
gc_prime_from_spans(args);
}
else {
// When priming, don't fill the buffer too full since that would
// likely cause the stack to be used shortly after when it
// fills. We want to use the buffer as much as possible and so
// we only fill to BUFFER_HI, not BUFFER_SIZE.
Py_ssize_t space = BUFFER_HI - gc_mark_buffer_len(args);
assert(space > 0);
do {
PyObject *op = _PyObjectStack_Pop(&args->stack);
if (op == NULL) {
return;
}
gc_mark_buffer_push(op, args);
space--;
} while (space > 0);
}
}
static int
propagate_alive_bits(_PyObjectStack *stack)
gc_propagate_alive_prefetch(gc_mark_args_t *args)
{
for (;;) {
PyObject *op = _PyObjectStack_Pop(stack);
if (op == NULL) {
break;
Py_ssize_t buf_used = gc_mark_buffer_len(args);
if (buf_used <= BUFFER_LO) {
// The mark buffer is getting empty. If it's too empty
// then there will not be enough delay between issuing
// the prefetch and when the object is actually accessed.
// Prime the buffer with object pointers from the stack or
// from the spans, if there are any available.
gc_prime_buffer(args);
if (gc_mark_buffer_is_empty(args)) {
return 0;
}
}
assert(_PyObject_GC_IS_TRACKED(op));
assert(gc_is_alive(op));
PyObject *op = gc_mark_buffer_pop(args);
if (!gc_has_bit(op, _PyGC_BITS_TRACKED)) {
continue;
}
if (gc_is_alive(op)) {
continue; // already visited this object
}
// Need to call tp_traverse on this object. Mark it alive so we
// don't traverse it a second time.
gc_set_alive(op);
traverseproc traverse = Py_TYPE(op)->tp_traverse;
if (traverse(op, (visitproc)&mark_alive_stack_push, stack) < 0) {
if (traverse == PyList_Type.tp_traverse) {
if (gc_mark_traverse_list(op, args) < 0) {
return -1;
}
}
else if (traverse == PyTuple_Type.tp_traverse) {
if (gc_mark_traverse_tuple(op, args) < 0) {
return -1;
}
}
else if (traverse(op, (visitproc)&gc_mark_enqueue_buffer, args) < 0) {
return -1;
}
}
return 0;
}
static int
gc_propagate_alive(gc_mark_args_t *args)
{
if (args->use_prefetch) {
return gc_propagate_alive_prefetch(args);
}
else {
for (;;) {
PyObject *op = _PyObjectStack_Pop(&args->stack);
if (op == NULL) {
break;
}
assert(_PyObject_GC_IS_TRACKED(op));
assert(gc_is_alive(op));
traverseproc traverse = Py_TYPE(op)->tp_traverse;
if (traverse(op, (visitproc)&gc_mark_enqueue_no_buffer, args) < 0) {
return -1;
}
}
return 0;
}
}
// Using tp_traverse, mark everything reachable from known root objects
@ -939,48 +1311,64 @@ propagate_alive_bits(_PyObjectStack *stack)
//
// Returns -1 on failure (out of memory).
static int
mark_alive_from_roots(PyInterpreterState *interp,
struct collection_state *state)
gc_mark_alive_from_roots(PyInterpreterState *interp,
struct collection_state *state)
{
#ifdef GC_DEBUG
// Check that all objects don't have alive bit set
gc_visit_heaps(interp, &validate_alive_bits, &state->base);
#endif
_PyObjectStack stack = { NULL };
gc_mark_args_t mark_args = { 0 };
#define STACK_PUSH(op) \
if (mark_alive_stack_push(op, &stack) < 0) { \
gc_abort_mark_alive(interp, state, &stack); \
return -1; \
// Using prefetch instructions is only a win if the set of objects being
// examined by the GC does not fit into CPU caches. Otherwise, using the
// buffer and prefetch instructions is just overhead. Using the long lived
// object count seems a good estimate of if things will fit in the cache.
// On 64-bit platforms, the minimum object size is 32 bytes. A 4MB L2 cache
// would hold about 130k objects.
mark_args.use_prefetch = interp->gc.long_lived_total > 200000;
#define MARK_ENQUEUE(op) \
if (op != NULL ) { \
if (gc_mark_enqueue(op, &mark_args) < 0) { \
gc_abort_mark_alive(interp, state, &mark_args); \
return -1; \
} \
}
STACK_PUSH(interp->sysdict);
MARK_ENQUEUE(interp->sysdict);
#ifdef GC_MARK_ALIVE_EXTRA_ROOTS
STACK_PUSH(interp->builtins);
STACK_PUSH(interp->dict);
MARK_ENQUEUE(interp->builtins);
MARK_ENQUEUE(interp->dict);
struct types_state *types = &interp->types;
for (int i = 0; i < _Py_MAX_MANAGED_STATIC_BUILTIN_TYPES; i++) {
STACK_PUSH(types->builtins.initialized[i].tp_dict);
STACK_PUSH(types->builtins.initialized[i].tp_subclasses);
MARK_ENQUEUE(types->builtins.initialized[i].tp_dict);
MARK_ENQUEUE(types->builtins.initialized[i].tp_subclasses);
}
for (int i = 0; i < _Py_MAX_MANAGED_STATIC_EXT_TYPES; i++) {
STACK_PUSH(types->for_extensions.initialized[i].tp_dict);
STACK_PUSH(types->for_extensions.initialized[i].tp_subclasses);
MARK_ENQUEUE(types->for_extensions.initialized[i].tp_dict);
MARK_ENQUEUE(types->for_extensions.initialized[i].tp_subclasses);
}
#endif
#ifdef GC_MARK_ALIVE_STACKS
if (gc_visit_thread_stacks_mark_alive(interp, &stack) < 0) {
gc_abort_mark_alive(interp, state, &stack);
if (gc_visit_thread_stacks_mark_alive(interp, &mark_args) < 0) {
gc_abort_mark_alive(interp, state, &mark_args);
return -1;
}
#endif
#undef STACK_PUSH
#undef MARK_ENQUEUE
// Use tp_traverse to find everything reachable from roots.
if (propagate_alive_bits(&stack) < 0) {
gc_abort_mark_alive(interp, state, &stack);
if (gc_propagate_alive(&mark_args) < 0) {
gc_abort_mark_alive(interp, state, &mark_args);
return -1;
}
assert(mark_args.spans.size == 0);
if (mark_args.spans.stack != NULL) {
PyMem_Free(mark_args.spans.stack);
}
assert(mark_args.stack.head == NULL);
return 0;
}
#endif // GC_ENABLE_MARK_ALIVE
@ -1559,7 +1947,7 @@ gc_collect_internal(PyInterpreterState *interp, struct collection_state *state,
if (!state->gcstate->freeze_active) {
// Mark objects reachable from known roots as "alive". These will
// be ignored for rest of the GC pass.
int err = mark_alive_from_roots(interp, state);
int err = gc_mark_alive_from_roots(interp, state);
if (err < 0) {
_PyEval_StartTheWorld(interp);
PyErr_NoMemory();