bpo-37543: optimize pymalloc (#14674)

PyObject_Malloc() and PyObject_Free() inlines pymalloc_alloc and
pymalloc_free partially.
But when PGO is not used, compiler don't know where is the hot part
in pymalloc_alloc and pymalloc_free.
This commit is contained in:
Inada Naoki 2019-07-17 21:23:57 +09:00 committed by GitHub
parent 7036e1de3a
commit fb26504d14
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 227 additions and 218 deletions

View file

@ -0,0 +1 @@
Optimized pymalloc for non PGO build.

View file

@ -710,19 +710,21 @@ PyObject_Free(void *ptr)
} }
#ifdef WITH_PYMALLOC
#ifdef WITH_VALGRIND
#include <valgrind/valgrind.h>
/* If we're using GCC, use __builtin_expect() to reduce overhead of /* If we're using GCC, use __builtin_expect() to reduce overhead of
the valgrind checks */ the valgrind checks */
#if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__) #if defined(__GNUC__) && (__GNUC__ > 2) && defined(__OPTIMIZE__)
# define UNLIKELY(value) __builtin_expect((value), 0) # define UNLIKELY(value) __builtin_expect((value), 0)
# define LIKELY(value) __builtin_expect((value), 1)
#else #else
# define UNLIKELY(value) (value) # define UNLIKELY(value) (value)
# define LIKELY(value) (value)
#endif #endif
#ifdef WITH_PYMALLOC
#ifdef WITH_VALGRIND
#include <valgrind/valgrind.h>
/* -1 indicates that we haven't checked that we're running on valgrind yet. */ /* -1 indicates that we haven't checked that we're running on valgrind yet. */
static int running_on_valgrind = -1; static int running_on_valgrind = -1;
#endif #endif
@ -1424,96 +1426,48 @@ address_in_range(void *p, poolp pool)
/*==========================================================================*/ /*==========================================================================*/
/* pymalloc allocator // Called when freelist is exhausted. Extend the freelist if there is
// space for a block. Otherwise, remove this pool from usedpools.
The basic blocks are ordered by decreasing execution frequency, static void
which minimizes the number of jumps in the most common cases, pymalloc_pool_extend(poolp pool, uint size)
improves branching prediction and instruction scheduling (small
block allocations typically result in a couple of instructions).
Unless the optimizer reorders everything, being too smart...
Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p.
Return 0 if pymalloc failed to allocate the memory block: on bigger
requests, on error in the code below (as a last chance to serve the request)
or when the max memory limit has been reached. */
static int
pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
{ {
block *bp; if (UNLIKELY(pool->nextoffset <= pool->maxnextoffset)) {
poolp pool;
poolp next;
uint size;
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind == -1)) {
running_on_valgrind = RUNNING_ON_VALGRIND;
}
if (UNLIKELY(running_on_valgrind)) {
return 0;
}
#endif
if (nbytes == 0) {
return 0;
}
if (nbytes > SMALL_REQUEST_THRESHOLD) {
return 0;
}
/*
* Most frequent paths first
*/
size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT;
pool = usedpools[size + size];
if (pool != pool->nextpool) {
/*
* There is a used pool for this size class.
* Pick up the head block of its free list.
*/
++pool->ref.count;
bp = pool->freeblock;
assert(bp != NULL);
if ((pool->freeblock = *(block **)bp) != NULL) {
goto success;
}
/*
* Reached the end of the free list, try to extend it.
*/
if (pool->nextoffset <= pool->maxnextoffset) {
/* There is room for another block. */ /* There is room for another block. */
pool->freeblock = (block*)pool + pool->freeblock = (block*)pool + pool->nextoffset;
pool->nextoffset;
pool->nextoffset += INDEX2SIZE(size); pool->nextoffset += INDEX2SIZE(size);
*(block **)(pool->freeblock) = NULL; *(block **)(pool->freeblock) = NULL;
goto success; return;
} }
/* Pool is full, unlink from used pools. */ /* Pool is full, unlink from used pools. */
poolp next;
next = pool->nextpool; next = pool->nextpool;
pool = pool->prevpool; pool = pool->prevpool;
next->prevpool = pool; next->prevpool = pool;
pool->nextpool = next; pool->nextpool = next;
goto success;
} }
/* called when pymalloc_alloc can not allocate a block from usedpool.
* This function takes new pool and allocate a block from it.
*/
static void*
allocate_from_new_pool(uint size)
{
/* There isn't a pool of the right size class immediately /* There isn't a pool of the right size class immediately
* available: use a free pool. * available: use a free pool.
*/ */
if (usable_arenas == NULL) { if (UNLIKELY(usable_arenas == NULL)) {
/* No arena has a free pool: allocate a new arena. */ /* No arena has a free pool: allocate a new arena. */
#ifdef WITH_MEMORY_LIMITS #ifdef WITH_MEMORY_LIMITS
if (narenas_currently_allocated >= MAX_ARENAS) { if (narenas_currently_allocated >= MAX_ARENAS) {
goto failed; return NULL;
} }
#endif #endif
usable_arenas = new_arena(); usable_arenas = new_arena();
if (usable_arenas == NULL) { if (usable_arenas == NULL) {
goto failed; return NULL;
} }
usable_arenas->nextarena = usable_arenas->nextarena = usable_arenas->prevarena = NULL;
usable_arenas->prevarena = NULL;
assert(nfp2lasta[usable_arenas->nfreepools] == NULL); assert(nfp2lasta[usable_arenas->nfreepools] == NULL);
nfp2lasta[usable_arenas->nfreepools] = usable_arenas; nfp2lasta[usable_arenas->nfreepools] = usable_arenas;
} }
@ -1536,12 +1490,12 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
} }
/* Try to get a cached free pool. */ /* Try to get a cached free pool. */
pool = usable_arenas->freepools; poolp pool = usable_arenas->freepools;
if (pool != NULL) { if (LIKELY(pool != NULL)) {
/* Unlink from cached pools. */ /* Unlink from cached pools. */
usable_arenas->freepools = pool->nextpool; usable_arenas->freepools = pool->nextpool;
--usable_arenas->nfreepools; usable_arenas->nfreepools--;
if (usable_arenas->nfreepools == 0) { if (UNLIKELY(usable_arenas->nfreepools == 0)) {
/* Wholly allocated: remove. */ /* Wholly allocated: remove. */
assert(usable_arenas->freepools == NULL); assert(usable_arenas->freepools == NULL);
assert(usable_arenas->nextarena == NULL || assert(usable_arenas->nextarena == NULL ||
@ -1564,40 +1518,8 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
(block*)usable_arenas->address + (block*)usable_arenas->address +
ARENA_SIZE - POOL_SIZE); ARENA_SIZE - POOL_SIZE);
} }
init_pool:
/* Frontlink to used pools. */
next = usedpools[size + size]; /* == prev */
pool->nextpool = next;
pool->prevpool = next;
next->nextpool = pool;
next->prevpool = pool;
pool->ref.count = 1;
if (pool->szidx == size) {
/* Luckily, this pool last contained blocks
* of the same size class, so its header
* and free list are already initialized.
*/
bp = pool->freeblock;
assert(bp != NULL);
pool->freeblock = *(block **)bp;
goto success;
} }
/* else {
* Initialize the pool header, set up the free list to
* contain just the second block, and return the first
* block.
*/
pool->szidx = size;
size = INDEX2SIZE(size);
bp = (block *)pool + POOL_OVERHEAD;
pool->nextoffset = POOL_OVERHEAD + (size << 1);
pool->maxnextoffset = POOL_SIZE - size;
pool->freeblock = bp + size;
*(block **)(pool->freeblock) = NULL;
goto success;
}
/* Carve off a new pool. */ /* Carve off a new pool. */
assert(usable_arenas->nfreepools > 0); assert(usable_arenas->nfreepools > 0);
assert(usable_arenas->freepools == NULL); assert(usable_arenas->freepools == NULL);
@ -1621,16 +1543,98 @@ pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
assert(usable_arenas->address != 0); assert(usable_arenas->address != 0);
} }
} }
}
goto init_pool; /* Frontlink to used pools. */
block *bp;
poolp next = usedpools[size + size]; /* == prev */
pool->nextpool = next;
pool->prevpool = next;
next->nextpool = pool;
next->prevpool = pool;
pool->ref.count = 1;
if (pool->szidx == size) {
/* Luckily, this pool last contained blocks
* of the same size class, so its header
* and free list are already initialized.
*/
bp = pool->freeblock;
assert(bp != NULL);
pool->freeblock = *(block **)bp;
return bp;
}
/*
* Initialize the pool header, set up the free list to
* contain just the second block, and return the first
* block.
*/
pool->szidx = size;
size = INDEX2SIZE(size);
bp = (block *)pool + POOL_OVERHEAD;
pool->nextoffset = POOL_OVERHEAD + (size << 1);
pool->maxnextoffset = POOL_SIZE - size;
pool->freeblock = bp + size;
*(block **)(pool->freeblock) = NULL;
return bp;
}
/* pymalloc allocator
Return 1 if pymalloc allocated memory and wrote the pointer into *ptr_p.
Return 0 if pymalloc failed to allocate the memory block: on bigger
requests, on error in the code below (as a last chance to serve the request)
or when the max memory limit has been reached.
*/
static inline int
pymalloc_alloc(void *ctx, void **ptr_p, size_t nbytes)
{
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind == -1)) {
running_on_valgrind = RUNNING_ON_VALGRIND;
}
if (UNLIKELY(running_on_valgrind)) {
return 0;
}
#endif
if (UNLIKELY(nbytes == 0)) {
return 0;
}
if (UNLIKELY(nbytes > SMALL_REQUEST_THRESHOLD)) {
return 0;
}
uint size = (uint)(nbytes - 1) >> ALIGNMENT_SHIFT;
poolp pool = usedpools[size + size];
block *bp;
if (LIKELY(pool != pool->nextpool)) {
/*
* There is a used pool for this size class.
* Pick up the head block of its free list.
*/
++pool->ref.count;
bp = pool->freeblock;
if (UNLIKELY((pool->freeblock = *(block **)bp) == NULL)) {
// Reached the end of the free list, try to extend it.
pymalloc_pool_extend(pool, size);
}
}
else {
/* There isn't a pool of the right size class immediately
* available: use a free pool.
*/
bp = allocate_from_new_pool(size);
if (UNLIKELY(bp == NULL)) {
return 0;
}
}
success:
assert(bp != NULL); assert(bp != NULL);
*ptr_p = (void *)bp; *ptr_p = (void *)bp;
return 1; return 1;
failed:
return 0;
} }
@ -1638,7 +1642,7 @@ static void *
_PyObject_Malloc(void *ctx, size_t nbytes) _PyObject_Malloc(void *ctx, size_t nbytes)
{ {
void* ptr; void* ptr;
if (pymalloc_alloc(ctx, &ptr, nbytes)) { if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) {
return ptr; return ptr;
} }
@ -1658,7 +1662,7 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize); assert(elsize == 0 || nelem <= (size_t)PY_SSIZE_T_MAX / elsize);
size_t nbytes = nelem * elsize; size_t nbytes = nelem * elsize;
if (pymalloc_alloc(ctx, &ptr, nbytes)) { if (LIKELY(pymalloc_alloc(ctx, &ptr, nbytes))) {
memset(ptr, 0, nbytes); memset(ptr, 0, nbytes);
return ptr; return ptr;
} }
@ -1671,88 +1675,37 @@ _PyObject_Calloc(void *ctx, size_t nelem, size_t elsize)
} }
/* Free a memory block allocated by pymalloc_alloc(). static void
Return 1 if it was freed. insert_to_usedpool(poolp pool)
Return 0 if the block was not allocated by pymalloc_alloc(). */
static int
pymalloc_free(void *ctx, void *p)
{ {
poolp pool;
block *lastfree;
poolp next, prev;
uint size;
assert(p != NULL);
#ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind > 0)) {
return 0;
}
#endif
pool = POOL_ADDR(p);
if (!address_in_range(p, pool)) {
return 0;
}
/* We allocated this address. */
/* Link p to the start of the pool's freeblock list. Since
* the pool had at least the p block outstanding, the pool
* wasn't empty (so it's already in a usedpools[] list, or
* was full and is in no list -- it's not in the freeblocks
* list in any case).
*/
assert(pool->ref.count > 0); /* else it was empty */
*(block **)p = lastfree = pool->freeblock;
pool->freeblock = (block *)p;
if (!lastfree) {
/* Pool was full, so doesn't currently live in any list:
* link it to the front of the appropriate usedpools[] list.
* This mimics LRU pool usage for new allocations and
* targets optimal filling when several pools contain
* blocks of the same size class.
*/
--pool->ref.count;
assert(pool->ref.count > 0); /* else the pool is empty */ assert(pool->ref.count > 0); /* else the pool is empty */
size = pool->szidx;
next = usedpools[size + size]; uint size = pool->szidx;
prev = next->prevpool; poolp next = usedpools[size + size];
poolp prev = next->prevpool;
/* insert pool before next: prev <-> pool <-> next */ /* insert pool before next: prev <-> pool <-> next */
pool->nextpool = next; pool->nextpool = next;
pool->prevpool = prev; pool->prevpool = prev;
next->prevpool = pool; next->prevpool = pool;
prev->nextpool = pool; prev->nextpool = pool;
goto success;
} }
struct arena_object* ao; static void
uint nf; /* ao->nfreepools */ insert_to_freepool(poolp pool)
{
/* freeblock wasn't NULL, so the pool wasn't full, poolp next = pool->nextpool;
* and the pool is in a usedpools[] list. poolp prev = pool->prevpool;
*/
if (--pool->ref.count != 0) {
/* pool isn't empty: leave it in usedpools */
goto success;
}
/* Pool is now empty: unlink from usedpools, and
* link to the front of freepools. This ensures that
* previously freed pools will be allocated later
* (being not referenced, they are perhaps paged out).
*/
next = pool->nextpool;
prev = pool->prevpool;
next->prevpool = prev; next->prevpool = prev;
prev->nextpool = next; prev->nextpool = next;
/* Link the pool to freepools. This is a singly-linked /* Link the pool to freepools. This is a singly-linked
* list, and pool->prevpool isn't used there. * list, and pool->prevpool isn't used there.
*/ */
ao = &arenas[pool->arenaindex]; struct arena_object *ao = &arenas[pool->arenaindex];
pool->nextpool = ao->freepools; pool->nextpool = ao->freepools;
ao->freepools = pool; ao->freepools = pool;
nf = ao->nfreepools; uint nf = ao->nfreepools;
/* If this is the rightmost arena with this number of free pools, /* If this is the rightmost arena with this number of free pools,
* nfp2lasta[nf] needs to change. Caution: if nf is 0, there * nfp2lasta[nf] needs to change. Caution: if nf is 0, there
* are no arenas in usable_arenas with that value. * are no arenas in usable_arenas with that value.
@ -1826,7 +1779,7 @@ pymalloc_free(void *ctx, void *p)
ao->address = 0; /* mark unassociated */ ao->address = 0; /* mark unassociated */
--narenas_currently_allocated; --narenas_currently_allocated;
goto success; return;
} }
if (nf == 1) { if (nf == 1) {
@ -1845,7 +1798,7 @@ pymalloc_free(void *ctx, void *p)
nfp2lasta[1] = ao; nfp2lasta[1] = ao;
} }
goto success; return;
} }
/* If this arena is now out of order, we need to keep /* If this arena is now out of order, we need to keep
@ -1862,7 +1815,7 @@ pymalloc_free(void *ctx, void *p)
/* If this was the rightmost of the old size, it remains in place. */ /* If this was the rightmost of the old size, it remains in place. */
if (ao == lastnf) { if (ao == lastnf) {
/* Case 4. Nothing to do. */ /* Case 4. Nothing to do. */
goto success; return;
} }
/* If ao were the only arena in the list, the last block would have /* If ao were the only arena in the list, the last block would have
* gotten us out. * gotten us out.
@ -1898,10 +1851,65 @@ pymalloc_free(void *ctx, void *p)
assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao); assert(ao->nextarena == NULL || ao->nextarena->prevarena == ao);
assert((usable_arenas == ao && ao->prevarena == NULL) assert((usable_arenas == ao && ao->prevarena == NULL)
|| ao->prevarena->nextarena == ao); || ao->prevarena->nextarena == ao);
}
goto success; /* Free a memory block allocated by pymalloc_alloc().
Return 1 if it was freed.
Return 0 if the block was not allocated by pymalloc_alloc(). */
static inline int
pymalloc_free(void *ctx, void *p)
{
assert(p != NULL);
success: #ifdef WITH_VALGRIND
if (UNLIKELY(running_on_valgrind > 0)) {
return 0;
}
#endif
poolp pool = POOL_ADDR(p);
if (UNLIKELY(!address_in_range(p, pool))) {
return 0;
}
/* We allocated this address. */
/* Link p to the start of the pool's freeblock list. Since
* the pool had at least the p block outstanding, the pool
* wasn't empty (so it's already in a usedpools[] list, or
* was full and is in no list -- it's not in the freeblocks
* list in any case).
*/
assert(pool->ref.count > 0); /* else it was empty */
block *lastfree = pool->freeblock;
*(block **)p = lastfree;
pool->freeblock = (block *)p;
pool->ref.count--;
if (UNLIKELY(lastfree == NULL)) {
/* Pool was full, so doesn't currently live in any list:
* link it to the front of the appropriate usedpools[] list.
* This mimics LRU pool usage for new allocations and
* targets optimal filling when several pools contain
* blocks of the same size class.
*/
insert_to_usedpool(pool);
return 1;
}
/* freeblock wasn't NULL, so the pool wasn't full,
* and the pool is in a usedpools[] list.
*/
if (LIKELY(pool->ref.count != 0)) {
/* pool isn't empty: leave it in usedpools */
return 1;
}
/* Pool is now empty: unlink from usedpools, and
* link to the front of freepools. This ensures that
* previously freed pools will be allocated later
* (being not referenced, they are perhaps paged out).
*/
insert_to_freepool(pool);
return 1; return 1;
} }
@ -1914,7 +1922,7 @@ _PyObject_Free(void *ctx, void *p)
return; return;
} }
if (!pymalloc_free(ctx, p)) { if (UNLIKELY(!pymalloc_free(ctx, p))) {
/* pymalloc didn't allocate this address */ /* pymalloc didn't allocate this address */
PyMem_RawFree(p); PyMem_RawFree(p);
raw_allocated_blocks--; raw_allocated_blocks--;