bpo-46841: Use inline caching for calls (GH-31709)

2025-08-03 00:23:06 +00:00 · 2022-03-07 11:45:00 -08:00 · 2022-03-07 11:45:00 -08:00 · f193631387
commit f193631387
parent 105b9ac001
16 changed files with 494 additions and 735 deletions
--- a/Include/internal/pycore_code.h
+++ b/Include/internal/pycore_code.h
@ -8,50 +8,10 @@ extern "C" {
 * Specialization and quickening structs and helper functions
 */

-typedef struct {
-    int32_t cache_count;
-    int32_t _; /* Force 8 byte size */
-} _PyEntryZero;

-typedef struct {
-    uint8_t original_oparg;
-    uint8_t counter;
-    uint16_t index;
-    uint32_t version;
-} _PyAdaptiveEntry;
-
-typedef struct {
-    /* Borrowed ref */
-    PyObject *obj;
-} _PyObjectCache;
-
-typedef struct {
-    uint32_t func_version;
-    uint16_t min_args;
-    uint16_t defaults_len;
-} _PyCallCache;
-
-
-/* Add specialized versions of entries to this union.
- *
- * Do not break the invariant: sizeof(SpecializedCacheEntry) == 8
- * Preserving this invariant is necessary because:
-    - If any one form uses more space, then all must and on 64 bit machines
-      this is likely to double the memory consumption of caches
-    - The function for calculating the offset of caches assumes a 4:1
-      cache:instruction size ratio. Changing that would need careful
-      analysis to choose a new function.
- */
-typedef union {
-    _PyEntryZero zero;
-    _PyAdaptiveEntry adaptive;
-    _PyObjectCache obj;
-    _PyCallCache call;
-} SpecializedCacheEntry;
-
-#define INSTRUCTIONS_PER_ENTRY (sizeof(SpecializedCacheEntry)/sizeof(_Py_CODEUNIT))
-
-/* Inline caches */
+// Inline caches. If you change the number of cache entries for an instruction,
+// you must *also* update the number of cache entries in Lib/opcode.py and bump
+// the magic number in Lib/importlib/_bootstrap_external.py!

 #define CACHE_ENTRIES(cache) (sizeof(cache)/sizeof(_Py_CODEUNIT))

@ -112,73 +72,22 @@ typedef struct {

 #define INLINE_CACHE_ENTRIES_LOAD_METHOD CACHE_ENTRIES(_PyLoadMethodCache)

+typedef struct {
+    _Py_CODEUNIT counter;
+    _Py_CODEUNIT func_version[2];
+    _Py_CODEUNIT min_args;
+} _PyCallCache;
+
+#define INLINE_CACHE_ENTRIES_CALL CACHE_ENTRIES(_PyCallCache)
+
+typedef struct {
+    _Py_CODEUNIT counter;
+} _PyPrecallCache;
+
+#define INLINE_CACHE_ENTRIES_PRECALL CACHE_ENTRIES(_PyPrecallCache)
+
 /* Maximum size of code to quicken, in code units. */
-#define MAX_SIZE_TO_QUICKEN 5000
-
-typedef union _cache_or_instruction {
-    _Py_CODEUNIT code[1];
-    SpecializedCacheEntry entry;
-} SpecializedCacheOrInstruction;
-
-/* Get pointer to the nth cache entry, from the first instruction and n.
- * Cache entries are indexed backwards, with [count-1] first in memory, and [0] last.
- * The zeroth entry immediately precedes the instructions.
- */
-static inline SpecializedCacheEntry *
-_GetSpecializedCacheEntry(const _Py_CODEUNIT *first_instr, Py_ssize_t n)
-{
-    SpecializedCacheOrInstruction *last_cache_plus_one = (SpecializedCacheOrInstruction *)first_instr;
-    assert(&last_cache_plus_one->code[0] == first_instr);
-    return &last_cache_plus_one[-1-n].entry;
-}
-
-/* Following two functions form a pair.
- *
- * oparg_from_offset_and_index() is used to compute the oparg
- * when quickening, so that offset_from_oparg_and_nexti()
- * can be used at runtime to compute the offset.
- *
- * The relationship between the three values is currently
- *     offset == (index>>1) + oparg
- * This relation is chosen based on the following observations:
- * 1. typically 1 in 4 instructions need a cache
- * 2. instructions that need a cache typically use 2 entries
- *  These observations imply:  offset ≈ index/2
- *  We use the oparg to fine tune the relation to avoid wasting space
- * and allow consecutive instructions to use caches.
- *
- * If the number of cache entries < number of instructions/2 we will waste
- * some small amoount of space.
- * If the number of cache entries > (number of instructions/2) + 255, then
- * some instructions will not be able to use a cache.
- * In practice, we expect some small amount of wasted space in a shorter functions
- * and only functions exceeding a 1000 lines or more not to have enugh cache space.
- *
- */
-static inline int
-oparg_from_offset_and_nexti(int offset, int nexti)
-{
-    return offset-(nexti>>1);
-}
-
-static inline int
-offset_from_oparg_and_nexti(int oparg, int nexti)
-{
-    return (nexti>>1)+oparg;
-}
-
-/* Get pointer to the cache entry associated with an instruction.
- * nexti is the index of the instruction plus one.
- * nexti is used as it corresponds to the instruction pointer in the interpreter.
- * This doesn't check that an entry has been allocated for that instruction. */
-static inline SpecializedCacheEntry *
-_GetSpecializedCacheEntryForInstruction(const _Py_CODEUNIT *first_instr, int nexti, int oparg)
-{
-    return _GetSpecializedCacheEntry(
-        first_instr,
-        offset_from_oparg_and_nexti(oparg, nexti)
-    );
-}
+#define MAX_SIZE_TO_QUICKEN 10000

 #define QUICKENING_WARMUP_DELAY 8

@ -205,6 +114,13 @@ _Py_IncrementCountAndMaybeQuicken(PyCodeObject *code)

 extern Py_ssize_t _Py_QuickenedCount;

+// Borrowed references to common callables:
+struct callable_cache {
+    PyObject *isinstance;
+    PyObject *len;
+    PyObject *list_append;
+};
+
 /* "Locals plus" for a code object is the set of locals + cell vars +
 * free vars.  This relates to variable names as well as offsets into
 * the "fast locals" storage array of execution frames.  The compiler
@ -332,11 +248,6 @@ extern int _PyLineTable_PreviousAddressRange(PyCodeAddressRange *range);

 #define ADAPTIVE_CACHE_BACKOFF 64

-static inline void
-cache_backoff(_PyAdaptiveEntry *entry) {
-    entry->counter = ADAPTIVE_CACHE_BACKOFF;
-}
-
 /* Specialization functions */

 extern int _Py_Specialize_LoadAttr(PyObject *owner, _Py_CODEUNIT *instr,
@ -348,10 +259,10 @@ extern int _Py_Specialize_LoadMethod(PyObject *owner, _Py_CODEUNIT *instr,
                                     PyObject *name);
 extern int _Py_Specialize_BinarySubscr(PyObject *sub, PyObject *container, _Py_CODEUNIT *instr);
 extern int _Py_Specialize_StoreSubscr(PyObject *container, PyObject *sub, _Py_CODEUNIT *instr);
-extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
-    PyObject *kwnames, SpecializedCacheEntry *cache);
-extern int _Py_Specialize_Precall(PyObject *callable, _Py_CODEUNIT *instr, int nargs,
-    PyObject *kwnames, SpecializedCacheEntry *cache, PyObject *builtins);
+extern int _Py_Specialize_Call(PyObject *callable, _Py_CODEUNIT *instr,
+                               int nargs, PyObject *kwnames);
+extern int _Py_Specialize_Precall(PyObject *callable, _Py_CODEUNIT *instr,
+                                  int nargs, PyObject *kwnames, int oparg);
 extern void _Py_Specialize_BinaryOp(PyObject *lhs, PyObject *rhs, _Py_CODEUNIT *instr,
                                    int oparg);
 extern void _Py_Specialize_CompareOp(PyObject *lhs, PyObject *rhs,
--- a/Include/internal/pycore_global_strings.h
+++ b/Include/internal/pycore_global_strings.h
@ -269,6 +269,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(inf)
        STRUCT_FOR_ID(intersection)
        STRUCT_FOR_ID(isatty)
+        STRUCT_FOR_ID(isinstance)
        STRUCT_FOR_ID(items)
        STRUCT_FOR_ID(iter)
        STRUCT_FOR_ID(join)
@ -278,6 +279,7 @@ struct _Py_global_strings {
        STRUCT_FOR_ID(last_type)
        STRUCT_FOR_ID(last_value)
        STRUCT_FOR_ID(latin1)
+        STRUCT_FOR_ID(len)
        STRUCT_FOR_ID(line)
        STRUCT_FOR_ID(lineno)
        STRUCT_FOR_ID(listcomp)
--- a/Include/internal/pycore_interp.h
+++ b/Include/internal/pycore_interp.h
@ -12,6 +12,7 @@ extern "C" {

 #include "pycore_atomic.h"        // _Py_atomic_address
 #include "pycore_ast_state.h"     // struct ast_state
+#include "pycore_code.h"          // struct callable_cache
 #include "pycore_context.h"       // struct _Py_context_state
 #include "pycore_dict.h"          // struct _Py_dict_state
 #include "pycore_exceptions.h"    // struct _Py_exc_state
@ -176,6 +177,7 @@ struct _is {

    struct ast_state ast;
    struct type_cache type_cache;
+    struct callable_cache callable_cache;

    /* The following fields are here to avoid allocation during init.
       The data is exposed through PyInterpreterState pointer fields.
--- a/Include/internal/pycore_runtime_init.h
+++ b/Include/internal/pycore_runtime_init.h
@ -884,6 +884,7 @@ extern "C" {
                INIT_ID(inf), \
                INIT_ID(intersection), \
                INIT_ID(isatty), \
+                INIT_ID(isinstance), \
                INIT_ID(items), \
                INIT_ID(iter), \
                INIT_ID(join), \
@ -893,6 +894,7 @@ extern "C" {
                INIT_ID(last_type), \
                INIT_ID(last_value), \
                INIT_ID(latin1), \
+                INIT_ID(len), \
                INIT_ID(line), \
                INIT_ID(lineno), \
                INIT_ID(listcomp), \