From 082dbf77884264a8470b1ea132cc458d0b5bf08a Mon Sep 17 00:00:00 2001 From: Irit Katriel <1055913+iritkatriel@users.noreply.github.com> Date: Mon, 5 May 2025 17:46:56 +0100 Subject: [PATCH] gh-133395: add option for extension modules to specialize BINARY_OP/SUBSCR, apply to arrays (#133396) --- Include/cpython/object.h | 12 ++++ Include/internal/pycore_code.h | 11 ++- Include/internal/pycore_opcode_metadata.h | 3 +- Include/internal/pycore_uop_metadata.h | 6 +- Include/typeslots.h | 1 + Lib/test/test_sys.py | 2 +- ...-05-04-19-32-47.gh-issue-133395.VhWWEP.rst | 2 + Modules/arraymodule.c | 72 +++++++++++++++++++ Objects/typeslots.inc | 1 + Python/bytecodes.c | 15 +++- Python/executor_cases.c.h | 24 ++----- Python/generated_cases.c.h | 29 ++++++-- Python/optimizer_cases.c.h | 4 +- Python/specialize.c | 35 +++++++-- 14 files changed, 172 insertions(+), 45 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2025-05-04-19-32-47.gh-issue-133395.VhWWEP.rst diff --git a/Include/cpython/object.h b/Include/cpython/object.h index 3a4d65f7712..818fc7d0560 100644 --- a/Include/cpython/object.h +++ b/Include/cpython/object.h @@ -143,6 +143,11 @@ typedef struct { * backwards-compatibility */ typedef Py_ssize_t printfunc; +/* Specialize a binary op by setting the descriptor pointer */ +struct _PyBinopSpecializationDescr; +typedef int (*binop_specialize_func)(PyObject *v, PyObject *w, int oparg, + struct _PyBinopSpecializationDescr **descr); + // If this structure is modified, Doc/includes/typestruct.h should be updated // as well. struct _typeobject { @@ -233,6 +238,13 @@ struct _typeobject { /* bitset of which type-watchers care about this type */ unsigned char tp_watched; + /* callback that may specialize BINARY_OP + * this is an experimental API based on the ideas in the paper + * Cross Module Quickening - The Curious Case of C Extensions + * by Felix Berlakovich and Stefan Brunthaler. + */ + binop_specialize_func tp_binop_specialize; + /* Number of tp_version_tag values used. * Set to _Py_ATTR_CACHE_UNUSED if the attribute cache is * disabled for this type (e.g. due to custom MRO entries). diff --git a/Include/internal/pycore_code.h b/Include/internal/pycore_code.h index 635d2b24f4b..1c77730e417 100644 --- a/Include/internal/pycore_code.h +++ b/Include/internal/pycore_code.h @@ -482,13 +482,18 @@ adaptive_counter_backoff(_Py_BackoffCounter counter) { /* Specialization Extensions */ /* callbacks for an external specialization */ -typedef int (*binaryopguardfunc)(PyObject *lhs, PyObject *rhs); -typedef PyObject *(*binaryopactionfunc)(PyObject *lhs, PyObject *rhs); -typedef struct { +struct _PyBinopSpecializationDescr; + +typedef int (*binaryopguardfunc)(PyObject *lhs, PyObject *rhs); +typedef PyObject* (*binaryopactionfunc)(PyObject *lhs, PyObject *rhs); +typedef void (*binaryopfreefunc)(struct _PyBinopSpecializationDescr *descr); + +typedef struct _PyBinopSpecializationDescr { int oparg; binaryopguardfunc guard; binaryopactionfunc action; + binaryopfreefunc free; } _PyBinaryOpSpecializationDescr; /* Comparison bit masks. */ diff --git a/Include/internal/pycore_opcode_metadata.h b/Include/internal/pycore_opcode_metadata.h index dc7ecc998c4..fbb696e1755 100644 --- a/Include/internal/pycore_opcode_metadata.h +++ b/Include/internal/pycore_opcode_metadata.h @@ -1082,7 +1082,7 @@ const struct opcode_metadata _PyOpcode_opcode_metadata[267] = { [BINARY_OP_ADD_FLOAT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG }, [BINARY_OP_ADD_INT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BINARY_OP_ADD_UNICODE] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG }, - [BINARY_OP_EXTEND] = { true, INSTR_FMT_IXC0000, HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG }, + [BINARY_OP_EXTEND] = { true, INSTR_FMT_IXC0000, HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BINARY_OP_INPLACE_ADD_UNICODE] = { true, INSTR_FMT_IXC0000, HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, [BINARY_OP_MULTIPLY_FLOAT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG }, [BINARY_OP_MULTIPLY_INT] = { true, INSTR_FMT_IXC0000, HAS_EXIT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG }, @@ -1333,7 +1333,6 @@ _PyOpcode_macro_expansion[256] = { [BINARY_OP_ADD_FLOAT] = { .nuops = 3, .uops = { { _GUARD_TOS_FLOAT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_FLOAT, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_FLOAT, OPARG_SIMPLE, 5 } } }, [BINARY_OP_ADD_INT] = { .nuops = 3, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_INT, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_INT, OPARG_SIMPLE, 5 } } }, [BINARY_OP_ADD_UNICODE] = { .nuops = 3, .uops = { { _GUARD_TOS_UNICODE, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_ADD_UNICODE, OPARG_SIMPLE, 5 } } }, - [BINARY_OP_EXTEND] = { .nuops = 2, .uops = { { _GUARD_BINARY_OP_EXTEND, 4, 1 }, { _BINARY_OP_EXTEND, 4, 1 } } }, [BINARY_OP_INPLACE_ADD_UNICODE] = { .nuops = 3, .uops = { { _GUARD_TOS_UNICODE, OPARG_SIMPLE, 0 }, { _GUARD_NOS_UNICODE, OPARG_SIMPLE, 0 }, { _BINARY_OP_INPLACE_ADD_UNICODE, OPARG_SIMPLE, 5 } } }, [BINARY_OP_MULTIPLY_FLOAT] = { .nuops = 3, .uops = { { _GUARD_TOS_FLOAT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_FLOAT, OPARG_SIMPLE, 0 }, { _BINARY_OP_MULTIPLY_FLOAT, OPARG_SIMPLE, 5 } } }, [BINARY_OP_MULTIPLY_INT] = { .nuops = 3, .uops = { { _GUARD_TOS_INT, OPARG_SIMPLE, 0 }, { _GUARD_NOS_INT, OPARG_SIMPLE, 0 }, { _BINARY_OP_MULTIPLY_INT, OPARG_SIMPLE, 5 } } }, diff --git a/Include/internal/pycore_uop_metadata.h b/Include/internal/pycore_uop_metadata.h index 9b88763da07..922d8a6cadf 100644 --- a/Include/internal/pycore_uop_metadata.h +++ b/Include/internal/pycore_uop_metadata.h @@ -94,8 +94,7 @@ const uint16_t _PyUop_Flags[MAX_UOP_ID+1] = { [_BINARY_OP_SUBTRACT_FLOAT] = HAS_ERROR_FLAG | HAS_PURE_FLAG, [_BINARY_OP_ADD_UNICODE] = HAS_ERROR_FLAG | HAS_PURE_FLAG, [_BINARY_OP_INPLACE_ADD_UNICODE] = HAS_LOCAL_FLAG | HAS_DEOPT_FLAG | HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, - [_GUARD_BINARY_OP_EXTEND] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, - [_BINARY_OP_EXTEND] = HAS_ESCAPES_FLAG | HAS_PURE_FLAG, + [_BINARY_OP_EXTEND] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG | HAS_PURE_FLAG, [_BINARY_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_STORE_SLICE] = HAS_ERROR_FLAG | HAS_ESCAPES_FLAG, [_BINARY_OP_SUBSCR_LIST_INT] = HAS_DEOPT_FLAG | HAS_ESCAPES_FLAG, @@ -423,7 +422,6 @@ const char *const _PyOpcode_uop_name[MAX_UOP_ID+1] = { [_GET_ITER] = "_GET_ITER", [_GET_LEN] = "_GET_LEN", [_GET_YIELD_FROM_ITER] = "_GET_YIELD_FROM_ITER", - [_GUARD_BINARY_OP_EXTEND] = "_GUARD_BINARY_OP_EXTEND", [_GUARD_CALLABLE_STR_1] = "_GUARD_CALLABLE_STR_1", [_GUARD_CALLABLE_TUPLE_1] = "_GUARD_CALLABLE_TUPLE_1", [_GUARD_CALLABLE_TYPE_1] = "_GUARD_CALLABLE_TYPE_1", @@ -760,8 +758,6 @@ int _PyUop_num_popped(int opcode, int oparg) return 2; case _BINARY_OP_INPLACE_ADD_UNICODE: return 2; - case _GUARD_BINARY_OP_EXTEND: - return 0; case _BINARY_OP_EXTEND: return 2; case _BINARY_SLICE: diff --git a/Include/typeslots.h b/Include/typeslots.h index a7f3017ec02..980e714714e 100644 --- a/Include/typeslots.h +++ b/Include/typeslots.h @@ -93,4 +93,5 @@ #if !defined(Py_LIMITED_API) || Py_LIMITED_API+0 >= 0x030E0000 /* New in 3.14 */ #define Py_tp_token 83 +#define Py_tp_binop_specialize 84 #endif diff --git a/Lib/test/test_sys.py b/Lib/test/test_sys.py index 5f7171d02c5..aeca3720cfa 100644 --- a/Lib/test/test_sys.py +++ b/Lib/test/test_sys.py @@ -1776,7 +1776,7 @@ class SizeofTest(unittest.TestCase): check((1,2,3), vsize('') + self.P + 3*self.P) # type # static type: PyTypeObject - fmt = 'P2nPI13Pl4Pn9Pn12PIPc' + fmt = 'P2nPI13Pl4Pn9Pn12PI3Pc' s = vsize(fmt) check(int, s) typeid = 'n' if support.Py_GIL_DISABLED else '' diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2025-05-04-19-32-47.gh-issue-133395.VhWWEP.rst b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-04-19-32-47.gh-issue-133395.VhWWEP.rst new file mode 100644 index 00000000000..a391ce16339 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2025-05-04-19-32-47.gh-issue-133395.VhWWEP.rst @@ -0,0 +1,2 @@ +Add option for extension modules to specialize ``BINARY_OP`` instructions. +Applied to ``array`` objects. diff --git a/Modules/arraymodule.c b/Modules/arraymodule.c index 401a3a7072b..4d2ff32cabe 100644 --- a/Modules/arraymodule.c +++ b/Modules/arraymodule.c @@ -14,6 +14,8 @@ #include "pycore_modsupport.h" // _PyArg_NoKeywords() #include "pycore_moduleobject.h" // _PyModule_GetState() +#include "opcode.h" // binary op opargs (NB_*) + #include // offsetof() #include @@ -848,6 +850,10 @@ array_richcompare(PyObject *v, PyObject *w, int op) return res; } +static int +array_binop_specialize(PyObject *v, PyObject *w, int oparg, + _PyBinaryOpSpecializationDescr **descr); + static Py_ssize_t array_length(PyObject *op) { @@ -2963,6 +2969,8 @@ static PyType_Slot array_slots[] = { {Py_tp_alloc, PyType_GenericAlloc}, {Py_tp_new, array_new}, {Py_tp_traverse, array_tp_traverse}, + {Py_tp_token, Py_TP_USE_SPEC}, + {Py_tp_binop_specialize, array_binop_specialize}, /* as sequence */ {Py_sq_length, array_length}, @@ -2995,6 +3003,70 @@ static PyType_Spec array_spec = { .slots = array_slots, }; +static inline int +array_subscr_guard(PyObject *lhs, PyObject *rhs) +{ + PyObject *exc = PyErr_GetRaisedException(); + int ret = PyType_GetBaseByToken(Py_TYPE(lhs), &array_spec, NULL); + if (ret < 0) { + if (PyErr_ExceptionMatches(PyExc_TypeError)) { + PyErr_Clear(); + ret = 0; + } + } + _PyErr_ChainExceptions1(exc); + return ret; +} + +static PyObject * +array_subscr_action(PyObject *lhs, PyObject *rhs) +{ + return array_subscr(lhs, rhs); +} + +static void +array_subscr_free(_PyBinaryOpSpecializationDescr* descr) +{ + if (descr != NULL) { + PyMem_Free(descr); + } +} + +static int +array_binop_specialize(PyObject *v, PyObject *w, int oparg, + _PyBinaryOpSpecializationDescr **descr) +{ + array_state *state = find_array_state_by_type(Py_TYPE(v)); + + if (!array_Check(v, state)) { + return 0; + } + + *descr = NULL; + switch(oparg) { + case NB_SUBSCR: + if (array_subscr_guard(v, w)) { + *descr = (_PyBinaryOpSpecializationDescr*)PyMem_Malloc( + sizeof(_PyBinaryOpSpecializationDescr)); + if (*descr == NULL) { + PyErr_NoMemory(); + return -1; + } + **descr = (_PyBinaryOpSpecializationDescr) { + .oparg = oparg, + .guard = array_subscr_guard, + .action = array_subscr_action, + .free = array_subscr_free, + }; + return 1; + } + break; + } + + return 0; +} + + /*********************** Array Iterator **************************/ /*[clinic input] diff --git a/Objects/typeslots.inc b/Objects/typeslots.inc index 642160fe0bd..f197c3f5023 100644 --- a/Objects/typeslots.inc +++ b/Objects/typeslots.inc @@ -82,3 +82,4 @@ {offsetof(PyAsyncMethods, am_send), offsetof(PyTypeObject, tp_as_async)}, {-1, offsetof(PyTypeObject, tp_vectorcall)}, {-1, offsetof(PyHeapTypeObject, ht_token)}, +{-1, offsetof(PyTypeObject, tp_binop_specialize)}, diff --git a/Python/bytecodes.c b/Python/bytecodes.c index 5a52efaaec8..cc47e57175d 100644 --- a/Python/bytecodes.c +++ b/Python/bytecodes.c @@ -801,9 +801,19 @@ dummy_func( PyObject *right_o = PyStackRef_AsPyObjectBorrow(right); _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr*)descr; assert(INLINE_CACHE_ENTRIES_BINARY_OP == 5); - assert(d && d->guard); + assert(d); + assert(d->guard); int res = d->guard(left_o, right_o); - DEOPT_IF(!res); + ERROR_IF(res < 0); + if (res == 0) { + if (d->free) { + d->free(d); + } + _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(this_instr+1); + write_ptr(cache->external_cache, NULL); + this_instr->op.code = BINARY_OP; + DEOPT_IF(true); + } } pure op(_BINARY_OP_EXTEND, (descr/4, left, right -- res)) { @@ -816,6 +826,7 @@ dummy_func( PyObject *res_o = d->action(left_o, right_o); DECREF_INPUTS(); + ERROR_IF(res_o == NULL); res = PyStackRef_FromPyObjectSteal(res_o); } diff --git a/Python/executor_cases.c.h b/Python/executor_cases.c.h index 7f3c3141ad0..662e050c5c4 100644 --- a/Python/executor_cases.c.h +++ b/Python/executor_cases.c.h @@ -1147,26 +1147,7 @@ break; } - case _GUARD_BINARY_OP_EXTEND: { - _PyStackRef right; - _PyStackRef left; - right = stack_pointer[-1]; - left = stack_pointer[-2]; - PyObject *descr = (PyObject *)CURRENT_OPERAND0(); - PyObject *left_o = PyStackRef_AsPyObjectBorrow(left); - PyObject *right_o = PyStackRef_AsPyObjectBorrow(right); - _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr*)descr; - assert(INLINE_CACHE_ENTRIES_BINARY_OP == 5); - assert(d && d->guard); - _PyFrame_SetStackPointer(frame, stack_pointer); - int res = d->guard(left_o, right_o); - stack_pointer = _PyFrame_GetStackPointer(frame); - if (!res) { - UOP_STAT_INC(uopcode, miss); - JUMP_TO_JUMP_TARGET(); - } - break; - } + /* _GUARD_BINARY_OP_EXTEND is not a viable micro-op for tier 2 because it uses the 'this_instr' variable */ case _BINARY_OP_EXTEND: { _PyStackRef right; @@ -1193,6 +1174,9 @@ stack_pointer = _PyFrame_GetStackPointer(frame); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); + if (res_o == NULL) { + JUMP_TO_ERROR(); + } res = PyStackRef_FromPyObjectSteal(res_o); stack_pointer[0] = res; stack_pointer += 1; diff --git a/Python/generated_cases.c.h b/Python/generated_cases.c.h index ee54b385b70..072951d2a5f 100644 --- a/Python/generated_cases.c.h +++ b/Python/generated_cases.c.h @@ -283,14 +283,30 @@ PyObject *right_o = PyStackRef_AsPyObjectBorrow(right); _PyBinaryOpSpecializationDescr *d = (_PyBinaryOpSpecializationDescr*)descr; assert(INLINE_CACHE_ENTRIES_BINARY_OP == 5); - assert(d && d->guard); + assert(d); + assert(d->guard); _PyFrame_SetStackPointer(frame, stack_pointer); int res = d->guard(left_o, right_o); stack_pointer = _PyFrame_GetStackPointer(frame); - if (!res) { - UPDATE_MISS_STATS(BINARY_OP); - assert(_PyOpcode_Deopt[opcode] == (BINARY_OP)); - JUMP_TO_PREDICTED(BINARY_OP); + if (res < 0) { + JUMP_TO_LABEL(error); + } + if (res == 0) { + if (d->free) { + _PyFrame_SetStackPointer(frame, stack_pointer); + d->free(d); + stack_pointer = _PyFrame_GetStackPointer(frame); + } + _PyBinaryOpCache *cache = (_PyBinaryOpCache *)(this_instr+1); + _PyFrame_SetStackPointer(frame, stack_pointer); + write_ptr(cache->external_cache, NULL); + stack_pointer = _PyFrame_GetStackPointer(frame); + this_instr->op.code = BINARY_OP; + if (true) { + UPDATE_MISS_STATS(BINARY_OP); + assert(_PyOpcode_Deopt[opcode] == (BINARY_OP)); + JUMP_TO_PREDICTED(BINARY_OP); + } } } /* Skip -4 cache entry */ @@ -315,6 +331,9 @@ stack_pointer = _PyFrame_GetStackPointer(frame); stack_pointer += -2; assert(WITHIN_STACK_BOUNDS()); + if (res_o == NULL) { + JUMP_TO_LABEL(error); + } res = PyStackRef_FromPyObjectSteal(res_o); } stack_pointer[0] = res; diff --git a/Python/optimizer_cases.c.h b/Python/optimizer_cases.c.h index 3f91f7eefc7..8e8b2ecfa5b 100644 --- a/Python/optimizer_cases.c.h +++ b/Python/optimizer_cases.c.h @@ -554,9 +554,7 @@ break; } - case _GUARD_BINARY_OP_EXTEND: { - break; - } + /* _GUARD_BINARY_OP_EXTEND is not a viable micro-op for tier 2 */ case _BINARY_OP_EXTEND: { JitOptSymbol *res; diff --git a/Python/specialize.c b/Python/specialize.c index 59ec9a4cad6..fe4a65ee5f8 100644 --- a/Python/specialize.c +++ b/Python/specialize.c @@ -2534,7 +2534,7 @@ LONG_FLOAT_ACTION(compactlong_float_multiply, *) LONG_FLOAT_ACTION(compactlong_float_true_div, /) #undef LONG_FLOAT_ACTION -static _PyBinaryOpSpecializationDescr binaryop_extend_descrs[] = { +static const _PyBinaryOpSpecializationDescr binaryop_extend_builtins[] = { /* long-long arithmetic */ {NB_OR, compactlongs_guard, compactlongs_or}, {NB_AND, compactlongs_guard, compactlongs_and}, @@ -2560,14 +2560,41 @@ static int binary_op_extended_specialization(PyObject *lhs, PyObject *rhs, int oparg, _PyBinaryOpSpecializationDescr **descr) { - size_t n = sizeof(binaryop_extend_descrs)/sizeof(_PyBinaryOpSpecializationDescr); - for (size_t i = 0; i < n; i++) { - _PyBinaryOpSpecializationDescr *d = &binaryop_extend_descrs[i]; + /* We are currently using this only for NB_SUBSCR, which is not + * commutative. Will need to revisit this function when we use + * this for operators which are. + */ + + typedef _PyBinaryOpSpecializationDescr descr_type; + size_t size = Py_ARRAY_LENGTH(binaryop_extend_builtins); + for (size_t i = 0; i < size; i++) { + descr_type *d = (descr_type *)&binaryop_extend_builtins[i]; + assert(d != NULL); + assert(d->guard != NULL); if (d->oparg == oparg && d->guard(lhs, rhs)) { *descr = d; return 1; } } + + PyTypeObject *lhs_type = Py_TYPE(lhs); + if (lhs_type->tp_binop_specialize != NULL) { + int ret = lhs_type->tp_binop_specialize(lhs, rhs, oparg, descr); + if (ret < 0) { + return -1; + } + if (ret == 1) { + if (*descr == NULL) { + PyErr_Format( + PyExc_ValueError, + "tp_binop_specialize of '%T' returned 1 with *descr == NULL", + lhs); + return -1; + } + (*descr)->oparg = oparg; + } + return ret; + } return 0; }