gh-115999: Refactor LOAD_GLOBAL specializations to avoid reloading {globals, builtins} keys (gh-124953)

Each of the `LOAD_GLOBAL` specializations is implemented roughly as:

1. Load keys version.
2. Load cached keys version.
3. Deopt if (1) and (2) don't match.
4. Load keys.
5. Load cached index into keys.
6. Load object from (4) at offset from (5).

This is not thread-safe in free-threaded builds; the keys object may be replaced
in between steps (3) and (4).

This change refactors the specializations to avoid reloading the keys object and
instead pass the keys object from guards to be consumed by downstream uops.
This commit is contained in:
mpage 2024-10-09 08:18:25 -07:00 committed by GitHub
parent b9a8ca0a6a
commit f978fb4f8d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 379 additions and 162 deletions

View file

@ -1569,17 +1569,29 @@ dummy_func(
assert(DK_IS_UNICODE(dict->ma_keys));
}
op(_GUARD_BUILTINS_VERSION, (version/1 --)) {
op(_GUARD_GLOBALS_VERSION_PUSH_KEYS, (version / 1 -- globals_keys: PyDictKeysObject *))
{
PyDictObject *dict = (PyDictObject *)GLOBALS();
DEOPT_IF(!PyDict_CheckExact(dict));
DEOPT_IF(dict->ma_keys->dk_version != version);
globals_keys = dict->ma_keys;
assert(DK_IS_UNICODE(globals_keys));
}
op(_GUARD_BUILTINS_VERSION_PUSH_KEYS, (version / 1 -- builtins_keys: PyDictKeysObject *))
{
PyDictObject *dict = (PyDictObject *)BUILTINS();
DEOPT_IF(!PyDict_CheckExact(dict));
DEOPT_IF(dict->ma_keys->dk_version != version);
assert(DK_IS_UNICODE(dict->ma_keys));
builtins_keys = dict->ma_keys;
assert(DK_IS_UNICODE(builtins_keys));
}
op(_LOAD_GLOBAL_MODULE, (index/1 -- res, null if (oparg & 1))) {
PyDictObject *dict = (PyDictObject *)GLOBALS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
op(_LOAD_GLOBAL_MODULE_FROM_KEYS, (index/1, globals_keys: PyDictKeysObject* -- res, null if (oparg & 1))) {
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(globals_keys);
PyObject *res_o = entries[index].me_value;
DEAD(globals_keys);
SYNC_SP();
DEOPT_IF(res_o == NULL);
Py_INCREF(res_o);
STAT_INC(LOAD_GLOBAL, hit);
@ -1587,10 +1599,11 @@ dummy_func(
res = PyStackRef_FromPyObjectSteal(res_o);
}
op(_LOAD_GLOBAL_BUILTINS, (index/1 -- res, null if (oparg & 1))) {
PyDictObject *bdict = (PyDictObject *)BUILTINS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(bdict->ma_keys);
op(_LOAD_GLOBAL_BUILTINS_FROM_KEYS, (index/1, builtins_keys: PyDictKeysObject* -- res, null if (oparg & 1))) {
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(builtins_keys);
PyObject *res_o = entries[index].me_value;
DEAD(builtins_keys);
SYNC_SP();
DEOPT_IF(res_o == NULL);
Py_INCREF(res_o);
STAT_INC(LOAD_GLOBAL, hit);
@ -1600,15 +1613,15 @@ dummy_func(
macro(LOAD_GLOBAL_MODULE) =
unused/1 + // Skip over the counter
_GUARD_GLOBALS_VERSION +
_GUARD_GLOBALS_VERSION_PUSH_KEYS +
unused/1 + // Skip over the builtins version
_LOAD_GLOBAL_MODULE;
_LOAD_GLOBAL_MODULE_FROM_KEYS;
macro(LOAD_GLOBAL_BUILTIN) =
unused/1 + // Skip over the counter
_GUARD_GLOBALS_VERSION +
_GUARD_BUILTINS_VERSION +
_LOAD_GLOBAL_BUILTINS;
_GUARD_BUILTINS_VERSION_PUSH_KEYS +
_LOAD_GLOBAL_BUILTINS_FROM_KEYS;
inst(DELETE_FAST, (--)) {
_PyStackRef v = GETLOCAL(oparg);
@ -4871,6 +4884,26 @@ dummy_func(
DEOPT_IF(func->func_version != func_version);
}
tier2 op(_LOAD_GLOBAL_MODULE, (index/1 -- res, null if (oparg & 1))) {
PyDictObject *dict = (PyDictObject *)GLOBALS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
PyObject *res_o = entries[index].me_value;
DEOPT_IF(res_o == NULL);
Py_INCREF(res_o);
res = PyStackRef_FromPyObjectSteal(res_o);
null = PyStackRef_NULL;
}
tier2 op(_LOAD_GLOBAL_BUILTINS, (index/1 -- res, null if (oparg & 1))) {
PyDictObject *dict = (PyDictObject *)BUILTINS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
PyObject *res_o = entries[index].me_value;
DEOPT_IF(res_o == NULL);
Py_INCREF(res_o);
res = PyStackRef_FromPyObjectSteal(res_o);
null = PyStackRef_NULL;
}
/* Internal -- for testing executors */
op(_INTERNAL_INCREMENT_OPT_COUNTER, (opt --)) {
_PyCounterOptimizerObject *exe = (_PyCounterOptimizerObject *)PyStackRef_AsPyObjectBorrow(opt);

View file

@ -1791,7 +1791,28 @@
break;
}
case _GUARD_BUILTINS_VERSION: {
case _GUARD_GLOBALS_VERSION_PUSH_KEYS: {
PyDictKeysObject *globals_keys;
uint16_t version = (uint16_t)CURRENT_OPERAND();
PyDictObject *dict = (PyDictObject *)GLOBALS();
if (!PyDict_CheckExact(dict)) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
if (dict->ma_keys->dk_version != version) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
globals_keys = dict->ma_keys;
assert(DK_IS_UNICODE(globals_keys));
stack_pointer[0].bits = (uintptr_t)globals_keys;
stack_pointer += 1;
assert(WITHIN_STACK_BOUNDS());
break;
}
case _GUARD_BUILTINS_VERSION_PUSH_KEYS: {
PyDictKeysObject *builtins_keys;
uint16_t version = (uint16_t)CURRENT_OPERAND();
PyDictObject *dict = (PyDictObject *)BUILTINS();
if (!PyDict_CheckExact(dict)) {
@ -1802,18 +1823,25 @@
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
assert(DK_IS_UNICODE(dict->ma_keys));
builtins_keys = dict->ma_keys;
assert(DK_IS_UNICODE(builtins_keys));
stack_pointer[0].bits = (uintptr_t)builtins_keys;
stack_pointer += 1;
assert(WITHIN_STACK_BOUNDS());
break;
}
case _LOAD_GLOBAL_MODULE: {
case _LOAD_GLOBAL_MODULE_FROM_KEYS: {
PyDictKeysObject *globals_keys;
_PyStackRef res;
_PyStackRef null = PyStackRef_NULL;
oparg = CURRENT_OPARG();
globals_keys = (PyDictKeysObject *)stack_pointer[-1].bits;
uint16_t index = (uint16_t)CURRENT_OPERAND();
PyDictObject *dict = (PyDictObject *)GLOBALS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(globals_keys);
PyObject *res_o = entries[index].me_value;
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
if (res_o == NULL) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
@ -1829,14 +1857,17 @@
break;
}
case _LOAD_GLOBAL_BUILTINS: {
case _LOAD_GLOBAL_BUILTINS_FROM_KEYS: {
PyDictKeysObject *builtins_keys;
_PyStackRef res;
_PyStackRef null = PyStackRef_NULL;
oparg = CURRENT_OPARG();
builtins_keys = (PyDictKeysObject *)stack_pointer[-1].bits;
uint16_t index = (uint16_t)CURRENT_OPERAND();
PyDictObject *bdict = (PyDictObject *)BUILTINS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(bdict->ma_keys);
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(builtins_keys);
PyObject *res_o = entries[index].me_value;
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());
if (res_o == NULL) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
@ -5698,6 +5729,50 @@
break;
}
case _LOAD_GLOBAL_MODULE: {
_PyStackRef res;
_PyStackRef null = PyStackRef_NULL;
oparg = CURRENT_OPARG();
uint16_t index = (uint16_t)CURRENT_OPERAND();
PyDictObject *dict = (PyDictObject *)GLOBALS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
PyObject *res_o = entries[index].me_value;
if (res_o == NULL) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
Py_INCREF(res_o);
res = PyStackRef_FromPyObjectSteal(res_o);
null = PyStackRef_NULL;
stack_pointer[0] = res;
if (oparg & 1) stack_pointer[1] = null;
stack_pointer += 1 + (oparg & 1);
assert(WITHIN_STACK_BOUNDS());
break;
}
case _LOAD_GLOBAL_BUILTINS: {
_PyStackRef res;
_PyStackRef null = PyStackRef_NULL;
oparg = CURRENT_OPARG();
uint16_t index = (uint16_t)CURRENT_OPERAND();
PyDictObject *dict = (PyDictObject *)BUILTINS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
PyObject *res_o = entries[index].me_value;
if (res_o == NULL) {
UOP_STAT_INC(uopcode, miss);
JUMP_TO_JUMP_TARGET();
}
Py_INCREF(res_o);
res = PyStackRef_FromPyObjectSteal(res_o);
null = PyStackRef_NULL;
stack_pointer[0] = res;
if (oparg & 1) stack_pointer[1] = null;
stack_pointer += 1 + (oparg & 1);
assert(WITHIN_STACK_BOUNDS());
break;
}
case _INTERNAL_INCREMENT_OPT_COUNTER: {
_PyStackRef opt;
opt = stack_pointer[-1];

View file

@ -6146,6 +6146,7 @@
next_instr += 5;
INSTRUCTION_STATS(LOAD_GLOBAL_BUILTIN);
static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size");
PyDictKeysObject *builtins_keys;
_PyStackRef res;
_PyStackRef null = PyStackRef_NULL;
/* Skip 1 cache entry */
@ -6157,19 +6158,19 @@
DEOPT_IF(dict->ma_keys->dk_version != version, LOAD_GLOBAL);
assert(DK_IS_UNICODE(dict->ma_keys));
}
// _GUARD_BUILTINS_VERSION
// _GUARD_BUILTINS_VERSION_PUSH_KEYS
{
uint16_t version = read_u16(&this_instr[3].cache);
PyDictObject *dict = (PyDictObject *)BUILTINS();
DEOPT_IF(!PyDict_CheckExact(dict), LOAD_GLOBAL);
DEOPT_IF(dict->ma_keys->dk_version != version, LOAD_GLOBAL);
assert(DK_IS_UNICODE(dict->ma_keys));
builtins_keys = dict->ma_keys;
assert(DK_IS_UNICODE(builtins_keys));
}
// _LOAD_GLOBAL_BUILTINS
// _LOAD_GLOBAL_BUILTINS_FROM_KEYS
{
uint16_t index = read_u16(&this_instr[4].cache);
PyDictObject *bdict = (PyDictObject *)BUILTINS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(bdict->ma_keys);
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(builtins_keys);
PyObject *res_o = entries[index].me_value;
DEOPT_IF(res_o == NULL, LOAD_GLOBAL);
Py_INCREF(res_o);
@ -6189,23 +6190,24 @@
next_instr += 5;
INSTRUCTION_STATS(LOAD_GLOBAL_MODULE);
static_assert(INLINE_CACHE_ENTRIES_LOAD_GLOBAL == 4, "incorrect cache size");
PyDictKeysObject *globals_keys;
_PyStackRef res;
_PyStackRef null = PyStackRef_NULL;
/* Skip 1 cache entry */
// _GUARD_GLOBALS_VERSION
// _GUARD_GLOBALS_VERSION_PUSH_KEYS
{
uint16_t version = read_u16(&this_instr[2].cache);
PyDictObject *dict = (PyDictObject *)GLOBALS();
DEOPT_IF(!PyDict_CheckExact(dict), LOAD_GLOBAL);
DEOPT_IF(dict->ma_keys->dk_version != version, LOAD_GLOBAL);
assert(DK_IS_UNICODE(dict->ma_keys));
globals_keys = dict->ma_keys;
assert(DK_IS_UNICODE(globals_keys));
}
/* Skip 1 cache entry */
// _LOAD_GLOBAL_MODULE
// _LOAD_GLOBAL_MODULE_FROM_KEYS
{
uint16_t index = read_u16(&this_instr[4].cache);
PyDictObject *dict = (PyDictObject *)GLOBALS();
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(dict->ma_keys);
PyDictUnicodeEntry *entries = DK_UNICODE_ENTRIES(globals_keys);
PyObject *res_o = entries[index].me_value;
DEOPT_IF(res_o == NULL, LOAD_GLOBAL);
Py_INCREF(res_o);

View file

@ -131,6 +131,26 @@ incorrect_keys(_PyUOpInstruction *inst, PyObject *obj)
return 0;
}
static int
check_next_uop(_PyUOpInstruction *buffer, int size, int pc, uint16_t expected)
{
if (pc + 1 >= size) {
DPRINTF(1, "Cannot rewrite %s at pc %d: buffer too small\n",
_PyOpcode_uop_name[buffer[pc].opcode], pc);
return 0;
}
uint16_t next_opcode = buffer[pc + 1].opcode;
if (next_opcode != expected) {
DPRINTF(1,
"Cannot rewrite %s at pc %d: unexpected next opcode %s, "
"expected %s\n",
_PyOpcode_uop_name[buffer[pc].opcode], pc,
_PyOpcode_uop_name[next_opcode], _PyOpcode_uop_name[expected]);
return 0;
}
return 1;
}
/* Returns 1 if successfully optimized
* 0 if the trace is not suitable for optimization (yet)
* -1 if there was an error. */
@ -174,7 +194,7 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer,
_PyUOpInstruction *inst = &buffer[pc];
int opcode = inst->opcode;
switch(opcode) {
case _GUARD_BUILTINS_VERSION:
case _GUARD_BUILTINS_VERSION_PUSH_KEYS:
if (incorrect_keys(inst, builtins)) {
OPT_STAT_INC(remove_globals_incorrect_keys);
return 0;
@ -182,6 +202,10 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer,
if (interp->rare_events.builtin_dict >= _Py_MAX_ALLOWED_BUILTINS_MODIFICATIONS) {
continue;
}
if (!check_next_uop(buffer, buffer_size, pc,
_LOAD_GLOBAL_BUILTINS_FROM_KEYS)) {
continue;
}
if ((builtins_watched & 1) == 0) {
PyDict_Watch(BUILTINS_WATCHER_ID, builtins);
builtins_watched |= 1;
@ -194,8 +218,13 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer,
buffer[pc].operand = function_version;
function_checked |= 1;
}
// We're no longer pushing the builtins keys; rewrite the
// instruction that consumed the keys to load them from the
// frame.
buffer[pc + 1].opcode = _LOAD_GLOBAL_BUILTINS;
break;
case _GUARD_GLOBALS_VERSION:
case _GUARD_GLOBALS_VERSION_PUSH_KEYS:
if (incorrect_keys(inst, globals)) {
OPT_STAT_INC(remove_globals_incorrect_keys);
return 0;
@ -204,6 +233,11 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer,
if (watched_mutations >= _Py_MAX_ALLOWED_GLOBALS_MODIFICATIONS) {
continue;
}
if (opcode == _GUARD_GLOBALS_VERSION_PUSH_KEYS &&
!check_next_uop(buffer, buffer_size, pc,
_LOAD_GLOBAL_MODULE_FROM_KEYS)) {
continue;
}
if ((globals_watched & 1) == 0) {
PyDict_Watch(GLOBALS_WATCHER_ID, globals);
_Py_BloomFilter_Add(dependencies, globals);
@ -217,6 +251,12 @@ remove_globals(_PyInterpreterFrame *frame, _PyUOpInstruction *buffer,
buffer[pc].operand = function_version;
function_checked |= 1;
}
if (opcode == _GUARD_GLOBALS_VERSION_PUSH_KEYS) {
// We're no longer pushing the globals keys; rewrite the
// instruction that consumed the keys to load them from the
// frame.
buffer[pc + 1].opcode = _LOAD_GLOBAL_MODULE;
}
break;
case _LOAD_GLOBAL_BUILTINS:
if (function_checked & globals_watched & builtins_watched & 1) {

View file

@ -836,6 +836,16 @@ dummy_func(void) {
ctx->done = true;
}
op(_GUARD_GLOBALS_VERSION_PUSH_KEYS, (version/1 -- globals_keys)) {
globals_keys = sym_new_unknown(ctx);
(void)version;
}
op(_GUARD_BUILTINS_VERSION_PUSH_KEYS, (version/1 -- builtins_keys)) {
builtins_keys = sym_new_unknown(ctx);
(void)version;
}
// END BYTECODES //
}

View file

@ -844,30 +844,48 @@
break;
}
case _GUARD_BUILTINS_VERSION: {
break;
}
case _LOAD_GLOBAL_MODULE: {
_Py_UopsSymbol *res;
_Py_UopsSymbol *null = NULL;
res = sym_new_not_null(ctx);
null = sym_new_null(ctx);
stack_pointer[0] = res;
if (oparg & 1) stack_pointer[1] = null;
stack_pointer += 1 + (oparg & 1);
case _GUARD_GLOBALS_VERSION_PUSH_KEYS: {
_Py_UopsSymbol *globals_keys;
uint16_t version = (uint16_t)this_instr->operand;
globals_keys = sym_new_unknown(ctx);
(void)version;
stack_pointer[0] = globals_keys;
stack_pointer += 1;
assert(WITHIN_STACK_BOUNDS());
break;
}
case _LOAD_GLOBAL_BUILTINS: {
case _GUARD_BUILTINS_VERSION_PUSH_KEYS: {
_Py_UopsSymbol *builtins_keys;
uint16_t version = (uint16_t)this_instr->operand;
builtins_keys = sym_new_unknown(ctx);
(void)version;
stack_pointer[0] = builtins_keys;
stack_pointer += 1;
assert(WITHIN_STACK_BOUNDS());
break;
}
case _LOAD_GLOBAL_MODULE_FROM_KEYS: {
_Py_UopsSymbol *res;
_Py_UopsSymbol *null = NULL;
res = sym_new_not_null(ctx);
null = sym_new_null(ctx);
stack_pointer[0] = res;
if (oparg & 1) stack_pointer[1] = null;
stack_pointer += 1 + (oparg & 1);
stack_pointer[-1] = res;
if (oparg & 1) stack_pointer[0] = null;
stack_pointer += (oparg & 1);
assert(WITHIN_STACK_BOUNDS());
break;
}
case _LOAD_GLOBAL_BUILTINS_FROM_KEYS: {
_Py_UopsSymbol *res;
_Py_UopsSymbol *null = NULL;
res = sym_new_not_null(ctx);
null = sym_new_null(ctx);
stack_pointer[-1] = res;
if (oparg & 1) stack_pointer[0] = null;
stack_pointer += (oparg & 1);
assert(WITHIN_STACK_BOUNDS());
break;
}
@ -2419,6 +2437,30 @@
break;
}
case _LOAD_GLOBAL_MODULE: {
_Py_UopsSymbol *res;
_Py_UopsSymbol *null = NULL;
res = sym_new_not_null(ctx);
null = sym_new_null(ctx);
stack_pointer[0] = res;
if (oparg & 1) stack_pointer[1] = null;
stack_pointer += 1 + (oparg & 1);
assert(WITHIN_STACK_BOUNDS());
break;
}
case _LOAD_GLOBAL_BUILTINS: {
_Py_UopsSymbol *res;
_Py_UopsSymbol *null = NULL;
res = sym_new_not_null(ctx);
null = sym_new_null(ctx);
stack_pointer[0] = res;
if (oparg & 1) stack_pointer[1] = null;
stack_pointer += 1 + (oparg & 1);
assert(WITHIN_STACK_BOUNDS());
break;
}
case _INTERNAL_INCREMENT_OPT_COUNTER: {
stack_pointer += -1;
assert(WITHIN_STACK_BOUNDS());