gh-104584: Emit macro expansions to opcode_metadata.h (#106163)

This produces longer traces (superblocks?). Also improved debug output (uop names are now printed instead of numeric opcodes). This would be simpler if the numeric opcode values were generated by generate_cases.py, but that's another project. Refactored some code in generate_cases.py so the essential algorithm for cache effects is only run once. (Deciding which effects are used and what the total cache size is, regardless of what's used.)
2025-09-18 22:50:26 +00:00 · 2023-06-28 11:28:07 -07:00 · 2023-06-28 11:28:07 -07:00 · 11731434df
commit 11731434df
parent c283a0cff5
4 changed files with 153 additions and 78 deletions
--- a/Python/ceval.c
+++ b/Python/ceval.c
@ -2817,10 +2817,10 @@ _PyUopExecute(_PyExecutorObject *executor, _PyInterpreterFrame *frame, PyObject
        oparg = (int)operand;
 #ifdef LLTRACE
        if (lltrace >= 3) {
-            const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : "";
+            const char *opname = opcode < 256 ? _PyOpcode_OpName[opcode] : _PyOpcode_uop_name[opcode];
            int stack_level = (int)(stack_pointer - _PyFrame_Stackbase(frame));
-            fprintf(stderr, "  uop %s %d, operand %" PRIu64 ", stack_level %d\n",
+            fprintf(stderr, "  uop %s, operand %" PRIu64 ", stack_level %d\n",
-                    opname, opcode, operand, stack_level);
+                    opname, operand, stack_level);
        }
 #endif
        pc++;
--- a/Python/opcode_metadata.h
+++ b/Python/opcode_metadata.h
@ -913,6 +913,9 @@ struct opcode_macro_expansion {
 #ifndef NEED_OPCODE_METADATA
 extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];
 extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];
 #ifdef Py_DEBUG
 extern const char * const _PyOpcode_uop_name[512];
 #endif
 #else
 const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {
    [NOP] = { true, INSTR_FMT_IX, 0 },
@ -1131,10 +1134,18 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
    [STORE_FAST] = { .nuops = 1, .uops = { { STORE_FAST, 0, 0 } } },
    [POP_TOP] = { .nuops = 1, .uops = { { POP_TOP, 0, 0 } } },
    [PUSH_NULL] = { .nuops = 1, .uops = { { PUSH_NULL, 0, 0 } } },
    [END_FOR] = { .nuops = 2, .uops = { { POP_TOP, 0, 0 }, { POP_TOP, 0, 0 } } },
    [END_SEND] = { .nuops = 1, .uops = { { END_SEND, 0, 0 } } },
    [UNARY_NEGATIVE] = { .nuops = 1, .uops = { { UNARY_NEGATIVE, 0, 0 } } },
    [UNARY_NOT] = { .nuops = 1, .uops = { { UNARY_NOT, 0, 0 } } },
    [UNARY_INVERT] = { .nuops = 1, .uops = { { UNARY_INVERT, 0, 0 } } },
    [BINARY_OP_MULTIPLY_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_MULTIPLY_INT, 0, 0 } } },
    [BINARY_OP_ADD_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_ADD_INT, 0, 0 } } },
    [BINARY_OP_SUBTRACT_INT] = { .nuops = 2, .uops = { { _GUARD_BOTH_INT, 0, 0 }, { _BINARY_OP_SUBTRACT_INT, 0, 0 } } },
    [BINARY_OP_MULTIPLY_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_MULTIPLY_FLOAT, 0, 0 } } },
    [BINARY_OP_ADD_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_ADD_FLOAT, 0, 0 } } },
    [BINARY_OP_SUBTRACT_FLOAT] = { .nuops = 2, .uops = { { _GUARD_BOTH_FLOAT, 0, 0 }, { _BINARY_OP_SUBTRACT_FLOAT, 0, 0 } } },
    [BINARY_OP_ADD_UNICODE] = { .nuops = 2, .uops = { { _GUARD_BOTH_UNICODE, 0, 0 }, { _BINARY_OP_ADD_UNICODE, 0, 0 } } },
    [BINARY_SLICE] = { .nuops = 1, .uops = { { BINARY_SLICE, 0, 0 } } },
    [STORE_SLICE] = { .nuops = 1, .uops = { { STORE_SLICE, 0, 0 } } },
    [BINARY_SUBSCR_LIST_INT] = { .nuops = 1, .uops = { { BINARY_SUBSCR_LIST_INT, 0, 0 } } },
@ -1162,6 +1173,9 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
    [DELETE_ATTR] = { .nuops = 1, .uops = { { DELETE_ATTR, 0, 0 } } },
    [STORE_GLOBAL] = { .nuops = 1, .uops = { { STORE_GLOBAL, 0, 0 } } },
    [DELETE_GLOBAL] = { .nuops = 1, .uops = { { DELETE_GLOBAL, 0, 0 } } },
    [LOAD_LOCALS] = { .nuops = 1, .uops = { { _LOAD_LOCALS, 0, 0 } } },
    [LOAD_NAME] = { .nuops = 2, .uops = { { _LOAD_LOCALS, 0, 0 }, { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
    [LOAD_FROM_DICT_OR_GLOBALS] = { .nuops = 1, .uops = { { _LOAD_FROM_DICT_OR_GLOBALS, 0, 0 } } },
    [DELETE_DEREF] = { .nuops = 1, .uops = { { DELETE_DEREF, 0, 0 } } },
    [LOAD_FROM_DICT_OR_DEREF] = { .nuops = 1, .uops = { { LOAD_FROM_DICT_OR_DEREF, 0, 0 } } },
    [LOAD_DEREF] = { .nuops = 1, .uops = { { LOAD_DEREF, 0, 0 } } },
@ -1207,4 +1221,22 @@ const struct opcode_macro_expansion _PyOpcode_macro_expansion[256] = {
    [COPY] = { .nuops = 1, .uops = { { COPY, 0, 0 } } },
    [SWAP] = { .nuops = 1, .uops = { { SWAP, 0, 0 } } },
 };
 #ifdef Py_DEBUG
 const char * const _PyOpcode_uop_name[512] = {
    [300] = "EXIT_TRACE",
    [301] = "SET_IP",
    [302] = "_GUARD_BOTH_INT",
    [303] = "_BINARY_OP_MULTIPLY_INT",
    [304] = "_BINARY_OP_ADD_INT",
    [305] = "_BINARY_OP_SUBTRACT_INT",
    [306] = "_GUARD_BOTH_FLOAT",
    [307] = "_BINARY_OP_MULTIPLY_FLOAT",
    [308] = "_BINARY_OP_ADD_FLOAT",
    [309] = "_BINARY_OP_SUBTRACT_FLOAT",
    [310] = "_GUARD_BOTH_UNICODE",
    [311] = "_BINARY_OP_ADD_UNICODE",
    [312] = "_LOAD_LOCALS",
    [313] = "_LOAD_FROM_DICT_OR_GLOBALS",
 };
 #endif
 #endif
--- a/Python/optimizer.c
+++ b/Python/optimizer.c
@ -325,8 +325,8 @@ translate_bytecode_to_trace(
    }
 #define ADD_TO_TRACE(OPCODE, OPERAND) \
        if (lltrace >= 2) { \
-            const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : ""; \
+            const char *opname = (OPCODE) < 256 ? _PyOpcode_OpName[(OPCODE)] : _PyOpcode_uop_name[(OPCODE)]; \
-            fprintf(stderr, "  ADD_TO_TRACE(%s %d, %" PRIu64 ")\n", opname, (OPCODE), (uint64_t)(OPERAND)); \
+            fprintf(stderr, "  ADD_TO_TRACE(%s, %" PRIu64 ")\n", opname, (uint64_t)(OPERAND)); \
        } \
        trace[trace_length].opcode = (OPCODE); \
        trace[trace_length].operand = (OPERAND); \
@ -474,6 +474,8 @@ PyUnstable_Optimizer_NewUOpOptimizer(void)
    }
    opt->optimize = uop_optimize;
    opt->resume_threshold = UINT16_MAX;
-    opt->backedge_threshold = 0;
+    // Need at least 3 iterations to settle specializations.
    // A few lower bits of the counter are reserved for other flags.
    opt->backedge_threshold = 3 << OPTIMIZER_BITS_IN_COUNTER;
    return (PyObject *)opt;
 }
--- a/Tools/cases_generator/generate_cases.py
+++ b/Tools/cases_generator/generate_cases.py
@ -300,6 +300,13 @@ class InstructionFlags:
                f"(_PyOpcode_opcode_metadata[(OP)].flags & ({name}))")
@dataclasses.dataclass
 class ActiveCacheEffect:
    """Wraps a CacheEffect that is actually used, in context."""
    effect: parser.CacheEffect
    offset: int
 FORBIDDEN_NAMES_IN_UOPS = (
    "resume_with_error",  # Proxy for "goto", which isn't an IDENTIFIER
    "unbound_local_error",
@ -344,6 +351,7 @@ class Instruction:
    unmoved_names: frozenset[str]
    instr_fmt: str
    instr_flags: InstructionFlags
    active_caches: list[ActiveCacheEffect]
    # Set later
    family: parser.Family | None = None
@ -375,15 +383,19 @@ class Instruction:
        self.instr_flags = InstructionFlags.fromInstruction(inst)
        self.active_caches = []
        offset = 0
        for effect in self.cache_effects:
            if effect.name != UNUSED:
                self.active_caches.append(ActiveCacheEffect(effect, offset))
            offset += effect.size
        if self.instr_flags.HAS_ARG_FLAG:
            fmt = "IB"
        else:
            fmt = "IX"
-        cache = "C"
+        if offset:
-        for ce in self.cache_effects:
+            fmt += "C" + "0"*(offset-1)
            for _ in range(ce.size):
                fmt += cache
                cache = "0"
        self.instr_fmt = fmt
    def is_viable_uop(self) -> bool:
@ -392,18 +404,11 @@ class Instruction:
            return False
        if self.instr_flags.HAS_ARG_FLAG:
            # If the instruction uses oparg, it cannot use any caches
-            for c in self.cache_effects:
+            if self.active_caches:
-                if c.name != UNUSED:
+                return False
                    return False
        else:
            # If it doesn't use oparg, it can have one cache entry
-            caches: list[parser.CacheEffect] = []
+            if len(self.active_caches) > 1:
            cache_offset = 0
            for c in self.cache_effects:
                if c.name != UNUSED:
                    caches.append(c)
                cache_offset += c.size
            if len(caches) > 1:
                return False
        for forbidden in FORBIDDEN_NAMES_IN_UOPS:
            # TODO: Don't check in '#ifdef ENABLE_SPECIALIZATION' regions
@ -458,7 +463,7 @@ class Instruction:
        # out.emit(f"next_instr += OPSIZE({self.inst.name}) - 1;")
-        self.write_body(out, 0, tier=tier)
+        self.write_body(out, 0, self.active_caches, tier=tier)
        # Skip the rest if the block always exits
        if self.always_exits:
@ -492,33 +497,30 @@ class Instruction:
            self,
            out: Formatter,
            dedent: int,
-            cache_adjust: int = 0,
+            active_caches: list[ActiveCacheEffect],
            tier: Tiers = TIER_ONE,
        ) -> None:
        """Write the instruction body."""
        # Write cache effect variable declarations and initializations
-        cache_offset = cache_adjust
+        for active in active_caches:
-        for ceffect in self.cache_effects:
+            ceffect = active.effect
-            if ceffect.name != UNUSED:
+            bits = ceffect.size * BITS_PER_CODE_UNIT
-                bits = ceffect.size * BITS_PER_CODE_UNIT
+            if bits == 64:
-                if bits == 64:
+                # NOTE: We assume that 64-bit data in the cache
-                    # NOTE: We assume that 64-bit data in the cache
+                # is always an object pointer.
-                    # is always an object pointer.
+                # If this becomes false, we need a way to specify
-                    # If this becomes false, we need a way to specify
+                # syntactically what type the cache data is.
-                    # syntactically what type the cache data is.
+                typ = "PyObject *"
-                    typ = "PyObject *"
+                func = "read_obj"
-                    func = "read_obj"
+            else:
-                else:
+                typ = f"uint{bits}_t "
-                    typ = f"uint{bits}_t "
+                func = f"read_u{bits}"
-                    func = f"read_u{bits}"
+            if tier == TIER_ONE:
-                if tier == TIER_ONE:
+                out.emit(
-                    out.emit(
+                    f"{typ}{ceffect.name} = {func}(&next_instr[{active.offset}].cache);"
-                        f"{typ}{ceffect.name} = {func}(&next_instr[{cache_offset}].cache);"
+                )
-                    )
+            else:
-                else:
+                out.emit(f"{typ}{ceffect.name} = operand;")
                    out.emit(f"{typ}{ceffect.name} = operand;")
            cache_offset += ceffect.size
        assert cache_offset == self.cache_offset + cache_adjust
        # Write the body, substituting a goto for ERROR_IF() and other stuff
        assert dedent <= 0
@ -583,8 +585,9 @@ class Component:
    instr: Instruction
    input_mapping: StackEffectMapping
    output_mapping: StackEffectMapping
    active_caches: list[ActiveCacheEffect]
-    def write_body(self, out: Formatter, cache_adjust: int) -> None:
+    def write_body(self, out: Formatter) -> None:
        with out.block(""):
            input_names = {ieffect.name for _, ieffect in self.input_mapping}
            for var, ieffect in self.input_mapping:
@ -593,7 +596,7 @@ class Component:
                if oeffect.name not in input_names:
                    out.declare(oeffect, None)
-            self.instr.write_body(out, dedent=-4, cache_adjust=cache_adjust)
+            self.instr.write_body(out, -4, self.active_caches)
            for var, oeffect in self.output_mapping:
                out.assign(var, oeffect)
@ -611,6 +614,7 @@ class MacroInstruction:
    instr_flags: InstructionFlags
    macro: parser.Macro
    parts: list[Component | parser.CacheEffect]
    cache_offset: int
    predicted: bool = False
@ -873,11 +877,11 @@ class Analyzer:
            cache = instr.cache_offset
            input = len(instr.input_effects)
            output = len(instr.output_effects)
-        elif macro := self.macro_instrs.get(name):
+        elif mac := self.macro_instrs.get(name):
-            cache, input, output = 0, 0, 0
+            cache = mac.cache_offset
-            for part in macro.parts:
+            input, output = 0, 0
            for part in mac.parts:
                if isinstance(part, Component):
                    cache += part.instr.cache_offset
                    # A component may pop what the previous component pushed,
                    # so we offset the input/output counts by that.
                    delta_i = len(part.instr.input_effects)
@ -885,9 +889,6 @@ class Analyzer:
                    offset = min(delta_i, output)
                    input += delta_i - offset
                    output += delta_o - offset
                else:
                    assert isinstance(part, parser.CacheEffect), part
                    cache += part.size
        else:
            assert False, f"Unknown instruction {name!r}"
        return cache, input, output
@ -906,29 +907,25 @@ class Analyzer:
        stack, initial_sp = self.stack_analysis(components)
        sp = initial_sp
        parts: list[Component | parser.CacheEffect] = []
        format = "IB"
        flags = InstructionFlags.newEmpty()
-        cache = "C"
+        offset = 0
        for component in components:
            match component:
                case parser.CacheEffect() as ceffect:
                    parts.append(ceffect)
-                    for _ in range(ceffect.size):
+                    offset += ceffect.size
                        format += cache
                        cache = "0"
                case Instruction() as instr:
-                    part, sp = self.analyze_instruction(instr, stack, sp)
+                    part, sp, offset = self.analyze_instruction(instr, stack, sp, offset)
                    parts.append(part)
                    for ce in instr.cache_effects:
                        for _ in range(ce.size):
                            format += cache
                            cache = "0"
                    flags.add(instr.instr_flags)
                case _:
                    typing.assert_never(component)
        final_sp = sp
        format = "IB"
        if offset:
            format += "C" + "0"*(offset-1)
        return MacroInstruction(
-            macro.name, stack, initial_sp, final_sp, format, flags, macro, parts
+            macro.name, stack, initial_sp, final_sp, format, flags, macro, parts, offset
        )
    def analyze_pseudo(self, pseudo: parser.Pseudo) -> PseudoInstruction:
@ -941,8 +938,8 @@ class Analyzer:
        return PseudoInstruction(pseudo.name, targets, fmts[0], targets[0].instr_flags)
    def analyze_instruction(
-        self, instr: Instruction, stack: list[StackEffect], sp: int
+        self, instr: Instruction, stack: list[StackEffect], sp: int, offset: int
-    ) -> tuple[Component, int]:
+    ) -> tuple[Component, int, int]:
        input_mapping: StackEffectMapping = []
        for ieffect in reversed(instr.input_effects):
            sp -= 1
@ -951,7 +948,12 @@ class Analyzer:
        for oeffect in instr.output_effects:
            output_mapping.append((stack[sp], oeffect))
            sp += 1
-        return Component(instr, input_mapping, output_mapping), sp
+        active_effects: list[ActiveCacheEffect] = []
        for ceffect in instr.cache_effects:
            if ceffect.name != UNUSED:
                active_effects.append(ActiveCacheEffect(ceffect, offset))
            offset += ceffect.size
        return Component(instr, input_mapping, output_mapping, active_effects), sp, offset
    def check_macro_components(
        self, macro: parser.Macro
@ -1030,7 +1032,7 @@ class Analyzer:
    def get_stack_effect_info(
        self, thing: parser.InstDef | parser.Macro | parser.Pseudo
-    ) -> tuple[AnyInstruction | None, str, str]:
+    ) -> tuple[AnyInstruction | None, str | None, str | None]:
        def effect_str(effects: list[StackEffect]) -> str:
            n_effect, sym_effect = list_effect_size(effects)
            if sym_effect:
@ -1108,6 +1110,7 @@ class Analyzer:
                continue
            instr, popped, pushed = self.get_stack_effect_info(thing)
            if instr is not None:
                assert popped is not None and pushed is not None
                popped_data.append((instr, popped))
                pushed_data.append((instr, pushed))
@ -1182,7 +1185,8 @@ class Analyzer:
            self.write_pseudo_instrs()
-            self.write_uop_defines()
+            self.out.emit("")
            self.write_uop_items(lambda name, counter: f"#define {name} {counter}")
            self.write_stack_effect_functions()
@ -1213,6 +1217,9 @@ class Analyzer:
            self.out.emit("#ifndef NEED_OPCODE_METADATA")
            self.out.emit("extern const struct opcode_metadata _PyOpcode_opcode_metadata[512];")
            self.out.emit("extern const struct opcode_macro_expansion _PyOpcode_macro_expansion[256];")
            self.out.emit("#ifdef Py_DEBUG")
            self.out.emit("extern const char * const _PyOpcode_uop_name[512];")
            self.out.emit("#endif")
            self.out.emit("#else")
            self.out.emit("const struct opcode_metadata _PyOpcode_opcode_metadata[512] = {")
@ -1246,19 +1253,27 @@ class Analyzer:
                            pass
                        case parser.InstDef(name=name):
                            instr = self.instrs[name]
                            # Since an 'op' is not a bytecode, it has no expansion
                            if instr.kind != "op" and instr.is_viable_uop():
                                # Double check there aren't any used cache effects.
                                # If this fails, see write_macro_expansions().
                                assert not instr.active_caches, (instr.name, instr.cache_effects)
                                self.out.emit(
                                    f"[{name}] = "
                                    f"{{ .nuops = 1, .uops = {{ {{ {name}, 0, 0 }} }} }},"
                                )
                        case parser.Macro():
-                            # TODO: emit expansion if all parts are viable uops
+                            self.write_macro_expansions(self.macro_instrs[thing.name])
                            pass
                        case parser.Pseudo():
                            pass
                        case _:
                            typing.assert_never(thing)
            self.out.emit("#ifdef Py_DEBUG")
            with self.out.block("const char * const _PyOpcode_uop_name[512] =", ";"):
                self.write_uop_items(lambda name, counter: f"[{counter}] = \"{name}\",")
            self.out.emit("#endif")
            self.out.emit("#endif")
        with open(self.pymetadata_filename, "w") as f:
@ -1300,13 +1315,12 @@ class Analyzer:
            self.out.emit(f"    ((OP) == {op}) || \\")
        self.out.emit(f"    0")
-    def write_uop_defines(self) -> None:
+    def write_uop_items(self, make_text: typing.Callable[[str, int], str]) -> None:
        """Write '#define XXX NNN' for each uop"""
-        self.out.emit("")
+        counter = 300  # TODO: Avoid collision with pseudo instructions
        counter = 300
        def add(name: str) -> None:
            nonlocal counter
-            self.out.emit(f"#define {name} {counter}")
+            self.out.emit(make_text(name, counter))
            counter += 1
        add("EXIT_TRACE")
        add("SET_IP")
@ -1314,6 +1328,32 @@ class Analyzer:
            if instr.kind == "op" and instr.is_viable_uop():
                add(instr.name)
    def write_macro_expansions(self, mac: MacroInstruction) -> None:
        """Write the macro expansions for a macro-instruction."""
        # TODO: Refactor to share code with write_cody(), is_viaible_uop(), etc.
        offset = 0  # Cache effect offset
        expansions: list[tuple[str, int, int]] = []  # [(name, size, offset), ...]
        for part in mac.parts:
            if isinstance(part, Component):
                # All component instructions must be viable uops
                if not part.instr.is_viable_uop():
                    print(f"NOTE: Part {part.instr.name} of {mac.name} is not a viable uop")
                    return
                if part.instr.instr_flags.HAS_ARG_FLAG or not part.active_caches:
                    size, offset = 0, 0
                else:
                    # If this assert triggers, is_viable_uops() lied
                    assert len(part.active_caches) == 1, (mac.name, part.instr.name)
                    cache = part.active_caches[0]
                    size, offset = cache.effect.size, cache.offset
                expansions.append((part.instr.name, size, offset))
        assert len(expansions) > 0, f"Macro {mac.name} has empty expansion?!"
        pieces = [f"{{ {name}, {size}, {offset} }}" for name, size, offset in expansions]
        self.out.emit(
            f"[{mac.name}] = "
            f"{{ .nuops = {len(expansions)}, .uops = {{ {', '.join(pieces)} }} }},"
        )
    def emit_metadata_entry(
        self, name: str, fmt: str, flags: InstructionFlags
    ) -> None:
@ -1379,6 +1419,7 @@ class Analyzer:
            for thing in self.everything:
                match thing:
                    case OverriddenInstructionPlaceHolder():
                        # TODO: Is this helpful?
                        self.write_overridden_instr_place_holder(thing)
                    case parser.InstDef():
                        instr = self.instrs[thing.name]
@ -1388,7 +1429,7 @@ class Analyzer:
                                instr.write(self.out, tier=TIER_TWO)
                                self.out.emit("break;")
                    case parser.Macro():
-                        pass  # TODO
+                        pass
                    case parser.Pseudo():
                        pass
                    case _:
@ -1429,7 +1470,7 @@ class Analyzer:
                        cache_adjust += size
                    case Component() as comp:
                        last_instr = comp.instr
-                        comp.write_body(self.out, cache_adjust)
+                        comp.write_body(self.out)
                        cache_adjust += comp.instr.cache_offset
            if cache_adjust: