Thoroughly refactor the cases generator (#107151)

This mostly extracts a whole bunch of stuff out of generate_cases.py into separate files, but there are a few other things going on here. - analysis.py: `Analyzer` etc. - instructions.py: `Instruction` etc. - flags.py: `InstructionFlags`, `variable_used`, `variable_used_unspecialized` - formatting.py: `Formatter` etc. - Rename parser.py to parsing.py, to avoid conflict with stdlib parser.py - Blackify most things - Fix most mypy errors - Remove output filenames from Generator state, add them to `write_instructions()` etc. - Fix unit tests
2025-08-31 14:07:50 +00:00 · 2023-07-24 09:38:23 -07:00 · 2023-07-24 09:38:23 -07:00 · 032f480909
commit 032f480909
parent ff5f94b72c
7 changed files with 1304 additions and 1169 deletions
--- a/Tools/cases_generator/analysis.py
+++ b/Tools/cases_generator/analysis.py
@ -0,0 +1,412 @@
+import re
+import sys
+import typing
+
+from flags import InstructionFlags, variable_used
+from formatting import prettify_filename, UNUSED
+from instructions import (
+    ActiveCacheEffect,
+    Component,
+    Instruction,
+    InstructionOrCacheEffect,
+    MacroInstruction,
+    MacroParts,
+    OverriddenInstructionPlaceHolder,
+    PseudoInstruction,
+    StackEffectMapping,
+)
+import parsing
+from parsing import StackEffect
+
+BEGIN_MARKER = "// BEGIN BYTECODES //"
+END_MARKER = "// END BYTECODES //"
+
+RESERVED_WORDS = {
+    "co_consts": "Use FRAME_CO_CONSTS.",
+    "co_names": "Use FRAME_CO_NAMES.",
+}
+
+RE_PREDICTED = r"^\s*(?:GO_TO_INSTRUCTION\(|DEOPT_IF\(.*?,\s*)(\w+)\);\s*(?://.*)?$"
+
+
+class Analyzer:
+    """Parse input, analyze it, and write to output."""
+
+    input_filenames: list[str]
+    errors: int = 0
+
+    def __init__(self, input_filenames: list[str]):
+        self.input_filenames = input_filenames
+
+    def error(self, msg: str, node: parsing.Node) -> None:
+        lineno = 0
+        filename = "<unknown file>"
+        if context := node.context:
+            filename = context.owner.filename
+            # Use line number of first non-comment in the node
+            for token in context.owner.tokens[context.begin : context.end]:
+                lineno = token.line
+                if token.kind != "COMMENT":
+                    break
+        print(f"{filename}:{lineno}: {msg}", file=sys.stderr)
+        self.errors += 1
+
+    everything: list[
+        parsing.InstDef
+        | parsing.Macro
+        | parsing.Pseudo
+        | OverriddenInstructionPlaceHolder
+    ]
+    instrs: dict[str, Instruction]  # Includes ops
+    macros: dict[str, parsing.Macro]
+    macro_instrs: dict[str, MacroInstruction]
+    families: dict[str, parsing.Family]
+    pseudos: dict[str, parsing.Pseudo]
+    pseudo_instrs: dict[str, PseudoInstruction]
+
+    def parse(self) -> None:
+        """Parse the source text.
+
+        We only want the parser to see the stuff between the
+        begin and end markers.
+        """
+
+        self.everything = []
+        self.instrs = {}
+        self.macros = {}
+        self.families = {}
+        self.pseudos = {}
+
+        instrs_idx: dict[str, int] = dict()
+
+        for filename in self.input_filenames:
+            self.parse_file(filename, instrs_idx)
+
+        files = " + ".join(self.input_filenames)
+        print(
+            f"Read {len(self.instrs)} instructions/ops, "
+            f"{len(self.macros)} macros, {len(self.pseudos)} pseudos, "
+            f"and {len(self.families)} families from {files}",
+            file=sys.stderr,
+        )
+
+    def parse_file(self, filename: str, instrs_idx: dict[str, int]) -> None:
+        with open(filename) as file:
+            src = file.read()
+
+        psr = parsing.Parser(src, filename=prettify_filename(filename))
+
+        # Skip until begin marker
+        while tkn := psr.next(raw=True):
+            if tkn.text == BEGIN_MARKER:
+                break
+        else:
+            raise psr.make_syntax_error(
+                f"Couldn't find {BEGIN_MARKER!r} in {psr.filename}"
+            )
+        start = psr.getpos()
+
+        # Find end marker, then delete everything after it
+        while tkn := psr.next(raw=True):
+            if tkn.text == END_MARKER:
+                break
+        del psr.tokens[psr.getpos() - 1 :]
+
+        # Parse from start
+        psr.setpos(start)
+        thing: parsing.Node | None
+        thing_first_token = psr.peek()
+        while thing := psr.definition():
+            thing = typing.cast(
+                parsing.InstDef | parsing.Macro | parsing.Pseudo | parsing.Family, thing
+            )
+            if ws := [w for w in RESERVED_WORDS if variable_used(thing, w)]:
+                self.error(
+                    f"'{ws[0]}' is a reserved word. {RESERVED_WORDS[ws[0]]}", thing
+                )
+
+            match thing:
+                case parsing.InstDef(name=name):
+                    if name in self.instrs:
+                        if not thing.override:
+                            raise psr.make_syntax_error(
+                                f"Duplicate definition of '{name}' @ {thing.context} "
+                                f"previous definition @ {self.instrs[name].inst.context}",
+                                thing_first_token,
+                            )
+                        self.everything[
+                            instrs_idx[name]
+                        ] = OverriddenInstructionPlaceHolder(name=name)
+                    if name not in self.instrs and thing.override:
+                        raise psr.make_syntax_error(
+                            f"Definition of '{name}' @ {thing.context} is supposed to be "
+                            "an override but no previous definition exists.",
+                            thing_first_token,
+                        )
+                    self.instrs[name] = Instruction(thing)
+                    instrs_idx[name] = len(self.everything)
+                    self.everything.append(thing)
+                case parsing.Macro(name):
+                    self.macros[name] = thing
+                    self.everything.append(thing)
+                case parsing.Family(name):
+                    self.families[name] = thing
+                case parsing.Pseudo(name):
+                    self.pseudos[name] = thing
+                    self.everything.append(thing)
+                case _:
+                    typing.assert_never(thing)
+        if not psr.eof():
+            raise psr.make_syntax_error(f"Extra stuff at the end of {filename}")
+
+    def analyze(self) -> None:
+        """Analyze the inputs.
+
+        Raises SystemExit if there is an error.
+        """
+        self.analyze_macros_and_pseudos()
+        self.find_predictions()
+        self.map_families()
+        self.check_families()
+
+    def find_predictions(self) -> None:
+        """Find the instructions that need PREDICTED() labels."""
+        for instr in self.instrs.values():
+            targets: set[str] = set()
+            for line in instr.block_text:
+                if m := re.match(RE_PREDICTED, line):
+                    targets.add(m.group(1))
+            for target in targets:
+                if target_instr := self.instrs.get(target):
+                    target_instr.predicted = True
+                elif target_macro := self.macro_instrs.get(target):
+                    target_macro.predicted = True
+                else:
+                    self.error(
+                        f"Unknown instruction {target!r} predicted in {instr.name!r}",
+                        instr.inst,  # TODO: Use better location
+                    )
+
+    def map_families(self) -> None:
+        """Link instruction names back to their family, if they have one."""
+        for family in self.families.values():
+            for member in [family.name] + family.members:
+                if member_instr := self.instrs.get(member):
+                    if (
+                        member_instr.family is not family
+                        and member_instr.family is not None
+                    ):
+                        self.error(
+                            f"Instruction {member} is a member of multiple families "
+                            f"({member_instr.family.name}, {family.name}).",
+                            family,
+                        )
+                    else:
+                        member_instr.family = family
+                elif not self.macro_instrs.get(member):
+                    self.error(
+                        f"Unknown instruction {member!r} referenced in family {family.name!r}",
+                        family,
+                    )
+
+    def check_families(self) -> None:
+        """Check each family:
+
+        - Must have at least 2 members (including head)
+        - Head and all members must be known instructions
+        - Head and all members must have the same cache, input and output effects
+        """
+        for family in self.families.values():
+            if family.name not in self.macro_instrs and family.name not in self.instrs:
+                self.error(
+                    f"Family {family.name!r} has unknown instruction {family.name!r}",
+                    family,
+                )
+            members = [
+                member
+                for member in family.members
+                if member in self.instrs or member in self.macro_instrs
+            ]
+            if members != family.members:
+                unknown = set(family.members) - set(members)
+                self.error(
+                    f"Family {family.name!r} has unknown members: {unknown}", family
+                )
+            expected_effects = self.effect_counts(family.name)
+            for member in members:
+                member_effects = self.effect_counts(member)
+                if member_effects != expected_effects:
+                    self.error(
+                        f"Family {family.name!r} has inconsistent "
+                        f"(cache, input, output) effects:\n"
+                        f"  {family.name} = {expected_effects}; "
+                        f"{member} = {member_effects}",
+                        family,
+                    )
+
+    def effect_counts(self, name: str) -> tuple[int, int, int]:
+        if instr := self.instrs.get(name):
+            cache = instr.cache_offset
+            input = len(instr.input_effects)
+            output = len(instr.output_effects)
+        elif mac := self.macro_instrs.get(name):
+            cache = mac.cache_offset
+            input, output = 0, 0
+            for part in mac.parts:
+                if isinstance(part, Component):
+                    # A component may pop what the previous component pushed,
+                    # so we offset the input/output counts by that.
+                    delta_i = len(part.instr.input_effects)
+                    delta_o = len(part.instr.output_effects)
+                    offset = min(delta_i, output)
+                    input += delta_i - offset
+                    output += delta_o - offset
+        else:
+            assert False, f"Unknown instruction {name!r}"
+        return cache, input, output
+
+    def analyze_macros_and_pseudos(self) -> None:
+        """Analyze each macro and pseudo instruction."""
+        self.macro_instrs = {}
+        self.pseudo_instrs = {}
+        for name, macro in self.macros.items():
+            self.macro_instrs[name] = self.analyze_macro(macro)
+        for name, pseudo in self.pseudos.items():
+            self.pseudo_instrs[name] = self.analyze_pseudo(pseudo)
+
+    def analyze_macro(self, macro: parsing.Macro) -> MacroInstruction:
+        components = self.check_macro_components(macro)
+        stack, initial_sp = self.stack_analysis(components)
+        sp = initial_sp
+        parts: MacroParts = []
+        flags = InstructionFlags.newEmpty()
+        offset = 0
+        for component in components:
+            match component:
+                case parsing.CacheEffect() as ceffect:
+                    parts.append(ceffect)
+                    offset += ceffect.size
+                case Instruction() as instr:
+                    part, sp, offset = self.analyze_instruction(
+                        instr, stack, sp, offset
+                    )
+                    parts.append(part)
+                    flags.add(instr.instr_flags)
+                case _:
+                    typing.assert_never(component)
+        final_sp = sp
+        format = "IB"
+        if offset:
+            format += "C" + "0" * (offset - 1)
+        return MacroInstruction(
+            macro.name, stack, initial_sp, final_sp, format, flags, macro, parts, offset
+        )
+
+    def analyze_pseudo(self, pseudo: parsing.Pseudo) -> PseudoInstruction:
+        targets = [self.instrs[target] for target in pseudo.targets]
+        assert targets
+        # Make sure the targets have the same fmt
+        fmts = list(set([t.instr_fmt for t in targets]))
+        assert len(fmts) == 1
+        assert len(list(set([t.instr_flags.bitmap() for t in targets]))) == 1
+        return PseudoInstruction(pseudo.name, targets, fmts[0], targets[0].instr_flags)
+
+    def analyze_instruction(
+        self, instr: Instruction, stack: list[StackEffect], sp: int, offset: int
+    ) -> tuple[Component, int, int]:
+        input_mapping: StackEffectMapping = []
+        for ieffect in reversed(instr.input_effects):
+            sp -= 1
+            input_mapping.append((stack[sp], ieffect))
+        output_mapping: StackEffectMapping = []
+        for oeffect in instr.output_effects:
+            output_mapping.append((stack[sp], oeffect))
+            sp += 1
+        active_effects: list[ActiveCacheEffect] = []
+        for ceffect in instr.cache_effects:
+            if ceffect.name != UNUSED:
+                active_effects.append(ActiveCacheEffect(ceffect, offset))
+            offset += ceffect.size
+        return (
+            Component(instr, input_mapping, output_mapping, active_effects),
+            sp,
+            offset,
+        )
+
+    def check_macro_components(
+        self, macro: parsing.Macro
+    ) -> list[InstructionOrCacheEffect]:
+        components: list[InstructionOrCacheEffect] = []
+        for uop in macro.uops:
+            match uop:
+                case parsing.OpName(name):
+                    if name not in self.instrs:
+                        self.error(f"Unknown instruction {name!r}", macro)
+                    components.append(self.instrs[name])
+                case parsing.CacheEffect():
+                    components.append(uop)
+                case _:
+                    typing.assert_never(uop)
+        return components
+
+    def stack_analysis(
+        self, components: typing.Iterable[InstructionOrCacheEffect]
+    ) -> tuple[list[StackEffect], int]:
+        """Analyze a macro.
+
+        Ignore cache effects.
+
+        Return the list of variables (as StackEffects) and the initial stack pointer.
+        """
+        lowest = current = highest = 0
+        conditions: dict[int, str] = {}  # Indexed by 'current'.
+        last_instr: Instruction | None = None
+        for thing in components:
+            if isinstance(thing, Instruction):
+                last_instr = thing
+        for thing in components:
+            match thing:
+                case Instruction() as instr:
+                    if any(
+                        eff.size for eff in instr.input_effects + instr.output_effects
+                    ):
+                        # TODO: Eventually this will be needed, at least for macros.
+                        self.error(
+                            f"Instruction {instr.name!r} has variable-sized stack effect, "
+                            "which are not supported in macro instructions",
+                            instr.inst,  # TODO: Pass name+location of macro
+                        )
+                    if any(eff.cond for eff in instr.input_effects):
+                        self.error(
+                            f"Instruction {instr.name!r} has conditional input stack effect, "
+                            "which are not supported in macro instructions",
+                            instr.inst,  # TODO: Pass name+location of macro
+                        )
+                    if (
+                        any(eff.cond for eff in instr.output_effects)
+                        and instr is not last_instr
+                    ):
+                        self.error(
+                            f"Instruction {instr.name!r} has conditional output stack effect, "
+                            "but is not the last instruction in a macro",
+                            instr.inst,  # TODO: Pass name+location of macro
+                        )
+                    current -= len(instr.input_effects)
+                    lowest = min(lowest, current)
+                    for eff in instr.output_effects:
+                        if eff.cond:
+                            conditions[current] = eff.cond
+                        current += 1
+                    highest = max(highest, current)
+                case parsing.CacheEffect():
+                    pass
+                case _:
+                    typing.assert_never(thing)
+        # At this point, 'current' is the net stack effect,
+        # and 'lowest' and 'highest' are the extremes.
+        # Note that 'lowest' may be negative.
+        stack = [
+            StackEffect(f"_tmp_{i}", "", conditions.get(highest - i, ""))
+            for i in reversed(range(1, highest - lowest + 1))
+        ]
+        return stack, -lowest