gh-127833: Use productionlist nodes to implement the grammar-snippet directive (#130376)

Co-authored-by: Petr Viktorin <encukou@gmail.com> Co-authored-by: Blaise Pabon <blaise@gmail.com>
2025-10-14 18:59:46 +00:00 · 2025-03-20 15:35:20 +00:00 · 2025-03-20 15:35:20 +00:00 · 443c0cd17c
commit 443c0cd17c
parent 86d5fa95cf
2 changed files with 491 additions and 422 deletions
--- a/Doc/tools/extensions/grammar_snippet.py
+++ b/Doc/tools/extensions/grammar_snippet.py
@ -13,8 +13,8 @@ from sphinx.util.docutils import SphinxDirective
 from sphinx.util.nodes import make_id

 if TYPE_CHECKING:
-    from collections.abc import Sequence
-    from typing import Any
+    from collections.abc import Iterable, Iterator, Sequence
+    from typing import Any, Final

    from docutils.nodes import Node
    from sphinx.application import Sphinx
@ -41,30 +41,7 @@ class GrammarSnippetBase(SphinxDirective):

    # The option/argument handling is left to the individual classes.

-    def make_grammar_snippet(
-        self, options: dict[str, Any], content: Sequence[str]
-    ) -> list[nodes.paragraph]:
-        """Create a literal block from options & content."""
-
-        group_name = options['group']
-
-        # Docutils elements have a `rawsource` attribute that is supposed to be
-        # set to the original ReST source.
-        # Sphinx does the following with it:
-        # - if it's empty, set it to `self.astext()`
-        # - if it matches `self.astext()` when generating the output,
-        #   apply syntax highlighting (which is based on the plain-text content
-        #   and thus discards internal formatting, like references).
-        # To get around this, we set it to this non-empty string:
-        rawsource = 'You should not see this.'
-
-        literal = nodes.literal_block(
-            rawsource,
-            '',
-            classes=['highlight'],
-        )
-
-        grammar_re = re.compile(
+    grammar_re: Final = re.compile(
        r"""
            (?P<rule_name>^[a-zA-Z0-9_]+)     # identifier at start of line
            (?=:)                             # ... followed by a colon
@ -78,61 +55,126 @@ class GrammarSnippetBase(SphinxDirective):
        re.VERBOSE,
    )

-        for line in content:
-            last_pos = 0
-            for match in grammar_re.finditer(line):
-                # Handle text between matches
-                if match.start() > last_pos:
-                    literal += nodes.Text(line[last_pos : match.start()])
-                last_pos = match.end()
+    def make_grammar_snippet(
+        self, options: dict[str, Any], content: Sequence[str]
+    ) -> list[addnodes.productionlist]:
+        """Create a literal block from options & content."""

-                # Handle matches
-                group_dict = {
-                    name: content
-                    for name, content in match.groupdict().items()
-                    if content is not None
-                }
-                match group_dict:
-                    case {'rule_name': name}:
-                        literal += self.make_link_target_for_token(
-                            group_name, name
+        group_name = options['group']
+        node_location = self.get_location()
+        production_nodes = []
+        for rawsource, production_defs in self.production_definitions(content):
+            production = self.make_production(
+                rawsource,
+                production_defs,
+                group_name=group_name,
+                location=node_location,
            )
-                    case {'rule_ref': ref_text}:
-                        literal += token_xrefs(ref_text, group_name)
-                    case {'single_quoted': name} | {'double_quoted': name}:
-                        literal += snippet_string_node('', name)
-                    case _:
-                        raise ValueError('unhandled match')
-            literal += nodes.Text(line[last_pos:] + '\n')
+            production_nodes.append(production)

-        node = nodes.paragraph(
+        node = addnodes.productionlist(
            '',
-            '',
-            literal,
+            *production_nodes,
+            support_smartquotes=False,
+            classes=['highlight'],
        )
-
+        self.set_source_info(node)
        return [node]

-    def make_link_target_for_token(
-        self, group_name: str, name: str
+    def production_definitions(
+        self, lines: Iterable[str], /
+    ) -> Iterator[tuple[str, list[tuple[str, str]]]]:
+        """Yield pairs of rawsource and production content dicts."""
+        production_lines: list[str] = []
+        production_content: list[tuple[str, str]] = []
+        for line in lines:
+            # If this line is the start of a new rule (text in the column 1),
+            # emit the current production and start a new one.
+            if not line[:1].isspace():
+                rawsource = '\n'.join(production_lines)
+                production_lines.clear()
+                if production_content:
+                    yield rawsource, production_content
+                    production_content = []
+
+            # Append the current line for the raw source
+            production_lines.append(line)
+
+            # Parse the line into constituent parts
+            last_pos = 0
+            for match in self.grammar_re.finditer(line):
+                # Handle text between matches
+                if match.start() > last_pos:
+                    unmatched_text = line[last_pos : match.start()]
+                    production_content.append(('text', unmatched_text))
+                last_pos = match.end()
+
+                # Handle matches.
+                # After filtering None (non-matches), exactly one groupdict()
+                # entry should remain.
+                [(re_group_name, content)] = (
+                    (re_group_name, content)
+                    for re_group_name, content in match.groupdict().items()
+                    if content is not None
+                )
+                production_content.append((re_group_name, content))
+            production_content.append(('text', line[last_pos:] + '\n'))
+
+        # Emit the final production
+        if production_content:
+            rawsource = '\n'.join(production_lines)
+            yield rawsource, production_content
+
+    def make_production(
+        self,
+        rawsource: str,
+        production_defs: list[tuple[str, str]],
+        *,
+        group_name: str,
+        location: str,
+    ) -> addnodes.production:
+        """Create a production node from a list of parts."""
+        production_node = addnodes.production(rawsource)
+        for re_group_name, content in production_defs:
+            match re_group_name:
+                case 'rule_name':
+                    production_node += self.make_name_target(
+                        name=content,
+                        production_group=group_name,
+                        location=location,
+                    )
+                case 'rule_ref':
+                    production_node += token_xrefs(content, group_name)
+                case 'single_quoted' | 'double_quoted':
+                    production_node += snippet_string_node('', content)
+                case 'text':
+                    production_node += nodes.Text(content)
+                case _:
+                    raise ValueError(f'unhandled match: {re_group_name!r}')
+        return production_node
+
+    def make_name_target(
+        self,
+        *,
+        name: str,
+        production_group: str,
+        location: str,
    ) -> addnodes.literal_strong:
-        """Return a literal node which is a link target for the given token."""
-        name_node = addnodes.literal_strong()
+        """Make a link target for the given production."""

        # Cargo-culted magic to make `name_node` a link target
        # similar to Sphinx `production`.
        # This needs to be the same as what Sphinx does
        # to avoid breaking existing links.
-        domain = self.env.domains['std']
-        obj_name = f"{group_name}:{name}"
-        prefix = f'grammar-token-{group_name}'
+
+        name_node = addnodes.literal_strong(name, name)
+        prefix = f'grammar-token-{production_group}'
        node_id = make_id(self.env, self.state.document, prefix, name)
        name_node['ids'].append(node_id)
        self.state.document.note_implicit_target(name_node, name_node)
-        domain.note_object('token', obj_name, node_id, location=name_node)
-
-        text_node = nodes.Text(name)
-        name_node += text_node
+        obj_name = f'{production_group}:{name}' if production_group else name
+        std = self.env.domains.standard_domain
+        std.note_object('token', obj_name, node_id, location=location)
        return name_node


@ -168,7 +210,7 @@ class GrammarSnippetDirective(GrammarSnippetBase):
    optional_arguments = 1
    final_argument_whitespace = True

-    def run(self) -> list[nodes.paragraph]:
+    def run(self) -> list[addnodes.productionlist]:
        return self.make_grammar_snippet(self.options, self.content)


@ -187,7 +229,7 @@ class CompatProductionList(GrammarSnippetBase):
    final_argument_whitespace = True
    option_spec = {}

-    def run(self) -> list[nodes.paragraph]:
+    def run(self) -> list[addnodes.productionlist]:
        # The "content" of a productionlist is actually the first and only
        # argument. The first line is the group; the rest is the content lines.
        lines = self.arguments[0].splitlines()
--- a/Lib/pydoc_data/topics.py
+++ b/Lib/pydoc_data/topics.py