mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 10:26:02 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			345 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			345 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os.path
 | |
| import token
 | |
| from typing import IO, Any, Dict, Optional, Sequence, Set, Text, Tuple
 | |
| 
 | |
| from pegen import grammar
 | |
| from pegen.grammar import (
 | |
|     Alt,
 | |
|     Cut,
 | |
|     Forced,
 | |
|     Gather,
 | |
|     GrammarVisitor,
 | |
|     Group,
 | |
|     Lookahead,
 | |
|     NamedItem,
 | |
|     NameLeaf,
 | |
|     NegativeLookahead,
 | |
|     Opt,
 | |
|     PositiveLookahead,
 | |
|     Repeat0,
 | |
|     Repeat1,
 | |
|     Rhs,
 | |
|     Rule,
 | |
|     StringLeaf,
 | |
| )
 | |
| from pegen.parser_generator import ParserGenerator
 | |
| 
 | |
| MODULE_PREFIX = """\
 | |
| #!/usr/bin/env python3.8
 | |
| # @generated by pegen from {filename}
 | |
| 
 | |
| import ast
 | |
| import sys
 | |
| import tokenize
 | |
| 
 | |
| from typing import Any, Optional
 | |
| 
 | |
| from pegen.parser import memoize, memoize_left_rec, logger, Parser
 | |
| 
 | |
| """
 | |
| MODULE_SUFFIX = """
 | |
| 
 | |
| if __name__ == '__main__':
 | |
|     from pegen.parser import simple_parser_main
 | |
|     simple_parser_main({class_name})
 | |
| """
 | |
| 
 | |
| 
 | |
| class InvalidNodeVisitor(GrammarVisitor):
 | |
|     def visit_NameLeaf(self, node: NameLeaf) -> bool:
 | |
|         name = node.value
 | |
|         return name.startswith("invalid")
 | |
| 
 | |
|     def visit_StringLeaf(self, node: StringLeaf) -> bool:
 | |
|         return False
 | |
| 
 | |
|     def visit_NamedItem(self, node: NamedItem) -> bool:
 | |
|         return self.visit(node.item)
 | |
| 
 | |
|     def visit_Rhs(self, node: Rhs) -> bool:
 | |
|         return any(self.visit(alt) for alt in node.alts)
 | |
| 
 | |
|     def visit_Alt(self, node: Alt) -> bool:
 | |
|         return any(self.visit(item) for item in node.items)
 | |
| 
 | |
|     def lookahead_call_helper(self, node: Lookahead) -> bool:
 | |
|         return self.visit(node.node)
 | |
| 
 | |
|     def visit_PositiveLookahead(self, node: PositiveLookahead) -> bool:
 | |
|         return self.lookahead_call_helper(node)
 | |
| 
 | |
|     def visit_NegativeLookahead(self, node: NegativeLookahead) -> bool:
 | |
|         return self.lookahead_call_helper(node)
 | |
| 
 | |
|     def visit_Opt(self, node: Opt) -> bool:
 | |
|         return self.visit(node.node)
 | |
| 
 | |
|     def visit_Repeat(self, node: Repeat0) -> Tuple[str, str]:
 | |
|         return self.visit(node.node)
 | |
| 
 | |
|     def visit_Gather(self, node: Gather) -> Tuple[str, str]:
 | |
|         return self.visit(node.node)
 | |
| 
 | |
|     def visit_Group(self, node: Group) -> bool:
 | |
|         return self.visit(node.rhs)
 | |
| 
 | |
|     def visit_Cut(self, node: Cut) -> bool:
 | |
|         return False
 | |
| 
 | |
|     def visit_Forced(self, node: Forced) -> bool:
 | |
|         return self.visit(node.node)
 | |
| 
 | |
| 
 | |
| class PythonCallMakerVisitor(GrammarVisitor):
 | |
|     def __init__(self, parser_generator: ParserGenerator):
 | |
|         self.gen = parser_generator
 | |
|         self.cache: Dict[Any, Any] = {}
 | |
| 
 | |
|     def visit_NameLeaf(self, node: NameLeaf) -> Tuple[Optional[str], str]:
 | |
|         name = node.value
 | |
|         if name == "SOFT_KEYWORD":
 | |
|             return "soft_keyword", "self.soft_keyword()"
 | |
|         if name in ("NAME", "NUMBER", "STRING", "OP", "TYPE_COMMENT"):
 | |
|             name = name.lower()
 | |
|             return name, f"self.{name}()"
 | |
|         if name in ("NEWLINE", "DEDENT", "INDENT", "ENDMARKER"):
 | |
|             # Avoid using names that can be Python keywords
 | |
|             return "_" + name.lower(), f"self.expect({name!r})"
 | |
|         return name, f"self.{name}()"
 | |
| 
 | |
|     def visit_StringLeaf(self, node: StringLeaf) -> Tuple[str, str]:
 | |
|         return "literal", f"self.expect({node.value})"
 | |
| 
 | |
|     def visit_Rhs(self, node: Rhs) -> Tuple[Optional[str], str]:
 | |
|         if node in self.cache:
 | |
|             return self.cache[node]
 | |
|         if len(node.alts) == 1 and len(node.alts[0].items) == 1:
 | |
|             self.cache[node] = self.visit(node.alts[0].items[0])
 | |
|         else:
 | |
|             name = self.gen.artificial_rule_from_rhs(node)
 | |
|             self.cache[node] = name, f"self.{name}()"
 | |
|         return self.cache[node]
 | |
| 
 | |
|     def visit_NamedItem(self, node: NamedItem) -> Tuple[Optional[str], str]:
 | |
|         name, call = self.visit(node.item)
 | |
|         if node.name:
 | |
|             name = node.name
 | |
|         return name, call
 | |
| 
 | |
|     def lookahead_call_helper(self, node: Lookahead) -> Tuple[str, str]:
 | |
|         name, call = self.visit(node.node)
 | |
|         head, tail = call.split("(", 1)
 | |
|         assert tail[-1] == ")"
 | |
|         tail = tail[:-1]
 | |
|         return head, tail
 | |
| 
 | |
|     def visit_PositiveLookahead(self, node: PositiveLookahead) -> Tuple[None, str]:
 | |
|         head, tail = self.lookahead_call_helper(node)
 | |
|         return None, f"self.positive_lookahead({head}, {tail})"
 | |
| 
 | |
|     def visit_NegativeLookahead(self, node: NegativeLookahead) -> Tuple[None, str]:
 | |
|         head, tail = self.lookahead_call_helper(node)
 | |
|         return None, f"self.negative_lookahead({head}, {tail})"
 | |
| 
 | |
|     def visit_Opt(self, node: Opt) -> Tuple[str, str]:
 | |
|         name, call = self.visit(node.node)
 | |
|         # Note trailing comma (the call may already have one comma
 | |
|         # at the end, for example when rules have both repeat0 and optional
 | |
|         # markers, e.g: [rule*])
 | |
|         if call.endswith(","):
 | |
|             return "opt", call
 | |
|         else:
 | |
|             return "opt", f"{call},"
 | |
| 
 | |
|     def visit_Repeat0(self, node: Repeat0) -> Tuple[str, str]:
 | |
|         if node in self.cache:
 | |
|             return self.cache[node]
 | |
|         name = self.gen.artificial_rule_from_repeat(node.node, False)
 | |
|         self.cache[node] = name, f"self.{name}(),"  # Also a trailing comma!
 | |
|         return self.cache[node]
 | |
| 
 | |
|     def visit_Repeat1(self, node: Repeat1) -> Tuple[str, str]:
 | |
|         if node in self.cache:
 | |
|             return self.cache[node]
 | |
|         name = self.gen.artificial_rule_from_repeat(node.node, True)
 | |
|         self.cache[node] = name, f"self.{name}()"  # But no trailing comma here!
 | |
|         return self.cache[node]
 | |
| 
 | |
|     def visit_Gather(self, node: Gather) -> Tuple[str, str]:
 | |
|         if node in self.cache:
 | |
|             return self.cache[node]
 | |
|         name = self.gen.artificial_rule_from_gather(node)
 | |
|         self.cache[node] = name, f"self.{name}()"  # No trailing comma here either!
 | |
|         return self.cache[node]
 | |
| 
 | |
|     def visit_Group(self, node: Group) -> Tuple[Optional[str], str]:
 | |
|         return self.visit(node.rhs)
 | |
| 
 | |
|     def visit_Cut(self, node: Cut) -> Tuple[str, str]:
 | |
|         return "cut", "True"
 | |
| 
 | |
|     def visit_Forced(self, node: Forced) -> Tuple[str, str]:
 | |
|         if isinstance(node.node, Group):
 | |
|             _, val = self.visit(node.node.rhs)
 | |
|             return "forced", f"self.expect_forced({val}, '''({node.node.rhs!s})''')"
 | |
|         else:
 | |
|             return (
 | |
|                 "forced",
 | |
|                 f"self.expect_forced(self.expect({node.node.value}), {node.node.value!r})",
 | |
|             )
 | |
| 
 | |
| 
 | |
| class PythonParserGenerator(ParserGenerator, GrammarVisitor):
 | |
|     def __init__(
 | |
|         self,
 | |
|         grammar: grammar.Grammar,
 | |
|         file: Optional[IO[Text]],
 | |
|         tokens: Set[str] = set(token.tok_name.values()),
 | |
|         location_formatting: Optional[str] = None,
 | |
|         unreachable_formatting: Optional[str] = None,
 | |
|     ):
 | |
|         tokens.add("SOFT_KEYWORD")
 | |
|         super().__init__(grammar, tokens, file)
 | |
|         self.callmakervisitor: PythonCallMakerVisitor = PythonCallMakerVisitor(self)
 | |
|         self.invalidvisitor: InvalidNodeVisitor = InvalidNodeVisitor()
 | |
|         self.unreachable_formatting = unreachable_formatting or "None  # pragma: no cover"
 | |
|         self.location_formatting = (
 | |
|             location_formatting
 | |
|             or "lineno=start_lineno, col_offset=start_col_offset, "
 | |
|             "end_lineno=end_lineno, end_col_offset=end_col_offset"
 | |
|         )
 | |
| 
 | |
|     def generate(self, filename: str) -> None:
 | |
|         self.collect_rules()
 | |
|         header = self.grammar.metas.get("header", MODULE_PREFIX)
 | |
|         if header is not None:
 | |
|             basename = os.path.basename(filename)
 | |
|             self.print(header.rstrip("\n").format(filename=basename))
 | |
|         subheader = self.grammar.metas.get("subheader", "")
 | |
|         if subheader:
 | |
|             self.print(subheader)
 | |
|         cls_name = self.grammar.metas.get("class", "GeneratedParser")
 | |
|         self.print("# Keywords and soft keywords are listed at the end of the parser definition.")
 | |
|         self.print(f"class {cls_name}(Parser):")
 | |
|         for rule in self.all_rules.values():
 | |
|             self.print()
 | |
|             with self.indent():
 | |
|                 self.visit(rule)
 | |
| 
 | |
|         self.print()
 | |
|         with self.indent():
 | |
|             self.print(f"KEYWORDS = {tuple(self.keywords)}")
 | |
|             self.print(f"SOFT_KEYWORDS = {tuple(self.soft_keywords)}")
 | |
| 
 | |
|         trailer = self.grammar.metas.get("trailer", MODULE_SUFFIX.format(class_name=cls_name))
 | |
|         if trailer is not None:
 | |
|             self.print(trailer.rstrip("\n"))
 | |
| 
 | |
|     def alts_uses_locations(self, alts: Sequence[Alt]) -> bool:
 | |
|         for alt in alts:
 | |
|             if alt.action and "LOCATIONS" in alt.action:
 | |
|                 return True
 | |
|             for n in alt.items:
 | |
|                 if isinstance(n.item, Group) and self.alts_uses_locations(n.item.rhs.alts):
 | |
|                     return True
 | |
|         return False
 | |
| 
 | |
|     def visit_Rule(self, node: Rule) -> None:
 | |
|         is_loop = node.is_loop()
 | |
|         is_gather = node.is_gather()
 | |
|         rhs = node.flatten()
 | |
|         if node.left_recursive:
 | |
|             if node.leader:
 | |
|                 self.print("@memoize_left_rec")
 | |
|             else:
 | |
|                 # Non-leader rules in a cycle are not memoized,
 | |
|                 # but they must still be logged.
 | |
|                 self.print("@logger")
 | |
|         else:
 | |
|             self.print("@memoize")
 | |
|         node_type = node.type or "Any"
 | |
|         self.print(f"def {node.name}(self) -> Optional[{node_type}]:")
 | |
|         with self.indent():
 | |
|             self.print(f"# {node.name}: {rhs}")
 | |
|             self.print("mark = self._mark()")
 | |
|             if self.alts_uses_locations(node.rhs.alts):
 | |
|                 self.print("tok = self._tokenizer.peek()")
 | |
|                 self.print("start_lineno, start_col_offset = tok.start")
 | |
|             if is_loop:
 | |
|                 self.print("children = []")
 | |
|             self.visit(rhs, is_loop=is_loop, is_gather=is_gather)
 | |
|             if is_loop:
 | |
|                 self.print("return children")
 | |
|             else:
 | |
|                 self.print("return None")
 | |
| 
 | |
|     def visit_NamedItem(self, node: NamedItem) -> None:
 | |
|         name, call = self.callmakervisitor.visit(node.item)
 | |
|         if node.name:
 | |
|             name = node.name
 | |
|         if not name:
 | |
|             self.print(call)
 | |
|         else:
 | |
|             if name != "cut":
 | |
|                 name = self.dedupe(name)
 | |
|             self.print(f"({name} := {call})")
 | |
| 
 | |
|     def visit_Rhs(self, node: Rhs, is_loop: bool = False, is_gather: bool = False) -> None:
 | |
|         if is_loop:
 | |
|             assert len(node.alts) == 1
 | |
|         for alt in node.alts:
 | |
|             self.visit(alt, is_loop=is_loop, is_gather=is_gather)
 | |
| 
 | |
|     def visit_Alt(self, node: Alt, is_loop: bool, is_gather: bool) -> None:
 | |
|         has_cut = any(isinstance(item.item, Cut) for item in node.items)
 | |
|         with self.local_variable_context():
 | |
|             if has_cut:
 | |
|                 self.print("cut = False")
 | |
|             if is_loop:
 | |
|                 self.print("while (")
 | |
|             else:
 | |
|                 self.print("if (")
 | |
|             with self.indent():
 | |
|                 first = True
 | |
|                 for item in node.items:
 | |
|                     if first:
 | |
|                         first = False
 | |
|                     else:
 | |
|                         self.print("and")
 | |
|                     self.visit(item)
 | |
|                     if is_gather:
 | |
|                         self.print("is not None")
 | |
| 
 | |
|             self.print("):")
 | |
|             with self.indent():
 | |
|                 action = node.action
 | |
|                 if not action:
 | |
|                     if is_gather:
 | |
|                         assert len(self.local_variable_names) == 2
 | |
|                         action = (
 | |
|                             f"[{self.local_variable_names[0]}] + {self.local_variable_names[1]}"
 | |
|                         )
 | |
|                     else:
 | |
|                         if self.invalidvisitor.visit(node):
 | |
|                             action = "UNREACHABLE"
 | |
|                         elif len(self.local_variable_names) == 1:
 | |
|                             action = f"{self.local_variable_names[0]}"
 | |
|                         else:
 | |
|                             action = f"[{', '.join(self.local_variable_names)}]"
 | |
|                 elif "LOCATIONS" in action:
 | |
|                     self.print("tok = self._tokenizer.get_last_non_whitespace_token()")
 | |
|                     self.print("end_lineno, end_col_offset = tok.end")
 | |
|                     action = action.replace("LOCATIONS", self.location_formatting)
 | |
| 
 | |
|                 if is_loop:
 | |
|                     self.print(f"children.append({action})")
 | |
|                     self.print(f"mark = self._mark()")
 | |
|                 else:
 | |
|                     if "UNREACHABLE" in action:
 | |
|                         action = action.replace("UNREACHABLE", self.unreachable_formatting)
 | |
|                     self.print(f"return {action}")
 | |
| 
 | |
|             self.print("self._reset(mark)")
 | |
|             # Skip remaining alternatives if a cut was reached.
 | |
|             if has_cut:
 | |
|                 self.print("if cut: return None")
 | 
