mirror of
				https://github.com/python/cpython.git
				synced 2025-10-31 10:26:02 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			310 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			310 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """Python parser generator
 | |
| 
 | |
| 
 | |
| This parser generator transforms a Python grammar file into parsing tables
 | |
| that can be consumed by Python's LL(1) parser written in C.
 | |
| 
 | |
| Concepts
 | |
| --------
 | |
| 
 | |
| * An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a
 | |
|   top-down parser for a subset of context-free languages. It parses the input
 | |
|   from Left to right, performing Leftmost derivation of the sentence, and can
 | |
|   only use 1 token of lookahead when parsing a sentence.
 | |
| 
 | |
| * A parsing table is a collection of data that a generic implementation of the
 | |
|   LL(1) parser consumes to know how to parse a given context-free grammar. In
 | |
|   this case the collection of data involves Deterministic Finite Automatons,
 | |
|   calculated first sets, keywords and transition labels.
 | |
| 
 | |
| * A grammar is defined by production rules (or just 'productions') that specify
 | |
|   which symbols may replace which other symbols; these rules may be used to
 | |
|   generate strings, or to parse them. Each such rule has a head, or left-hand
 | |
|   side, which consists of the string that may be replaced, and a body, or
 | |
|   right-hand side, which consists of a string that may replace it. In the
 | |
|   Python grammar, rules are written in the form
 | |
| 
 | |
|   rule_name: rule_description;
 | |
| 
 | |
|   meaning the rule 'a: b' specifies that a can be replaced by b. A context-free
 | |
|   grammar is a grammar in which the left-hand side of each production rule
 | |
|   consists of only a single nonterminal symbol. Context-free grammars can
 | |
|   always be recognized by a Non-Deterministic Automatons.
 | |
| 
 | |
| * Terminal symbols are literal symbols which may appear in the outputs of the
 | |
|   production rules of the grammar and which cannot be changed using the rules
 | |
|   of the grammar. Applying the rules recursively to a source string of symbols
 | |
|   will usually terminate in a final output string consisting only of terminal
 | |
|   symbols.
 | |
| 
 | |
| * Nonterminal symbols are those symbols which can be replaced. The grammar
 | |
|   includes a start symbol a designated member of the set of nonterminals from
 | |
|   which all the strings in the language may be derived by successive
 | |
|   applications of the production rules.
 | |
| 
 | |
| * The language defined by the grammar is defined as the set of terminal strings
 | |
|   that can be derived using the production rules.
 | |
| 
 | |
| * The first sets of a rule (FIRST(rule)) are defined to be the set of terminals
 | |
|   that can appear in the first position of any string derived from the rule.
 | |
|   This is useful for LL(1) parsers as the parser is only allowed to look at the
 | |
|   next token in the input to know which rule needs to parse. For example, given
 | |
|   this grammar:
 | |
| 
 | |
|   start: '(' A | B ')'
 | |
|   A: 'a' '<'
 | |
|   B: 'b' '<'
 | |
| 
 | |
|   and the input '(b<)' the parser can only look at 'b' to know if it needs
 | |
|   to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows
 | |
|   that needs to continue parsing rule B because only that rule can start
 | |
|   with 'b'.
 | |
| 
 | |
| Description
 | |
| -----------
 | |
| 
 | |
| The input for the parser generator is a grammar in extended BNF form (using *
 | |
| for repetition, + for at-least-once repetition, [] for optional parts, | for
 | |
| alternatives and () for grouping).
 | |
| 
 | |
| Each rule in the grammar file is considered as a regular expression in its
 | |
| own right. It is turned into a Non-deterministic Finite Automaton (NFA),
 | |
| which is then turned into a Deterministic Finite Automaton (DFA), which is
 | |
| then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3,
 | |
| or similar compiler books (this technique is more often used for lexical
 | |
| analyzers).
 | |
| 
 | |
| The DFA's are used by the parser as parsing tables in a special way that's
 | |
| probably unique. Before they are usable, the FIRST sets of all non-terminals
 | |
| are computed so the LL(1) parser consuming the parsing tables can distinguish
 | |
| between different transitions.
 | |
| Reference
 | |
| ---------
 | |
| 
 | |
| [Aho&Ullman 77]
 | |
|     Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977
 | |
|     (first edition)
 | |
| """
 | |
| 
 | |
| from ast import literal_eval
 | |
| import collections
 | |
| 
 | |
| from . import grammar, token
 | |
| from .automata import DFA
 | |
| from .metaparser import GrammarParser
 | |
| 
 | |
| import enum
 | |
| 
 | |
| 
 | |
| class LabelType(enum.Enum):
 | |
|     NONTERMINAL = 0
 | |
|     NAMED_TOKEN = 1
 | |
|     KEYWORD = 2
 | |
|     OPERATOR = 3
 | |
|     NONE = 4
 | |
| 
 | |
| 
 | |
| class Label(str):
 | |
|     def __init__(self, value):
 | |
|         self.type = self._get_type()
 | |
| 
 | |
|     def _get_type(self):
 | |
|         if self[0].isalpha():
 | |
|             if self.upper() == self:
 | |
|                 # NAMED tokens (ASYNC, NAME...) are all uppercase by convention
 | |
|                 return LabelType.NAMED_TOKEN
 | |
|             else:
 | |
|                 # If is not uppercase it must be a non terminal.
 | |
|                 return LabelType.NONTERMINAL
 | |
|         else:
 | |
|             # Keywords and operators are wrapped in quotes
 | |
|             assert self[0] == self[-1] in ('"', "'"), self
 | |
|             value = literal_eval(self)
 | |
|             if value[0].isalpha():
 | |
|                 return LabelType.KEYWORD
 | |
|             else:
 | |
|                 return LabelType.OPERATOR
 | |
| 
 | |
|     def __repr__(self):
 | |
|         return "{}({})".format(self.type, super().__repr__())
 | |
| 
 | |
| 
 | |
| class ParserGenerator(object):
 | |
|     def __init__(self, grammar_file, token_file, verbose=False, graph_file=None):
 | |
|         with open(grammar_file) as f:
 | |
|             self.grammar = f.read()
 | |
|         with open(token_file) as tok_file:
 | |
|             token_lines = tok_file.readlines()
 | |
|         self.tokens = dict(token.generate_tokens(token_lines))
 | |
|         self.opmap = dict(token.generate_opmap(token_lines))
 | |
|         # Manually add <> so it does not collide with !=
 | |
|         self.opmap["<>"] = "NOTEQUAL"
 | |
|         self.verbose = verbose
 | |
|         self.filename = grammar_file
 | |
|         self.graph_file = graph_file
 | |
|         self.dfas, self.startsymbol = self.create_dfas()
 | |
|         self.first = {}  # map from symbol name to set of tokens
 | |
|         self.calculate_first_sets()
 | |
| 
 | |
|     def create_dfas(self):
 | |
|         rule_to_dfas = collections.OrderedDict()
 | |
|         start_nonterminal = None
 | |
|         for nfa in GrammarParser(self.grammar).parse():
 | |
|             if self.verbose:
 | |
|                 print("Dump of NFA for", nfa.name)
 | |
|                 nfa.dump()
 | |
|             if self.graph_file is not None:
 | |
|                 nfa.dump_graph(self.graph_file.write)
 | |
|             dfa = DFA.from_nfa(nfa)
 | |
|             if self.verbose:
 | |
|                 print("Dump of DFA for", dfa.name)
 | |
|                 dfa.dump()
 | |
|             dfa.simplify()
 | |
|             if self.graph_file is not None:
 | |
|                 dfa.dump_graph(self.graph_file.write)
 | |
|             rule_to_dfas[dfa.name] = dfa
 | |
| 
 | |
|             if start_nonterminal is None:
 | |
|                 start_nonterminal = dfa.name
 | |
| 
 | |
|         return rule_to_dfas, start_nonterminal
 | |
| 
 | |
|     def make_grammar(self):
 | |
|         c = grammar.Grammar()
 | |
|         c.all_labels = set()
 | |
|         names = list(self.dfas.keys())
 | |
|         names.remove(self.startsymbol)
 | |
|         names.insert(0, self.startsymbol)
 | |
|         for name in names:
 | |
|             i = 256 + len(c.symbol2number)
 | |
|             c.symbol2number[Label(name)] = i
 | |
|             c.number2symbol[i] = Label(name)
 | |
|             c.all_labels.add(name)
 | |
|         for name in names:
 | |
|             self.make_label(c, name)
 | |
|             dfa = self.dfas[name]
 | |
|             states = []
 | |
|             for state in dfa:
 | |
|                 arcs = []
 | |
|                 for label, next in sorted(state.arcs.items()):
 | |
|                     c.all_labels.add(label)
 | |
|                     arcs.append((self.make_label(c, label), dfa.states.index(next)))
 | |
|                 if state.is_final:
 | |
|                     arcs.append((0, dfa.states.index(state)))
 | |
|                 states.append(arcs)
 | |
|             c.states.append(states)
 | |
|             c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name))
 | |
|         c.start = c.symbol2number[self.startsymbol]
 | |
| 
 | |
|         if self.verbose:
 | |
|             print("")
 | |
|             print("Grammar summary")
 | |
|             print("===============")
 | |
| 
 | |
|             print("- {n_labels} labels".format(n_labels=len(c.labels)))
 | |
|             print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
 | |
|             print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
 | |
|             print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
 | |
|             print(
 | |
|                 "- Start symbol: {start_symbol}".format(
 | |
|                     start_symbol=c.number2symbol[c.start]
 | |
|                 )
 | |
|             )
 | |
|         return c
 | |
| 
 | |
|     def make_first_sets(self, c, name):
 | |
|         rawfirst = self.first[name]
 | |
|         first = set()
 | |
|         for label in sorted(rawfirst):
 | |
|             ilabel = self.make_label(c, label)
 | |
|             ##assert ilabel not in first # XXX failed on <> ... !=
 | |
|             first.add(ilabel)
 | |
|         return first
 | |
| 
 | |
|     def make_label(self, c, label):
 | |
|         label = Label(label)
 | |
|         ilabel = len(c.labels)
 | |
| 
 | |
|         if label.type == LabelType.NONTERMINAL:
 | |
|             if label in c.symbol2label:
 | |
|                 return c.symbol2label[label]
 | |
|             else:
 | |
|                 c.labels.append((c.symbol2number[label], None))
 | |
|                 c.symbol2label[label] = ilabel
 | |
|                 return ilabel
 | |
|         elif label.type == LabelType.NAMED_TOKEN:
 | |
|             # A named token (NAME, NUMBER, STRING)
 | |
|             itoken = self.tokens.get(label, None)
 | |
|             assert isinstance(itoken, int), label
 | |
|             assert itoken in self.tokens.values(), label
 | |
|             if itoken in c.tokens:
 | |
|                 return c.tokens[itoken]
 | |
|             else:
 | |
|                 c.labels.append((itoken, None))
 | |
|                 c.tokens[itoken] = ilabel
 | |
|                 return ilabel
 | |
|         elif label.type == LabelType.KEYWORD:
 | |
|             # A keyword
 | |
|             value = literal_eval(label)
 | |
|             if value in c.keywords:
 | |
|                 return c.keywords[value]
 | |
|             else:
 | |
|                 c.labels.append((self.tokens["NAME"], value))
 | |
|                 c.keywords[value] = ilabel
 | |
|                 return ilabel
 | |
|         elif label.type == LabelType.OPERATOR:
 | |
|             # An operator (any non-numeric token)
 | |
|             value = literal_eval(label)
 | |
|             tok_name = self.opmap[value]  # Fails if unknown token
 | |
|             itoken = self.tokens[tok_name]
 | |
|             if itoken in c.tokens:
 | |
|                 return c.tokens[itoken]
 | |
|             else:
 | |
|                 c.labels.append((itoken, None))
 | |
|                 c.tokens[itoken] = ilabel
 | |
|                 return ilabel
 | |
|         else:
 | |
|             raise ValueError("Cannot categorize label {}".format(label))
 | |
| 
 | |
|     def calculate_first_sets(self):
 | |
|         names = list(self.dfas.keys())
 | |
|         for name in names:
 | |
|             if name not in self.first:
 | |
|                 self.calculate_first_sets_for_rule(name)
 | |
| 
 | |
|             if self.verbose:
 | |
|                 print("First set for {dfa_name}".format(dfa_name=name))
 | |
|                 for item in self.first[name]:
 | |
|                     print("    - {terminal}".format(terminal=item))
 | |
| 
 | |
|     def calculate_first_sets_for_rule(self, name):
 | |
|         dfa = self.dfas[name]
 | |
|         self.first[name] = None  # dummy to detect left recursion
 | |
|         state = dfa.states[0]
 | |
|         totalset = set()
 | |
|         overlapcheck = {}
 | |
|         for label, next in state.arcs.items():
 | |
|             if label in self.dfas:
 | |
|                 if label in self.first:
 | |
|                     fset = self.first[label]
 | |
|                     if fset is None:
 | |
|                         raise ValueError("recursion for rule %r" % name)
 | |
|                 else:
 | |
|                     self.calculate_first_sets_for_rule(label)
 | |
|                     fset = self.first[label]
 | |
|                 totalset.update(fset)
 | |
|                 overlapcheck[label] = fset
 | |
|             else:
 | |
|                 totalset.add(label)
 | |
|                 overlapcheck[label] = {label}
 | |
|         inverse = {}
 | |
|         for label, itsfirst in overlapcheck.items():
 | |
|             for symbol in itsfirst:
 | |
|                 if symbol in inverse:
 | |
|                     raise ValueError(
 | |
|                         "rule %s is ambiguous; %s is in the"
 | |
|                         " first sets of %s as well as %s"
 | |
|                         % (name, symbol, label, inverse[symbol])
 | |
|                     )
 | |
|                 inverse[symbol] = label
 | |
|         self.first[name] = totalset
 | 
