cpython/Parser/pgen/metaparser.py

"""Parser for the Python metagrammar"""

import io
import tokenize  # from stdlib

from .automata import NFA, NFAState


class GrammarParser:
    """Parser for Python grammar files."""

    _translation_table = {
        tokenize.NAME: "NAME",
        tokenize.STRING: "STRING",
        tokenize.NEWLINE: "NEWLINE",
        tokenize.NL: "NL",
        tokenize.OP: "OP",
        tokenize.ENDMARKER: "ENDMARKER",
        tokenize.COMMENT: "COMMENT",
    }

    def __init__(self, grammar):
        self.grammar = grammar
        grammar_adaptor = io.StringIO(grammar)
        self.generator = tokenize.generate_tokens(grammar_adaptor.readline)
        self._gettoken()  # Initialize lookahead
        self._current_rule_name = None

    def parse(self):
        """Turn the grammar into a collection of NFAs"""
        # grammar: (NEWLINE | rule)* ENDMARKER
        while self.type != tokenize.ENDMARKER:
            while self.type == tokenize.NEWLINE:
                self._gettoken()
            # rule: NAME ':' rhs NEWLINE
            self._current_rule_name = self._expect(tokenize.NAME)
            self._expect(tokenize.OP, ":")
            a, z = self._parse_rhs()
            self._expect(tokenize.NEWLINE)

            yield NFA(a, z)

    def _parse_rhs(self):
        # rhs: items ('|' items)*
        a, z = self._parse_items()
        if self.value != "|":
            return a, z
        else:
            aa = NFAState(self._current_rule_name)
            zz = NFAState(self._current_rule_name)
            while True:
                # Allow to transit directly to the previous state and connect the end of the
                # previous state to the end of the current one, effectively allowing to skip
                # the current state.
                aa.add_arc(a)
                z.add_arc(zz)
                if self.value != "|":
                    break

                self._gettoken()
                a, z = self._parse_items()
            return aa, zz

    def _parse_items(self):
        # items: item+
        a, b = self._parse_item()
        while self.type in (tokenize.NAME, tokenize.STRING) or self.value in ("(", "["):
            c, d = self._parse_item()
            # Allow a transition between the end of the previous state
            # and the beginning of the new one, connecting all the items
            # together. In this way we can only reach the end if we visit
            # all the items.
            b.add_arc(c)
            b = d
        return a, b

    def _parse_item(self):
        # item: '[' rhs ']' | atom ['+' | '*']
        if self.value == "[":
            self._gettoken()
            a, z = self._parse_rhs()
            self._expect(tokenize.OP, "]")
            # Make a transition from the beginning to the end so it is possible to
            # advance for free to the next state of this item # without consuming
            # anything from the rhs.
            a.add_arc(z)
            return a, z
        else:
            a, z = self._parse_atom()
            value = self.value
            if value not in ("+", "*"):
                return a, z
            self._gettoken()
            z.add_arc(a)
            if value == "+":
                # Create a cycle to the beginning so we go back to the old state in this
                # item and repeat.
                return a, z
            else:
                # The end state is the same as the beginning, so we can cycle arbitrarily
                # and end in the beginning if necessary.
                return a, a

    def _parse_atom(self):
        # atom: '(' rhs ')' | NAME | STRING
        if self.value == "(":
            self._gettoken()
            a, z = self._parse_rhs()
            self._expect(tokenize.OP, ")")
            return a, z
        elif self.type in (tokenize.NAME, tokenize.STRING):
            a = NFAState(self._current_rule_name)
            z = NFAState(self._current_rule_name)
            # We can transit to the next state only if we consume the value.
            a.add_arc(z, self.value)
            self._gettoken()
            return a, z
        else:
            self._raise_error(
                "expected (...) or NAME or STRING, got {} ({})",
                self._translation_table.get(self.type, self.type),
                self.value,
            )

    def _expect(self, type_, value=None):
        if self.type != type_:
            self._raise_error(
                "expected {}, got {} ({})",
                self._translation_table.get(type_, type_),
                self._translation_table.get(self.type, self.type),
                self.value,
            )
        if value is not None and self.value != value:
            self._raise_error("expected {}, got {}", value, self.value)
        value = self.value
        self._gettoken()
        return value

    def _gettoken(self):
        tup = next(self.generator)
        while tup[0] in (tokenize.COMMENT, tokenize.NL):
            tup = next(self.generator)
        self.type, self.value, self.begin, self.end, self.line = tup

    def _raise_error(self, msg, *args):
        if args:
            try:
                msg = msg.format(*args)
            except Exception:
                msg = " ".join([msg] + list(map(str, args)))
        line = self.grammar.splitlines()[self.begin[0] - 1]
        raise SyntaxError(msg, ("<grammar>", self.begin[0], self.begin[1], line))