mirror of
https://github.com/Instagram/LibCST.git
synced 2025-12-23 10:35:53 +00:00
* Keep old exception messages (avoid breaking-changes for users relying on exception messages) * Move ``get_expected_str`` out of _exceptions.py, where it does not belong, to its own file in _parser/_parsing_check.py
215 lines
8.5 KiB
Python
215 lines
8.5 KiB
Python
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
|
# Licensed to PSF under a Contributor Agreement.
|
|
|
|
# Modifications:
|
|
# Copyright David Halter and Contributors
|
|
# Modifications are dual-licensed: MIT and PSF.
|
|
# 99% of the code is different from pgen2, now.
|
|
|
|
# A fork of `parso.parser`.
|
|
# https://github.com/davidhalter/parso/blob/v0.3.4/parso/parser.py
|
|
#
|
|
# The following changes were made:
|
|
# - Typing was added.
|
|
# - Error recovery is removed.
|
|
# - The Jedi-specific _allowed_transition_names_and_token_types API is removed.
|
|
# - Improved error messages by using our exceptions module.
|
|
# - node_map/leaf_map were removed in favor of just calling convert_*.
|
|
# - convert_node/convert_leaf were renamed to convert_nonterminal/convert_terminal
|
|
# - convert_nonterminal is called regardless of the number of children. Parso avoids
|
|
# calling it in some cases to avoid creating extra nodes.
|
|
# - The parser is constructed with the tokens to allow us to track a bit more state. As
|
|
# As a consequence parser may only be used once.
|
|
# - Supports our custom Token class, instead of `parso.python.tokenize.Token`.
|
|
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Generic, Iterable, List, Sequence, TypeVar, Union
|
|
|
|
from libcst._exceptions import EOFSentinel, ParserSyntaxError, PartialParserSyntaxError
|
|
from libcst._parser._parsing_check import get_expected_str
|
|
from libcst._parser.parso.pgen2.generator import DFAState, Grammar, ReservedString
|
|
from libcst._parser.parso.python.token import TokenType
|
|
from libcst._parser.types.token import Token
|
|
|
|
_NodeT = TypeVar("_NodeT")
|
|
_TokenTypeT = TypeVar("_TokenTypeT", bound=TokenType)
|
|
_TokenT = TypeVar("_TokenT", bound=Token)
|
|
|
|
|
|
@dataclass(frozen=False)
|
|
class StackNode(Generic[_TokenTypeT, _NodeT]):
|
|
dfa: "DFAState[_TokenTypeT]"
|
|
nodes: List[_NodeT] = field(default_factory=list)
|
|
|
|
@property
|
|
def nonterminal(self) -> str:
|
|
return self.dfa.from_rule
|
|
|
|
|
|
def _token_to_transition(
|
|
grammar: "Grammar[_TokenTypeT]", type_: _TokenTypeT, value: str
|
|
) -> Union[ReservedString, _TokenTypeT]:
|
|
# Map from token to label
|
|
if type_.contains_syntax:
|
|
# Check for reserved words (keywords)
|
|
try:
|
|
return grammar.reserved_syntax_strings[value]
|
|
except KeyError:
|
|
pass
|
|
|
|
return type_
|
|
|
|
|
|
# TODO: This should be an ABC, but there's a metaclass conflict between Generic and ABC
|
|
# that's fixed in Python 3.7.
|
|
class BaseParser(Generic[_TokenT, _TokenTypeT, _NodeT]):
|
|
"""Parser engine.
|
|
|
|
A Parser instance contains state pertaining to the current token
|
|
sequence, and should not be used concurrently by different threads
|
|
to parse separate token sequences.
|
|
|
|
See python/tokenize.py for how to get input tokens by a string.
|
|
"""
|
|
|
|
tokens: Iterable[_TokenT]
|
|
lines: Sequence[str] # used when generating parse errors
|
|
_pgen_grammar: "Grammar[_TokenTypeT]"
|
|
stack: List[StackNode[_TokenTypeT, _NodeT]]
|
|
# Keep track of if parse was called. Because a parser may keep global mutable state,
|
|
# each BaseParser instance should only be used once.
|
|
__was_parse_called: bool
|
|
|
|
def __init__(
|
|
self,
|
|
*,
|
|
tokens: Iterable[_TokenT],
|
|
lines: Sequence[str],
|
|
pgen_grammar: "Grammar[_TokenTypeT]",
|
|
start_nonterminal: str,
|
|
) -> None:
|
|
self.tokens = tokens
|
|
self.lines = lines
|
|
self._pgen_grammar = pgen_grammar
|
|
first_dfa = pgen_grammar.nonterminal_to_dfas[start_nonterminal][0]
|
|
self.stack = [StackNode(first_dfa)]
|
|
self.__was_parse_called = False
|
|
|
|
def parse(self) -> _NodeT:
|
|
# Ensure that we don't re-use parsers.
|
|
if self.__was_parse_called:
|
|
raise ValueError("Each parser object may only be used to parse once.")
|
|
self.__was_parse_called = True
|
|
|
|
for token in self.tokens:
|
|
self._add_token(token)
|
|
|
|
while True:
|
|
tos = self.stack[-1]
|
|
if not tos.dfa.is_final:
|
|
expected_str = get_expected_str(
|
|
EOFSentinel.EOF, tos.dfa.transitions.keys()
|
|
)
|
|
raise ParserSyntaxError(
|
|
f"Incomplete input. {expected_str}",
|
|
lines=self.lines,
|
|
raw_line=len(self.lines),
|
|
raw_column=len(self.lines[-1]),
|
|
)
|
|
|
|
if len(self.stack) > 1:
|
|
self._pop()
|
|
else:
|
|
return self.convert_nonterminal(tos.nonterminal, tos.nodes)
|
|
|
|
def convert_nonterminal(
|
|
self, nonterminal: str, children: Sequence[_NodeT]
|
|
) -> _NodeT: ...
|
|
|
|
def convert_terminal(self, token: _TokenT) -> _NodeT: ...
|
|
|
|
def _add_token(self, token: _TokenT) -> None:
|
|
"""
|
|
This is the only core function for parsing. Here happens basically
|
|
everything. Everything is well prepared by the parser generator and we
|
|
only apply the necessary steps here.
|
|
"""
|
|
grammar = self._pgen_grammar
|
|
stack = self.stack
|
|
# pyre-fixme[6]: Expected `_TokenTypeT` for 2nd param but got `TokenType`.
|
|
transition = _token_to_transition(grammar, token.type, token.string)
|
|
|
|
while True:
|
|
try:
|
|
plan = stack[-1].dfa.transitions[transition]
|
|
break
|
|
except KeyError:
|
|
if stack[-1].dfa.is_final:
|
|
try:
|
|
self._pop()
|
|
except PartialParserSyntaxError as ex:
|
|
# Upconvert the PartialParserSyntaxError to a ParserSyntaxError
|
|
# by backfilling the line/column information.
|
|
raise ParserSyntaxError(
|
|
ex.message,
|
|
lines=self.lines,
|
|
raw_line=token.start_pos[0],
|
|
raw_column=token.start_pos[1],
|
|
)
|
|
except Exception as ex:
|
|
# convert_nonterminal may fail due to a bug in our code. Try to
|
|
# recover enough to at least tell us where in the file it
|
|
# failed.
|
|
raise ParserSyntaxError(
|
|
f"Internal error: {ex}",
|
|
lines=self.lines,
|
|
raw_line=token.start_pos[0],
|
|
raw_column=token.start_pos[1],
|
|
)
|
|
else:
|
|
# We never broke out -- EOF is too soon -- Unfinished statement.
|
|
#
|
|
# BUG: The `expected_str` may not be complete because we already
|
|
# popped the other possibilities off the stack at this point, but
|
|
# it still seems useful to list some of the possibilities that we
|
|
# could've expected.
|
|
expected_str = get_expected_str(
|
|
token, stack[-1].dfa.transitions.keys()
|
|
)
|
|
raise ParserSyntaxError(
|
|
f"Incomplete input. {expected_str}",
|
|
lines=self.lines,
|
|
raw_line=token.start_pos[0],
|
|
raw_column=token.start_pos[1],
|
|
)
|
|
except IndexError:
|
|
# I don't think this will ever happen with Python's grammar, because if
|
|
# there are any extra tokens at the end of the input, we'll instead
|
|
# complain that we expected ENDMARKER.
|
|
#
|
|
# However, let's leave it just in case.
|
|
expected_str = get_expected_str(token, EOFSentinel.EOF)
|
|
raise ParserSyntaxError(
|
|
f"Too much input. {expected_str}",
|
|
lines=self.lines,
|
|
raw_line=token.start_pos[0],
|
|
raw_column=token.start_pos[1],
|
|
)
|
|
|
|
# Logically, `plan` is always defined, but pyre can't reasonably determine that.
|
|
stack[-1].dfa = plan.next_dfa
|
|
|
|
for push in plan.dfa_pushes:
|
|
stack.append(StackNode(push))
|
|
|
|
leaf = self.convert_terminal(token)
|
|
stack[-1].nodes.append(leaf)
|
|
|
|
def _pop(self) -> None:
|
|
tos = self.stack.pop()
|
|
# Unlike parso and lib2to3, we call `convert_nonterminal` unconditionally
|
|
# instead of only when we have more than one child. This allows us to create a
|
|
# far more consistent and predictable tree.
|
|
new_node = self.convert_nonterminal(tos.dfa.from_rule, tos.nodes)
|
|
self.stack[-1].nodes.append(new_node)
|