mirror of
https://github.com/Instagram/LibCST.git
synced 2025-12-23 10:35:53 +00:00
If we had more close braces than open braces, we used to raise an IndexError due to an empty stack. This fixes that bug.
213 lines
7.6 KiB
Python
213 lines
7.6 KiB
Python
# Copyright (c) Facebook, Inc. and its affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
# pyre-strict
|
|
|
|
"""
|
|
Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this
|
|
performs a small number of transformations to the token stream:
|
|
|
|
- `end_pos` is precomputed as a property, instead of lazily as a method, for more
|
|
efficient access.
|
|
- `whitespace_before` and `whitespace_after` have been added. These include the correct
|
|
indentation information.
|
|
- `prefix` is removed, since we don't use it anywhere.
|
|
- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support
|
|
error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead.
|
|
|
|
If performance becomes a concern, we can rewrite this later as a fork of the original
|
|
tokenize module, instead of as a wrapper.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Generator, List, Optional, Sequence
|
|
|
|
from parso.python.token import PythonTokenTypes, TokenType
|
|
from parso.python.tokenize import (
|
|
Token as OrigToken,
|
|
tokenize_lines as orig_tokenize_lines,
|
|
)
|
|
from parso.utils import PythonVersionInfo, split_lines
|
|
|
|
from libcst._add_slots import add_slots
|
|
from libcst._exceptions import ParserSyntaxError
|
|
from libcst._parser._types.token import Token
|
|
from libcst._parser._types.whitespace_state import WhitespaceState
|
|
|
|
|
|
_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN
|
|
_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT
|
|
|
|
_INDENT: TokenType = PythonTokenTypes.INDENT
|
|
_DEDENT: TokenType = PythonTokenTypes.DEDENT
|
|
_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER
|
|
|
|
_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START
|
|
_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END
|
|
|
|
_OP: TokenType = PythonTokenTypes.OP
|
|
|
|
|
|
class _ParenthesisOrFStringStackEntry(Enum):
|
|
PARENTHESIS = 0
|
|
FSTRING = 0
|
|
|
|
|
|
_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
|
|
_ParenthesisOrFStringStackEntry.PARENTHESIS
|
|
)
|
|
_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
|
|
_ParenthesisOrFStringStackEntry.FSTRING
|
|
)
|
|
|
|
|
|
@add_slots
|
|
@dataclass(frozen=False)
|
|
class _TokenizeState:
|
|
lines: Sequence[str]
|
|
previous_whitespace_state: WhitespaceState = field(
|
|
default_factory=lambda: WhitespaceState(
|
|
line=1, column=0, absolute_indent="", is_parenthesized=False
|
|
)
|
|
)
|
|
indents: List[str] = field(default_factory=lambda: [""])
|
|
parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field(
|
|
default_factory=list
|
|
)
|
|
|
|
|
|
def tokenize(
|
|
code: str, version_info: PythonVersionInfo
|
|
) -> Generator[Token, None, None]:
|
|
lines = split_lines(code, keepends=True)
|
|
return tokenize_lines(lines, version_info)
|
|
|
|
|
|
def tokenize_lines(
|
|
lines: Sequence[str], version_info: PythonVersionInfo
|
|
) -> Generator[Token, None, None]:
|
|
state = _TokenizeState(lines)
|
|
orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info))
|
|
|
|
# Iterate over the tokens and pass them to _convert_token, providing a one-token
|
|
# lookahead, to enable proper indent handling.
|
|
try:
|
|
curr_token = next(orig_tokens_iter)
|
|
except StopIteration:
|
|
pass # empty file
|
|
else:
|
|
for next_token in orig_tokens_iter:
|
|
yield _convert_token(state, curr_token, next_token)
|
|
curr_token = next_token
|
|
yield _convert_token(state, curr_token, None)
|
|
|
|
|
|
def _convert_token( # noqa: C901: too complex
|
|
state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]
|
|
) -> Token:
|
|
ct_type = curr_token.type
|
|
ct_string = curr_token.string
|
|
ct_start_pos = curr_token.start_pos
|
|
if ct_type is _ERRORTOKEN:
|
|
raise ParserSyntaxError(
|
|
f"{ct_string!r} is not a valid token.",
|
|
lines=state.lines,
|
|
raw_line=ct_start_pos[0],
|
|
raw_column=ct_start_pos[1],
|
|
)
|
|
if ct_type is _ERROR_DEDENT:
|
|
raise ParserSyntaxError(
|
|
"Inconsistent indentation. Expected a dedent.",
|
|
lines=state.lines,
|
|
raw_line=ct_start_pos[0],
|
|
raw_column=ct_start_pos[1],
|
|
)
|
|
|
|
# Compute relative indent changes for indent/dedent nodes
|
|
relative_indent: Optional[str] = None
|
|
if ct_type is _INDENT:
|
|
old_indent = "" if len(state.indents) < 2 else state.indents[-2]
|
|
new_indent = state.indents[-1]
|
|
relative_indent = new_indent[len(old_indent) :]
|
|
|
|
if next_token is not None:
|
|
nt_type = next_token.type
|
|
if nt_type is _INDENT:
|
|
nt_line, nt_column = next_token.start_pos
|
|
state.indents.append(state.lines[nt_line - 1][:nt_column])
|
|
elif nt_type is _DEDENT:
|
|
state.indents.pop()
|
|
|
|
whitespace_before = state.previous_whitespace_state
|
|
|
|
if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER:
|
|
# Don't update whitespace state for these dummy tokens. This makes it possible
|
|
# to partially parse whitespace for IndentedBlock footers, and then parse the
|
|
# rest of the whitespace in the following statement's leading_lines.
|
|
# Unfortunately, that means that the indentation is either wrong for the footer
|
|
# comments, or for the next line. We've chosen to allow it to be wrong for the
|
|
# IndentedBlock footer and manually override the state when parsing whitespace
|
|
# in that particular node.
|
|
whitespace_after = whitespace_before
|
|
ct_end_pos = ct_start_pos
|
|
else:
|
|
# Not a dummy token, so update the whitespace state.
|
|
|
|
# Compute our own end_pos, since parso's end_pos is wrong for triple-strings.
|
|
lines = split_lines(ct_string)
|
|
if len(lines) > 1:
|
|
ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1])
|
|
else:
|
|
ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string))
|
|
|
|
# Figure out what mode the whitespace parser should use. If we're inside
|
|
# parentheses, certain whitespace (e.g. newlines) are allowed where they would
|
|
# otherwise not be. f-strings override and disable this behavior, however.
|
|
#
|
|
# Parso's tokenizer tracks this internally, but doesn't expose it, so we have to
|
|
# duplicate that logic here.
|
|
|
|
pof_stack = state.parenthesis_or_fstring_stack
|
|
try:
|
|
if ct_type is _FSTRING_START:
|
|
pof_stack.append(_FSTRING_STACK_ENTRY)
|
|
elif ct_type is _FSTRING_END:
|
|
pof_stack.pop()
|
|
elif ct_type is _OP:
|
|
if ct_string in "([{":
|
|
pof_stack.append(_PARENTHESIS_STACK_ENTRY)
|
|
elif ct_string in ")]}":
|
|
pof_stack.pop()
|
|
except IndexError:
|
|
# pof_stack may be empty by the time we need to read from it due to
|
|
# mismatched braces.
|
|
raise ParserSyntaxError(
|
|
"Encountered a closing brace without a matching opening brace.",
|
|
lines=state.lines,
|
|
raw_line=ct_start_pos[0],
|
|
raw_column=ct_start_pos[1],
|
|
)
|
|
is_parenthesized = (
|
|
len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY
|
|
)
|
|
|
|
whitespace_after = WhitespaceState(
|
|
ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized
|
|
)
|
|
|
|
# Hold onto whitespace_after, so we can use it as whitespace_before in the next
|
|
# node.
|
|
state.previous_whitespace_state = whitespace_after
|
|
|
|
return Token(
|
|
ct_type,
|
|
ct_string,
|
|
ct_start_pos,
|
|
ct_end_pos,
|
|
whitespace_before,
|
|
whitespace_after,
|
|
relative_indent,
|
|
)
|