mirror of
https://github.com/Instagram/LibCST.git
synced 2025-12-23 10:35:53 +00:00
225 lines
8.1 KiB
Python
225 lines
8.1 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
|
|
"""
|
|
Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this
|
|
performs a small number of transformations to the token stream:
|
|
|
|
- `end_pos` is precomputed as a property, instead of lazily as a method, for more
|
|
efficient access.
|
|
- `whitespace_before` and `whitespace_after` have been added. These include the correct
|
|
indentation information.
|
|
- `prefix` is removed, since we don't use it anywhere.
|
|
- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support
|
|
error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead.
|
|
|
|
If performance becomes a concern, we can rewrite this later as a fork of the original
|
|
tokenize module, instead of as a wrapper.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from enum import Enum
|
|
from typing import Generator, Iterator, List, Optional, Sequence
|
|
|
|
from libcst._add_slots import add_slots
|
|
from libcst._exceptions import ParserSyntaxError
|
|
from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
|
|
from libcst._parser.parso.python.tokenize import (
|
|
Token as OrigToken,
|
|
tokenize_lines as orig_tokenize_lines,
|
|
)
|
|
from libcst._parser.parso.utils import PythonVersionInfo, split_lines
|
|
from libcst._parser.types.token import Token
|
|
from libcst._parser.types.whitespace_state import WhitespaceState
|
|
|
|
_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN
|
|
_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT
|
|
|
|
_INDENT: TokenType = PythonTokenTypes.INDENT
|
|
_DEDENT: TokenType = PythonTokenTypes.DEDENT
|
|
_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER
|
|
|
|
_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START
|
|
_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END
|
|
|
|
_OP: TokenType = PythonTokenTypes.OP
|
|
|
|
|
|
class _ParenthesisOrFStringStackEntry(Enum):
|
|
PARENTHESIS = 0
|
|
FSTRING = 0
|
|
|
|
|
|
_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
|
|
_ParenthesisOrFStringStackEntry.PARENTHESIS
|
|
)
|
|
_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
|
|
_ParenthesisOrFStringStackEntry.FSTRING
|
|
)
|
|
|
|
|
|
@add_slots
|
|
@dataclass(frozen=False)
|
|
class _TokenizeState:
|
|
lines: Sequence[str]
|
|
previous_whitespace_state: WhitespaceState = field(
|
|
default_factory=lambda: WhitespaceState(
|
|
line=1, column=0, absolute_indent="", is_parenthesized=False
|
|
)
|
|
)
|
|
indents: List[str] = field(default_factory=lambda: [""])
|
|
parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field(
|
|
default_factory=list
|
|
)
|
|
|
|
|
|
def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]:
|
|
try:
|
|
from libcst_native import tokenize as native_tokenize
|
|
|
|
return native_tokenize.tokenize(code)
|
|
except ImportError:
|
|
lines = split_lines(code, keepends=True)
|
|
return tokenize_lines(code, lines, version_info)
|
|
|
|
|
|
def tokenize_lines(
|
|
code: str, lines: Sequence[str], version_info: PythonVersionInfo
|
|
) -> Iterator[Token]:
|
|
try:
|
|
from libcst_native import tokenize as native_tokenize
|
|
|
|
# TODO: pass through version_info
|
|
return native_tokenize.tokenize(code)
|
|
except ImportError:
|
|
return tokenize_lines_py(code, lines, version_info)
|
|
|
|
|
|
def tokenize_lines_py(
|
|
code: str, lines: Sequence[str], version_info: PythonVersionInfo
|
|
) -> Generator[Token, None, None]:
|
|
state = _TokenizeState(lines)
|
|
orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info))
|
|
|
|
# Iterate over the tokens and pass them to _convert_token, providing a one-token
|
|
# lookahead, to enable proper indent handling.
|
|
try:
|
|
curr_token = next(orig_tokens_iter)
|
|
except StopIteration:
|
|
pass # empty file
|
|
else:
|
|
for next_token in orig_tokens_iter:
|
|
yield _convert_token(state, curr_token, next_token)
|
|
curr_token = next_token
|
|
yield _convert_token(state, curr_token, None)
|
|
|
|
|
|
def _convert_token( # noqa: C901: too complex
|
|
state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]
|
|
) -> Token:
|
|
ct_type = curr_token.type
|
|
ct_string = curr_token.string
|
|
ct_start_pos = curr_token.start_pos
|
|
if ct_type is _ERRORTOKEN:
|
|
raise ParserSyntaxError(
|
|
f"{ct_string!r} is not a valid token.",
|
|
lines=state.lines,
|
|
raw_line=ct_start_pos[0],
|
|
raw_column=ct_start_pos[1],
|
|
)
|
|
if ct_type is _ERROR_DEDENT:
|
|
raise ParserSyntaxError(
|
|
"Inconsistent indentation. Expected a dedent.",
|
|
lines=state.lines,
|
|
raw_line=ct_start_pos[0],
|
|
raw_column=ct_start_pos[1],
|
|
)
|
|
|
|
# Compute relative indent changes for indent/dedent nodes
|
|
relative_indent: Optional[str] = None
|
|
if ct_type is _INDENT:
|
|
old_indent = "" if len(state.indents) < 2 else state.indents[-2]
|
|
new_indent = state.indents[-1]
|
|
relative_indent = new_indent[len(old_indent) :]
|
|
|
|
if next_token is not None:
|
|
nt_type = next_token.type
|
|
if nt_type is _INDENT:
|
|
nt_line, nt_column = next_token.start_pos
|
|
state.indents.append(state.lines[nt_line - 1][:nt_column])
|
|
elif nt_type is _DEDENT:
|
|
state.indents.pop()
|
|
|
|
whitespace_before = state.previous_whitespace_state
|
|
|
|
if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER:
|
|
# Don't update whitespace state for these dummy tokens. This makes it possible
|
|
# to partially parse whitespace for IndentedBlock footers, and then parse the
|
|
# rest of the whitespace in the following statement's leading_lines.
|
|
# Unfortunately, that means that the indentation is either wrong for the footer
|
|
# comments, or for the next line. We've chosen to allow it to be wrong for the
|
|
# IndentedBlock footer and manually override the state when parsing whitespace
|
|
# in that particular node.
|
|
whitespace_after = whitespace_before
|
|
ct_end_pos = ct_start_pos
|
|
else:
|
|
# Not a dummy token, so update the whitespace state.
|
|
|
|
# Compute our own end_pos, since parso's end_pos is wrong for triple-strings.
|
|
lines = split_lines(ct_string)
|
|
if len(lines) > 1:
|
|
ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1])
|
|
else:
|
|
ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string))
|
|
|
|
# Figure out what mode the whitespace parser should use. If we're inside
|
|
# parentheses, certain whitespace (e.g. newlines) are allowed where they would
|
|
# otherwise not be. f-strings override and disable this behavior, however.
|
|
#
|
|
# Parso's tokenizer tracks this internally, but doesn't expose it, so we have to
|
|
# duplicate that logic here.
|
|
|
|
pof_stack = state.parenthesis_or_fstring_stack
|
|
try:
|
|
if ct_type is _FSTRING_START:
|
|
pof_stack.append(_FSTRING_STACK_ENTRY)
|
|
elif ct_type is _FSTRING_END:
|
|
pof_stack.pop()
|
|
elif ct_type is _OP:
|
|
if ct_string in "([{":
|
|
pof_stack.append(_PARENTHESIS_STACK_ENTRY)
|
|
elif ct_string in ")]}":
|
|
pof_stack.pop()
|
|
except IndexError:
|
|
# pof_stack may be empty by the time we need to read from it due to
|
|
# mismatched braces.
|
|
raise ParserSyntaxError(
|
|
"Encountered a closing brace without a matching opening brace.",
|
|
lines=state.lines,
|
|
raw_line=ct_start_pos[0],
|
|
raw_column=ct_start_pos[1],
|
|
)
|
|
is_parenthesized = (
|
|
len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY
|
|
)
|
|
|
|
whitespace_after = WhitespaceState(
|
|
ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized
|
|
)
|
|
|
|
# Hold onto whitespace_after, so we can use it as whitespace_before in the next
|
|
# node.
|
|
state.previous_whitespace_state = whitespace_after
|
|
|
|
return Token(
|
|
ct_type,
|
|
ct_string,
|
|
ct_start_pos,
|
|
ct_end_pos,
|
|
whitespace_before,
|
|
whitespace_after,
|
|
relative_indent,
|
|
)
|