LibCST/libcst/_parser/wrapped_tokenize.py
Zsolt Dollenstein c44ff0500b
Fix license headers (#560)
* Facebook -> Meta

* remove year from doc copyright
2021-12-28 11:55:18 +00:00

225 lines
8.1 KiB
Python

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this
performs a small number of transformations to the token stream:
- `end_pos` is precomputed as a property, instead of lazily as a method, for more
efficient access.
- `whitespace_before` and `whitespace_after` have been added. These include the correct
indentation information.
- `prefix` is removed, since we don't use it anywhere.
- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support
error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead.
If performance becomes a concern, we can rewrite this later as a fork of the original
tokenize module, instead of as a wrapper.
"""
from dataclasses import dataclass, field
from enum import Enum
from typing import Generator, Iterator, List, Optional, Sequence
from libcst._add_slots import add_slots
from libcst._exceptions import ParserSyntaxError
from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
from libcst._parser.parso.python.tokenize import (
Token as OrigToken,
tokenize_lines as orig_tokenize_lines,
)
from libcst._parser.parso.utils import PythonVersionInfo, split_lines
from libcst._parser.types.token import Token
from libcst._parser.types.whitespace_state import WhitespaceState
_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN
_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT
_INDENT: TokenType = PythonTokenTypes.INDENT
_DEDENT: TokenType = PythonTokenTypes.DEDENT
_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER
_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START
_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END
_OP: TokenType = PythonTokenTypes.OP
class _ParenthesisOrFStringStackEntry(Enum):
PARENTHESIS = 0
FSTRING = 0
_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
_ParenthesisOrFStringStackEntry.PARENTHESIS
)
_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
_ParenthesisOrFStringStackEntry.FSTRING
)
@add_slots
@dataclass(frozen=False)
class _TokenizeState:
lines: Sequence[str]
previous_whitespace_state: WhitespaceState = field(
default_factory=lambda: WhitespaceState(
line=1, column=0, absolute_indent="", is_parenthesized=False
)
)
indents: List[str] = field(default_factory=lambda: [""])
parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field(
default_factory=list
)
def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]:
try:
from libcst_native import tokenize as native_tokenize
return native_tokenize.tokenize(code)
except ImportError:
lines = split_lines(code, keepends=True)
return tokenize_lines(code, lines, version_info)
def tokenize_lines(
code: str, lines: Sequence[str], version_info: PythonVersionInfo
) -> Iterator[Token]:
try:
from libcst_native import tokenize as native_tokenize
# TODO: pass through version_info
return native_tokenize.tokenize(code)
except ImportError:
return tokenize_lines_py(code, lines, version_info)
def tokenize_lines_py(
code: str, lines: Sequence[str], version_info: PythonVersionInfo
) -> Generator[Token, None, None]:
state = _TokenizeState(lines)
orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info))
# Iterate over the tokens and pass them to _convert_token, providing a one-token
# lookahead, to enable proper indent handling.
try:
curr_token = next(orig_tokens_iter)
except StopIteration:
pass # empty file
else:
for next_token in orig_tokens_iter:
yield _convert_token(state, curr_token, next_token)
curr_token = next_token
yield _convert_token(state, curr_token, None)
def _convert_token( # noqa: C901: too complex
state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]
) -> Token:
ct_type = curr_token.type
ct_string = curr_token.string
ct_start_pos = curr_token.start_pos
if ct_type is _ERRORTOKEN:
raise ParserSyntaxError(
f"{ct_string!r} is not a valid token.",
lines=state.lines,
raw_line=ct_start_pos[0],
raw_column=ct_start_pos[1],
)
if ct_type is _ERROR_DEDENT:
raise ParserSyntaxError(
"Inconsistent indentation. Expected a dedent.",
lines=state.lines,
raw_line=ct_start_pos[0],
raw_column=ct_start_pos[1],
)
# Compute relative indent changes for indent/dedent nodes
relative_indent: Optional[str] = None
if ct_type is _INDENT:
old_indent = "" if len(state.indents) < 2 else state.indents[-2]
new_indent = state.indents[-1]
relative_indent = new_indent[len(old_indent) :]
if next_token is not None:
nt_type = next_token.type
if nt_type is _INDENT:
nt_line, nt_column = next_token.start_pos
state.indents.append(state.lines[nt_line - 1][:nt_column])
elif nt_type is _DEDENT:
state.indents.pop()
whitespace_before = state.previous_whitespace_state
if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER:
# Don't update whitespace state for these dummy tokens. This makes it possible
# to partially parse whitespace for IndentedBlock footers, and then parse the
# rest of the whitespace in the following statement's leading_lines.
# Unfortunately, that means that the indentation is either wrong for the footer
# comments, or for the next line. We've chosen to allow it to be wrong for the
# IndentedBlock footer and manually override the state when parsing whitespace
# in that particular node.
whitespace_after = whitespace_before
ct_end_pos = ct_start_pos
else:
# Not a dummy token, so update the whitespace state.
# Compute our own end_pos, since parso's end_pos is wrong for triple-strings.
lines = split_lines(ct_string)
if len(lines) > 1:
ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1])
else:
ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string))
# Figure out what mode the whitespace parser should use. If we're inside
# parentheses, certain whitespace (e.g. newlines) are allowed where they would
# otherwise not be. f-strings override and disable this behavior, however.
#
# Parso's tokenizer tracks this internally, but doesn't expose it, so we have to
# duplicate that logic here.
pof_stack = state.parenthesis_or_fstring_stack
try:
if ct_type is _FSTRING_START:
pof_stack.append(_FSTRING_STACK_ENTRY)
elif ct_type is _FSTRING_END:
pof_stack.pop()
elif ct_type is _OP:
if ct_string in "([{":
pof_stack.append(_PARENTHESIS_STACK_ENTRY)
elif ct_string in ")]}":
pof_stack.pop()
except IndexError:
# pof_stack may be empty by the time we need to read from it due to
# mismatched braces.
raise ParserSyntaxError(
"Encountered a closing brace without a matching opening brace.",
lines=state.lines,
raw_line=ct_start_pos[0],
raw_column=ct_start_pos[1],
)
is_parenthesized = (
len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY
)
whitespace_after = WhitespaceState(
ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized
)
# Hold onto whitespace_after, so we can use it as whitespace_before in the next
# node.
state.previous_whitespace_state = whitespace_after
return Token(
ct_type,
ct_string,
ct_start_pos,
ct_end_pos,
whitespace_before,
whitespace_after,
relative_indent,
)