LibCST/libcst/_parser/wrapped_tokenize.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


"""
Parso's tokenize doesn't give us tokens in the format that we'd ideally like, so this
performs a small number of transformations to the token stream:

- `end_pos` is precomputed as a property, instead of lazily as a method, for more
  efficient access.
- `whitespace_before` and `whitespace_after` have been added. These include the correct
  indentation information.
- `prefix` is removed, since we don't use it anywhere.
- `ERRORTOKEN` and `ERROR_DEDENT` have been removed, because we don't intend to support
  error recovery. If we encounter token errors, we'll raise a ParserSyntaxError instead.

If performance becomes a concern, we can rewrite this later as a fork of the original
tokenize module, instead of as a wrapper.
"""

from dataclasses import dataclass, field
from enum import Enum
from typing import Generator, Iterator, List, Optional, Sequence

from libcst._add_slots import add_slots
from libcst._exceptions import ParserSyntaxError
from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
from libcst._parser.parso.python.tokenize import (
    Token as OrigToken,
    tokenize_lines as orig_tokenize_lines,
)
from libcst._parser.parso.utils import PythonVersionInfo, split_lines
from libcst._parser.types.token import Token
from libcst._parser.types.whitespace_state import WhitespaceState

_ERRORTOKEN: TokenType = PythonTokenTypes.ERRORTOKEN
_ERROR_DEDENT: TokenType = PythonTokenTypes.ERROR_DEDENT

_INDENT: TokenType = PythonTokenTypes.INDENT
_DEDENT: TokenType = PythonTokenTypes.DEDENT
_ENDMARKER: TokenType = PythonTokenTypes.ENDMARKER

_FSTRING_START: TokenType = PythonTokenTypes.FSTRING_START
_FSTRING_END: TokenType = PythonTokenTypes.FSTRING_END

_OP: TokenType = PythonTokenTypes.OP


class _ParenthesisOrFStringStackEntry(Enum):
    PARENTHESIS = 0
    FSTRING = 0


_PARENTHESIS_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
    _ParenthesisOrFStringStackEntry.PARENTHESIS
)
_FSTRING_STACK_ENTRY: _ParenthesisOrFStringStackEntry = (
    _ParenthesisOrFStringStackEntry.FSTRING
)


@add_slots
@dataclass(frozen=False)
class _TokenizeState:
    lines: Sequence[str]
    previous_whitespace_state: WhitespaceState = field(
        default_factory=lambda: WhitespaceState(
            line=1, column=0, absolute_indent="", is_parenthesized=False
        )
    )
    indents: List[str] = field(default_factory=lambda: [""])
    parenthesis_or_fstring_stack: List[_ParenthesisOrFStringStackEntry] = field(
        default_factory=list
    )


def tokenize(code: str, version_info: PythonVersionInfo) -> Iterator[Token]:
    try:
        from libcst_native import tokenize as native_tokenize

        return native_tokenize.tokenize(code)
    except ImportError:
        lines = split_lines(code, keepends=True)
        return tokenize_lines(code, lines, version_info)


def tokenize_lines(
    code: str, lines: Sequence[str], version_info: PythonVersionInfo
) -> Iterator[Token]:
    try:
        from libcst_native import tokenize as native_tokenize

        # TODO: pass through version_info
        return native_tokenize.tokenize(code)
    except ImportError:
        return tokenize_lines_py(code, lines, version_info)


def tokenize_lines_py(
    code: str, lines: Sequence[str], version_info: PythonVersionInfo
) -> Generator[Token, None, None]:
    state = _TokenizeState(lines)
    orig_tokens_iter = iter(orig_tokenize_lines(lines, version_info))

    # Iterate over the tokens and pass them to _convert_token, providing a one-token
    # lookahead, to enable proper indent handling.
    try:
        curr_token = next(orig_tokens_iter)
    except StopIteration:
        pass  # empty file
    else:
        for next_token in orig_tokens_iter:
            yield _convert_token(state, curr_token, next_token)
            curr_token = next_token
        yield _convert_token(state, curr_token, None)


def _convert_token(  # noqa: C901: too complex
    state: _TokenizeState, curr_token: OrigToken, next_token: Optional[OrigToken]
) -> Token:
    ct_type = curr_token.type
    ct_string = curr_token.string
    ct_start_pos = curr_token.start_pos
    if ct_type is _ERRORTOKEN:
        raise ParserSyntaxError(
            f"{ct_string!r} is not a valid token.",
            lines=state.lines,
            raw_line=ct_start_pos[0],
            raw_column=ct_start_pos[1],
        )
    if ct_type is _ERROR_DEDENT:
        raise ParserSyntaxError(
            "Inconsistent indentation. Expected a dedent.",
            lines=state.lines,
            raw_line=ct_start_pos[0],
            raw_column=ct_start_pos[1],
        )

    # Compute relative indent changes for indent/dedent nodes
    relative_indent: Optional[str] = None
    if ct_type is _INDENT:
        old_indent = "" if len(state.indents) < 2 else state.indents[-2]
        new_indent = state.indents[-1]
        relative_indent = new_indent[len(old_indent) :]

    if next_token is not None:
        nt_type = next_token.type
        if nt_type is _INDENT:
            nt_line, nt_column = next_token.start_pos
            state.indents.append(state.lines[nt_line - 1][:nt_column])
        elif nt_type is _DEDENT:
            state.indents.pop()

    whitespace_before = state.previous_whitespace_state

    if ct_type is _INDENT or ct_type is _DEDENT or ct_type is _ENDMARKER:
        # Don't update whitespace state for these dummy tokens. This makes it possible
        # to partially parse whitespace for IndentedBlock footers, and then parse the
        # rest of the whitespace in the following statement's leading_lines.
        # Unfortunately, that means that the indentation is either wrong for the footer
        # comments, or for the next line. We've chosen to allow it to be wrong for the
        # IndentedBlock footer and manually override the state when parsing whitespace
        # in that particular node.
        whitespace_after = whitespace_before
        ct_end_pos = ct_start_pos
    else:
        # Not a dummy token, so update the whitespace state.

        # Compute our own end_pos, since parso's end_pos is wrong for triple-strings.
        lines = split_lines(ct_string)
        if len(lines) > 1:
            ct_end_pos = ct_start_pos[0] + len(lines) - 1, len(lines[-1])
        else:
            ct_end_pos = (ct_start_pos[0], ct_start_pos[1] + len(ct_string))

        # Figure out what mode the whitespace parser should use. If we're inside
        # parentheses, certain whitespace (e.g. newlines) are allowed where they would
        # otherwise not be. f-strings override and disable this behavior, however.
        #
        # Parso's tokenizer tracks this internally, but doesn't expose it, so we have to
        # duplicate that logic here.

        pof_stack = state.parenthesis_or_fstring_stack
        try:
            if ct_type is _FSTRING_START:
                pof_stack.append(_FSTRING_STACK_ENTRY)
            elif ct_type is _FSTRING_END:
                pof_stack.pop()
            elif ct_type is _OP:
                if ct_string in "([{":
                    pof_stack.append(_PARENTHESIS_STACK_ENTRY)
                elif ct_string in ")]}":
                    pof_stack.pop()
        except IndexError:
            # pof_stack may be empty by the time we need to read from it due to
            # mismatched braces.
            raise ParserSyntaxError(
                "Encountered a closing brace without a matching opening brace.",
                lines=state.lines,
                raw_line=ct_start_pos[0],
                raw_column=ct_start_pos[1],
            )
        is_parenthesized = (
            len(pof_stack) > 0 and pof_stack[-1] == _PARENTHESIS_STACK_ENTRY
        )

        whitespace_after = WhitespaceState(
            ct_end_pos[0], ct_end_pos[1], state.indents[-1], is_parenthesized
        )

    # Hold onto whitespace_after, so we can use it as whitespace_before in the next
    # node.
    state.previous_whitespace_state = whitespace_after

    return Token(
        ct_type,
        ct_string,
        ct_start_pos,
        ct_end_pos,
        whitespace_before,
        whitespace_after,
        relative_indent,
    )