LibCST/libcst/_parser/py_whitespace_parser.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from typing import List, Optional, Sequence, Tuple, Union

from libcst._nodes.whitespace import (
    Comment,
    COMMENT_RE,
    EmptyLine,
    Newline,
    NEWLINE_RE,
    ParenthesizedWhitespace,
    SIMPLE_WHITESPACE_RE,
    SimpleWhitespace,
    TrailingWhitespace,
)
from libcst._parser.types.config import BaseWhitespaceParserConfig
from libcst._parser.types.whitespace_state import WhitespaceState as State

# BEGIN PARSER ENTRYPOINTS


def parse_simple_whitespace(
    config: BaseWhitespaceParserConfig, state: State
) -> SimpleWhitespace:
    # The match never fails because the pattern can match an empty string
    lines = config.lines
    # pyre-fixme[16]: Optional type has no attribute `group`.
    ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group(0)
    ws_line_list = [ws_line]
    while "\\" in ws_line:
        # continuation character
        state.line += 1
        state.column = 0
        ws_line = SIMPLE_WHITESPACE_RE.match(lines[state.line - 1], state.column).group(
            0
        )
        ws_line_list.append(ws_line)

    # TODO: we could special-case the common case where there's no continuation
    # character to avoid list construction and joining.

    # once we've finished collecting continuation characters
    state.column += len(ws_line)
    return SimpleWhitespace("".join(ws_line_list))


def parse_empty_lines(
    config: BaseWhitespaceParserConfig,
    state: State,
    *,
    override_absolute_indent: Optional[str] = None,
) -> Sequence[EmptyLine]:
    # If override_absolute_indent is true, then we need to parse all lines up
    # to and including the last line that is indented at our level. These all
    # belong to the footer and not to the next line's leading_lines. All lines
    # that have indent=False and come after the last line where indent=True
    # do not belong to this node.
    state_for_line = State(
        state.line, state.column, state.absolute_indent, state.is_parenthesized
    )
    lines: List[Tuple[State, EmptyLine]] = []
    while True:
        el = _parse_empty_line(
            config, state_for_line, override_absolute_indent=override_absolute_indent
        )
        if el is None:
            break

        # Store the updated state with the element we parsed. Then make a new state
        # clone for the next element.
        lines.append((state_for_line, el))
        state_for_line = State(
            state_for_line.line,
            state_for_line.column,
            state.absolute_indent,
            state.is_parenthesized,
        )

    if override_absolute_indent is not None:
        # We need to find the last element that is indented, and then split the list
        # at that point.
        for i in range(len(lines) - 1, -1, -1):
            if lines[i][1].indent:
                lines = lines[: (i + 1)]
                break
        else:
            # We didn't find any lines, throw them all away
            lines = []

    if lines:
        # Update the state line and column to match the last line actually parsed.
        final_state: State = lines[-1][0]
        state.line = final_state.line
        state.column = final_state.column
    return [r[1] for r in lines]


def parse_trailing_whitespace(
    config: BaseWhitespaceParserConfig, state: State
) -> TrailingWhitespace:
    trailing_whitespace = _parse_trailing_whitespace(config, state)
    if trailing_whitespace is None:
        raise Exception(
            "Internal Error: Failed to parse TrailingWhitespace. This should never "
            + "happen because a TrailingWhitespace is never optional in the grammar, "
            + "so this error should've been caught by parso first."
        )
    return trailing_whitespace


def parse_parenthesizable_whitespace(
    config: BaseWhitespaceParserConfig, state: State
) -> Union[SimpleWhitespace, ParenthesizedWhitespace]:
    if state.is_parenthesized:
        # First, try parenthesized (don't need speculation because it either
        # parses or doesn't modify state).
        parenthesized_whitespace = _parse_parenthesized_whitespace(config, state)
        if parenthesized_whitespace is not None:
            return parenthesized_whitespace
    # Now, just parse and return a simple whitespace
    return parse_simple_whitespace(config, state)


# END PARSER ENTRYPOINTS
# BEGIN PARSER INTERNAL PRODUCTIONS


def _parse_empty_line(
    config: BaseWhitespaceParserConfig,
    state: State,
    *,
    override_absolute_indent: Optional[str] = None,
) -> Optional[EmptyLine]:
    # begin speculative parsing
    speculative_state = State(
        state.line, state.column, state.absolute_indent, state.is_parenthesized
    )
    try:
        indent = _parse_indent(
            config, speculative_state, override_absolute_indent=override_absolute_indent
        )
    except Exception:
        # We aren't on a new line, speculative parsing failed
        return None
    whitespace = parse_simple_whitespace(config, speculative_state)
    comment = _parse_comment(config, speculative_state)
    newline = _parse_newline(config, speculative_state)
    if newline is None:
        # speculative parsing failed
        return None
    # speculative parsing succeeded
    state.line = speculative_state.line
    state.column = speculative_state.column
    # don't need to copy absolute_indent/is_parenthesized because they don't change.
    return EmptyLine(indent, whitespace, comment, newline)


def _parse_indent(
    config: BaseWhitespaceParserConfig,
    state: State,
    *,
    override_absolute_indent: Optional[str] = None,
) -> bool:
    """
    Returns True if indentation was found, otherwise False.
    """
    absolute_indent = (
        override_absolute_indent
        if override_absolute_indent is not None
        else state.absolute_indent
    )
    line_str = config.lines[state.line - 1]
    if state.column != 0:
        if state.column == len(line_str) and state.line == len(config.lines):
            # We're at EOF, treat this as a failed speculative parse
            return False
        raise Exception("Internal Error: Column should be 0 when parsing an indent.")
    if line_str.startswith(absolute_indent, state.column):
        state.column += len(absolute_indent)
        return True
    return False


def _parse_comment(
    config: BaseWhitespaceParserConfig, state: State
) -> Optional[Comment]:
    comment_match = COMMENT_RE.match(config.lines[state.line - 1], state.column)
    if comment_match is None:
        return None
    comment = comment_match.group(0)
    state.column += len(comment)
    return Comment(comment)


def _parse_newline(
    config: BaseWhitespaceParserConfig, state: State
) -> Optional[Newline]:
    # begin speculative parsing
    line_str = config.lines[state.line - 1]
    newline_match = NEWLINE_RE.match(line_str, state.column)
    if newline_match is not None:
        # speculative parsing succeeded
        newline_str = newline_match.group(0)
        state.column += len(newline_str)
        if state.column != len(line_str):
            raise Exception("Internal Error: Found a newline, but it wasn't the EOL.")
        if state.line < len(config.lines):
            # this newline was the end of a line, and there's another line,
            # therefore we should move to the next line
            state.line += 1
            state.column = 0
        if newline_str == config.default_newline:
            # Just inherit it from the Module instead of explicitly setting it.
            return Newline()
        else:
            return Newline(newline_str)
    else:  # no newline was found, speculative parsing failed
        return None


def _parse_trailing_whitespace(
    config: BaseWhitespaceParserConfig, state: State
) -> Optional[TrailingWhitespace]:
    # Begin speculative parsing
    speculative_state = State(
        state.line, state.column, state.absolute_indent, state.is_parenthesized
    )
    whitespace = parse_simple_whitespace(config, speculative_state)
    comment = _parse_comment(config, speculative_state)
    newline = _parse_newline(config, speculative_state)
    if newline is None:
        # Speculative parsing failed
        return None
    # Speculative parsing succeeded
    state.line = speculative_state.line
    state.column = speculative_state.column
    # don't need to copy absolute_indent/is_parenthesized because they don't change.
    return TrailingWhitespace(whitespace, comment, newline)


def _parse_parenthesized_whitespace(
    config: BaseWhitespaceParserConfig, state: State
) -> Optional[ParenthesizedWhitespace]:
    first_line = _parse_trailing_whitespace(config, state)
    if first_line is None:
        # Speculative parsing failed
        return None
    empty_lines = ()
    while True:
        empty_line = _parse_empty_line(config, state)
        if empty_line is None:
            # This isn't an empty line, so parse it below
            break
        empty_lines = empty_lines + (empty_line,)
    indent = _parse_indent(config, state)
    last_line = parse_simple_whitespace(config, state)
    return ParenthesizedWhitespace(first_line, empty_lines, indent, last_line)