LibCST/libcst/_parser/detect_config.py
Jennifer Taylor 57860f3d76 Fix trailing newline detection around continuation.
If you have such a program like "pass\\\n", this is technically a program without a trailing newline, since line continuations are defined as being a `\` followed by a newline. We were misdetecting this as having a trailing newline, thus making it impossible to parse the continuation. Add some tests to verify this behavior and then fix the problem.

Note that this was found via hypothesis.
2019-10-21 13:24:25 -07:00

157 lines
5.3 KiB
Python

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# pyre-strict
import itertools
import re
from dataclasses import dataclass
from io import BytesIO
from tokenize import detect_encoding as py_tokenize_detect_encoding
from typing import Iterable, Iterator, Pattern, Union
from libcst._nodes.whitespace import NEWLINE_RE
from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
from libcst._parser.parso.utils import split_lines
from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig
from libcst._parser.types.token import Token
from libcst._parser.wrapped_tokenize import tokenize_lines
_INDENT: TokenType = PythonTokenTypes.INDENT
_FALLBACK_DEFAULT_NEWLINE = "\n"
_FALLBACK_DEFAULT_INDENT = " "
_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE)
@dataclass(frozen=True)
class ConfigDetectionResult:
# The config is a set of constant values used by the parser.
config: ParserConfig
# The tokens iterator is mutated by the parser.
tokens: Iterator[Token]
def _detect_encoding(source: Union[str, bytes]) -> str:
"""
Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as
specified in PEP 263.
If given a string (instead of bytes) the encoding is assumed to be utf-8.
"""
if isinstance(source, str):
return "utf-8"
return py_tokenize_detect_encoding(BytesIO(source).readline)[0]
def _detect_default_newline(source_str: str) -> str:
"""
Finds the first newline, and uses that value as the default newline.
"""
# Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a
# single newline.
match = NEWLINE_RE.search(source_str)
return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE
def _detect_indent(tokens: Iterable[Token]) -> str:
"""
Finds the first INDENT token, and uses that as the value of the default indent.
"""
try:
first_indent = next(t for t in tokens if t.type is _INDENT)
except StopIteration:
return _FALLBACK_DEFAULT_INDENT
first_indent_str = first_indent.relative_indent
assert first_indent_str is not None, "INDENT tokens must contain a relative_indent"
return first_indent_str
def _detect_trailing_newline(source_str: str) -> bool:
if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]):
return False
# Make sure that the last newline wasn't following a continuation
return not (
_CONTINUATION_RE.fullmatch(source_str[-2:])
or _CONTINUATION_RE.fullmatch(source_str[-3:])
)
def detect_config(
source: Union[str, bytes],
*,
partial: PartialParserConfig,
detect_trailing_newline: bool,
detect_default_newline: bool,
) -> ConfigDetectionResult:
"""
Computes a ParserConfig given the current source code to be parsed and a partial
config.
"""
python_version = partial.parsed_python_version
partial_encoding = partial.encoding
encoding = (
_detect_encoding(source)
if isinstance(partial_encoding, AutoConfig)
else partial_encoding
)
source_str = source if isinstance(source, str) else source.decode(encoding)
partial_default_newline = partial.default_newline
default_newline = (
(
_detect_default_newline(source_str)
if detect_default_newline
else _FALLBACK_DEFAULT_NEWLINE
)
if isinstance(partial_default_newline, AutoConfig)
else partial_default_newline
)
# HACK: The grammar requires a trailing newline, but python doesn't actually require
# a trailing newline. Add one onto the end to make the parser happy. We'll strip it
# out again during cst.Module's codegen.
#
# I think parso relies on error recovery support to handle this, which we don't
# have. lib2to3 doesn't handle this case at all AFAICT.
has_trailing_newline = detect_trailing_newline and _detect_trailing_newline(
source_str
)
if detect_trailing_newline and not has_trailing_newline:
source_str += default_newline
lines = split_lines(source_str, keepends=True)
tokens = tokenize_lines(lines, python_version)
partial_default_indent = partial.default_indent
if isinstance(partial_default_indent, AutoConfig):
# We need to clone `tokens` before passing it to `_detect_indent`, because
# `_detect_indent` consumes some tokens, mutating `tokens`.
#
# Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the
# size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup`
# once `token_dup` is freed at the end of this method (subject to
# GC/refcounting).
tokens, tokens_dup = itertools.tee(tokens)
default_indent = _detect_indent(tokens_dup)
else:
default_indent = partial_default_indent
return ConfigDetectionResult(
config=ParserConfig(
lines=lines,
encoding=encoding,
default_indent=default_indent,
default_newline=default_newline,
has_trailing_newline=has_trailing_newline,
version=python_version,
),
tokens=tokens,
)