LibCST/libcst/_parser/_detect_config.py
Jennifer Taylor 95f649af79 Fix incorrect round-tripping of newlines when parsing statements.
Hypothesis found that when we have a statement like `pass\r`, we detect that
`\r` is the default and parse the trailing newline as `Newline(None)`. However, when
we render the statement back out again, since we don't have a module, we construct
a default module which treats `Newline(None)` as a `\n` not a '\r'. So, when we are
parsing statements or expressions, disable auto-inferring the default newline and always
infer the default rendered newline (`\n`) so that rendering a statement/expression back
out behaves as expected.
2019-08-29 11:33:57 -07:00

145 lines
4.8 KiB
Python

# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# pyre-strict
import itertools
from dataclasses import dataclass
from io import BytesIO
from tokenize import detect_encoding as py_tokenize_detect_encoding
from typing import Iterable, Iterator, Union
from parso.python.token import PythonTokenTypes, TokenType
from parso.utils import split_lines
from libcst._nodes._whitespace import NEWLINE_RE
from libcst._parser._types.config import AutoConfig, ParserConfig, PartialParserConfig
from libcst._parser._types.token import Token
from libcst._parser._wrapped_tokenize import tokenize_lines
_INDENT: TokenType = PythonTokenTypes.INDENT
_FALLBACK_DEFAULT_NEWLINE = "\n"
_FALLBACK_DEFAULT_INDENT = " "
@dataclass(frozen=True)
class ConfigDetectionResult:
# The config is a set of constant values used by the parser.
config: ParserConfig
# The tokens iterator is mutated by the parser.
tokens: Iterator[Token]
def _detect_encoding(source: Union[str, bytes]) -> str:
"""
Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as
specified in PEP 263.
If given a string (instead of bytes) the encoding is assumed to be utf-8.
"""
if isinstance(source, str):
return "utf-8"
return py_tokenize_detect_encoding(BytesIO(source).readline)[0]
def _detect_default_newline(source_str: str) -> str:
"""
Finds the first newline, and uses that value as the default newline.
"""
# Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a
# single newline.
match = NEWLINE_RE.search(source_str)
return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE
def _detect_indent(tokens: Iterable[Token]) -> str:
"""
Finds the first INDENT token, and uses that as the value of the default indent.
"""
try:
first_indent = next(t for t in tokens if t.type is _INDENT)
except StopIteration:
return _FALLBACK_DEFAULT_INDENT
first_indent_str = first_indent.relative_indent
assert first_indent_str is not None, "INDENT tokens must contain a relative_indent"
return first_indent_str
def detect_config(
source: Union[str, bytes],
*,
partial: PartialParserConfig,
detect_trailing_newline: bool,
detect_default_newline: bool,
) -> ConfigDetectionResult:
"""
Computes a ParserConfig given the current source code to be parsed and a partial
config.
"""
python_version = partial.parsed_python_version
partial_encoding = partial.encoding
encoding = (
_detect_encoding(source)
if isinstance(partial_encoding, AutoConfig)
else partial_encoding
)
source_str = source if isinstance(source, str) else source.decode(encoding)
partial_default_newline = partial.default_newline
default_newline = (
(
_detect_default_newline(source_str)
if detect_default_newline
else _FALLBACK_DEFAULT_NEWLINE
)
if isinstance(partial_default_newline, AutoConfig)
else partial_default_newline
)
# HACK: The grammar requires a trailing newline, but python doesn't actually require
# a trailing newline. Add one onto the end to make the parser happy. We'll strip it
# out again during cst.Module's codegen.
#
# I think parso relies on error recovery support to handle this, which we don't
# have. lib2to3 doesn't handle this case at all AFAICT.
has_trailing_newline = detect_trailing_newline and bool(
len(source_str) != 0 and NEWLINE_RE.match(source_str[-1])
)
if detect_trailing_newline and not has_trailing_newline:
source_str += default_newline
lines = split_lines(source_str, keepends=True)
tokens = tokenize_lines(lines, python_version)
partial_default_indent = partial.default_indent
if isinstance(partial_default_indent, AutoConfig):
# We need to clone `tokens` before passing it to `_detect_indent`, because
# `_detect_indent` consumes some tokens, mutating `tokens`.
#
# Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the
# size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup`
# once `token_dup` is freed at the end of this method (subject to
# GC/refcounting).
tokens, tokens_dup = itertools.tee(tokens)
default_indent = _detect_indent(tokens_dup)
else:
default_indent = partial_default_indent
return ConfigDetectionResult(
config=ParserConfig(
lines=lines,
encoding=encoding,
default_indent=default_indent,
default_newline=default_newline,
has_trailing_newline=has_trailing_newline,
),
tokens=tokens,
)