mirror of
https://github.com/Instagram/LibCST.git
synced 2025-12-23 10:35:53 +00:00
210 lines
7.1 KiB
Python
210 lines
7.1 KiB
Python
# Copyright (c) Meta Platforms, Inc. and affiliates.
|
|
#
|
|
# This source code is licensed under the MIT license found in the
|
|
# LICENSE file in the root directory of this source tree.
|
|
|
|
|
|
import itertools
|
|
import re
|
|
from dataclasses import dataclass
|
|
from io import BytesIO
|
|
from tokenize import detect_encoding as py_tokenize_detect_encoding
|
|
from typing import FrozenSet, Iterable, Iterator, Pattern, Set, Tuple, Union
|
|
|
|
from libcst._nodes.whitespace import NEWLINE_RE
|
|
from libcst._parser.parso.python.token import PythonTokenTypes, TokenType
|
|
from libcst._parser.parso.utils import split_lines
|
|
from libcst._parser.types.config import AutoConfig, ParserConfig, PartialParserConfig
|
|
from libcst._parser.types.token import Token
|
|
from libcst._parser.wrapped_tokenize import tokenize_lines
|
|
|
|
_INDENT: TokenType = PythonTokenTypes.INDENT
|
|
_NAME: TokenType = PythonTokenTypes.NAME
|
|
_NEWLINE: TokenType = PythonTokenTypes.NEWLINE
|
|
_STRING: TokenType = PythonTokenTypes.STRING
|
|
|
|
_FALLBACK_DEFAULT_NEWLINE = "\n"
|
|
_FALLBACK_DEFAULT_INDENT = " "
|
|
_CONTINUATION_RE: Pattern[str] = re.compile(r"\\(\r\n?|\n)", re.UNICODE)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ConfigDetectionResult:
|
|
# The config is a set of constant values used by the parser.
|
|
config: ParserConfig
|
|
# The tokens iterator is mutated by the parser.
|
|
tokens: Iterator[Token]
|
|
|
|
|
|
def _detect_encoding(source: Union[str, bytes]) -> str:
|
|
"""
|
|
Detects the encoding from the presence of a UTF-8 BOM or an encoding cookie as
|
|
specified in PEP 263.
|
|
|
|
If given a string (instead of bytes) the encoding is assumed to be utf-8.
|
|
"""
|
|
|
|
if isinstance(source, str):
|
|
return "utf-8"
|
|
return py_tokenize_detect_encoding(BytesIO(source).readline)[0]
|
|
|
|
|
|
def _detect_default_newline(source_str: str) -> str:
|
|
"""
|
|
Finds the first newline, and uses that value as the default newline.
|
|
"""
|
|
# Don't use `NEWLINE_RE` for this, because it might match multiple newlines as a
|
|
# single newline.
|
|
match = NEWLINE_RE.search(source_str)
|
|
return match.group(0) if match is not None else _FALLBACK_DEFAULT_NEWLINE
|
|
|
|
|
|
def _detect_indent(tokens: Iterable[Token]) -> str:
|
|
"""
|
|
Finds the first INDENT token, and uses that as the value of the default indent.
|
|
"""
|
|
try:
|
|
first_indent = next(t for t in tokens if t.type is _INDENT)
|
|
except StopIteration:
|
|
return _FALLBACK_DEFAULT_INDENT
|
|
first_indent_str = first_indent.relative_indent
|
|
assert first_indent_str is not None, "INDENT tokens must contain a relative_indent"
|
|
return first_indent_str
|
|
|
|
|
|
def _detect_trailing_newline(source_str: str) -> bool:
|
|
if len(source_str) == 0 or not NEWLINE_RE.fullmatch(source_str[-1]):
|
|
return False
|
|
# Make sure that the last newline wasn't following a continuation
|
|
return not (
|
|
_CONTINUATION_RE.fullmatch(source_str[-2:])
|
|
or _CONTINUATION_RE.fullmatch(source_str[-3:])
|
|
)
|
|
|
|
|
|
def _detect_future_imports(tokens: Iterable[Token]) -> FrozenSet[str]:
|
|
"""
|
|
Finds __future__ imports in their proper locations.
|
|
|
|
See `https://www.python.org/dev/peps/pep-0236/`_
|
|
"""
|
|
future_imports: Set[str] = set()
|
|
state = 0
|
|
for tok in tokens:
|
|
if state == 0 and tok.type in (_STRING, _NEWLINE):
|
|
continue
|
|
elif state == 0 and tok.string == "from":
|
|
state = 1
|
|
elif state == 1 and tok.string == "__future__":
|
|
state = 2
|
|
elif state == 2 and tok.string == "import":
|
|
state = 3
|
|
elif state == 3 and tok.string == "as":
|
|
state = 4
|
|
elif state == 3 and tok.type == _NAME:
|
|
future_imports.add(tok.string)
|
|
elif state == 4 and tok.type == _NAME:
|
|
state = 3
|
|
elif state == 3 and tok.string in "(),":
|
|
continue
|
|
elif state == 3 and tok.type == _NEWLINE:
|
|
state = 0
|
|
else:
|
|
break
|
|
return frozenset(future_imports)
|
|
|
|
|
|
def convert_to_utf8(
|
|
source: Union[str, bytes], *, partial: PartialParserConfig
|
|
) -> Tuple[str, str]:
|
|
"""
|
|
Returns an (original encoding, converted source) tuple.
|
|
"""
|
|
partial_encoding = partial.encoding
|
|
encoding = (
|
|
_detect_encoding(source)
|
|
if isinstance(partial_encoding, AutoConfig)
|
|
else partial_encoding
|
|
)
|
|
|
|
source_str = source if isinstance(source, str) else source.decode(encoding)
|
|
return (encoding, source_str)
|
|
|
|
|
|
def detect_config(
|
|
source: Union[str, bytes],
|
|
*,
|
|
partial: PartialParserConfig,
|
|
detect_trailing_newline: bool,
|
|
detect_default_newline: bool,
|
|
) -> ConfigDetectionResult:
|
|
"""
|
|
Computes a ParserConfig given the current source code to be parsed and a partial
|
|
config.
|
|
"""
|
|
|
|
python_version = partial.parsed_python_version
|
|
|
|
encoding, source_str = convert_to_utf8(source, partial=partial)
|
|
|
|
partial_default_newline = partial.default_newline
|
|
default_newline = (
|
|
(
|
|
_detect_default_newline(source_str)
|
|
if detect_default_newline
|
|
else _FALLBACK_DEFAULT_NEWLINE
|
|
)
|
|
if isinstance(partial_default_newline, AutoConfig)
|
|
else partial_default_newline
|
|
)
|
|
|
|
# HACK: The grammar requires a trailing newline, but python doesn't actually require
|
|
# a trailing newline. Add one onto the end to make the parser happy. We'll strip it
|
|
# out again during cst.Module's codegen.
|
|
#
|
|
# I think parso relies on error recovery support to handle this, which we don't
|
|
# have. lib2to3 doesn't handle this case at all AFAICT.
|
|
has_trailing_newline = detect_trailing_newline and _detect_trailing_newline(
|
|
source_str
|
|
)
|
|
if detect_trailing_newline and not has_trailing_newline:
|
|
source_str += default_newline
|
|
|
|
lines = split_lines(source_str, keepends=True)
|
|
|
|
tokens = tokenize_lines(source_str, lines, python_version)
|
|
|
|
partial_default_indent = partial.default_indent
|
|
if isinstance(partial_default_indent, AutoConfig):
|
|
# We need to clone `tokens` before passing it to `_detect_indent`, because
|
|
# `_detect_indent` consumes some tokens, mutating `tokens`.
|
|
#
|
|
# Implementation detail: CPython's `itertools.tee` uses weakrefs to reduce the
|
|
# size of its FIFO, so this doesn't retain items (leak memory) for `tokens_dup`
|
|
# once `token_dup` is freed at the end of this method (subject to
|
|
# GC/refcounting).
|
|
tokens, tokens_dup = itertools.tee(tokens)
|
|
default_indent = _detect_indent(tokens_dup)
|
|
else:
|
|
default_indent = partial_default_indent
|
|
|
|
partial_future_imports = partial.future_imports
|
|
if isinstance(partial_future_imports, AutoConfig):
|
|
# Same note as above re itertools.tee, we will consume tokens.
|
|
tokens, tokens_dup = itertools.tee(tokens)
|
|
future_imports = _detect_future_imports(tokens_dup)
|
|
else:
|
|
future_imports = partial_future_imports
|
|
|
|
return ConfigDetectionResult(
|
|
config=ParserConfig(
|
|
lines=lines,
|
|
encoding=encoding,
|
|
default_indent=default_indent,
|
|
default_newline=default_newline,
|
|
has_trailing_newline=has_trailing_newline,
|
|
version=python_version,
|
|
future_imports=future_imports,
|
|
),
|
|
tokens=tokens,
|
|
)
|