LibCST/libcst/_parser/parso/utils.py

# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
#
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
#
# A fork of `parso.utils`.
# https://github.com/davidhalter/parso/blob/master/parso/utils.py
#
# The following changes were made:
# - Drop Python 2 compatibility layer
# - Use dataclasses instead of namedtuple
# - Apply type hints directly to files
# - Make PythonVersionInfo directly usable in hashmaps
# - Unroll total ordering because Pyre doesn't understand it


import re
import sys
from ast import literal_eval
from dataclasses import dataclass
from typing import Optional, Sequence, Tuple, Union

# The following is a list in Python that are line breaks in str.splitlines, but
# not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed,
# 0xA) are allowed to split lines.
_NON_LINE_BREAKS = (
    "\v",  # Vertical Tabulation 0xB
    "\f",  # Form Feed 0xC
    "\x1c",  # File Separator
    "\x1d",  # Group Separator
    "\x1e",  # Record Separator
    "\x85",  # Next Line (NEL - Equivalent to CR+LF.
    # Used to mark end-of-line on some IBM mainframes.)
    "\u2028",  # Line Separator
    "\u2029",  # Paragraph Separator
)


@dataclass(frozen=True)
class Version:
    major: int
    minor: int
    micro: int


def split_lines(string: str, keepends: bool = False) -> Sequence[str]:
    r"""
    Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`,
    looks at form feeds and other special characters as normal text. Just
    splits ``\n`` and ``\r\n``.
    Also different: Returns ``[""]`` for an empty string input.

    In Python 2.7 form feeds are used as normal characters when using
    str.splitlines. However in Python 3 somewhere there was a decision to split
    also on form feeds.
    """
    if keepends:
        lst = string.splitlines(True)

        # We have to merge lines that were broken by form feed characters.
        merge = []
        for i, line in enumerate(lst):
            try:
                last_chr = line[-1]
            except IndexError:
                pass
            else:
                if last_chr in _NON_LINE_BREAKS:
                    merge.append(i)

        for index in reversed(merge):
            try:
                lst[index] = lst[index] + lst[index + 1]
                del lst[index + 1]
            except IndexError:
                # index + 1 can be empty and therefore there's no need to
                # merge.
                pass

        # The stdlib's implementation of the end is inconsistent when calling
        # it with/without keepends. One time there's an empty string in the
        # end, one time there's none.
        if string.endswith("\n") or string.endswith("\r") or string == "":
            lst.append("")
        return lst
    else:
        return re.split(r"\n|\r\n|\r", string)


def python_bytes_to_unicode(
    source: Union[str, bytes], encoding: str = "utf-8", errors: str = "strict"
) -> str:
    """
    Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a
    unicode object like in :py:meth:`bytes.decode`.

    :param encoding: See :py:meth:`bytes.decode` documentation.
    :param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be
        ``'strict'``, ``'replace'`` or ``'ignore'``.
    """

    def detect_encoding() -> Union[str, bytes]:
        """
        For the implementation of encoding definitions in Python, look at:
        - http://www.python.org/dev/peps/pep-0263/
        - http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations
        """
        byte_mark = literal_eval(r"b'\xef\xbb\xbf'")
        if source.startswith(byte_mark):
            # UTF-8 byte-order mark
            return b"utf-8"

        # pyre-ignore Pyre can't see that Union[str, bytes] conforms to AnyStr.
        first_two_match = re.match(rb"(?:[^\n]*\n){0,2}", source)
        if first_two_match is None:
            return encoding
        first_two_lines = first_two_match.group(0)
        possible_encoding = re.search(rb"coding[=:]\s*([-\w.]+)", first_two_lines)
        if possible_encoding:
            return possible_encoding.group(1)
        else:
            # the default if nothing else has been set -> PEP 263
            return encoding

    if isinstance(source, str):
        # only cast bytes
        return source

    actual_encoding = detect_encoding()
    if not isinstance(actual_encoding, str):
        actual_encoding = actual_encoding.decode("utf-8", "replace")

    # Cast to str
    return source.decode(actual_encoding, errors)


@dataclass(frozen=True)
class PythonVersionInfo:
    major: int
    minor: int

    def __gt__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool:
        if isinstance(other, tuple):
            if len(other) != 2:
                raise ValueError("Can only compare to tuples of length 2.")
            return (self.major, self.minor) > other

        return (self.major, self.minor) > (other.major, other.minor)

    def __ge__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool:
        return self.__gt__(other) or self.__eq__(other)

    def __lt__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool:
        if isinstance(other, tuple):
            if len(other) != 2:
                raise ValueError("Can only compare to tuples of length 2.")
            return (self.major, self.minor) < other

        return (self.major, self.minor) < (other.major, other.minor)

    def __le__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool:
        return self.__lt__(other) or self.__eq__(other)

    def __eq__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool:
        if isinstance(other, tuple):
            if len(other) != 2:
                raise ValueError("Can only compare to tuples of length 2.")
            return (self.major, self.minor) == other

        return (self.major, self.minor) == (other.major, other.minor)

    def __ne__(self, other: Union["PythonVersionInfo", Tuple[int, int]]) -> bool:
        return not self.__eq__(other)

    def __hash__(self) -> int:
        return hash((self.major, self.minor))


def _parse_version(version: str) -> PythonVersionInfo:
    match = re.match(r"(\d+)(?:\.(\d+)(?:\.\d+)?)?$", version)
    if match is None:
        raise ValueError(
            (
                "The given version is not in the right format. "
                + 'Use something like "3.2" or "3".'
            )
        )

    major = int(match.group(1))
    minor = match.group(2)
    if minor is None:
        # Use the latest Python in case it's not exactly defined, because the
        # grammars are typically backwards compatible?
        if major == 2:
            minor = "7"
        elif major == 3:
            minor = "6"
        else:
            raise NotImplementedError(
                "Sorry, no support yet for those fancy new/old versions."
            )
    minor = int(minor)
    return PythonVersionInfo(major, minor)


def parse_version_string(version: Optional[str] = None) -> PythonVersionInfo:
    """
    Checks for a valid version number (e.g. `3.2` or `2.7.1` or `3`) and
    returns a corresponding version info that is always two characters long in
    decimal.
    """
    if version is None:
        version = "%s.%s" % sys.version_info[:2]

    return _parse_version(version)