LibCST/libcst/_parser/parso/tests/test_tokenize.py
John Reese 10c3aa09a7
Upgrade to µsort 1.0.0rc1, and apply formatting changes (#565)
* Upgrade to usort==1.0.0rc1

* Apply sorting changes from usort 1.0.0rc1

* reapply codegen

Co-authored-by: Zsolt Dollenstein <zsol.zsol@gmail.com>
2021-12-21 14:55:04 -08:00

428 lines
14 KiB
Python

# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
# Licensed to PSF under a Contributor Agreement.
#
# Modifications:
# Copyright David Halter and Contributors
# Modifications are dual-licensed: MIT and PSF.
# 99% of the code is different from pgen2, now.
#
# A fork of Parso's tokenize test
# https://github.com/davidhalter/parso/blob/master/test/test_tokenize.py
#
# The following changes were made:
# - Convert base test to Unittet
# - Remove grammar-specific tests
# pyre-unsafe
# -*- coding: utf-8 # This file contains Unicode characters.
from textwrap import dedent
from libcst._parser.parso.python.token import PythonTokenTypes
from libcst._parser.parso.python.tokenize import PythonToken, tokenize
from libcst._parser.parso.utils import parse_version_string, split_lines
from libcst.testing.utils import data_provider, UnitTest
# To make it easier to access some of the token types, just put them here.
NAME = PythonTokenTypes.NAME
NEWLINE = PythonTokenTypes.NEWLINE
STRING = PythonTokenTypes.STRING
NUMBER = PythonTokenTypes.NUMBER
INDENT = PythonTokenTypes.INDENT
DEDENT = PythonTokenTypes.DEDENT
ERRORTOKEN = PythonTokenTypes.ERRORTOKEN
OP = PythonTokenTypes.OP
ENDMARKER = PythonTokenTypes.ENDMARKER
ERROR_DEDENT = PythonTokenTypes.ERROR_DEDENT
FSTRING_START = PythonTokenTypes.FSTRING_START
FSTRING_STRING = PythonTokenTypes.FSTRING_STRING
FSTRING_END = PythonTokenTypes.FSTRING_END
def _get_token_list(string, version=None):
# Load the current version.
version_info = parse_version_string(version)
return list(tokenize(string, version_info))
class ParsoTokenizerTest(UnitTest):
def test_simple_no_whitespace(self):
# Test a simple one line string, no preceding whitespace
simple_docstring = '"""simple one line docstring"""'
token_list = _get_token_list(simple_docstring)
_, value, _, prefix = token_list[0]
assert prefix == ""
assert value == '"""simple one line docstring"""'
def test_simple_with_whitespace(self):
# Test a simple one line string with preceding whitespace and newline
simple_docstring = ' """simple one line docstring""" \r\n'
token_list = _get_token_list(simple_docstring)
assert token_list[0][0] == INDENT
typ, value, start_pos, prefix = token_list[1]
assert prefix == " "
assert value == '"""simple one line docstring"""'
assert typ == STRING
typ, value, start_pos, prefix = token_list[2]
assert prefix == " "
assert typ == NEWLINE
def test_function_whitespace(self):
# Test function definition whitespace identification
fundef = dedent(
"""
def test_whitespace(*args, **kwargs):
x = 1
if x > 0:
print(True)
"""
)
token_list = _get_token_list(fundef)
for _, value, _, prefix in token_list:
if value == "test_whitespace":
assert prefix == " "
if value == "(":
assert prefix == ""
if value == "*":
assert prefix == ""
if value == "**":
assert prefix == " "
if value == "print":
assert prefix == " "
if value == "if":
assert prefix == " "
def test_tokenize_multiline_I(self):
# Make sure multiline string having newlines have the end marker on the
# next line
fundef = '''""""\n'''
token_list = _get_token_list(fundef)
assert token_list == [
PythonToken(ERRORTOKEN, '""""\n', (1, 0), ""),
PythonToken(ENDMARKER, "", (2, 0), ""),
]
def test_tokenize_multiline_II(self):
# Make sure multiline string having no newlines have the end marker on
# same line
fundef = '''""""'''
token_list = _get_token_list(fundef)
assert token_list == [
PythonToken(ERRORTOKEN, '""""', (1, 0), ""),
PythonToken(ENDMARKER, "", (1, 4), ""),
]
def test_tokenize_multiline_III(self):
# Make sure multiline string having newlines have the end marker on the
# next line even if several newline
fundef = '''""""\n\n'''
token_list = _get_token_list(fundef)
assert token_list == [
PythonToken(ERRORTOKEN, '""""\n\n', (1, 0), ""),
PythonToken(ENDMARKER, "", (3, 0), ""),
]
def test_identifier_contains_unicode(self):
fundef = dedent(
"""
def 我あφ():
pass
"""
)
token_list = _get_token_list(fundef)
unicode_token = token_list[1]
assert unicode_token[0] == NAME
def test_ur_literals(self):
"""
Decided to parse `u''` literals regardless of Python version. This makes
probably sense:
- Python 3+ doesn't support it, but it doesn't hurt
not be. While this is incorrect, it's just incorrect for one "old" and in
the future not very important version.
- All the other Python versions work very well with it.
"""
def check(literal, is_literal=True):
token_list = _get_token_list(literal)
typ, result_literal, _, _ = token_list[0]
if is_literal:
if typ != FSTRING_START:
assert typ == STRING
assert result_literal == literal
else:
assert typ == NAME
check('u""')
check('ur""', is_literal=False)
check('Ur""', is_literal=False)
check('UR""', is_literal=False)
check('bR""')
# Starting with Python 3.3 this ordering is also possible.
check('Rb""')
# Starting with Python 3.6 format strings where introduced.
check('fr""', is_literal=True)
check('rF""', is_literal=True)
check('f""', is_literal=True)
check('F""', is_literal=True)
def test_error_literal(self):
error_token, newline, endmarker = _get_token_list('"\n')
assert error_token.type == ERRORTOKEN
assert error_token.string == '"'
assert newline.type == NEWLINE
assert endmarker.type == ENDMARKER
assert endmarker.prefix == ""
bracket, error_token, endmarker = _get_token_list('( """')
assert error_token.type == ERRORTOKEN
assert error_token.prefix == " "
assert error_token.string == '"""'
assert endmarker.type == ENDMARKER
assert endmarker.prefix == ""
def test_endmarker_end_pos(self):
def check(code):
tokens = _get_token_list(code)
lines = split_lines(code)
assert tokens[-1].end_pos == (len(lines), len(lines[-1]))
check("#c")
check("#c\n")
check("a\n")
check("a")
check(r"a\\n")
check("a\\")
@data_provider(
(
# Indentation
(" foo", [INDENT, NAME, DEDENT]),
(" foo\n bar", [INDENT, NAME, NEWLINE, ERROR_DEDENT, NAME, DEDENT]),
(
" foo\n bar \n baz",
[
INDENT,
NAME,
NEWLINE,
ERROR_DEDENT,
NAME,
NEWLINE,
ERROR_DEDENT,
NAME,
DEDENT,
],
),
(" foo\nbar", [INDENT, NAME, NEWLINE, DEDENT, NAME]),
# Name stuff
("1foo1", [NUMBER, NAME]),
("மெல்லினம்", [NAME]),
("²", [ERRORTOKEN]),
("ä²ö", [NAME, ERRORTOKEN, NAME]),
("ää²¹öö", [NAME, ERRORTOKEN, NAME]),
)
)
def test_token_types(self, code, types):
actual_types = [t.type for t in _get_token_list(code)]
assert actual_types == types + [ENDMARKER]
def test_error_string(self):
t1, newline, endmarker = _get_token_list(' "\n')
assert t1.type == ERRORTOKEN
assert t1.prefix == " "
assert t1.string == '"'
assert newline.type == NEWLINE
assert endmarker.prefix == ""
assert endmarker.string == ""
def test_indent_error_recovery(self):
code = dedent(
"""\
str(
from x import a
def
"""
)
lst = _get_token_list(code)
expected = [
# `str(`
INDENT,
NAME,
OP,
# `from parso`
NAME,
NAME,
# `import a` on same line as the previous from parso
NAME,
NAME,
NEWLINE,
# Dedent happens, because there's an import now and the import
# statement "breaks" out of the opening paren on the first line.
DEDENT,
# `b`
NAME,
NEWLINE,
ENDMARKER,
]
assert [t.type for t in lst] == expected
def test_error_token_after_dedent(self):
code = dedent(
"""\
class C:
pass
$foo
"""
)
lst = _get_token_list(code)
expected = [
NAME,
NAME,
OP,
NEWLINE,
INDENT,
NAME,
NEWLINE,
DEDENT,
# $foo\n
ERRORTOKEN,
NAME,
NEWLINE,
ENDMARKER,
]
assert [t.type for t in lst] == expected
def test_brackets_no_indentation(self):
"""
There used to be an issue that the parentheses counting would go below
zero. This should not happen.
"""
code = dedent(
"""\
}
{
}
"""
)
lst = _get_token_list(code)
assert [t.type for t in lst] == [OP, NEWLINE, OP, OP, NEWLINE, ENDMARKER]
def test_form_feed(self):
error_token, endmarker = _get_token_list(
dedent(
'''\
\f"""'''
)
)
assert error_token.prefix == "\f"
assert error_token.string == '"""'
assert endmarker.prefix == ""
def test_carriage_return(self):
lst = _get_token_list(" =\\\rclass")
assert [t.type for t in lst] == [INDENT, OP, DEDENT, NAME, ENDMARKER]
def test_backslash(self):
code = "\\\n# 1 \n"
(endmarker,) = _get_token_list(code)
assert endmarker.prefix == code
@data_provider(
(
('f"', [FSTRING_START], "3.7"),
('f""', [FSTRING_START, FSTRING_END], "3.7"),
('f" {}"', [FSTRING_START, FSTRING_STRING, OP, OP, FSTRING_END], "3.7"),
('f" "{}', [FSTRING_START, FSTRING_STRING, FSTRING_END, OP, OP], "3.7"),
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
(r'f"\""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
# format spec
(
r'f"Some {x:.2f}{y}"',
[
FSTRING_START,
FSTRING_STRING,
OP,
NAME,
OP,
FSTRING_STRING,
OP,
OP,
NAME,
OP,
FSTRING_END,
],
"3.7",
),
# multiline f-string
('f"""abc\ndef"""', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
(
'f"""abc{\n123}def"""',
[
FSTRING_START,
FSTRING_STRING,
OP,
NUMBER,
OP,
FSTRING_STRING,
FSTRING_END,
],
"3.7",
),
# a line continuation inside of an fstring_string
('f"abc\\\ndef"', [FSTRING_START, FSTRING_STRING, FSTRING_END], "3.7"),
(
'f"\\\n{123}\\\n"',
[
FSTRING_START,
FSTRING_STRING,
OP,
NUMBER,
OP,
FSTRING_STRING,
FSTRING_END,
],
"3.7",
),
# a line continuation inside of an fstring_expr
('f"{\\\n123}"', [FSTRING_START, OP, NUMBER, OP, FSTRING_END], "3.7"),
# a line continuation inside of an format spec
(
'f"{123:.2\\\nf}"',
[FSTRING_START, OP, NUMBER, OP, FSTRING_STRING, OP, FSTRING_END],
"3.7",
),
# a newline without a line continuation inside a single-line string is
# wrong, and will generate an ERRORTOKEN
(
'f"abc\ndef"',
[FSTRING_START, FSTRING_STRING, NEWLINE, NAME, ERRORTOKEN],
"3.7",
),
# a more complex example
(
r'print(f"Some {x:.2f}a{y}")',
[
NAME,
OP,
FSTRING_START,
FSTRING_STRING,
OP,
NAME,
OP,
FSTRING_STRING,
OP,
FSTRING_STRING,
OP,
NAME,
OP,
FSTRING_END,
OP,
],
"3.7",
),
)
)
def test_fstring(self, code, types, py_version):
actual_types = [t.type for t in _get_token_list(code, py_version)]
assert types + [ENDMARKER] == actual_types