[3.12] gh-105564: Don't include artificial newlines in the line attribute of tokens (GH-105565) (#105579)

Co-authored-by: Pablo Galindo Salgado <Pablogsal@gmail.com>
This commit is contained in:
Miss Islington (bot) 2023-06-09 09:58:14 -07:00 committed by GitHub
parent 97d846dc2b
commit 16b1cdc87c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 13 additions and 8 deletions

View file

@ -1229,7 +1229,7 @@ class Test_Tokenize(TestCase):
# skip the initial encoding token and the end tokens
tokens = list(_generate_tokens_from_c_tokenizer(readline().__next__, encoding='utf-8',
extra_tokens=True))[:-2]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"\n')]
expected_tokens = [TokenInfo(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
self.assertEqual(tokens, expected_tokens,
"bytes not decoded with encoding")
@ -1638,8 +1638,8 @@ class TestTokenize(TestCase):
TokenInfo(type=token.NUMBER, string='1', start=(1, 4), end=(1, 5), line='b = 1\n'),
TokenInfo(type=token.NEWLINE, string='\n', start=(1, 5), end=(1, 6), line='b = 1\n'),
TokenInfo(type=token.NL, string='\n', start=(2, 0), end=(2, 1), line='\n'),
TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test\n'),
TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test\n'),
TokenInfo(type=token.COMMENT, string='#test', start=(3, 0), end=(3, 5), line='#test'),
TokenInfo(type=token.NL, string='', start=(3, 5), end=(3, 6), line='#test'),
TokenInfo(type=token.ENDMARKER, string='', start=(4, 0), end=(4, 0), line='')
]
@ -1653,7 +1653,7 @@ class TestTokenize(TestCase):
TokenInfo(token.ENCODING, string='utf-8', start=(0, 0), end=(0, 0), line=''),
TokenInfo(token.NAME, string='a', start=(1, 0), end=(1, 1), line='a\n'),
TokenInfo(token.NEWLINE, string='\n', start=(1, 1), end=(1, 2), line='a\n'),
TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' \n'),
TokenInfo(token.NL, string='', start=(2, 1), end=(2, 2), line=' '),
TokenInfo(token.ENDMARKER, string='', start=(3, 0), end=(3, 0), line='')
]
@ -1889,10 +1889,10 @@ class CTokenizeTest(TestCase):
yield "1+1".encode(encoding)
expected = [
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1\n'),
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1\n'),
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1\n'),
TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1\n'),
TokenInfo(type=NUMBER, string='1', start=(1, 0), end=(1, 1), line='1+1'),
TokenInfo(type=OP, string='+', start=(1, 1), end=(1, 2), line='1+1'),
TokenInfo(type=NUMBER, string='1', start=(1, 2), end=(1, 3), line='1+1'),
TokenInfo(type=NEWLINE, string='', start=(1, 3), end=(1, 4), line='1+1'),
TokenInfo(type=ENDMARKER, string='', start=(2, 0), end=(2, 0), line='')
]
for encoding in ["utf-8", "latin-1", "utf-16"]:

View file

@ -0,0 +1,2 @@
Don't include artificil newlines in the ``line`` attribute of tokens in the
APIs of the :mod:`tokenize` module. Patch by Pablo Galindo

View file

@ -206,6 +206,9 @@ tokenizeriter_next(tokenizeriterobject *it)
line = PyUnicode_FromString("");
} else {
Py_ssize_t size = it->tok->inp - line_start;
if (size >= 1 && it->tok->implicit_newline) {
size -= 1;
}
line = PyUnicode_DecodeUTF8(line_start, size, "replace");
}
if (line == NULL) {