gh-125553: Fix backslash continuation in untokenize (#126010)

This commit is contained in:
Tomas R. 2025-01-21 20:58:44 +01:00 committed by GitHub
parent a4760ef8e5
commit 7ad793e5db
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 49 additions and 6 deletions

View file

@ -1,4 +1,5 @@
import os
import re
import token
import tokenize
import unittest
@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
def contains_ambiguous_backslash(source):
"""Return `True` if the source contains a backslash on a
line by itself. For example:
a = (1
\\
)
Code like this cannot be untokenized exactly. This is because
the tokenizer does not produce any tokens for the line containing
the backslash and so there is no way to know its indent.
"""
pattern = re.compile(br'\n\s*\\\r?\n')
return pattern.search(source) is not None
class TestRoundtrip(TestCase):
def check_roundtrip(self, f):
@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
The test fails if the 3 pair tokenizations do not match.
If the source code can be untokenized unambiguously, the
untokenized code must match the original code exactly.
When untokenize bugs are fixed, untokenize with 5-tuples should
reproduce code that does not contain a backslash continuation
following spaces. A proper test should test this.
@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2)
if not contains_ambiguous_backslash(code):
# The BOM does not produce a token so there is no way to preserve it.
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
self.assertEqual(code_without_bom, untokenized_code)
def check_line_extraction(self, f):
if isinstance(f, str):
code = f.encode('utf-8')

View file

@ -169,6 +169,7 @@ class Untokenizer:
self.prev_row = 1
self.prev_col = 0
self.prev_type = None
self.prev_line = ""
self.encoding = None
def add_whitespace(self, start):
@ -176,14 +177,28 @@ class Untokenizer:
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
raise ValueError("start ({},{}) precedes previous end ({},{})"
.format(row, col, self.prev_row, self.prev_col))
row_offset = row - self.prev_row
if row_offset:
self.tokens.append("\\\n" * row_offset)
self.prev_col = 0
self.add_backslash_continuation(start)
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def add_backslash_continuation(self, start):
"""Add backslash continuation characters if the row has increased
without encountering a newline token.
This also inserts the correct amount of whitespace before the backslash.
"""
row = start[0]
row_offset = row - self.prev_row
if row_offset == 0:
return
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
line = self.prev_line.rstrip('\\\r\n')
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
self.tokens.append(ws + f"\\{newline}" * row_offset)
self.prev_col = 0
def escape_brackets(self, token):
characters = []
consume_until_next_bracket = False
@ -243,8 +258,6 @@ class Untokenizer:
end_line, end_col = end
extra_chars = last_line.count("{{") + last_line.count("}}")
end = (end_line, end_col + extra_chars)
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
self.tokens.append(" ")
self.add_whitespace(start)
self.tokens.append(token)
@ -253,6 +266,7 @@ class Untokenizer:
self.prev_row += 1
self.prev_col = 0
self.prev_type = tok_type
self.prev_line = line
return "".join(self.tokens)
def compat(self, token, iterable):

View file

@ -0,0 +1,2 @@
Fix round-trip invariance for backslash continuations in
:func:`tokenize.untokenize`.