[3.13] gh-125553: Fix backslash continuation in untokenize (GH-126010) (#129153)

gh-125553: Fix backslash continuation in `untokenize` (GH-126010)
(cherry picked from commit 7ad793e5db)

Co-authored-by: Tomas R <tomas.roun8@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-01-21 22:04:55 +01:00 committed by GitHub
parent 0c7045378f
commit 3048dcd15a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 49 additions and 6 deletions

View file

@ -1,4 +1,5 @@
import os import os
import re
import token import token
import tokenize import tokenize
import unittest import unittest
@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ') self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
def contains_ambiguous_backslash(source):
"""Return `True` if the source contains a backslash on a
line by itself. For example:
a = (1
\\
)
Code like this cannot be untokenized exactly. This is because
the tokenizer does not produce any tokens for the line containing
the backslash and so there is no way to know its indent.
"""
pattern = re.compile(br'\n\s*\\\r?\n')
return pattern.search(source) is not None
class TestRoundtrip(TestCase): class TestRoundtrip(TestCase):
def check_roundtrip(self, f): def check_roundtrip(self, f):
@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
tokenize.untokenize(), and the latter tokenized again to 2-tuples. tokenize.untokenize(), and the latter tokenized again to 2-tuples.
The test fails if the 3 pair tokenizations do not match. The test fails if the 3 pair tokenizations do not match.
If the source code can be untokenized unambiguously, the
untokenized code must match the original code exactly.
When untokenize bugs are fixed, untokenize with 5-tuples should When untokenize bugs are fixed, untokenize with 5-tuples should
reproduce code that does not contain a backslash continuation reproduce code that does not contain a backslash continuation
following spaces. A proper test should test this. following spaces. A proper test should test this.
@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)] tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
self.assertEqual(tokens2_from5, tokens2) self.assertEqual(tokens2_from5, tokens2)
if not contains_ambiguous_backslash(code):
# The BOM does not produce a token so there is no way to preserve it.
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
self.assertEqual(code_without_bom, untokenized_code)
def check_line_extraction(self, f): def check_line_extraction(self, f):
if isinstance(f, str): if isinstance(f, str):
code = f.encode('utf-8') code = f.encode('utf-8')

View file

@ -169,6 +169,7 @@ class Untokenizer:
self.prev_row = 1 self.prev_row = 1
self.prev_col = 0 self.prev_col = 0
self.prev_type = None self.prev_type = None
self.prev_line = ""
self.encoding = None self.encoding = None
def add_whitespace(self, start): def add_whitespace(self, start):
@ -176,14 +177,28 @@ class Untokenizer:
if row < self.prev_row or row == self.prev_row and col < self.prev_col: if row < self.prev_row or row == self.prev_row and col < self.prev_col:
raise ValueError("start ({},{}) precedes previous end ({},{})" raise ValueError("start ({},{}) precedes previous end ({},{})"
.format(row, col, self.prev_row, self.prev_col)) .format(row, col, self.prev_row, self.prev_col))
row_offset = row - self.prev_row self.add_backslash_continuation(start)
if row_offset:
self.tokens.append("\\\n" * row_offset)
self.prev_col = 0
col_offset = col - self.prev_col col_offset = col - self.prev_col
if col_offset: if col_offset:
self.tokens.append(" " * col_offset) self.tokens.append(" " * col_offset)
def add_backslash_continuation(self, start):
"""Add backslash continuation characters if the row has increased
without encountering a newline token.
This also inserts the correct amount of whitespace before the backslash.
"""
row = start[0]
row_offset = row - self.prev_row
if row_offset == 0:
return
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
line = self.prev_line.rstrip('\\\r\n')
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
self.tokens.append(ws + f"\\{newline}" * row_offset)
self.prev_col = 0
def escape_brackets(self, token): def escape_brackets(self, token):
characters = [] characters = []
consume_until_next_bracket = False consume_until_next_bracket = False
@ -243,8 +258,6 @@ class Untokenizer:
end_line, end_col = end end_line, end_col = end
extra_chars = last_line.count("{{") + last_line.count("}}") extra_chars = last_line.count("{{") + last_line.count("}}")
end = (end_line, end_col + extra_chars) end = (end_line, end_col + extra_chars)
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
self.tokens.append(" ")
self.add_whitespace(start) self.add_whitespace(start)
self.tokens.append(token) self.tokens.append(token)
@ -253,6 +266,7 @@ class Untokenizer:
self.prev_row += 1 self.prev_row += 1
self.prev_col = 0 self.prev_col = 0
self.prev_type = tok_type self.prev_type = tok_type
self.prev_line = line
return "".join(self.tokens) return "".join(self.tokens)
def compat(self, token, iterable): def compat(self, token, iterable):

View file

@ -0,0 +1,2 @@
Fix round-trip invariance for backslash continuations in
:func:`tokenize.untokenize`.