mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
[3.13] gh-125553: Fix backslash continuation in untokenize
(GH-126010) (#129153)
gh-125553: Fix backslash continuation in `untokenize` (GH-126010)
(cherry picked from commit 7ad793e5db
)
Co-authored-by: Tomas R <tomas.roun8@gmail.com>
This commit is contained in:
parent
0c7045378f
commit
3048dcd15a
3 changed files with 49 additions and 6 deletions
|
@ -1,4 +1,5 @@
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import token
|
import token
|
||||||
import tokenize
|
import tokenize
|
||||||
import unittest
|
import unittest
|
||||||
|
@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
|
||||||
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
|
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
|
||||||
|
|
||||||
|
|
||||||
|
def contains_ambiguous_backslash(source):
|
||||||
|
"""Return `True` if the source contains a backslash on a
|
||||||
|
line by itself. For example:
|
||||||
|
|
||||||
|
a = (1
|
||||||
|
\\
|
||||||
|
)
|
||||||
|
|
||||||
|
Code like this cannot be untokenized exactly. This is because
|
||||||
|
the tokenizer does not produce any tokens for the line containing
|
||||||
|
the backslash and so there is no way to know its indent.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(br'\n\s*\\\r?\n')
|
||||||
|
return pattern.search(source) is not None
|
||||||
|
|
||||||
|
|
||||||
class TestRoundtrip(TestCase):
|
class TestRoundtrip(TestCase):
|
||||||
|
|
||||||
def check_roundtrip(self, f):
|
def check_roundtrip(self, f):
|
||||||
|
@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
|
||||||
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
|
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
|
||||||
The test fails if the 3 pair tokenizations do not match.
|
The test fails if the 3 pair tokenizations do not match.
|
||||||
|
|
||||||
|
If the source code can be untokenized unambiguously, the
|
||||||
|
untokenized code must match the original code exactly.
|
||||||
|
|
||||||
When untokenize bugs are fixed, untokenize with 5-tuples should
|
When untokenize bugs are fixed, untokenize with 5-tuples should
|
||||||
reproduce code that does not contain a backslash continuation
|
reproduce code that does not contain a backslash continuation
|
||||||
following spaces. A proper test should test this.
|
following spaces. A proper test should test this.
|
||||||
|
@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
|
||||||
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
|
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
|
||||||
self.assertEqual(tokens2_from5, tokens2)
|
self.assertEqual(tokens2_from5, tokens2)
|
||||||
|
|
||||||
|
if not contains_ambiguous_backslash(code):
|
||||||
|
# The BOM does not produce a token so there is no way to preserve it.
|
||||||
|
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
|
||||||
|
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
|
||||||
|
untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
|
||||||
|
self.assertEqual(code_without_bom, untokenized_code)
|
||||||
|
|
||||||
def check_line_extraction(self, f):
|
def check_line_extraction(self, f):
|
||||||
if isinstance(f, str):
|
if isinstance(f, str):
|
||||||
code = f.encode('utf-8')
|
code = f.encode('utf-8')
|
||||||
|
|
|
@ -169,6 +169,7 @@ class Untokenizer:
|
||||||
self.prev_row = 1
|
self.prev_row = 1
|
||||||
self.prev_col = 0
|
self.prev_col = 0
|
||||||
self.prev_type = None
|
self.prev_type = None
|
||||||
|
self.prev_line = ""
|
||||||
self.encoding = None
|
self.encoding = None
|
||||||
|
|
||||||
def add_whitespace(self, start):
|
def add_whitespace(self, start):
|
||||||
|
@ -176,14 +177,28 @@ class Untokenizer:
|
||||||
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
|
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
|
||||||
raise ValueError("start ({},{}) precedes previous end ({},{})"
|
raise ValueError("start ({},{}) precedes previous end ({},{})"
|
||||||
.format(row, col, self.prev_row, self.prev_col))
|
.format(row, col, self.prev_row, self.prev_col))
|
||||||
row_offset = row - self.prev_row
|
self.add_backslash_continuation(start)
|
||||||
if row_offset:
|
|
||||||
self.tokens.append("\\\n" * row_offset)
|
|
||||||
self.prev_col = 0
|
|
||||||
col_offset = col - self.prev_col
|
col_offset = col - self.prev_col
|
||||||
if col_offset:
|
if col_offset:
|
||||||
self.tokens.append(" " * col_offset)
|
self.tokens.append(" " * col_offset)
|
||||||
|
|
||||||
|
def add_backslash_continuation(self, start):
|
||||||
|
"""Add backslash continuation characters if the row has increased
|
||||||
|
without encountering a newline token.
|
||||||
|
|
||||||
|
This also inserts the correct amount of whitespace before the backslash.
|
||||||
|
"""
|
||||||
|
row = start[0]
|
||||||
|
row_offset = row - self.prev_row
|
||||||
|
if row_offset == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
|
||||||
|
line = self.prev_line.rstrip('\\\r\n')
|
||||||
|
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
|
||||||
|
self.tokens.append(ws + f"\\{newline}" * row_offset)
|
||||||
|
self.prev_col = 0
|
||||||
|
|
||||||
def escape_brackets(self, token):
|
def escape_brackets(self, token):
|
||||||
characters = []
|
characters = []
|
||||||
consume_until_next_bracket = False
|
consume_until_next_bracket = False
|
||||||
|
@ -243,8 +258,6 @@ class Untokenizer:
|
||||||
end_line, end_col = end
|
end_line, end_col = end
|
||||||
extra_chars = last_line.count("{{") + last_line.count("}}")
|
extra_chars = last_line.count("{{") + last_line.count("}}")
|
||||||
end = (end_line, end_col + extra_chars)
|
end = (end_line, end_col + extra_chars)
|
||||||
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
|
|
||||||
self.tokens.append(" ")
|
|
||||||
|
|
||||||
self.add_whitespace(start)
|
self.add_whitespace(start)
|
||||||
self.tokens.append(token)
|
self.tokens.append(token)
|
||||||
|
@ -253,6 +266,7 @@ class Untokenizer:
|
||||||
self.prev_row += 1
|
self.prev_row += 1
|
||||||
self.prev_col = 0
|
self.prev_col = 0
|
||||||
self.prev_type = tok_type
|
self.prev_type = tok_type
|
||||||
|
self.prev_line = line
|
||||||
return "".join(self.tokens)
|
return "".join(self.tokens)
|
||||||
|
|
||||||
def compat(self, token, iterable):
|
def compat(self, token, iterable):
|
||||||
|
|
|
@ -0,0 +1,2 @@
|
||||||
|
Fix round-trip invariance for backslash continuations in
|
||||||
|
:func:`tokenize.untokenize`.
|
Loading…
Add table
Add a link
Reference in a new issue