mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
gh-125553: Fix backslash continuation in untokenize
(#126010)
This commit is contained in:
parent
a4760ef8e5
commit
7ad793e5db
3 changed files with 49 additions and 6 deletions
|
@ -1,4 +1,5 @@
|
|||
import os
|
||||
import re
|
||||
import token
|
||||
import tokenize
|
||||
import unittest
|
||||
|
@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
|
|||
self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
|
||||
|
||||
|
||||
def contains_ambiguous_backslash(source):
|
||||
"""Return `True` if the source contains a backslash on a
|
||||
line by itself. For example:
|
||||
|
||||
a = (1
|
||||
\\
|
||||
)
|
||||
|
||||
Code like this cannot be untokenized exactly. This is because
|
||||
the tokenizer does not produce any tokens for the line containing
|
||||
the backslash and so there is no way to know its indent.
|
||||
"""
|
||||
pattern = re.compile(br'\n\s*\\\r?\n')
|
||||
return pattern.search(source) is not None
|
||||
|
||||
|
||||
class TestRoundtrip(TestCase):
|
||||
|
||||
def check_roundtrip(self, f):
|
||||
|
@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
|
|||
tokenize.untokenize(), and the latter tokenized again to 2-tuples.
|
||||
The test fails if the 3 pair tokenizations do not match.
|
||||
|
||||
If the source code can be untokenized unambiguously, the
|
||||
untokenized code must match the original code exactly.
|
||||
|
||||
When untokenize bugs are fixed, untokenize with 5-tuples should
|
||||
reproduce code that does not contain a backslash continuation
|
||||
following spaces. A proper test should test this.
|
||||
|
@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
|
|||
tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
|
||||
self.assertEqual(tokens2_from5, tokens2)
|
||||
|
||||
if not contains_ambiguous_backslash(code):
|
||||
# The BOM does not produce a token so there is no way to preserve it.
|
||||
code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
|
||||
readline = iter(code_without_bom.splitlines(keepends=True)).__next__
|
||||
untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
|
||||
self.assertEqual(code_without_bom, untokenized_code)
|
||||
|
||||
def check_line_extraction(self, f):
|
||||
if isinstance(f, str):
|
||||
code = f.encode('utf-8')
|
||||
|
|
|
@ -169,6 +169,7 @@ class Untokenizer:
|
|||
self.prev_row = 1
|
||||
self.prev_col = 0
|
||||
self.prev_type = None
|
||||
self.prev_line = ""
|
||||
self.encoding = None
|
||||
|
||||
def add_whitespace(self, start):
|
||||
|
@ -176,14 +177,28 @@ class Untokenizer:
|
|||
if row < self.prev_row or row == self.prev_row and col < self.prev_col:
|
||||
raise ValueError("start ({},{}) precedes previous end ({},{})"
|
||||
.format(row, col, self.prev_row, self.prev_col))
|
||||
row_offset = row - self.prev_row
|
||||
if row_offset:
|
||||
self.tokens.append("\\\n" * row_offset)
|
||||
self.prev_col = 0
|
||||
self.add_backslash_continuation(start)
|
||||
col_offset = col - self.prev_col
|
||||
if col_offset:
|
||||
self.tokens.append(" " * col_offset)
|
||||
|
||||
def add_backslash_continuation(self, start):
|
||||
"""Add backslash continuation characters if the row has increased
|
||||
without encountering a newline token.
|
||||
|
||||
This also inserts the correct amount of whitespace before the backslash.
|
||||
"""
|
||||
row = start[0]
|
||||
row_offset = row - self.prev_row
|
||||
if row_offset == 0:
|
||||
return
|
||||
|
||||
newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
|
||||
line = self.prev_line.rstrip('\\\r\n')
|
||||
ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
|
||||
self.tokens.append(ws + f"\\{newline}" * row_offset)
|
||||
self.prev_col = 0
|
||||
|
||||
def escape_brackets(self, token):
|
||||
characters = []
|
||||
consume_until_next_bracket = False
|
||||
|
@ -243,8 +258,6 @@ class Untokenizer:
|
|||
end_line, end_col = end
|
||||
extra_chars = last_line.count("{{") + last_line.count("}}")
|
||||
end = (end_line, end_col + extra_chars)
|
||||
elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
|
||||
self.tokens.append(" ")
|
||||
|
||||
self.add_whitespace(start)
|
||||
self.tokens.append(token)
|
||||
|
@ -253,6 +266,7 @@ class Untokenizer:
|
|||
self.prev_row += 1
|
||||
self.prev_col = 0
|
||||
self.prev_type = tok_type
|
||||
self.prev_line = line
|
||||
return "".join(self.tokens)
|
||||
|
||||
def compat(self, token, iterable):
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
Fix round-trip invariance for backslash continuations in
|
||||
:func:`tokenize.untokenize`.
|
Loading…
Add table
Add a link
Reference in a new issue