[3.13] gh-125553: Fix backslash continuation in untokenize (GH-126010) (#129153)

gh-125553: Fix backslash continuation in `untokenize` (GH-126010) (cherry picked from commit 7ad793e5db) Co-authored-by: Tomas R <tomas.roun8@gmail.com>
2025-08-04 00:48:58 +00:00 · 2025-01-21 22:04:55 +01:00 · 2025-01-21 22:04:55 +01:00 · 3048dcd15a
commit 3048dcd15a
parent 0c7045378f
3 changed files with 49 additions and 6 deletions
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@ -1,4 +1,5 @@
 import os
 import re
 import token
 import tokenize
 import unittest
@ -1819,6 +1820,22 @@ class UntokenizeTest(TestCase):
        self.assertEqual(tokenize.untokenize(iter(tokens)), b'Hello ')
 def contains_ambiguous_backslash(source):
    """Return `True` if the source contains a backslash on a
    line by itself. For example:
    a = (1
        \\
    )
    Code like this cannot be untokenized exactly. This is because
    the tokenizer does not produce any tokens for the line containing
    the backslash and so there is no way to know its indent.
    """
    pattern = re.compile(br'\n\s*\\\r?\n')
    return pattern.search(source) is not None
 class TestRoundtrip(TestCase):
    def check_roundtrip(self, f):
@ -1829,6 +1846,9 @@ class TestRoundtrip(TestCase):
        tokenize.untokenize(), and the latter tokenized again to 2-tuples.
        The test fails if the 3 pair tokenizations do not match.
        If the source code can be untokenized unambiguously, the
        untokenized code must match the original code exactly.
        When untokenize bugs are fixed, untokenize with 5-tuples should
        reproduce code that does not contain a backslash continuation
        following spaces.  A proper test should test this.
@ -1852,6 +1872,13 @@ class TestRoundtrip(TestCase):
        tokens2_from5 = [tok[:2] for tok in tokenize.tokenize(readline5)]
        self.assertEqual(tokens2_from5, tokens2)
        if not contains_ambiguous_backslash(code):
            # The BOM does not produce a token so there is no way to preserve it.
            code_without_bom = code.removeprefix(b'\xef\xbb\xbf')
            readline = iter(code_without_bom.splitlines(keepends=True)).__next__
            untokenized_code = tokenize.untokenize(tokenize.tokenize(readline))
            self.assertEqual(code_without_bom, untokenized_code)
    def check_line_extraction(self, f):
        if isinstance(f, str):
            code = f.encode('utf-8')
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -169,6 +169,7 @@ class Untokenizer:
        self.prev_row = 1
        self.prev_col = 0
        self.prev_type = None
        self.prev_line = ""
        self.encoding = None
    def add_whitespace(self, start):
@ -176,14 +177,28 @@ class Untokenizer:
        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
            raise ValueError("start ({},{}) precedes previous end ({},{})"
                             .format(row, col, self.prev_row, self.prev_col))
-        row_offset = row - self.prev_row
+        self.add_backslash_continuation(start)
        if row_offset:
            self.tokens.append("\\\n" * row_offset)
            self.prev_col = 0
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)
    def add_backslash_continuation(self, start):
        """Add backslash continuation characters if the row has increased
        without encountering a newline token.
        This also inserts the correct amount of whitespace before the backslash.
        """
        row = start[0]
        row_offset = row - self.prev_row
        if row_offset == 0:
            return
        newline = '\r\n' if self.prev_line.endswith('\r\n') else '\n'
        line = self.prev_line.rstrip('\\\r\n')
        ws = ''.join(_itertools.takewhile(str.isspace, reversed(line)))
        self.tokens.append(ws + f"\\{newline}" * row_offset)
        self.prev_col = 0
    def escape_brackets(self, token):
        characters = []
        consume_until_next_bracket = False
@ -243,8 +258,6 @@ class Untokenizer:
                    end_line, end_col = end
                    extra_chars = last_line.count("{{") + last_line.count("}}")
                    end = (end_line, end_col + extra_chars)
            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
                self.tokens.append(" ")
            self.add_whitespace(start)
            self.tokens.append(token)
@ -253,6 +266,7 @@ class Untokenizer:
                self.prev_row += 1
                self.prev_col = 0
            self.prev_type = tok_type
            self.prev_line = line
        return "".join(self.tokens)
    def compat(self, token, iterable):
--- a/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
+++ b/Misc/NEWS.d/next/Library/2024-10-26-16-59-02.gh-issue-125553.4pDLzt.rst
@ -0,0 +1,2 @@
 Fix round-trip invariance for backslash continuations in
 :func:`tokenize.untokenize`.
		`@ -0,0 +1,2 @@`
							`Fix round-trip invariance for backslash continuations in`
							:func:`tokenize.untokenize`.