gh-115154: Fix untokenize handling of unicode named literals (#115171)

2025-10-09 00:22:17 +00:00 · 2024-02-19 14:54:10 +00:00 · 2024-02-19 14:54:10 +00:00 · ecf16ee50e
commit ecf16ee50e
parent d504968983
3 changed files with 85 additions and 10 deletions
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -168,6 +168,7 @@ class Untokenizer:
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0
+        self.prev_type = None
        self.encoding = None

    def add_whitespace(self, start):
@ -183,6 +184,29 @@ class Untokenizer:
        if col_offset:
            self.tokens.append(" " * col_offset)

+    def escape_brackets(self, token):
+        characters = []
+        consume_until_next_bracket = False
+        for character in token:
+            if character == "}":
+                if consume_until_next_bracket:
+                    consume_until_next_bracket = False
+                else:
+                    characters.append(character)
+            if character == "{":
+                n_backslashes = sum(
+                    1 for char in _itertools.takewhile(
+                        "\\".__eq__,
+                        characters[-2::-1]
+                    )
+                )
+                if n_backslashes % 2 == 0:
+                    characters.append(character)
+                else:
+                    consume_until_next_bracket = True
+            characters.append(character)
+        return "".join(characters)
+
    def untokenize(self, iterable):
        it = iter(iterable)
        indents = []
@ -214,11 +238,13 @@ class Untokenizer:
                startline = False
            elif tok_type == FSTRING_MIDDLE:
                if '{' in token or '}' in token:
+                    token = self.escape_brackets(token)
+                    last_line = token.splitlines()[-1]
                    end_line, end_col = end
-                    end = (end_line, end_col + token.count('{') + token.count('}'))
-                    token = re.sub('{', '{{', token)
-                    token = re.sub('}', '}}', token)
-
+                    extra_chars = last_line.count("{{") + last_line.count("}}")
+                    end = (end_line, end_col + extra_chars)
+            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
+                self.tokens.append(" ")

            self.add_whitespace(start)
            self.tokens.append(token)
@ -226,6 +252,7 @@ class Untokenizer:
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
+            self.prev_type = tok_type
        return "".join(self.tokens)

    def compat(self, token, iterable):
@ -233,6 +260,7 @@ class Untokenizer:
        toks_append = self.tokens.append
        startline = token[0] in (NEWLINE, NL)
        prevstring = False
+        in_fstring = 0

        for tok in _itertools.chain([token], iterable):
            toknum, tokval = tok[:2]
@ -251,6 +279,10 @@ class Untokenizer:
            else:
                prevstring = False

+            if toknum == FSTRING_START:
+                in_fstring += 1
+            elif toknum == FSTRING_END:
+                in_fstring -= 1
            if toknum == INDENT:
                indents.append(tokval)
                continue
@ -263,11 +295,18 @@ class Untokenizer:
                toks_append(indents[-1])
                startline = False
            elif toknum == FSTRING_MIDDLE:
-                if '{' in tokval or '}' in tokval:
-                    tokval = re.sub('{', '{{', tokval)
-                    tokval = re.sub('}', '}}', tokval)
+                tokval = self.escape_brackets(tokval)
+
+            # Insert a space between two consecutive brackets if we are in an f-string
+            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring:
+                tokval = ' ' + tokval
+
+            # Insert a space between two consecutive f-strings
+            if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
+                self.tokens.append(" ")

            toks_append(tokval)
+            self.prev_type = toknum


 def untokenize(iterable):