gh-102856: Python tokenizer implementation for PEP 701 (#104323)

This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
2025-11-24 20:30:18 +00:00 · 2023-05-21 02:03:02 +02:00 · 2023-05-21 02:03:02 +02:00 · 6715f91edc
commit 6715f91edc
parent 3ed57e4995
22 changed files with 424 additions and 374 deletions
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@ -56,112 +56,11 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
        else:
            return self.type

-def group(*choices): return '(' + '|'.join(choices) + ')'
-def any(*choices): return group(*choices) + '*'
-def maybe(*choices): return group(*choices) + '?'
-
-# Note: we use unicode matching for names ("\w") but ascii matching for
-# number literals.
-Whitespace = r'[ \f\t]*'
-Comment = r'#[^\r\n]*'
-Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
-Name = r'\w+'
-
-Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
-Binnumber = r'0[bB](?:_?[01])+'
-Octnumber = r'0[oO](?:_?[0-7])+'
-Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
-Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
-Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
-                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
-Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
-Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
-Number = group(Imagnumber, Floatnumber, Intnumber)
-
-# Return the empty string, plus all of the valid string prefixes.
-def _all_string_prefixes():
-    # The valid string prefixes. Only contain the lower case versions,
-    #  and don't contain any permutations (include 'fr', but not
-    #  'rf'). The various permutations will be generated.
-    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
-    # if we add binary f-strings, add: ['fb', 'fbr']
-    result = {''}
-    for prefix in _valid_string_prefixes:
-        for t in _itertools.permutations(prefix):
-            # create a list with upper and lower versions of each
-            #  character
-            for u in _itertools.product(*[(c, c.upper()) for c in t]):
-                result.add(''.join(u))
-    return result
-
-@functools.lru_cache
-def _compile(expr):
-    return re.compile(expr, re.UNICODE)
-
-# Note that since _all_string_prefixes includes the empty string,
-#  StringPrefix can be the empty string (making it optional).
-StringPrefix = group(*_all_string_prefixes())
-
-# Tail end of ' string.
-Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
-# Tail end of " string.
-Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
-# Tail end of ''' string.
-Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
-# Tail end of """ string.
-Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group(StringPrefix + "'''", StringPrefix + '"""')
-# Single-line ' or " string.
-String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
-               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
-
-# Sorting in reverse order puts the long operators before their prefixes.
-# Otherwise if = came before ==, == would get recognized as two instances
-# of =.
-Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
-Funny = group(r'\r?\n', Special)
-
-PlainToken = group(Number, Funny, String, Name)
-Token = Ignore + PlainToken
-
-# First (or only) line of ' or " string.
-ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
-                group("'", r'\\\r?\n'),
-                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
-                group('"', r'\\\r?\n'))
-PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
-PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-
-# For a given string prefix plus quotes, endpats maps it to a regex
-#  to match the remainder of that string. _prefix can be empty, for
-#  a normal single or triple quoted string (with no prefix).
-endpats = {}
-for _prefix in _all_string_prefixes():
-    endpats[_prefix + "'"] = Single
-    endpats[_prefix + '"'] = Double
-    endpats[_prefix + "'''"] = Single3
-    endpats[_prefix + '"""'] = Double3
-del _prefix
-
-# A set of all of the single and triple quoted string prefixes,
-#  including the opening quotes.
-single_quoted = set()
-triple_quoted = set()
-for t in _all_string_prefixes():
-    for u in (t + '"', t + "'"):
-        single_quoted.add(u)
-    for u in (t + '"""', t + "'''"):
-        triple_quoted.add(u)
-del t, u
-
-tabsize = 8

 class TokenError(Exception): pass

-class StopTokenizing(Exception): pass

+class StopTokenizing(Exception): pass

 class Untokenizer:

@ -213,6 +112,14 @@ class Untokenizer:
                    self.tokens.append(indent)
                    self.prev_col = len(indent)
                startline = False
+            elif tok_type == FSTRING_MIDDLE:
+                if '{' in token or '}' in token:
+                    end_line, end_col = end
+                    end = (end_line, end_col + token.count('{') + token.count('}'))
+                    token = re.sub('{', '{{', token)
+                    token = re.sub('}', '}}', token)
+
+
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
@ -255,6 +162,11 @@ class Untokenizer:
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
+            elif toknum == FSTRING_MIDDLE:
+                if '{' in tokval or '}' in tokval:
+                    tokval = re.sub('{', '{{', tokval)
+                    tokval = re.sub('}', '}}', tokval)
+
            toks_append(tokval)


@ -404,7 +316,6 @@ def open(filename):
        buffer.close()
        raise

-
 def tokenize(readline):
    """
    The tokenize() generator requires one argument, readline, which
@ -425,192 +336,32 @@ def tokenize(readline):
    which tells you which encoding was used to decode the bytes stream.
    """
    encoding, consumed = detect_encoding(readline)
-    empty = _itertools.repeat(b"")
-    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
-    return _tokenize(rl_gen.__next__, encoding)
-
-
-def _tokenize(readline, encoding):
-    lnum = parenlev = continued = 0
-    numchars = '0123456789'
-    contstr, needcont = '', 0
-    contline = None
-    indents = [0]
-
+    rl_gen = _itertools.chain(consumed, iter(readline, b""))
    if encoding is not None:
        if encoding == "utf-8-sig":
            # BOM will already have been stripped.
            encoding = "utf-8"
        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
-    last_line = b''
-    line = b''
-    while True:                                # loop over lines in stream
-        try:
-            # We capture the value of the line variable here because
-            # readline uses the empty string '' to signal end of input,
-            # hence `line` itself will always be overwritten at the end
-            # of this loop.
-            last_line = line
-            line = readline()
-        except StopIteration:
-            line = b''
+    yield from _tokenize(rl_gen, encoding)

-        if encoding is not None:
-            line = line.decode(encoding)
-        lnum += 1
-        pos, max = 0, len(line)
+def _tokenize(rl_gen, encoding):
+    source = b"".join(rl_gen).decode(encoding)
+    token = None
+    for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
+        # TODO: Marta -> limpiar esto
+        if 6 < token.type <= 54:
+            token = token._replace(type=OP)
+        if token.type in {ASYNC, AWAIT}:
+            token = token._replace(type=NAME)
+        if token.type == NEWLINE:
+            l_start, c_start = token.start
+            l_end, c_end = token.end
+            token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))

-        if contstr:                            # continued string
-            if not line:
-                raise TokenError("EOF in multi-line string", strstart)
-            endmatch = endprog.match(line)
-            if endmatch:
-                pos = end = endmatch.end(0)
-                yield TokenInfo(STRING, contstr + line[:end],
-                       strstart, (lnum, end), contline + line)
-                contstr, needcont = '', 0
-                contline = None
-            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
-                yield TokenInfo(ERRORTOKEN, contstr + line,
-                           strstart, (lnum, len(line)), contline)
-                contstr = ''
-                contline = None
-                continue
-            else:
-                contstr = contstr + line
-                contline = contline + line
-                continue
-
-        elif parenlev == 0 and not continued:  # new statement
-            if not line: break
-            column = 0
-            while pos < max:                   # measure leading whitespace
-                if line[pos] == ' ':
-                    column += 1
-                elif line[pos] == '\t':
-                    column = (column//tabsize + 1)*tabsize
-                elif line[pos] == '\f':
-                    column = 0
-                else:
-                    break
-                pos += 1
-            if pos == max:
-                break
-
-            if line[pos] in '#\r\n':           # skip comments or blank lines
-                if line[pos] == '#':
-                    comment_token = line[pos:].rstrip('\r\n')
-                    yield TokenInfo(COMMENT, comment_token,
-                           (lnum, pos), (lnum, pos + len(comment_token)), line)
-                    pos += len(comment_token)
-
-                yield TokenInfo(NL, line[pos:],
-                           (lnum, pos), (lnum, len(line)), line)
-                continue
-
-            if column > indents[-1]:           # count indents or dedents
-                indents.append(column)
-                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
-            while column < indents[-1]:
-                if column not in indents:
-                    raise IndentationError(
-                        "unindent does not match any outer indentation level",
-                        ("<tokenize>", lnum, pos, line))
-                indents = indents[:-1]
-
-                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
-
-        else:                                  # continued statement
-            if not line:
-                raise TokenError("EOF in multi-line statement", (lnum, 0))
-            continued = 0
-
-        while pos < max:
-            pseudomatch = _compile(PseudoToken).match(line, pos)
-            if pseudomatch:                                # scan for tokens
-                start, end = pseudomatch.span(1)
-                spos, epos, pos = (lnum, start), (lnum, end), end
-                if start == end:
-                    continue
-                token, initial = line[start:end], line[start]
-
-                if (initial in numchars or                 # ordinary number
-                    (initial == '.' and token != '.' and token != '...')):
-                    yield TokenInfo(NUMBER, token, spos, epos, line)
-                elif initial in '\r\n':
-                    if parenlev > 0:
-                        yield TokenInfo(NL, token, spos, epos, line)
-                    else:
-                        yield TokenInfo(NEWLINE, token, spos, epos, line)
-
-                elif initial == '#':
-                    assert not token.endswith("\n")
-                    yield TokenInfo(COMMENT, token, spos, epos, line)
-
-                elif token in triple_quoted:
-                    endprog = _compile(endpats[token])
-                    endmatch = endprog.match(line, pos)
-                    if endmatch:                           # all on one line
-                        pos = endmatch.end(0)
-                        token = line[start:pos]
-                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
-                    else:
-                        strstart = (lnum, start)           # multiple lines
-                        contstr = line[start:]
-                        contline = line
-                        break
-
-                # Check up to the first 3 chars of the token to see if
-                #  they're in the single_quoted set. If so, they start
-                #  a string.
-                # We're using the first 3, because we're looking for
-                #  "rb'" (for example) at the start of the token. If
-                #  we switch to longer prefixes, this needs to be
-                #  adjusted.
-                # Note that initial == token[:1].
-                # Also note that single quote checking must come after
-                #  triple quote checking (above).
-                elif (initial in single_quoted or
-                      token[:2] in single_quoted or
-                      token[:3] in single_quoted):
-                    if token[-1] == '\n':                  # continued string
-                        strstart = (lnum, start)
-                        # Again, using the first 3 chars of the
-                        #  token. This is looking for the matching end
-                        #  regex for the correct type of quote
-                        #  character. So it's really looking for
-                        #  endpats["'"] or endpats['"'], by trying to
-                        #  skip string prefix characters, if any.
-                        endprog = _compile(endpats.get(initial) or
-                                           endpats.get(token[1]) or
-                                           endpats.get(token[2]))
-                        contstr, needcont = line[start:], 1
-                        contline = line
-                        break
-                    else:                                  # ordinary string
-                        yield TokenInfo(STRING, token, spos, epos, line)
-
-                elif initial.isidentifier():               # ordinary name
-                    yield TokenInfo(NAME, token, spos, epos, line)
-                elif initial == '\\':                      # continued stmt
-                    continued = 1
-                else:
-                    if initial in '([{':
-                        parenlev += 1
-                    elif initial in ')]}':
-                        parenlev -= 1
-                    yield TokenInfo(OP, token, spos, epos, line)
-            else:
-                yield TokenInfo(ERRORTOKEN, line[pos],
-                           (lnum, pos), (lnum, pos+1), line)
-                pos += 1
-
-    # Add an implicit NEWLINE if the input doesn't end in one
-    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
-        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
-    for indent in indents[1:]:                 # pop remaining indent levels
-        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
-    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
+        yield token
+    if token is not None:
+        last_line, _ = token.start
+        yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')


 def generate_tokens(readline):
@ -619,7 +370,16 @@ def generate_tokens(readline):
    This has the same API as tokenize(), except that it expects the *readline*
    callable to return str objects instead of bytes.
    """
-    return _tokenize(readline, None)
+    def _gen():
+        while True:
+            try:
+                line = readline()
+            except StopIteration:
+                return
+            if not line:
+                return
+            yield line.encode()
+    return _tokenize(_gen(), 'utf-8')

 def main():
    import argparse
@ -656,7 +416,10 @@ def main():
                tokens = list(tokenize(f.readline))
        else:
            filename = "<stdin>"
-            tokens = _tokenize(sys.stdin.readline, None)
+            tokens = _tokenize(
+                (x.encode('utf-8') for x in iter(sys.stdin.readline, "")
+            ), "utf-8")
+

        # Output the tokenization
        for token in tokens:
@ -682,10 +445,10 @@ def main():
        perror("unexpected error: %s" % err)
        raise

-def _generate_tokens_from_c_tokenizer(source):
+def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
    import _tokenize as c_tokenizer
-    for info in c_tokenizer.TokenizerIter(source):
+    for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
        tok, type, lineno, end_lineno, col_off, end_col_off, line = info
        yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)