mirror of
https://github.com/python/cpython.git
synced 2025-07-07 19:35:27 +00:00
gh-102856: Python tokenizer implementation for PEP 701 (#104323)
This commit replaces the Python implementation of the tokenize module with an implementation that reuses the real C tokenizer via a private extension module. The tokenize module now implements a compatibility layer that transforms tokens from the C tokenizer into Python tokenize tokens for backward compatibility. As the C tokenizer does not emit some tokens that the Python tokenizer provides (such as comments and non-semantic newlines), a new special mode has been added to the C tokenizer mode that currently is only used via the extension module that exposes it to the Python layer. This new mode forces the C tokenizer to emit these new extra tokens and add the appropriate metadata that is needed to match the old Python implementation. Co-authored-by: Pablo Galindo <pablogsal@gmail.com>
This commit is contained in:
parent
3ed57e4995
commit
6715f91edc
22 changed files with 424 additions and 374 deletions
335
Lib/tokenize.py
335
Lib/tokenize.py
|
@ -56,112 +56,11 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
|
|||
else:
|
||||
return self.type
|
||||
|
||||
def group(*choices): return '(' + '|'.join(choices) + ')'
|
||||
def any(*choices): return group(*choices) + '*'
|
||||
def maybe(*choices): return group(*choices) + '?'
|
||||
|
||||
# Note: we use unicode matching for names ("\w") but ascii matching for
|
||||
# number literals.
|
||||
Whitespace = r'[ \f\t]*'
|
||||
Comment = r'#[^\r\n]*'
|
||||
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
|
||||
Name = r'\w+'
|
||||
|
||||
Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
|
||||
Binnumber = r'0[bB](?:_?[01])+'
|
||||
Octnumber = r'0[oO](?:_?[0-7])+'
|
||||
Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
|
||||
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
|
||||
Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
|
||||
Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
|
||||
r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
|
||||
Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
|
||||
Floatnumber = group(Pointfloat, Expfloat)
|
||||
Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
|
||||
Number = group(Imagnumber, Floatnumber, Intnumber)
|
||||
|
||||
# Return the empty string, plus all of the valid string prefixes.
|
||||
def _all_string_prefixes():
|
||||
# The valid string prefixes. Only contain the lower case versions,
|
||||
# and don't contain any permutations (include 'fr', but not
|
||||
# 'rf'). The various permutations will be generated.
|
||||
_valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
|
||||
# if we add binary f-strings, add: ['fb', 'fbr']
|
||||
result = {''}
|
||||
for prefix in _valid_string_prefixes:
|
||||
for t in _itertools.permutations(prefix):
|
||||
# create a list with upper and lower versions of each
|
||||
# character
|
||||
for u in _itertools.product(*[(c, c.upper()) for c in t]):
|
||||
result.add(''.join(u))
|
||||
return result
|
||||
|
||||
@functools.lru_cache
|
||||
def _compile(expr):
|
||||
return re.compile(expr, re.UNICODE)
|
||||
|
||||
# Note that since _all_string_prefixes includes the empty string,
|
||||
# StringPrefix can be the empty string (making it optional).
|
||||
StringPrefix = group(*_all_string_prefixes())
|
||||
|
||||
# Tail end of ' string.
|
||||
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
|
||||
# Tail end of " string.
|
||||
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
|
||||
# Tail end of ''' string.
|
||||
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
|
||||
# Tail end of """ string.
|
||||
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
|
||||
Triple = group(StringPrefix + "'''", StringPrefix + '"""')
|
||||
# Single-line ' or " string.
|
||||
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
|
||||
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
|
||||
|
||||
# Sorting in reverse order puts the long operators before their prefixes.
|
||||
# Otherwise if = came before ==, == would get recognized as two instances
|
||||
# of =.
|
||||
Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
|
||||
Funny = group(r'\r?\n', Special)
|
||||
|
||||
PlainToken = group(Number, Funny, String, Name)
|
||||
Token = Ignore + PlainToken
|
||||
|
||||
# First (or only) line of ' or " string.
|
||||
ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
|
||||
group("'", r'\\\r?\n'),
|
||||
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
|
||||
group('"', r'\\\r?\n'))
|
||||
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
|
||||
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
|
||||
|
||||
# For a given string prefix plus quotes, endpats maps it to a regex
|
||||
# to match the remainder of that string. _prefix can be empty, for
|
||||
# a normal single or triple quoted string (with no prefix).
|
||||
endpats = {}
|
||||
for _prefix in _all_string_prefixes():
|
||||
endpats[_prefix + "'"] = Single
|
||||
endpats[_prefix + '"'] = Double
|
||||
endpats[_prefix + "'''"] = Single3
|
||||
endpats[_prefix + '"""'] = Double3
|
||||
del _prefix
|
||||
|
||||
# A set of all of the single and triple quoted string prefixes,
|
||||
# including the opening quotes.
|
||||
single_quoted = set()
|
||||
triple_quoted = set()
|
||||
for t in _all_string_prefixes():
|
||||
for u in (t + '"', t + "'"):
|
||||
single_quoted.add(u)
|
||||
for u in (t + '"""', t + "'''"):
|
||||
triple_quoted.add(u)
|
||||
del t, u
|
||||
|
||||
tabsize = 8
|
||||
|
||||
class TokenError(Exception): pass
|
||||
|
||||
class StopTokenizing(Exception): pass
|
||||
|
||||
class StopTokenizing(Exception): pass
|
||||
|
||||
class Untokenizer:
|
||||
|
||||
|
@ -213,6 +112,14 @@ class Untokenizer:
|
|||
self.tokens.append(indent)
|
||||
self.prev_col = len(indent)
|
||||
startline = False
|
||||
elif tok_type == FSTRING_MIDDLE:
|
||||
if '{' in token or '}' in token:
|
||||
end_line, end_col = end
|
||||
end = (end_line, end_col + token.count('{') + token.count('}'))
|
||||
token = re.sub('{', '{{', token)
|
||||
token = re.sub('}', '}}', token)
|
||||
|
||||
|
||||
self.add_whitespace(start)
|
||||
self.tokens.append(token)
|
||||
self.prev_row, self.prev_col = end
|
||||
|
@ -255,6 +162,11 @@ class Untokenizer:
|
|||
elif startline and indents:
|
||||
toks_append(indents[-1])
|
||||
startline = False
|
||||
elif toknum == FSTRING_MIDDLE:
|
||||
if '{' in tokval or '}' in tokval:
|
||||
tokval = re.sub('{', '{{', tokval)
|
||||
tokval = re.sub('}', '}}', tokval)
|
||||
|
||||
toks_append(tokval)
|
||||
|
||||
|
||||
|
@ -404,7 +316,6 @@ def open(filename):
|
|||
buffer.close()
|
||||
raise
|
||||
|
||||
|
||||
def tokenize(readline):
|
||||
"""
|
||||
The tokenize() generator requires one argument, readline, which
|
||||
|
@ -425,192 +336,32 @@ def tokenize(readline):
|
|||
which tells you which encoding was used to decode the bytes stream.
|
||||
"""
|
||||
encoding, consumed = detect_encoding(readline)
|
||||
empty = _itertools.repeat(b"")
|
||||
rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
|
||||
return _tokenize(rl_gen.__next__, encoding)
|
||||
|
||||
|
||||
def _tokenize(readline, encoding):
|
||||
lnum = parenlev = continued = 0
|
||||
numchars = '0123456789'
|
||||
contstr, needcont = '', 0
|
||||
contline = None
|
||||
indents = [0]
|
||||
|
||||
rl_gen = _itertools.chain(consumed, iter(readline, b""))
|
||||
if encoding is not None:
|
||||
if encoding == "utf-8-sig":
|
||||
# BOM will already have been stripped.
|
||||
encoding = "utf-8"
|
||||
yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
|
||||
last_line = b''
|
||||
line = b''
|
||||
while True: # loop over lines in stream
|
||||
try:
|
||||
# We capture the value of the line variable here because
|
||||
# readline uses the empty string '' to signal end of input,
|
||||
# hence `line` itself will always be overwritten at the end
|
||||
# of this loop.
|
||||
last_line = line
|
||||
line = readline()
|
||||
except StopIteration:
|
||||
line = b''
|
||||
yield from _tokenize(rl_gen, encoding)
|
||||
|
||||
if encoding is not None:
|
||||
line = line.decode(encoding)
|
||||
lnum += 1
|
||||
pos, max = 0, len(line)
|
||||
def _tokenize(rl_gen, encoding):
|
||||
source = b"".join(rl_gen).decode(encoding)
|
||||
token = None
|
||||
for token in _generate_tokens_from_c_tokenizer(source, extra_tokens=True):
|
||||
# TODO: Marta -> limpiar esto
|
||||
if 6 < token.type <= 54:
|
||||
token = token._replace(type=OP)
|
||||
if token.type in {ASYNC, AWAIT}:
|
||||
token = token._replace(type=NAME)
|
||||
if token.type == NEWLINE:
|
||||
l_start, c_start = token.start
|
||||
l_end, c_end = token.end
|
||||
token = token._replace(string='\n', start=(l_start, c_start), end=(l_end, c_end+1))
|
||||
|
||||
if contstr: # continued string
|
||||
if not line:
|
||||
raise TokenError("EOF in multi-line string", strstart)
|
||||
endmatch = endprog.match(line)
|
||||
if endmatch:
|
||||
pos = end = endmatch.end(0)
|
||||
yield TokenInfo(STRING, contstr + line[:end],
|
||||
strstart, (lnum, end), contline + line)
|
||||
contstr, needcont = '', 0
|
||||
contline = None
|
||||
elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
|
||||
yield TokenInfo(ERRORTOKEN, contstr + line,
|
||||
strstart, (lnum, len(line)), contline)
|
||||
contstr = ''
|
||||
contline = None
|
||||
continue
|
||||
else:
|
||||
contstr = contstr + line
|
||||
contline = contline + line
|
||||
continue
|
||||
|
||||
elif parenlev == 0 and not continued: # new statement
|
||||
if not line: break
|
||||
column = 0
|
||||
while pos < max: # measure leading whitespace
|
||||
if line[pos] == ' ':
|
||||
column += 1
|
||||
elif line[pos] == '\t':
|
||||
column = (column//tabsize + 1)*tabsize
|
||||
elif line[pos] == '\f':
|
||||
column = 0
|
||||
else:
|
||||
break
|
||||
pos += 1
|
||||
if pos == max:
|
||||
break
|
||||
|
||||
if line[pos] in '#\r\n': # skip comments or blank lines
|
||||
if line[pos] == '#':
|
||||
comment_token = line[pos:].rstrip('\r\n')
|
||||
yield TokenInfo(COMMENT, comment_token,
|
||||
(lnum, pos), (lnum, pos + len(comment_token)), line)
|
||||
pos += len(comment_token)
|
||||
|
||||
yield TokenInfo(NL, line[pos:],
|
||||
(lnum, pos), (lnum, len(line)), line)
|
||||
continue
|
||||
|
||||
if column > indents[-1]: # count indents or dedents
|
||||
indents.append(column)
|
||||
yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
|
||||
while column < indents[-1]:
|
||||
if column not in indents:
|
||||
raise IndentationError(
|
||||
"unindent does not match any outer indentation level",
|
||||
("<tokenize>", lnum, pos, line))
|
||||
indents = indents[:-1]
|
||||
|
||||
yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
|
||||
|
||||
else: # continued statement
|
||||
if not line:
|
||||
raise TokenError("EOF in multi-line statement", (lnum, 0))
|
||||
continued = 0
|
||||
|
||||
while pos < max:
|
||||
pseudomatch = _compile(PseudoToken).match(line, pos)
|
||||
if pseudomatch: # scan for tokens
|
||||
start, end = pseudomatch.span(1)
|
||||
spos, epos, pos = (lnum, start), (lnum, end), end
|
||||
if start == end:
|
||||
continue
|
||||
token, initial = line[start:end], line[start]
|
||||
|
||||
if (initial in numchars or # ordinary number
|
||||
(initial == '.' and token != '.' and token != '...')):
|
||||
yield TokenInfo(NUMBER, token, spos, epos, line)
|
||||
elif initial in '\r\n':
|
||||
if parenlev > 0:
|
||||
yield TokenInfo(NL, token, spos, epos, line)
|
||||
else:
|
||||
yield TokenInfo(NEWLINE, token, spos, epos, line)
|
||||
|
||||
elif initial == '#':
|
||||
assert not token.endswith("\n")
|
||||
yield TokenInfo(COMMENT, token, spos, epos, line)
|
||||
|
||||
elif token in triple_quoted:
|
||||
endprog = _compile(endpats[token])
|
||||
endmatch = endprog.match(line, pos)
|
||||
if endmatch: # all on one line
|
||||
pos = endmatch.end(0)
|
||||
token = line[start:pos]
|
||||
yield TokenInfo(STRING, token, spos, (lnum, pos), line)
|
||||
else:
|
||||
strstart = (lnum, start) # multiple lines
|
||||
contstr = line[start:]
|
||||
contline = line
|
||||
break
|
||||
|
||||
# Check up to the first 3 chars of the token to see if
|
||||
# they're in the single_quoted set. If so, they start
|
||||
# a string.
|
||||
# We're using the first 3, because we're looking for
|
||||
# "rb'" (for example) at the start of the token. If
|
||||
# we switch to longer prefixes, this needs to be
|
||||
# adjusted.
|
||||
# Note that initial == token[:1].
|
||||
# Also note that single quote checking must come after
|
||||
# triple quote checking (above).
|
||||
elif (initial in single_quoted or
|
||||
token[:2] in single_quoted or
|
||||
token[:3] in single_quoted):
|
||||
if token[-1] == '\n': # continued string
|
||||
strstart = (lnum, start)
|
||||
# Again, using the first 3 chars of the
|
||||
# token. This is looking for the matching end
|
||||
# regex for the correct type of quote
|
||||
# character. So it's really looking for
|
||||
# endpats["'"] or endpats['"'], by trying to
|
||||
# skip string prefix characters, if any.
|
||||
endprog = _compile(endpats.get(initial) or
|
||||
endpats.get(token[1]) or
|
||||
endpats.get(token[2]))
|
||||
contstr, needcont = line[start:], 1
|
||||
contline = line
|
||||
break
|
||||
else: # ordinary string
|
||||
yield TokenInfo(STRING, token, spos, epos, line)
|
||||
|
||||
elif initial.isidentifier(): # ordinary name
|
||||
yield TokenInfo(NAME, token, spos, epos, line)
|
||||
elif initial == '\\': # continued stmt
|
||||
continued = 1
|
||||
else:
|
||||
if initial in '([{':
|
||||
parenlev += 1
|
||||
elif initial in ')]}':
|
||||
parenlev -= 1
|
||||
yield TokenInfo(OP, token, spos, epos, line)
|
||||
else:
|
||||
yield TokenInfo(ERRORTOKEN, line[pos],
|
||||
(lnum, pos), (lnum, pos+1), line)
|
||||
pos += 1
|
||||
|
||||
# Add an implicit NEWLINE if the input doesn't end in one
|
||||
if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
|
||||
yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
|
||||
for indent in indents[1:]: # pop remaining indent levels
|
||||
yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
|
||||
yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
|
||||
yield token
|
||||
if token is not None:
|
||||
last_line, _ = token.start
|
||||
yield TokenInfo(ENDMARKER, '', (last_line + 1, 0), (last_line + 1, 0), '')
|
||||
|
||||
|
||||
def generate_tokens(readline):
|
||||
|
@ -619,7 +370,16 @@ def generate_tokens(readline):
|
|||
This has the same API as tokenize(), except that it expects the *readline*
|
||||
callable to return str objects instead of bytes.
|
||||
"""
|
||||
return _tokenize(readline, None)
|
||||
def _gen():
|
||||
while True:
|
||||
try:
|
||||
line = readline()
|
||||
except StopIteration:
|
||||
return
|
||||
if not line:
|
||||
return
|
||||
yield line.encode()
|
||||
return _tokenize(_gen(), 'utf-8')
|
||||
|
||||
def main():
|
||||
import argparse
|
||||
|
@ -656,7 +416,10 @@ def main():
|
|||
tokens = list(tokenize(f.readline))
|
||||
else:
|
||||
filename = "<stdin>"
|
||||
tokens = _tokenize(sys.stdin.readline, None)
|
||||
tokens = _tokenize(
|
||||
(x.encode('utf-8') for x in iter(sys.stdin.readline, "")
|
||||
), "utf-8")
|
||||
|
||||
|
||||
# Output the tokenization
|
||||
for token in tokens:
|
||||
|
@ -682,10 +445,10 @@ def main():
|
|||
perror("unexpected error: %s" % err)
|
||||
raise
|
||||
|
||||
def _generate_tokens_from_c_tokenizer(source):
|
||||
def _generate_tokens_from_c_tokenizer(source, extra_tokens=False):
|
||||
"""Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
|
||||
import _tokenize as c_tokenizer
|
||||
for info in c_tokenizer.TokenizerIter(source):
|
||||
for info in c_tokenizer.TokenizerIter(source, extra_tokens=extra_tokens):
|
||||
tok, type, lineno, end_lineno, col_off, end_col_off, line = info
|
||||
yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue