bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)

"Include/token.h", "Lib/token.py" (containing now some data moved from
"Lib/tokenize.py") and new files "Parser/token.c" (containing the code
moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included
in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by
"Tools/scripts/generate_token.py". The script overwrites files only if
needed and can be used on the read-only sources tree.

"Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py"
instead of been executable itself.

Added new make targets "regen-token" and "regen-symbol" which are now
dependencies of "regen-all".

The documentation contains now strings for operators and punctuation tokens.
This commit is contained in:
Serhiy Storchaka 2018-12-22 11:18:40 +02:00 committed by GitHub
parent c1b4b0f616
commit 8ac658114d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 940 additions and 462 deletions

View file

@ -32,6 +32,7 @@ import itertools as _itertools
import re
import sys
from token import *
from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"]
del token
EXACT_TOKEN_TYPES = {
'(': LPAR,
')': RPAR,
'[': LSQB,
']': RSQB,
':': COLON,
',': COMMA,
';': SEMI,
'+': PLUS,
'-': MINUS,
'*': STAR,
'/': SLASH,
'|': VBAR,
'&': AMPER,
'<': LESS,
'>': GREATER,
'=': EQUAL,
'.': DOT,
'%': PERCENT,
'{': LBRACE,
'}': RBRACE,
'==': EQEQUAL,
'!=': NOTEQUAL,
'<=': LESSEQUAL,
'>=': GREATEREQUAL,
'~': TILDE,
'^': CIRCUMFLEX,
'<<': LEFTSHIFT,
'>>': RIGHTSHIFT,
'**': DOUBLESTAR,
'+=': PLUSEQUAL,
'-=': MINEQUAL,
'*=': STAREQUAL,
'/=': SLASHEQUAL,
'%=': PERCENTEQUAL,
'&=': AMPEREQUAL,
'|=': VBAREQUAL,
'^=': CIRCUMFLEXEQUAL,
'<<=': LEFTSHIFTEQUAL,
'>>=': RIGHTSHIFTEQUAL,
'**=': DOUBLESTAREQUAL,
'//': DOUBLESLASH,
'//=': DOUBLESLASHEQUAL,
'...': ELLIPSIS,
'->': RARROW,
'@': AT,
'@=': ATEQUAL,
}
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
r"//=?", r"->",
r"[+\-*/%&@|^=<>]=?",
r"~")
Bracket = '[][(){}]'
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Funny = group(Operator, Bracket, Special)
# Sorting in reverse order puts the long operators before their prefixes.
# Otherwise if = came before ==, == would get recognized as two instances
# of =.
Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
Funny = group(r'\r?\n', Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken