bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)

"Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.
2025-07-07 19:35:27 +00:00 · 2018-12-22 11:18:40 +02:00 · 2018-12-22 11:18:40 +02:00 · 8ac658114d
commit 8ac658114d
parent c1b4b0f616
18 changed files with 940 additions and 462 deletions
--- a/Lib/token.py
+++ b/Lib/token.py
@ -1,15 +1,8 @@
-"""Token constants (from "token.h")."""
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py

 __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']

-#  This file is automatically generated; please don't muck it up!
-#
-#  To update the symbols in this file, 'cd' to the top directory of
-#  the python source tree after building the interpreter and run:
-#
-#    ./python Lib/token.py
-
-#--start constants--
 ENDMARKER = 0
 NAME = 1
 NUMBER = 2
@ -63,23 +56,70 @@ AT = 49
 ATEQUAL = 50
 RARROW = 51
 ELLIPSIS = 52
-# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
 OP = 53
-ERRORTOKEN = 54
 # These aren't used by the C tokenizer but are needed for tokenize.py
+ERRORTOKEN = 54
 COMMENT = 55
 NL = 56
 ENCODING = 57
 N_TOKENS = 58
 # Special definitions for cooperation with parser
 NT_OFFSET = 256
-#--end constants--

 tok_name = {value: name
            for name, value in globals().items()
            if isinstance(value, int) and not name.startswith('_')}
 __all__.extend(tok_name.values())

+EXACT_TOKEN_TYPES = {
+    '!=': NOTEQUAL,
+    '%': PERCENT,
+    '%=': PERCENTEQUAL,
+    '&': AMPER,
+    '&=': AMPEREQUAL,
+    '(': LPAR,
+    ')': RPAR,
+    '*': STAR,
+    '**': DOUBLESTAR,
+    '**=': DOUBLESTAREQUAL,
+    '*=': STAREQUAL,
+    '+': PLUS,
+    '+=': PLUSEQUAL,
+    ',': COMMA,
+    '-': MINUS,
+    '-=': MINEQUAL,
+    '->': RARROW,
+    '.': DOT,
+    '...': ELLIPSIS,
+    '/': SLASH,
+    '//': DOUBLESLASH,
+    '//=': DOUBLESLASHEQUAL,
+    '/=': SLASHEQUAL,
+    ':': COLON,
+    ';': SEMI,
+    '<': LESS,
+    '<<': LEFTSHIFT,
+    '<<=': LEFTSHIFTEQUAL,
+    '<=': LESSEQUAL,
+    '=': EQUAL,
+    '==': EQEQUAL,
+    '>': GREATER,
+    '>=': GREATEREQUAL,
+    '>>': RIGHTSHIFT,
+    '>>=': RIGHTSHIFTEQUAL,
+    '@': AT,
+    '@=': ATEQUAL,
+    '[': LSQB,
+    ']': RSQB,
+    '^': CIRCUMFLEX,
+    '^=': CIRCUMFLEXEQUAL,
+    '{': LBRACE,
+    '|': VBAR,
+    '|=': VBAREQUAL,
+    '}': RBRACE,
+    '~': TILDE,
+}
+
 def ISTERMINAL(x):
    return x < NT_OFFSET

@ -88,73 +128,3 @@ def ISNONTERMINAL(x):

 def ISEOF(x):
    return x == ENDMARKER
-
-
-def _main():
-    import re
-    import sys
-    args = sys.argv[1:]
-    inFileName = args and args[0] or "Include/token.h"
-    outFileName = "Lib/token.py"
-    if len(args) > 1:
-        outFileName = args[1]
-    try:
-        fp = open(inFileName)
-    except OSError as err:
-        sys.stdout.write("I/O error: %s\n" % str(err))
-        sys.exit(1)
-    with fp:
-        lines = fp.read().split("\n")
-    prog = re.compile(
-        r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
-        re.IGNORECASE)
-    comment_regex = re.compile(
-        r"^\s*/\*\s*(.+?)\s*\*/\s*$",
-        re.IGNORECASE)
-
-    tokens = {}
-    prev_val = None
-    for line in lines:
-        match = prog.match(line)
-        if match:
-            name, val = match.group(1, 2)
-            val = int(val)
-            tokens[val] = {'token': name}          # reverse so we can sort them...
-            prev_val = val
-        else:
-            comment_match = comment_regex.match(line)
-            if comment_match and prev_val is not None:
-                comment = comment_match.group(1)
-                tokens[prev_val]['comment'] = comment
-    keys = sorted(tokens.keys())
-    # load the output skeleton from the target:
-    try:
-        fp = open(outFileName)
-    except OSError as err:
-        sys.stderr.write("I/O error: %s\n" % str(err))
-        sys.exit(2)
-    with fp:
-        format = fp.read().split("\n")
-    try:
-        start = format.index("#--start constants--") + 1
-        end = format.index("#--end constants--")
-    except ValueError:
-        sys.stderr.write("target does not contain format markers")
-        sys.exit(3)
-    lines = []
-    for key in keys:
-        lines.append("%s = %d" % (tokens[key]["token"], key))
-        if "comment" in tokens[key]:
-            lines.append("# %s" % tokens[key]["comment"])
-    format[start:end] = lines
-    try:
-        fp = open(outFileName, 'w')
-    except OSError as err:
-        sys.stderr.write("I/O error: %s\n" % str(err))
-        sys.exit(4)
-    with fp:
-        fp.write("\n".join(format))
-
-
-if __name__ == "__main__":
-    _main()