bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)

"Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.
2025-07-15 23:35:23 +00:00 · 2018-12-22 11:18:40 +02:00 · 2018-12-22 11:18:40 +02:00 · 8ac658114d
commit 8ac658114d
parent c1b4b0f616
18 changed files with 940 additions and 462 deletions
--- a/Tools/scripts/generate_token.py
+++ b/Tools/scripts/generate_token.py
@ -0,0 +1,268 @@
+#! /usr/bin/env python3
+# This script generates token related files from Grammar/Tokens:
+#
+#   Doc/library/token-list.inc
+#   Include/token.h
+#   Parser/token.c
+#   Lib/token.py
+
+
+NT_OFFSET = 256
+
+def load_tokens(path):
+    tok_names = []
+    string_to_tok = {}
+    ERRORTOKEN = None
+    with open(path) as fp:
+        for line in fp:
+            line = line.strip()
+            # strip comments
+            i = line.find('#')
+            if i >= 0:
+                line = line[:i].strip()
+            if not line:
+                continue
+            fields = line.split()
+            name = fields[0]
+            value = len(tok_names)
+            if name == 'ERRORTOKEN':
+                ERRORTOKEN = value
+            string = fields[1] if len(fields) > 1 else None
+            if string:
+                string = eval(string)
+                string_to_tok[string] = value
+            tok_names.append(name)
+    return tok_names, ERRORTOKEN, string_to_tok
+
+
+def update_file(file, content):
+    try:
+        with open(file, 'r') as fobj:
+            if fobj.read() == content:
+                return False
+    except (OSError, ValueError):
+        pass
+    with open(file, 'w') as fobj:
+        fobj.write(content)
+    return True
+
+
+token_h_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+/* Token types */
+#ifndef Py_LIMITED_API
+#ifndef Py_TOKEN_H
+#define Py_TOKEN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef TILDE   /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
+
+%s\
+#define N_TOKENS        %d
+#define NT_OFFSET       %d
+
+/* Special definitions for cooperation with parser */
+
+#define ISTERMINAL(x)           ((x) < NT_OFFSET)
+#define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
+#define ISEOF(x)                ((x) == ENDMARKER)
+
+
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
+PyAPI_FUNC(int) PyToken_OneChar(int);
+PyAPI_FUNC(int) PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TOKEN_H */
+#endif /* Py_LIMITED_API */
+"""
+
+def make_h(infile, outfile='Include/token.h'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+    defines = []
+    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+        defines.append("#define %-15s %d\n" % (name, value))
+
+    if update_file(outfile, token_h_template % (
+            ''.join(defines),
+            len(tok_names),
+            NT_OFFSET
+        )):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+token_c_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+%s\
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+%s\
+    return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+%s\
+    return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+%s\
+    return OP;
+}
+"""
+
+def generate_chars_to_token(mapping, n=1):
+    result = []
+    write = result.append
+    indent = '    ' * n
+    write(indent)
+    write('switch (c%d) {\n' % (n,))
+    for c in sorted(mapping):
+        write(indent)
+        value = mapping[c]
+        if isinstance(value, dict):
+            write("case '%s':\n" % (c,))
+            write(generate_chars_to_token(value, n + 1))
+            write(indent)
+            write('    break;\n')
+        else:
+            write("case '%s': return %s;\n" % (c, value))
+    write(indent)
+    write('}\n')
+    return ''.join(result)
+
+def make_c(infile, outfile='Parser/token.c'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+    string_to_tok['<>'] = string_to_tok['!=']
+    chars_to_token = {}
+    for string, value in string_to_tok.items():
+        assert 1 <= len(string) <= 3
+        name = tok_names[value]
+        m = chars_to_token.setdefault(len(string), {})
+        for c in string[:-1]:
+            m = m.setdefault(c, {})
+        m[string[-1]] = name
+
+    names = []
+    for value, name in enumerate(tok_names):
+        if value >= ERRORTOKEN:
+            name = '<%s>' % name
+        names.append('    "%s",\n' % name)
+    names.append('    "<N_TOKENS>",\n')
+
+    if update_file(outfile, token_c_template % (
+            ''.join(names),
+            generate_chars_to_token(chars_to_token[1]),
+            generate_chars_to_token(chars_to_token[2]),
+            generate_chars_to_token(chars_to_token[3])
+        )):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+token_inc_template = """\
+.. Auto-generated by Tools/scripts/generate_token.py
+%s
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
+"""
+
+def make_rst(infile, outfile='Doc/library/token-list.inc'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+    tok_to_string = {value: s for s, value in string_to_tok.items()}
+
+    names = []
+    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+        names.append('.. data:: %s' % (name,))
+        if value in tok_to_string:
+            names.append('')
+            names.append('   Token value for ``"%s"``.' % tok_to_string[value])
+        names.append('')
+
+    if update_file(outfile, token_inc_template % '\n'.join(names)):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+token_py_template = '''\
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py
+
+__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
+
+%s
+N_TOKENS = %d
+# Special definitions for cooperation with parser
+NT_OFFSET = %d
+
+tok_name = {value: name
+            for name, value in globals().items()
+            if isinstance(value, int) and not name.startswith('_')}
+__all__.extend(tok_name.values())
+
+EXACT_TOKEN_TYPES = {
+%s
+}
+
+def ISTERMINAL(x):
+    return x < NT_OFFSET
+
+def ISNONTERMINAL(x):
+    return x >= NT_OFFSET
+
+def ISEOF(x):
+    return x == ENDMARKER
+'''
+
+def make_py(infile, outfile='Lib/token.py'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+    constants = []
+    for value, name in enumerate(tok_names):
+        constants.append('%s = %d' % (name, value))
+    constants.insert(ERRORTOKEN,
+        "# These aren't used by the C tokenizer but are needed for tokenize.py")
+
+    token_types = []
+    for s, value in sorted(string_to_tok.items()):
+        token_types.append('    %r: %s,' % (s, tok_names[value]))
+
+    if update_file(outfile, token_py_template % (
+            '\n'.join(constants),
+            len(tok_names),
+            NT_OFFSET,
+            '\n'.join(token_types),
+        )):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+def main(op, infile='Grammar/Tokens', *args):
+    make = globals()['make_' + op]
+    make(infile, *args)
+
+
+if __name__ == '__main__':
+    import sys
+    main(*sys.argv[1:])