cpython/Tools/build/generate_global_objects.py

import contextlib
import io
import os.path
import re

SCRIPT_NAME = 'Tools/build/generate_global_objects.py'
__file__ = os.path.abspath(__file__)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
INTERNAL = os.path.join(ROOT, 'Include', 'internal')


IGNORED = {
    'ACTION',  # Python/_warnings.c
    'ATTR',  # Python/_warnings.c and Objects/funcobject.c
    'DUNDER',  # Objects/typeobject.c
    'RDUNDER',  # Objects/typeobject.c
    'SPECIAL',  # Objects/weakrefobject.c
    'NAME',  # Objects/typeobject.c
}
IDENTIFIERS = [
    # from ADD() Python/_warnings.c
    'default',
    'ignore',

    # from GET_WARNINGS_ATTR() in Python/_warnings.c
    'WarningMessage',
    '_showwarnmsg',
    '_warn_unawaited_coroutine',
    'defaultaction',
    'filters',
    'onceregistry',

    # from WRAP_METHOD() in Objects/weakrefobject.c
    '__bytes__',
    '__reversed__',

    # from COPY_ATTR() in Objects/funcobject.c
    '__module__',
    '__name__',
    '__qualname__',
    '__doc__',
    '__annotations__',

    # from SLOT* in Objects/typeobject.c
    '__abs__',
    '__add__',
    '__aiter__',
    '__and__',
    '__anext__',
    '__await__',
    '__bool__',
    '__call__',
    '__contains__',
    '__del__',
    '__delattr__',
    '__delete__',
    '__delitem__',
    '__eq__',
    '__float__',
    '__floordiv__',
    '__ge__',
    '__get__',
    '__getattr__',
    '__getattribute__',
    '__getitem__',
    '__gt__',
    '__hash__',
    '__iadd__',
    '__iand__',
    '__ifloordiv__',
    '__ilshift__',
    '__imatmul__',
    '__imod__',
    '__imul__',
    '__index__',
    '__init__',
    '__int__',
    '__invert__',
    '__ior__',
    '__ipow__',
    '__irshift__',
    '__isub__',
    '__iter__',
    '__itruediv__',
    '__ixor__',
    '__le__',
    '__len__',
    '__lshift__',
    '__lt__',
    '__matmul__',
    '__mod__',
    '__mul__',
    '__ne__',
    '__neg__',
    '__new__',
    '__next__',
    '__or__',
    '__pos__',
    '__pow__',
    '__radd__',
    '__rand__',
    '__repr__',
    '__rfloordiv__',
    '__rlshift__',
    '__rmatmul__',
    '__rmod__',
    '__rmul__',
    '__ror__',
    '__rpow__',
    '__rrshift__',
    '__rshift__',
    '__rsub__',
    '__rtruediv__',
    '__rxor__',
    '__set__',
    '__setattr__',
    '__setitem__',
    '__str__',
    '__sub__',
    '__truediv__',
    '__xor__',
    '__divmod__',
    '__rdivmod__',
    '__buffer__',
    '__release_buffer__',
]

NON_GENERATED_IMMORTAL_OBJECTS = [
    # The generated ones come from generate_runtime_init().
    '(PyObject *)&_Py_SINGLETON(bytes_empty)',
    '(PyObject *)&_Py_SINGLETON(tuple_empty)',
    '(PyObject *)&_Py_SINGLETON(hamt_bitmap_node_empty)',
    '(PyObject *)&_Py_INTERP_SINGLETON(interp, hamt_empty)',
    '(PyObject *)&_Py_SINGLETON(context_token_missing)',
]


#######################################
# helpers

def iter_files():
    for name in ('Modules', 'Objects', 'Parser', 'PC', 'Programs', 'Python'):
        root = os.path.join(ROOT, name)
        for dirname, _, files in os.walk(root):
            for name in files:
                if not name.endswith(('.c', '.h')):
                    continue
                yield os.path.join(dirname, name)


def iter_global_strings():
    id_regex = re.compile(r'\b_Py_ID\((\w+)\)')
    str_regex = re.compile(r'\b_Py_DECLARE_STR\((\w+), "(.*?)"\)')
    for filename in iter_files():
        try:
            infile = open(filename, encoding='utf-8')
        except FileNotFoundError:
            # The file must have been a temporary file.
            continue
        with infile:
            for lno, line in enumerate(infile, 1):
                for m in id_regex.finditer(line):
                    identifier, = m.groups()
                    yield identifier, None, filename, lno, line
                for m in str_regex.finditer(line):
                    varname, string = m.groups()
                    yield varname, string, filename, lno, line


def iter_to_marker(lines, marker):
    for line in lines:
        if line.rstrip() == marker:
            break
        yield line


class Printer:

    def __init__(self, file):
        self.level = 0
        self.file = file
        self.continuation = [False]

    @contextlib.contextmanager
    def indent(self):
        save_level = self.level
        try:
            self.level += 1
            yield
        finally:
            self.level = save_level

    def write(self, arg):
        eol = '\n'
        if self.continuation[-1]:
            eol = f' \\{eol}' if arg else f'\\{eol}'
        self.file.writelines(("    "*self.level, arg, eol))

    @contextlib.contextmanager
    def block(self, prefix, suffix="", *, continuation=None):
        if continuation is None:
            continuation = self.continuation[-1]
        self.continuation.append(continuation)

        self.write(prefix + " {")
        with self.indent():
            yield
        self.continuation.pop()
        self.write("}" + suffix)


@contextlib.contextmanager
def open_for_changes(filename, orig):
    """Like open() but only write to the file if it changed."""
    outfile = io.StringIO()
    yield outfile
    text = outfile.getvalue()
    if text != orig:
        with open(filename, 'w', encoding='utf-8') as outfile:
            outfile.write(text)
    else:
        print(f'# not changed: {filename}')


#######################################
# the global objects

START = f'/* The following is auto-generated by {SCRIPT_NAME}. */'
END = '/* End auto-generated code */'


def generate_global_strings(identifiers, strings):
    filename = os.path.join(INTERNAL, 'pycore_global_strings.h')

    # Read the non-generated part of the file.
    with open(filename) as infile:
        orig = infile.read()
    lines = iter(orig.rstrip().splitlines())
    before = '\n'.join(iter_to_marker(lines, START))
    for _ in iter_to_marker(lines, END):
        pass
    after = '\n'.join(lines)

    # Generate the file.
    with open_for_changes(filename, orig) as outfile:
        printer = Printer(outfile)
        printer.write(before)
        printer.write(START)
        with printer.block('struct _Py_global_strings', ';'):
            with printer.block('struct', ' literals;'):
                for literal, name in sorted(strings.items(), key=lambda x: x[1]):
                    printer.write(f'STRUCT_FOR_STR({name}, "{literal}")')
            outfile.write('\n')
            with printer.block('struct', ' identifiers;'):
                for name in sorted(identifiers):
                    assert name.isidentifier(), name
                    printer.write(f'STRUCT_FOR_ID({name})')
            with printer.block('struct', ' ascii[128];'):
                printer.write("PyASCIIObject _ascii;")
                printer.write("uint8_t _data[2];")
            with printer.block('struct', ' latin1[128];'):
                printer.write("PyCompactUnicodeObject _latin1;")
                printer.write("uint8_t _data[2];")
        printer.write(END)
        printer.write(after)


def generate_runtime_init(identifiers, strings):
    # First get some info from the declarations.
    nsmallposints = None
    nsmallnegints = None
    with open(os.path.join(INTERNAL, 'pycore_global_objects.h')) as infile:
        for line in infile:
            if line.startswith('#define _PY_NSMALLPOSINTS'):
                nsmallposints = int(line.split()[-1])
            elif line.startswith('#define _PY_NSMALLNEGINTS'):
                nsmallnegints = int(line.split()[-1])
                break
        else:
            raise NotImplementedError
    assert nsmallposints and nsmallnegints

    # Then target the runtime initializer.
    filename = os.path.join(INTERNAL, 'pycore_runtime_init_generated.h')

    # Read the non-generated part of the file.
    with open(filename) as infile:
        orig = infile.read()
    lines = iter(orig.rstrip().splitlines())
    before = '\n'.join(iter_to_marker(lines, START))
    for _ in iter_to_marker(lines, END):
        pass
    after = '\n'.join(lines)

    # Generate the file.
    with open_for_changes(filename, orig) as outfile:
        immortal_objects = []
        printer = Printer(outfile)
        printer.write(before)
        printer.write(START)
        with printer.block('#define _Py_small_ints_INIT', continuation=True):
            for i in range(-nsmallnegints, nsmallposints):
                printer.write(f'_PyLong_DIGIT_INIT({i}),')
                immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(small_ints)[_PY_NSMALLNEGINTS + {i}]')
        printer.write('')
        with printer.block('#define _Py_bytes_characters_INIT', continuation=True):
            for i in range(256):
                printer.write(f'_PyBytes_CHAR_INIT({i}),')
                immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(bytes_characters)[{i}]')
        printer.write('')
        with printer.block('#define _Py_str_literals_INIT', continuation=True):
            for literal, name in sorted(strings.items(), key=lambda x: x[1]):
                printer.write(f'INIT_STR({name}, "{literal}"),')
                immortal_objects.append(f'(PyObject *)&_Py_STR({name})')
        printer.write('')
        with printer.block('#define _Py_str_identifiers_INIT', continuation=True):
            for name in sorted(identifiers):
                assert name.isidentifier(), name
                printer.write(f'INIT_ID({name}),')
                immortal_objects.append(f'(PyObject *)&_Py_ID({name})')
        printer.write('')
        with printer.block('#define _Py_str_ascii_INIT', continuation=True):
            for i in range(128):
                printer.write(f'_PyASCIIObject_INIT("\\x{i:02x}"),')
                immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).ascii[{i}]')
        printer.write('')
        with printer.block('#define _Py_str_latin1_INIT', continuation=True):
            for i in range(128, 256):
                utf8 = ['"']
                for c in chr(i).encode('utf-8'):
                    utf8.append(f"\\x{c:02x}")
                utf8.append('"')
                printer.write(f'_PyUnicode_LATIN1_INIT("\\x{i:02x}", {"".join(utf8)}),')
                immortal_objects.append(f'(PyObject *)&_Py_SINGLETON(strings).latin1[{i} - 128]')
        printer.write(END)
        printer.write(after)
        return immortal_objects


def generate_static_strings_initializer(identifiers, strings):
    # Target the runtime initializer.
    filename = os.path.join(INTERNAL, 'pycore_unicodeobject_generated.h')

    # Read the non-generated part of the file.
    with open(filename) as infile:
        orig = infile.read()
    lines = iter(orig.rstrip().splitlines())
    before = '\n'.join(iter_to_marker(lines, START))
    for _ in iter_to_marker(lines, END):
        pass
    after = '\n'.join(lines)

    # Generate the file.
    with open_for_changes(filename, orig) as outfile:
        printer = Printer(outfile)
        printer.write(before)
        printer.write(START)
        printer.write("static inline void")
        with printer.block("_PyUnicode_InitStaticStrings(PyInterpreterState *interp)"):
            printer.write(f'PyObject *string;')
            for i in sorted(identifiers):
                # This use of _Py_ID() is ignored by iter_global_strings()
                # since iter_files() ignores .h files.
                printer.write(f'string = &_Py_ID({i});')
                printer.write(f'_PyUnicode_InternStatic(interp, &string);')
                printer.write(f'assert(_PyUnicode_CheckConsistency(string, 1));')
                printer.write(f'assert(PyUnicode_GET_LENGTH(string) != 1);')
            for value, name in sorted(strings.items()):
                printer.write(f'string = &_Py_STR({name});')
                printer.write(f'_PyUnicode_InternStatic(interp, &string);')
                printer.write(f'assert(_PyUnicode_CheckConsistency(string, 1));')
                printer.write(f'assert(PyUnicode_GET_LENGTH(string) != 1);')
        printer.write(END)
        printer.write(after)


def generate_global_object_finalizers(generated_immortal_objects):
    # Target the runtime initializer.
    filename = os.path.join(INTERNAL, 'pycore_global_objects_fini_generated.h')

    # Read the non-generated part of the file.
    with open(filename) as infile:
        orig = infile.read()
    lines = iter(orig.rstrip().splitlines())
    before = '\n'.join(iter_to_marker(lines, START))
    for _ in iter_to_marker(lines, END):
        pass
    after = '\n'.join(lines)

    # Generate the file.
    with open_for_changes(filename, orig) as outfile:
        printer = Printer(outfile)
        printer.write(before)
        printer.write(START)
        printer.write('#ifdef Py_DEBUG')
        printer.write("static inline void")
        with printer.block(
                "_PyStaticObjects_CheckRefcnt(PyInterpreterState *interp)"):
            printer.write('/* generated runtime-global */')
            printer.write('// (see pycore_runtime_init_generated.h)')
            for ref in generated_immortal_objects:
                printer.write(f'_PyStaticObject_CheckRefcnt({ref});')
            printer.write('/* non-generated */')
            for ref in NON_GENERATED_IMMORTAL_OBJECTS:
                printer.write(f'_PyStaticObject_CheckRefcnt({ref});')
        printer.write('#endif  // Py_DEBUG')
        printer.write(END)
        printer.write(after)


def get_identifiers_and_strings() -> 'tuple[set[str], dict[str, str]]':
    identifiers = set(IDENTIFIERS)
    strings = {}
    # Note that we store strings as they appear in C source, so the checks here
    # can be defeated, e.g.:
    # - "a" and "\0x61" won't be reported as duplicate.
    # - "\n" appears as 2 characters.
    # Probably not worth adding a C string parser.
    for name, string, *_ in iter_global_strings():
        if string is None:
            if name not in IGNORED:
                identifiers.add(name)
        else:
            if len(string) == 1 and ord(string) < 256:
                # Give a nice message for common mistakes.
                # To cover tricky cases (like "\n") we also generate C asserts.
                raise ValueError(
                    'do not use &_Py_ID or &_Py_STR for one-character latin-1 '
                    + f'strings, use _Py_LATIN1_CHR instead: {string!r}')
            if string not in strings:
                strings[string] = name
            elif name != strings[string]:
                raise ValueError(f'string mismatch for {name!r} ({string!r} != {strings[name]!r}')
    overlap = identifiers & set(strings.keys())
    if overlap:
        raise ValueError(
            'do not use both _Py_ID and _Py_DECLARE_STR for the same string: '
            + repr(overlap))
    return identifiers, strings


#######################################
# the script

def main() -> None:
    identifiers, strings = get_identifiers_and_strings()

    generate_global_strings(identifiers, strings)
    generated_immortal_objects = generate_runtime_init(identifiers, strings)
    generate_static_strings_initializer(identifiers, strings)
    generate_global_object_finalizers(generated_immortal_objects)


if __name__ == '__main__':
    main()