cpython/Tools/scripts/generate_global_objects.py
Eric Snow 12360aa159
bpo-46541: Discover the global strings. (gh-31346)
Instead of manually enumerating the global strings in generate_global_objects.py, we extrapolate the list from usage of _Py_ID() and _Py_STR() in the source files.

This is partly inspired by gh-31261.

https://bugs.python.org/issue46541
2022-02-14 17:36:51 -07:00

420 lines
12 KiB
Python

import contextlib
import glob
import os.path
import re
import sys
__file__ = os.path.abspath(__file__)
ROOT = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
INTERNAL = os.path.join(ROOT, 'Include', 'internal')
STRING_LITERALS = {
'empty': '',
'dot': '.',
}
IGNORED = {
'ACTION', # Python/_warnings.c
'ATTR', # Python/_warnings.c and Objects/funcobject.c
'DUNDER', # Objects/typeobject.c
'RDUNDER', # Objects/typeobject.c
'SPECIAL', # Objects/weakrefobject.c
}
IDENTIFIERS = [
# from ADD() Python/_warnings.c
'default',
'ignore',
# from GET_WARNINGS_ATTR() in Python/_warnings.c
'WarningMessage',
'_showwarnmsg',
'_warn_unawaited_coroutine',
'defaultaction',
'filters',
'onceregistry',
# from WRAP_METHOD() in Objects/weakrefobject.c
'__bytes__',
'__reversed__',
# from COPY_ATTR() in Objects/funcobject.c
'__module__',
'__name__',
'__qualname__',
'__doc__',
'__annotations__',
# from SLOT* in Objects/typeobject.c
'__abs__',
'__add__',
'__and__',
'__divmod__',
'__float__',
'__floordiv__',
'__getitem__',
'__iadd__',
'__iand__',
'__ifloordiv__',
'__ilshift__',
'__imatmul__',
'__imod__',
'__imul__',
'__int__',
'__invert__',
'__ior__',
'__irshift__',
'__isub__',
'__itruediv__',
'__ixor__',
'__lshift__',
'__matmul__',
'__mod__',
'__mul__',
'__neg__',
'__or__',
'__pos__',
'__pow__',
'__radd__',
'__rand__',
'__rdivmod__',
'__rfloordiv__',
'__rlshift__',
'__rmatmul__',
'__rmod__',
'__rmul__',
'__ror__',
'__rpow__',
'__rrshift__',
'__rshift__',
'__rsub__',
'__rtruediv__',
'__rxor__',
'__str__',
'__sub__',
'__truediv__',
'__xor__',
]
#######################################
# helpers
def iter_global_strings():
id_regex = re.compile(r'\b_Py_ID\((\w+)\)')
str_regex = re.compile(r'\b_Py_DECLARE_STR\((\w+), "(.*?)"\)')
for dirname, _, files in os.walk(ROOT):
if os.path.relpath(dirname, ROOT).startswith('Include'):
continue
for name in files:
if not name.endswith(('.c', '.h')):
continue
filename = os.path.join(dirname, name)
with open(os.path.join(filename), encoding='utf-8') as infile:
for lno, line in enumerate(infile, 1):
for m in id_regex.finditer(line):
identifier, = m.groups()
yield identifier, None, filename, lno, line
for m in str_regex.finditer(line):
varname, string = m.groups()
yield varname, string, filename, lno, line
def iter_to_marker(lines, marker):
for line in lines:
if line.rstrip() == marker:
break
yield line
class Printer:
def __init__(self, file):
self.level = 0
self.file = file
self.continuation = [False]
@contextlib.contextmanager
def indent(self):
save_level = self.level
try:
self.level += 1
yield
finally:
self.level = save_level
def write(self, arg):
eol = '\n'
if self.continuation[-1]:
eol = f' \\{eol}' if arg else f'\\{eol}'
self.file.writelines((" "*self.level, arg, eol))
@contextlib.contextmanager
def block(self, prefix, suffix="", *, continuation=None):
if continuation is None:
continuation = self.continuation[-1]
self.continuation.append(continuation)
self.write(prefix + " {")
with self.indent():
yield
self.continuation.pop()
self.write("}" + suffix)
#######################################
# the global objects
START = '/* The following is auto-generated by Tools/scripts/generate_global_objects.py. */'
END = '/* End auto-generated code */'
def generate_global_strings(identifiers, strings):
filename = os.path.join(INTERNAL, 'pycore_global_strings.h')
# Read the non-generated part of the file.
with open(filename) as infile:
before = ''.join(iter_to_marker(infile, START))[:-1]
for _ in iter_to_marker(infile, END):
pass
after = infile.read()[:-1]
# Generate the file.
with open(filename, 'w', encoding='utf-8') as outfile:
printer = Printer(outfile)
printer.write(before)
printer.write(START)
with printer.block('struct _Py_global_strings', ';'):
with printer.block('struct', ' literals;'):
for name, literal in sorted(strings.items()):
printer.write(f'STRUCT_FOR_STR({name}, "{literal}")')
outfile.write('\n')
with printer.block('struct', ' identifiers;'):
for name in sorted(identifiers):
assert name.isidentifier(), name
printer.write(f'STRUCT_FOR_ID({name})')
printer.write(END)
printer.write(after)
def generate_runtime_init(identifiers, strings):
# First get some info from the declarations.
nsmallposints = None
nsmallnegints = None
with open(os.path.join(INTERNAL, 'pycore_global_objects.h')) as infile:
for line in infile:
if line.startswith('#define _PY_NSMALLPOSINTS'):
nsmallposints = int(line.split()[-1])
elif line.startswith('#define _PY_NSMALLNEGINTS'):
nsmallnegints = int(line.split()[-1])
break
else:
raise NotImplementedError
assert nsmallposints and nsmallnegints
# Then target the runtime initializer.
filename = os.path.join(INTERNAL, 'pycore_runtime_init.h')
# Read the non-generated part of the file.
with open(filename) as infile:
before = ''.join(iter_to_marker(infile, START))[:-1]
for _ in iter_to_marker(infile, END):
pass
after = infile.read()[:-1]
# Generate the file.
with open(filename, 'w', encoding='utf-8') as outfile:
printer = Printer(outfile)
printer.write(before)
printer.write(START)
with printer.block('#define _Py_global_objects_INIT', continuation=True):
with printer.block('.singletons =', ','):
# Global int objects.
with printer.block('.small_ints =', ','):
for i in range(-nsmallnegints, nsmallposints):
printer.write(f'_PyLong_DIGIT_INIT({i}),')
printer.write('')
# Global bytes objects.
printer.write('.bytes_empty = _PyBytes_SIMPLE_INIT(0, 0),')
with printer.block('.bytes_characters =', ','):
for i in range(256):
printer.write(f'_PyBytes_CHAR_INIT({i}),')
printer.write('')
# Global strings.
with printer.block('.strings =', ','):
with printer.block('.literals =', ','):
for name, literal in sorted(strings.items()):
printer.write(f'INIT_STR({name}, "{literal}"),')
with printer.block('.identifiers =', ','):
for name in sorted(identifiers):
assert name.isidentifier(), name
printer.write(f'INIT_ID({name}),')
printer.write(END)
printer.write(after)
#######################################
# checks
def err(msg):
print(msg, file=sys.stderr)
GETTER_RE = re.compile(r'''
^
.*?
(?:
(?:
_Py_ID
[(]
( \w+ ) # <identifier>
[)]
)
|
(?:
_Py_STR
[(]
( \w+ ) # <literal>
[)]
)
)
''', re.VERBOSE)
TYPESLOTS_RE = re.compile(r'''
^
.*?
(?:
(?:
SLOT0 [(] .*?, \s*
( \w+ ) # <slot0>
[)]
)
|
(?:
SLOT1 [(] .*?, \s*
( \w+ ) # <slot1>
, .* [)]
)
|
(?:
SLOT1BIN [(] .*?, .*?, \s*
( \w+ ) # <slot1bin>
, \s*
( \w+ ) # <reverse>
[)]
)
|
(?:
SLOT1BINFULL [(] .*?, .*?, .*?, \s*
( \w+ ) # <slot1binfull>
, \s*
( \w+ ) # <fullreverse>
[)]
)
|
( SLOT \d .* [^)] $ ) # <wrapped>
)
''', re.VERBOSE)
def check_orphan_strings(identifiers):
literals = set(n for n, s in STRING_LITERALS.items() if s)
identifiers = set(identifiers)
files = glob.iglob(os.path.join(ROOT, '**', '*.[ch]'), recursive=True)
for i, filename in enumerate(files, start=1):
print('.', end='')
if i % 5 == 0:
print(' ', end='')
if i % 20 == 0:
print()
if i % 100 == 0:
print()
with open(filename) as infile:
wrapped = None
for line in infile:
identifier = literal = reverse = None
line = line.splitlines()[0]
if wrapped:
line = f'{wrapped.rstrip()} {line}'
wrapped = None
if os.path.basename(filename) == '_warnings.c':
m = re.match(r'^.* = GET_WARNINGS_ATTR[(][^,]*, (\w+),', line)
if m:
identifier, = m.groups()
elif os.path.basename(filename) == 'typeobject.c':
m = TYPESLOTS_RE.match(line)
if m:
(slot0,
slot1,
slot1bin, reverse,
slot1binfull, fullreverse,
wrapped,
) = m.groups()
identifier = slot0 or slot1 or slot1bin or slot1binfull
reverse = reverse or fullreverse
if not identifier and not literal:
m = GETTER_RE.match(line)
if not m:
continue
identifier, literal = m.groups()
if literal:
if literals and literal in literals:
literals.remove(literal)
if identifier:
if identifiers and identifier in identifiers:
identifiers.remove(identifier)
if reverse:
if identifiers and reverse in identifiers:
identifiers.remove(reverse)
if not literals and not identifiers:
break
else:
continue
break
if i % 20:
print()
if not literals and not identifiers:
return
print('ERROR:', file=sys.stderr)
if literals:
err(' unused global string literals:')
for name in sorted(literals):
err(f' {name}')
if identifiers:
if literals:
print()
err(' unused global identifiers:')
for name in sorted(identifiers):
err(f' {name}')
#######################################
# the script
def main(*, check=False) -> None:
identifiers = set(IDENTIFIERS)
strings = dict(STRING_LITERALS)
for name, string, filename, lno, _ in iter_global_strings():
if string is None:
if name not in IGNORED:
identifiers.add(name)
else:
if name not in strings:
strings[name] = string
elif string != strings[name]:
raise ValueError(f'string mismatch for {name!r} ({string!r} != {strings[name]!r}')
generate_global_strings(identifiers, strings)
generate_runtime_init(identifiers, strings)
if check:
check_orphan_strings(identifiers)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--check', action='store_true')
args = parser.parse_args()
main(**vars(args))