mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)
Python now supports checking bytecode cache up-to-dateness with a hash of the source contents rather than volatile source metadata. See the PEP for details. While a fairly straightforward idea, quite a lot of code had to be modified due to the pervasiveness of pyc implementation details in the codebase. Changes in this commit include: - The core changes to importlib to understand how to read, validate, and regenerate hash-based pycs. - Support for generating hash-based pycs in py_compile and compileall. - Modifications to our siphash implementation to support passing a custom key. We then expose it to importlib through _imp. - Updates to all places in the interpreter, standard library, and tests that manually generate or parse pyc files to grok the new format. - Support in the interpreter command line code for long options like --check-hash-based-pycs. - Tests and documentation for all of the above.
This commit is contained in:
parent
28d8d14013
commit
42aa93b8ff
33 changed files with 3364 additions and 2505 deletions
|
@ -242,6 +242,7 @@ _code_type = type(_write_atomic.__code__)
|
|||
# Python 3.6rc1 3379 (more thorough __class__ validation #23722)
|
||||
# Python 3.7a0 3390 (add LOAD_METHOD and CALL_METHOD opcodes)
|
||||
# Python 3.7a0 3391 (update GET_AITER #31709)
|
||||
# Python 3.7a0 3392 (PEP 552: Deterministic pycs)
|
||||
#
|
||||
# MAGIC must change whenever the bytecode emitted by the compiler may no
|
||||
# longer be understood by older implementations of the eval loop (usually
|
||||
|
@ -250,7 +251,7 @@ _code_type = type(_write_atomic.__code__)
|
|||
# Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
|
||||
# in PC/launcher.c must also be updated.
|
||||
|
||||
MAGIC_NUMBER = (3391).to_bytes(2, 'little') + b'\r\n'
|
||||
MAGIC_NUMBER = (3392).to_bytes(2, 'little') + b'\r\n'
|
||||
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c
|
||||
|
||||
_PYCACHE = '__pycache__'
|
||||
|
@ -429,63 +430,93 @@ def _find_module_shim(self, fullname):
|
|||
return loader
|
||||
|
||||
|
||||
def _validate_bytecode_header(data, source_stats=None, name=None, path=None):
|
||||
"""Validate the header of the passed-in bytecode against source_stats (if
|
||||
given) and returning the bytecode that can be compiled by compile().
|
||||
def _classify_pyc(data, name, exc_details):
|
||||
"""Perform basic validity checking of a pyc header and return the flags field,
|
||||
which determines how the pyc should be further validated against the source.
|
||||
|
||||
All other arguments are used to enhance error reporting.
|
||||
*data* is the contents of the pyc file. (Only the first 16 bytes are
|
||||
required, though.)
|
||||
|
||||
ImportError is raised when the magic number is incorrect or the bytecode is
|
||||
found to be stale. EOFError is raised when the data is found to be
|
||||
truncated.
|
||||
*name* is the name of the module being imported. It is used for logging.
|
||||
|
||||
*exc_details* is a dictionary passed to ImportError if it raised for
|
||||
improved debugging.
|
||||
|
||||
ImportError is raised when the magic number is incorrect or when the flags
|
||||
field is invalid. EOFError is raised when the data is found to be truncated.
|
||||
|
||||
"""
|
||||
exc_details = {}
|
||||
if name is not None:
|
||||
exc_details['name'] = name
|
||||
else:
|
||||
# To prevent having to make all messages have a conditional name.
|
||||
name = '<bytecode>'
|
||||
if path is not None:
|
||||
exc_details['path'] = path
|
||||
magic = data[:4]
|
||||
raw_timestamp = data[4:8]
|
||||
raw_size = data[8:12]
|
||||
if magic != MAGIC_NUMBER:
|
||||
message = 'bad magic number in {!r}: {!r}'.format(name, magic)
|
||||
message = f'bad magic number in {name!r}: {magic!r}'
|
||||
_bootstrap._verbose_message('{}', message)
|
||||
raise ImportError(message, **exc_details)
|
||||
elif len(raw_timestamp) != 4:
|
||||
message = 'reached EOF while reading timestamp in {!r}'.format(name)
|
||||
if len(data) < 16:
|
||||
message = f'reached EOF while reading pyc header of {name!r}'
|
||||
_bootstrap._verbose_message('{}', message)
|
||||
raise EOFError(message)
|
||||
elif len(raw_size) != 4:
|
||||
message = 'reached EOF while reading size of source in {!r}'.format(name)
|
||||
flags = _r_long(data[4:8])
|
||||
# Only the first two flags are defined.
|
||||
if flags & ~0b11:
|
||||
message = f'invalid flags {flags!r} in {name!r}'
|
||||
raise ImportError(message, **exc_details)
|
||||
return flags
|
||||
|
||||
|
||||
def _validate_timestamp_pyc(data, source_mtime, source_size, name,
|
||||
exc_details):
|
||||
"""Validate a pyc against the source last-modified time.
|
||||
|
||||
*data* is the contents of the pyc file. (Only the first 16 bytes are
|
||||
required.)
|
||||
|
||||
*source_mtime* is the last modified timestamp of the source file.
|
||||
|
||||
*source_size* is None or the size of the source file in bytes.
|
||||
|
||||
*name* is the name of the module being imported. It is used for logging.
|
||||
|
||||
*exc_details* is a dictionary passed to ImportError if it raised for
|
||||
improved debugging.
|
||||
|
||||
An ImportError is raised if the bytecode is stale.
|
||||
|
||||
"""
|
||||
if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF):
|
||||
message = f'bytecode is stale for {name!r}'
|
||||
_bootstrap._verbose_message('{}', message)
|
||||
raise EOFError(message)
|
||||
if source_stats is not None:
|
||||
try:
|
||||
source_mtime = int(source_stats['mtime'])
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if _r_long(raw_timestamp) != source_mtime:
|
||||
message = 'bytecode is stale for {!r}'.format(name)
|
||||
_bootstrap._verbose_message('{}', message)
|
||||
raise ImportError(message, **exc_details)
|
||||
try:
|
||||
source_size = source_stats['size'] & 0xFFFFFFFF
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
if _r_long(raw_size) != source_size:
|
||||
raise ImportError('bytecode is stale for {!r}'.format(name),
|
||||
**exc_details)
|
||||
return data[12:]
|
||||
raise ImportError(message, **exc_details)
|
||||
if (source_size is not None and
|
||||
_r_long(data[12:16]) != (source_size & 0xFFFFFFFF)):
|
||||
raise ImportError(f'bytecode is stale for {name!r}', **exc_details)
|
||||
|
||||
|
||||
def _validate_hash_pyc(data, source_hash, name, exc_details):
|
||||
"""Validate a hash-based pyc by checking the real source hash against the one in
|
||||
the pyc header.
|
||||
|
||||
*data* is the contents of the pyc file. (Only the first 16 bytes are
|
||||
required.)
|
||||
|
||||
*source_hash* is the importlib.util.source_hash() of the source file.
|
||||
|
||||
*name* is the name of the module being imported. It is used for logging.
|
||||
|
||||
*exc_details* is a dictionary passed to ImportError if it raised for
|
||||
improved debugging.
|
||||
|
||||
An ImportError is raised if the bytecode is stale.
|
||||
|
||||
"""
|
||||
if data[8:16] != source_hash:
|
||||
raise ImportError(
|
||||
f'hash in bytecode doesn\'t match hash of source {name!r}',
|
||||
**exc_details,
|
||||
)
|
||||
|
||||
|
||||
def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
|
||||
"""Compile bytecode as returned by _validate_bytecode_header()."""
|
||||
"""Compile bytecode as found in a pyc."""
|
||||
code = marshal.loads(data)
|
||||
if isinstance(code, _code_type):
|
||||
_bootstrap._verbose_message('code object from {!r}', bytecode_path)
|
||||
|
@ -496,16 +527,28 @@ def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
|
|||
raise ImportError('Non-code object in {!r}'.format(bytecode_path),
|
||||
name=name, path=bytecode_path)
|
||||
|
||||
def _code_to_bytecode(code, mtime=0, source_size=0):
|
||||
"""Compile a code object into bytecode for writing out to a byte-compiled
|
||||
file."""
|
||||
|
||||
def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
|
||||
"Produce the data for a timestamp-based pyc."
|
||||
data = bytearray(MAGIC_NUMBER)
|
||||
data.extend(_w_long(0))
|
||||
data.extend(_w_long(mtime))
|
||||
data.extend(_w_long(source_size))
|
||||
data.extend(marshal.dumps(code))
|
||||
return data
|
||||
|
||||
|
||||
def _code_to_hash_pyc(code, source_hash, checked=True):
|
||||
"Produce the data for a hash-based pyc."
|
||||
data = bytearray(MAGIC_NUMBER)
|
||||
flags = 0b1 | checked << 1
|
||||
data.extend(_w_long(flags))
|
||||
assert len(source_hash) == 8
|
||||
data.extend(source_hash)
|
||||
data.extend(marshal.dumps(code))
|
||||
return data
|
||||
|
||||
|
||||
def decode_source(source_bytes):
|
||||
"""Decode bytes representing source code and return the string.
|
||||
|
||||
|
@ -751,6 +794,10 @@ class SourceLoader(_LoaderBasics):
|
|||
"""
|
||||
source_path = self.get_filename(fullname)
|
||||
source_mtime = None
|
||||
source_bytes = None
|
||||
source_hash = None
|
||||
hash_based = False
|
||||
check_source = True
|
||||
try:
|
||||
bytecode_path = cache_from_source(source_path)
|
||||
except NotImplementedError:
|
||||
|
@ -767,10 +814,34 @@ class SourceLoader(_LoaderBasics):
|
|||
except OSError:
|
||||
pass
|
||||
else:
|
||||
exc_details = {
|
||||
'name': fullname,
|
||||
'path': bytecode_path,
|
||||
}
|
||||
try:
|
||||
bytes_data = _validate_bytecode_header(data,
|
||||
source_stats=st, name=fullname,
|
||||
path=bytecode_path)
|
||||
flags = _classify_pyc(data, fullname, exc_details)
|
||||
bytes_data = memoryview(data)[16:]
|
||||
hash_based = flags & 0b1 != 0
|
||||
if hash_based:
|
||||
check_source = flags & 0b10 != 0
|
||||
if (_imp.check_hash_based_pycs != 'never' and
|
||||
(check_source or
|
||||
_imp.check_hash_based_pycs == 'always')):
|
||||
source_bytes = self.get_data(source_path)
|
||||
source_hash = _imp.source_hash(
|
||||
_RAW_MAGIC_NUMBER,
|
||||
source_bytes,
|
||||
)
|
||||
_validate_hash_pyc(data, source_hash, fullname,
|
||||
exc_details)
|
||||
else:
|
||||
_validate_timestamp_pyc(
|
||||
data,
|
||||
source_mtime,
|
||||
st['size'],
|
||||
fullname,
|
||||
exc_details,
|
||||
)
|
||||
except (ImportError, EOFError):
|
||||
pass
|
||||
else:
|
||||
|
@ -779,13 +850,19 @@ class SourceLoader(_LoaderBasics):
|
|||
return _compile_bytecode(bytes_data, name=fullname,
|
||||
bytecode_path=bytecode_path,
|
||||
source_path=source_path)
|
||||
source_bytes = self.get_data(source_path)
|
||||
if source_bytes is None:
|
||||
source_bytes = self.get_data(source_path)
|
||||
code_object = self.source_to_code(source_bytes, source_path)
|
||||
_bootstrap._verbose_message('code object from {}', source_path)
|
||||
if (not sys.dont_write_bytecode and bytecode_path is not None and
|
||||
source_mtime is not None):
|
||||
data = _code_to_bytecode(code_object, source_mtime,
|
||||
len(source_bytes))
|
||||
if hash_based:
|
||||
if source_hash is None:
|
||||
source_hash = _imp.source_hash(source_bytes)
|
||||
data = _code_to_hash_pyc(code_object, source_hash, check_source)
|
||||
else:
|
||||
data = _code_to_timestamp_pyc(code_object, source_mtime,
|
||||
len(source_bytes))
|
||||
try:
|
||||
self._cache_bytecode(source_path, bytecode_path, data)
|
||||
_bootstrap._verbose_message('wrote {!r}', bytecode_path)
|
||||
|
@ -887,8 +964,18 @@ class SourcelessFileLoader(FileLoader, _LoaderBasics):
|
|||
def get_code(self, fullname):
|
||||
path = self.get_filename(fullname)
|
||||
data = self.get_data(path)
|
||||
bytes_data = _validate_bytecode_header(data, name=fullname, path=path)
|
||||
return _compile_bytecode(bytes_data, name=fullname, bytecode_path=path)
|
||||
# Call _classify_pyc to do basic validation of the pyc but ignore the
|
||||
# result. There's no source to check against.
|
||||
exc_details = {
|
||||
'name': fullname,
|
||||
'path': path,
|
||||
}
|
||||
_classify_pyc(data, fullname, exc_details)
|
||||
return _compile_bytecode(
|
||||
memoryview(data)[16:],
|
||||
name=fullname,
|
||||
bytecode_path=path,
|
||||
)
|
||||
|
||||
def get_source(self, fullname):
|
||||
"""Return None as there is no source code."""
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue