closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)

Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.

While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:

- The core changes to importlib to understand how to read, validate, and
  regenerate hash-based pycs.

- Support for generating hash-based pycs in py_compile and compileall.

- Modifications to our siphash implementation to support passing a custom
  key. We then expose it to importlib through _imp.

- Updates to all places in the interpreter, standard library, and tests that
  manually generate or parse pyc files to grok the new format.

- Support in the interpreter command line code for long options like
  --check-hash-based-pycs.

- Tests and documentation for all of the above.
This commit is contained in:
Benjamin Peterson 2017-12-09 10:26:52 -08:00 committed by GitHub
parent 28d8d14013
commit 42aa93b8ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 3364 additions and 2505 deletions

View file

@ -242,6 +242,7 @@ _code_type = type(_write_atomic.__code__)
# Python 3.6rc1 3379 (more thorough __class__ validation #23722)
# Python 3.7a0 3390 (add LOAD_METHOD and CALL_METHOD opcodes)
# Python 3.7a0 3391 (update GET_AITER #31709)
# Python 3.7a0 3392 (PEP 552: Deterministic pycs)
#
# MAGIC must change whenever the bytecode emitted by the compiler may no
# longer be understood by older implementations of the eval loop (usually
@ -250,7 +251,7 @@ _code_type = type(_write_atomic.__code__)
# Whenever MAGIC_NUMBER is changed, the ranges in the magic_values array
# in PC/launcher.c must also be updated.
MAGIC_NUMBER = (3391).to_bytes(2, 'little') + b'\r\n'
MAGIC_NUMBER = (3392).to_bytes(2, 'little') + b'\r\n'
_RAW_MAGIC_NUMBER = int.from_bytes(MAGIC_NUMBER, 'little') # For import.c
_PYCACHE = '__pycache__'
@ -429,63 +430,93 @@ def _find_module_shim(self, fullname):
return loader
def _validate_bytecode_header(data, source_stats=None, name=None, path=None):
"""Validate the header of the passed-in bytecode against source_stats (if
given) and returning the bytecode that can be compiled by compile().
def _classify_pyc(data, name, exc_details):
"""Perform basic validity checking of a pyc header and return the flags field,
which determines how the pyc should be further validated against the source.
All other arguments are used to enhance error reporting.
*data* is the contents of the pyc file. (Only the first 16 bytes are
required, though.)
ImportError is raised when the magic number is incorrect or the bytecode is
found to be stale. EOFError is raised when the data is found to be
truncated.
*name* is the name of the module being imported. It is used for logging.
*exc_details* is a dictionary passed to ImportError if it raised for
improved debugging.
ImportError is raised when the magic number is incorrect or when the flags
field is invalid. EOFError is raised when the data is found to be truncated.
"""
exc_details = {}
if name is not None:
exc_details['name'] = name
else:
# To prevent having to make all messages have a conditional name.
name = '<bytecode>'
if path is not None:
exc_details['path'] = path
magic = data[:4]
raw_timestamp = data[4:8]
raw_size = data[8:12]
if magic != MAGIC_NUMBER:
message = 'bad magic number in {!r}: {!r}'.format(name, magic)
message = f'bad magic number in {name!r}: {magic!r}'
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
elif len(raw_timestamp) != 4:
message = 'reached EOF while reading timestamp in {!r}'.format(name)
if len(data) < 16:
message = f'reached EOF while reading pyc header of {name!r}'
_bootstrap._verbose_message('{}', message)
raise EOFError(message)
elif len(raw_size) != 4:
message = 'reached EOF while reading size of source in {!r}'.format(name)
flags = _r_long(data[4:8])
# Only the first two flags are defined.
if flags & ~0b11:
message = f'invalid flags {flags!r} in {name!r}'
raise ImportError(message, **exc_details)
return flags
def _validate_timestamp_pyc(data, source_mtime, source_size, name,
exc_details):
"""Validate a pyc against the source last-modified time.
*data* is the contents of the pyc file. (Only the first 16 bytes are
required.)
*source_mtime* is the last modified timestamp of the source file.
*source_size* is None or the size of the source file in bytes.
*name* is the name of the module being imported. It is used for logging.
*exc_details* is a dictionary passed to ImportError if it raised for
improved debugging.
An ImportError is raised if the bytecode is stale.
"""
if _r_long(data[8:12]) != (source_mtime & 0xFFFFFFFF):
message = f'bytecode is stale for {name!r}'
_bootstrap._verbose_message('{}', message)
raise EOFError(message)
if source_stats is not None:
try:
source_mtime = int(source_stats['mtime'])
except KeyError:
pass
else:
if _r_long(raw_timestamp) != source_mtime:
message = 'bytecode is stale for {!r}'.format(name)
_bootstrap._verbose_message('{}', message)
raise ImportError(message, **exc_details)
try:
source_size = source_stats['size'] & 0xFFFFFFFF
except KeyError:
pass
else:
if _r_long(raw_size) != source_size:
raise ImportError('bytecode is stale for {!r}'.format(name),
**exc_details)
return data[12:]
raise ImportError(message, **exc_details)
if (source_size is not None and
_r_long(data[12:16]) != (source_size & 0xFFFFFFFF)):
raise ImportError(f'bytecode is stale for {name!r}', **exc_details)
def _validate_hash_pyc(data, source_hash, name, exc_details):
"""Validate a hash-based pyc by checking the real source hash against the one in
the pyc header.
*data* is the contents of the pyc file. (Only the first 16 bytes are
required.)
*source_hash* is the importlib.util.source_hash() of the source file.
*name* is the name of the module being imported. It is used for logging.
*exc_details* is a dictionary passed to ImportError if it raised for
improved debugging.
An ImportError is raised if the bytecode is stale.
"""
if data[8:16] != source_hash:
raise ImportError(
f'hash in bytecode doesn\'t match hash of source {name!r}',
**exc_details,
)
def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
"""Compile bytecode as returned by _validate_bytecode_header()."""
"""Compile bytecode as found in a pyc."""
code = marshal.loads(data)
if isinstance(code, _code_type):
_bootstrap._verbose_message('code object from {!r}', bytecode_path)
@ -496,16 +527,28 @@ def _compile_bytecode(data, name=None, bytecode_path=None, source_path=None):
raise ImportError('Non-code object in {!r}'.format(bytecode_path),
name=name, path=bytecode_path)
def _code_to_bytecode(code, mtime=0, source_size=0):
"""Compile a code object into bytecode for writing out to a byte-compiled
file."""
def _code_to_timestamp_pyc(code, mtime=0, source_size=0):
"Produce the data for a timestamp-based pyc."
data = bytearray(MAGIC_NUMBER)
data.extend(_w_long(0))
data.extend(_w_long(mtime))
data.extend(_w_long(source_size))
data.extend(marshal.dumps(code))
return data
def _code_to_hash_pyc(code, source_hash, checked=True):
"Produce the data for a hash-based pyc."
data = bytearray(MAGIC_NUMBER)
flags = 0b1 | checked << 1
data.extend(_w_long(flags))
assert len(source_hash) == 8
data.extend(source_hash)
data.extend(marshal.dumps(code))
return data
def decode_source(source_bytes):
"""Decode bytes representing source code and return the string.
@ -751,6 +794,10 @@ class SourceLoader(_LoaderBasics):
"""
source_path = self.get_filename(fullname)
source_mtime = None
source_bytes = None
source_hash = None
hash_based = False
check_source = True
try:
bytecode_path = cache_from_source(source_path)
except NotImplementedError:
@ -767,10 +814,34 @@ class SourceLoader(_LoaderBasics):
except OSError:
pass
else:
exc_details = {
'name': fullname,
'path': bytecode_path,
}
try:
bytes_data = _validate_bytecode_header(data,
source_stats=st, name=fullname,
path=bytecode_path)
flags = _classify_pyc(data, fullname, exc_details)
bytes_data = memoryview(data)[16:]
hash_based = flags & 0b1 != 0
if hash_based:
check_source = flags & 0b10 != 0
if (_imp.check_hash_based_pycs != 'never' and
(check_source or
_imp.check_hash_based_pycs == 'always')):
source_bytes = self.get_data(source_path)
source_hash = _imp.source_hash(
_RAW_MAGIC_NUMBER,
source_bytes,
)
_validate_hash_pyc(data, source_hash, fullname,
exc_details)
else:
_validate_timestamp_pyc(
data,
source_mtime,
st['size'],
fullname,
exc_details,
)
except (ImportError, EOFError):
pass
else:
@ -779,13 +850,19 @@ class SourceLoader(_LoaderBasics):
return _compile_bytecode(bytes_data, name=fullname,
bytecode_path=bytecode_path,
source_path=source_path)
source_bytes = self.get_data(source_path)
if source_bytes is None:
source_bytes = self.get_data(source_path)
code_object = self.source_to_code(source_bytes, source_path)
_bootstrap._verbose_message('code object from {}', source_path)
if (not sys.dont_write_bytecode and bytecode_path is not None and
source_mtime is not None):
data = _code_to_bytecode(code_object, source_mtime,
len(source_bytes))
if hash_based:
if source_hash is None:
source_hash = _imp.source_hash(source_bytes)
data = _code_to_hash_pyc(code_object, source_hash, check_source)
else:
data = _code_to_timestamp_pyc(code_object, source_mtime,
len(source_bytes))
try:
self._cache_bytecode(source_path, bytecode_path, data)
_bootstrap._verbose_message('wrote {!r}', bytecode_path)
@ -887,8 +964,18 @@ class SourcelessFileLoader(FileLoader, _LoaderBasics):
def get_code(self, fullname):
path = self.get_filename(fullname)
data = self.get_data(path)
bytes_data = _validate_bytecode_header(data, name=fullname, path=path)
return _compile_bytecode(bytes_data, name=fullname, bytecode_path=path)
# Call _classify_pyc to do basic validation of the pyc but ignore the
# result. There's no source to check against.
exc_details = {
'name': fullname,
'path': path,
}
_classify_pyc(data, fullname, exc_details)
return _compile_bytecode(
memoryview(data)[16:],
name=fullname,
bytecode_path=path,
)
def get_source(self, fullname):
"""Return None as there is no source code."""