closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)

Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.

While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:

- The core changes to importlib to understand how to read, validate, and
  regenerate hash-based pycs.

- Support for generating hash-based pycs in py_compile and compileall.

- Modifications to our siphash implementation to support passing a custom
  key. We then expose it to importlib through _imp.

- Updates to all places in the interpreter, standard library, and tests that
  manually generate or parse pyc files to grok the new format.

- Support in the interpreter command line code for long options like
  --check-hash-based-pycs.

- Tests and documentation for all of the above.
This commit is contained in:
Benjamin Peterson 2017-12-09 10:26:52 -08:00 committed by GitHub
parent 28d8d14013
commit 42aa93b8ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 3364 additions and 2505 deletions

View file

@ -1,4 +1,5 @@
#include "Python.h"
#include "internal/import.h"
#include "internal/pystate.h"
#include "structmember.h"
#include "osdefs.h"
@ -1305,7 +1306,7 @@ unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
unsigned char *buf = (unsigned char *)PyBytes_AsString(data);
Py_ssize_t size = PyBytes_Size(data);
if (size < 12) {
if (size < 16) {
PyErr_SetString(ZipImportError,
"bad pyc data");
return NULL;
@ -1319,7 +1320,16 @@ unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
Py_RETURN_NONE; /* signal caller to try alternative */
}
if (mtime != 0 && !eq_mtime(get_uint32(buf + 4), mtime)) {
uint32_t flags = get_uint32(buf + 4);
if (flags != 0) {
// Hash-based pyc. We currently refuse to handle checked hash-based
// pycs. We could validate hash-based pycs against the source, but it
// seems likely that most people putting hash-based pycs in a zipfile
// will use unchecked ones.
if (strcmp(_Py_CheckHashBasedPycsMode, "never") &&
(flags != 0x1 || !strcmp(_Py_CheckHashBasedPycsMode, "always")))
Py_RETURN_NONE;
} else if ((mtime != 0 && !eq_mtime(get_uint32(buf + 8), mtime))) {
if (Py_VerboseFlag) {
PySys_FormatStderr("# %R has bad mtime\n",
pathname);
@ -1329,7 +1339,7 @@ unmarshal_code(PyObject *pathname, PyObject *data, time_t mtime)
/* XXX the pyc's size field is ignored; timestamp collisions are probably
unimportant with zip files. */
code = PyMarshal_ReadObjectFromString((char *)buf + 12, size - 12);
code = PyMarshal_ReadObjectFromString((char *)buf + 16, size - 16);
if (code == NULL) {
return NULL;
}