closes bpo-31650: PEP 552 (Deterministic pycs) implementation (#4575)

Python now supports checking bytecode cache up-to-dateness with a hash of the
source contents rather than volatile source metadata. See the PEP for details.

While a fairly straightforward idea, quite a lot of code had to be modified due
to the pervasiveness of pyc implementation details in the codebase. Changes in
this commit include:

- The core changes to importlib to understand how to read, validate, and
  regenerate hash-based pycs.

- Support for generating hash-based pycs in py_compile and compileall.

- Modifications to our siphash implementation to support passing a custom
  key. We then expose it to importlib through _imp.

- Updates to all places in the interpreter, standard library, and tests that
  manually generate or parse pyc files to grok the new format.

- Support in the interpreter command line code for long options like
  --check-hash-based-pycs.

- Tests and documentation for all of the above.
This commit is contained in:
Benjamin Peterson 2017-12-09 10:26:52 -08:00 committed by GitHub
parent 28d8d14013
commit 42aa93b8ff
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
33 changed files with 3364 additions and 2505 deletions

View file

@ -354,6 +354,41 @@ exit:
return return_value;
}
PyDoc_STRVAR(_imp_source_hash__doc__,
"source_hash($module, /, key, source)\n"
"--\n"
"\n");
#define _IMP_SOURCE_HASH_METHODDEF \
{"source_hash", (PyCFunction)_imp_source_hash, METH_FASTCALL|METH_KEYWORDS, _imp_source_hash__doc__},
static PyObject *
_imp_source_hash_impl(PyObject *module, long key, Py_buffer *source);
static PyObject *
_imp_source_hash(PyObject *module, PyObject **args, Py_ssize_t nargs, PyObject *kwnames)
{
PyObject *return_value = NULL;
static const char * const _keywords[] = {"key", "source", NULL};
static _PyArg_Parser _parser = {"ly*:source_hash", _keywords, 0};
long key;
Py_buffer source = {NULL, NULL};
if (!_PyArg_ParseStackAndKeywords(args, nargs, kwnames, &_parser,
&key, &source)) {
goto exit;
}
return_value = _imp_source_hash_impl(module, key, &source);
exit:
/* Cleanup for source */
if (source.obj) {
PyBuffer_Release(&source);
}
return return_value;
}
#ifndef _IMP_CREATE_DYNAMIC_METHODDEF
#define _IMP_CREATE_DYNAMIC_METHODDEF
#endif /* !defined(_IMP_CREATE_DYNAMIC_METHODDEF) */
@ -361,4 +396,4 @@ exit:
#ifndef _IMP_EXEC_DYNAMIC_METHODDEF
#define _IMP_EXEC_DYNAMIC_METHODDEF
#endif /* !defined(_IMP_EXEC_DYNAMIC_METHODDEF) */
/*[clinic end generated code: output=d068dd493e513604 input=a9049054013a1b77]*/
/*[clinic end generated code: output=e8b2c0b0d0a75da8 input=a9049054013a1b77]*/

View file

@ -51,7 +51,8 @@ void _PyOS_ResetGetOpt(void)
opt_ptr = L"";
}
int _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring)
int _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring,
const _PyOS_LongOption *longopts, int *longindex)
{
wchar_t *ptr;
wchar_t option;
@ -86,13 +87,41 @@ int _PyOS_GetOpt(int argc, wchar_t **argv, wchar_t *optstring)
return 'V';
}
opt_ptr = &argv[_PyOS_optind++][1];
}
if ((option = *opt_ptr++) == L'\0')
return -1;
if (option == L'-') {
// Parse long option.
if (*opt_ptr == L'\0') {
fprintf(stderr, "expected long option\n");
return -1;
}
*longindex = 0;
const _PyOS_LongOption *opt;
for (opt = &longopts[*longindex]; opt->name; opt = &longopts[++(*longindex)]) {
if (!wcscmp(opt->name, opt_ptr))
break;
}
if (!opt->name) {
fprintf(stderr, "unknown option %ls\n", argv[_PyOS_optind - 1]);
return '_';
}
opt_ptr = L"";
if (!opt->has_arg) {
return opt->val;
}
if (_PyOS_optind >= argc) {
fprintf(stderr, "Argument expected for the %ls options\n",
argv[_PyOS_optind - 1]);
return '_';
}
_PyOS_optarg = argv[_PyOS_optind++];
return opt->val;
}
if (option == 'J') {
if (_PyOS_opterr)
fprintf(stderr, "-J is reserved for Jython\n");

View file

@ -5,6 +5,8 @@
#include "Python-ast.h"
#undef Yield /* undefine macro conflicting with winbase.h */
#include "internal/hash.h"
#include "internal/import.h"
#include "internal/pystate.h"
#include "errcode.h"
#include "marshal.h"
@ -2184,6 +2186,34 @@ _imp_exec_builtin_impl(PyObject *module, PyObject *mod)
return exec_builtin_or_dynamic(mod);
}
/*[clinic input]
_imp.source_hash
key: long
source: Py_buffer
[clinic start generated code]*/
static PyObject *
_imp_source_hash_impl(PyObject *module, long key, Py_buffer *source)
/*[clinic end generated code: output=edb292448cf399ea input=9aaad1e590089789]*/
{
uint64_t hash = _Py_KeyedHash((uint64_t)key, source->buf, source->len);
#if !PY_LITTLE_ENDIAN
// Force to little-endian. There really ought to be a succinct standard way
// to do this.
union {
uint64_t x;
unsigned char data[sizeof(uint64_t)];
} pun;
pun.x = hash;
for (size_t i = 0; i < sizeof(pun.data); i++) {
pun.data[sizeof(pun.data) - i - 1] = pun.data[i];
}
hash = pun.x;
#endif
return PyBytes_FromStringAndSize((const char *)&hash, sizeof(hash));
}
PyDoc_STRVAR(doc_imp,
"(Extremely) low-level import machinery bits as used by importlib and imp.");
@ -2203,6 +2233,7 @@ static PyMethodDef imp_methods[] = {
_IMP_EXEC_DYNAMIC_METHODDEF
_IMP_EXEC_BUILTIN_METHODDEF
_IMP__FIX_CO_FILENAME_METHODDEF
_IMP_SOURCE_HASH_METHODDEF
{NULL, NULL} /* sentinel */
};
@ -2219,6 +2250,8 @@ static struct PyModuleDef impmodule = {
NULL
};
const char *_Py_CheckHashBasedPycsMode = "default";
PyMODINIT_FUNC
PyInit_imp(void)
{
@ -2230,6 +2263,15 @@ PyInit_imp(void)
d = PyModule_GetDict(m);
if (d == NULL)
goto failure;
PyObject *pyc_mode = PyUnicode_FromString(_Py_CheckHashBasedPycsMode);
if (pyc_mode == NULL) {
goto failure;
}
if (PyDict_SetItemString(d, "check_hash_based_pycs", pyc_mode) < 0) {
Py_DECREF(pyc_mode);
goto failure;
}
Py_DECREF(pyc_mode);
return m;
failure:

File diff suppressed because it is too large Load diff

View file

@ -284,7 +284,6 @@ static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
/* **************************************************************************
<MIT License>
Copyright (c) 2013 Marek Majkowski <marek@popcount.org>
@ -364,10 +363,10 @@ static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
HALF_ROUND(v2,v1,v0,v3,17,21);
static Py_hash_t
siphash24(const void *src, Py_ssize_t src_sz) {
uint64_t k0 = _le64toh(_Py_HashSecret.siphash.k0);
uint64_t k1 = _le64toh(_Py_HashSecret.siphash.k1);
static uint64_t
siphash24(uint64_t key0, uint64_t key1, const void *src, Py_ssize_t src_sz) {
uint64_t k0 = _le64toh(key0);
uint64_t k1 = _le64toh(key1);
uint64_t b = (uint64_t)src_sz << 56;
const uint64_t *in = (uint64_t*)src;
@ -412,12 +411,26 @@ siphash24(const void *src, Py_ssize_t src_sz) {
/* modified */
t = (v0 ^ v1) ^ (v2 ^ v3);
return (Py_hash_t)t;
return t;
}
static PyHash_FuncDef PyHash_Func = {siphash24, "siphash24", 64, 128};
static Py_hash_t
pysiphash(const void *src, Py_ssize_t src_sz) {
return (Py_hash_t)siphash24(
_Py_HashSecret.siphash.k0, _Py_HashSecret.siphash.k1,
src, src_sz);
}
#endif /* Py_HASH_ALGORITHM == Py_HASH_SIPHASH24 */
uint64_t
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
{
return siphash24(key, 0, src, src_sz);
}
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
#endif
#ifdef __cplusplus
}

View file

@ -1053,7 +1053,8 @@ run_pyc_file(FILE *fp, const char *filename, PyObject *globals,
"Bad magic number in .pyc file");
return NULL;
}
/* Skip mtime and size */
/* Skip the rest of the header. */
(void) PyMarshal_ReadLongFromFile(fp);
(void) PyMarshal_ReadLongFromFile(fp);
(void) PyMarshal_ReadLongFromFile(fp);
if (PyErr_Occurred())