mirror of
https://github.com/python/cpython.git
synced 2025-08-03 00:23:06 +00:00
ssue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'.
Python now uses SipHash24 on all major platforms.
This commit is contained in:
parent
fe32aec25a
commit
985ecdcfc2
27 changed files with 1029 additions and 242 deletions
|
@ -897,7 +897,7 @@ bytes_hash(PyBytesObject *a)
|
|||
{
|
||||
if (a->ob_shash == -1) {
|
||||
/* Can't fail */
|
||||
a->ob_shash = _Py_HashBytes((unsigned char *) a->ob_sval, Py_SIZE(a));
|
||||
a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a));
|
||||
}
|
||||
return a->ob_shash;
|
||||
}
|
||||
|
|
|
@ -2742,7 +2742,7 @@ memory_hash(PyMemoryViewObject *self)
|
|||
}
|
||||
|
||||
/* Can't fail */
|
||||
self->hash = _Py_HashBytes((unsigned char *)mem, view->len);
|
||||
self->hash = _Py_HashBytes(mem, view->len);
|
||||
|
||||
if (mem != view->buf)
|
||||
PyMem_Free(mem);
|
||||
|
|
146
Objects/object.c
146
Objects/object.c
|
@ -731,150 +731,6 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
|
|||
return ok;
|
||||
}
|
||||
|
||||
/* Set of hash utility functions to help maintaining the invariant that
|
||||
if a==b then hash(a)==hash(b)
|
||||
|
||||
All the utility functions (_Py_Hash*()) return "-1" to signify an error.
|
||||
*/
|
||||
|
||||
/* For numeric types, the hash of a number x is based on the reduction
|
||||
of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
|
||||
hash(x) == hash(y) whenever x and y are numerically equal, even if
|
||||
x and y have different types.
|
||||
|
||||
A quick summary of the hashing strategy:
|
||||
|
||||
(1) First define the 'reduction of x modulo P' for any rational
|
||||
number x; this is a standard extension of the usual notion of
|
||||
reduction modulo P for integers. If x == p/q (written in lowest
|
||||
terms), the reduction is interpreted as the reduction of p times
|
||||
the inverse of the reduction of q, all modulo P; if q is exactly
|
||||
divisible by P then define the reduction to be infinity. So we've
|
||||
got a well-defined map
|
||||
|
||||
reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
|
||||
|
||||
(2) Now for a rational number x, define hash(x) by:
|
||||
|
||||
reduce(x) if x >= 0
|
||||
-reduce(-x) if x < 0
|
||||
|
||||
If the result of the reduction is infinity (this is impossible for
|
||||
integers, floats and Decimals) then use the predefined hash value
|
||||
_PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
|
||||
_PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
|
||||
hashes of float and Decimal infinities and nans.
|
||||
|
||||
A selling point for the above strategy is that it makes it possible
|
||||
to compute hashes of decimal and binary floating-point numbers
|
||||
efficiently, even if the exponent of the binary or decimal number
|
||||
is large. The key point is that
|
||||
|
||||
reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
|
||||
|
||||
provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
|
||||
binary or decimal float is never infinity, since the denominator is a power
|
||||
of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
|
||||
for nonnegative x,
|
||||
|
||||
reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
|
||||
|
||||
reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
|
||||
|
||||
and reduce(10**e) can be computed efficiently by the usual modular
|
||||
exponentiation algorithm. For reduce(2**e) it's even better: since
|
||||
P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
|
||||
by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
|
||||
|
||||
*/
|
||||
|
||||
Py_hash_t
|
||||
_Py_HashDouble(double v)
|
||||
{
|
||||
int e, sign;
|
||||
double m;
|
||||
Py_uhash_t x, y;
|
||||
|
||||
if (!Py_IS_FINITE(v)) {
|
||||
if (Py_IS_INFINITY(v))
|
||||
return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
|
||||
else
|
||||
return _PyHASH_NAN;
|
||||
}
|
||||
|
||||
m = frexp(v, &e);
|
||||
|
||||
sign = 1;
|
||||
if (m < 0) {
|
||||
sign = -1;
|
||||
m = -m;
|
||||
}
|
||||
|
||||
/* process 28 bits at a time; this should work well both for binary
|
||||
and hexadecimal floating point. */
|
||||
x = 0;
|
||||
while (m) {
|
||||
x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
|
||||
m *= 268435456.0; /* 2**28 */
|
||||
e -= 28;
|
||||
y = (Py_uhash_t)m; /* pull out integer part */
|
||||
m -= y;
|
||||
x += y;
|
||||
if (x >= _PyHASH_MODULUS)
|
||||
x -= _PyHASH_MODULUS;
|
||||
}
|
||||
|
||||
/* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
|
||||
e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
|
||||
x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
|
||||
|
||||
x = x * sign;
|
||||
if (x == (Py_uhash_t)-1)
|
||||
x = (Py_uhash_t)-2;
|
||||
return (Py_hash_t)x;
|
||||
}
|
||||
|
||||
Py_hash_t
|
||||
_Py_HashPointer(void *p)
|
||||
{
|
||||
Py_hash_t x;
|
||||
size_t y = (size_t)p;
|
||||
/* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
|
||||
excessive hash collisions for dicts and sets */
|
||||
y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
|
||||
x = (Py_hash_t)y;
|
||||
if (x == -1)
|
||||
x = -2;
|
||||
return x;
|
||||
}
|
||||
|
||||
Py_hash_t
|
||||
_Py_HashBytes(unsigned char *p, Py_ssize_t len)
|
||||
{
|
||||
Py_uhash_t x;
|
||||
Py_ssize_t i;
|
||||
|
||||
/*
|
||||
We make the hash of the empty string be 0, rather than using
|
||||
(prefix ^ suffix), since this slightly obfuscates the hash secret
|
||||
*/
|
||||
#ifdef Py_DEBUG
|
||||
assert(_Py_HashSecret_Initialized);
|
||||
#endif
|
||||
if (len == 0) {
|
||||
return 0;
|
||||
}
|
||||
x = (Py_uhash_t) _Py_HashSecret.prefix;
|
||||
x ^= (Py_uhash_t) *p << 7;
|
||||
for (i = 0; i < len; i++)
|
||||
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
|
||||
x ^= (Py_uhash_t) len;
|
||||
x ^= (Py_uhash_t) _Py_HashSecret.suffix;
|
||||
if (x == -1)
|
||||
x = -2;
|
||||
return x;
|
||||
}
|
||||
|
||||
Py_hash_t
|
||||
PyObject_HashNotImplemented(PyObject *v)
|
||||
{
|
||||
|
@ -883,8 +739,6 @@ PyObject_HashNotImplemented(PyObject *v)
|
|||
return -1;
|
||||
}
|
||||
|
||||
_Py_HashSecret_t _Py_HashSecret;
|
||||
|
||||
Py_hash_t
|
||||
PyObject_Hash(PyObject *v)
|
||||
{
|
||||
|
|
|
@ -11386,39 +11386,8 @@ unicode_hash(PyObject *self)
|
|||
_PyUnicode_HASH(self) = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The hash function as a macro, gets expanded three times below. */
|
||||
#define HASH(P) \
|
||||
x ^= (Py_uhash_t) *P << 7; \
|
||||
while (--len >= 0) \
|
||||
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
|
||||
|
||||
x = (Py_uhash_t) _Py_HashSecret.prefix;
|
||||
switch (PyUnicode_KIND(self)) {
|
||||
case PyUnicode_1BYTE_KIND: {
|
||||
const unsigned char *c = PyUnicode_1BYTE_DATA(self);
|
||||
HASH(c);
|
||||
break;
|
||||
}
|
||||
case PyUnicode_2BYTE_KIND: {
|
||||
const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
|
||||
HASH(s);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
Py_UCS4 *l;
|
||||
assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
|
||||
"Impossible switch case in unicode_hash");
|
||||
l = PyUnicode_4BYTE_DATA(self);
|
||||
HASH(l);
|
||||
break;
|
||||
}
|
||||
}
|
||||
x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
|
||||
x ^= (Py_uhash_t) _Py_HashSecret.suffix;
|
||||
|
||||
if (x == -1)
|
||||
x = -2;
|
||||
x = _Py_HashBytes(PyUnicode_DATA(self),
|
||||
PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
|
||||
_PyUnicode_HASH(self) = x;
|
||||
return x;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue