ssue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'.

Python now uses SipHash24 on all major platforms.
This commit is contained in:
Christian Heimes 2013-11-20 11:46:18 +01:00
parent fe32aec25a
commit 985ecdcfc2
27 changed files with 1029 additions and 242 deletions

View file

@ -897,7 +897,7 @@ bytes_hash(PyBytesObject *a)
{
if (a->ob_shash == -1) {
/* Can't fail */
a->ob_shash = _Py_HashBytes((unsigned char *) a->ob_sval, Py_SIZE(a));
a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a));
}
return a->ob_shash;
}

View file

@ -2742,7 +2742,7 @@ memory_hash(PyMemoryViewObject *self)
}
/* Can't fail */
self->hash = _Py_HashBytes((unsigned char *)mem, view->len);
self->hash = _Py_HashBytes(mem, view->len);
if (mem != view->buf)
PyMem_Free(mem);

View file

@ -731,150 +731,6 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
return ok;
}
/* Set of hash utility functions to help maintaining the invariant that
if a==b then hash(a)==hash(b)
All the utility functions (_Py_Hash*()) return "-1" to signify an error.
*/
/* For numeric types, the hash of a number x is based on the reduction
of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
hash(x) == hash(y) whenever x and y are numerically equal, even if
x and y have different types.
A quick summary of the hashing strategy:
(1) First define the 'reduction of x modulo P' for any rational
number x; this is a standard extension of the usual notion of
reduction modulo P for integers. If x == p/q (written in lowest
terms), the reduction is interpreted as the reduction of p times
the inverse of the reduction of q, all modulo P; if q is exactly
divisible by P then define the reduction to be infinity. So we've
got a well-defined map
reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
(2) Now for a rational number x, define hash(x) by:
reduce(x) if x >= 0
-reduce(-x) if x < 0
If the result of the reduction is infinity (this is impossible for
integers, floats and Decimals) then use the predefined hash value
_PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
_PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
hashes of float and Decimal infinities and nans.
A selling point for the above strategy is that it makes it possible
to compute hashes of decimal and binary floating-point numbers
efficiently, even if the exponent of the binary or decimal number
is large. The key point is that
reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
binary or decimal float is never infinity, since the denominator is a power
of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
for nonnegative x,
reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
and reduce(10**e) can be computed efficiently by the usual modular
exponentiation algorithm. For reduce(2**e) it's even better: since
P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
*/
Py_hash_t
_Py_HashDouble(double v)
{
int e, sign;
double m;
Py_uhash_t x, y;
if (!Py_IS_FINITE(v)) {
if (Py_IS_INFINITY(v))
return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
else
return _PyHASH_NAN;
}
m = frexp(v, &e);
sign = 1;
if (m < 0) {
sign = -1;
m = -m;
}
/* process 28 bits at a time; this should work well both for binary
and hexadecimal floating point. */
x = 0;
while (m) {
x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
m *= 268435456.0; /* 2**28 */
e -= 28;
y = (Py_uhash_t)m; /* pull out integer part */
m -= y;
x += y;
if (x >= _PyHASH_MODULUS)
x -= _PyHASH_MODULUS;
}
/* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
x = x * sign;
if (x == (Py_uhash_t)-1)
x = (Py_uhash_t)-2;
return (Py_hash_t)x;
}
Py_hash_t
_Py_HashPointer(void *p)
{
Py_hash_t x;
size_t y = (size_t)p;
/* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
excessive hash collisions for dicts and sets */
y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
x = (Py_hash_t)y;
if (x == -1)
x = -2;
return x;
}
Py_hash_t
_Py_HashBytes(unsigned char *p, Py_ssize_t len)
{
Py_uhash_t x;
Py_ssize_t i;
/*
We make the hash of the empty string be 0, rather than using
(prefix ^ suffix), since this slightly obfuscates the hash secret
*/
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
if (len == 0) {
return 0;
}
x = (Py_uhash_t) _Py_HashSecret.prefix;
x ^= (Py_uhash_t) *p << 7;
for (i = 0; i < len; i++)
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
x ^= (Py_uhash_t) len;
x ^= (Py_uhash_t) _Py_HashSecret.suffix;
if (x == -1)
x = -2;
return x;
}
Py_hash_t
PyObject_HashNotImplemented(PyObject *v)
{
@ -883,8 +739,6 @@ PyObject_HashNotImplemented(PyObject *v)
return -1;
}
_Py_HashSecret_t _Py_HashSecret;
Py_hash_t
PyObject_Hash(PyObject *v)
{

View file

@ -11386,39 +11386,8 @@ unicode_hash(PyObject *self)
_PyUnicode_HASH(self) = 0;
return 0;
}
/* The hash function as a macro, gets expanded three times below. */
#define HASH(P) \
x ^= (Py_uhash_t) *P << 7; \
while (--len >= 0) \
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++; \
x = (Py_uhash_t) _Py_HashSecret.prefix;
switch (PyUnicode_KIND(self)) {
case PyUnicode_1BYTE_KIND: {
const unsigned char *c = PyUnicode_1BYTE_DATA(self);
HASH(c);
break;
}
case PyUnicode_2BYTE_KIND: {
const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
HASH(s);
break;
}
default: {
Py_UCS4 *l;
assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
"Impossible switch case in unicode_hash");
l = PyUnicode_4BYTE_DATA(self);
HASH(l);
break;
}
}
x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
x ^= (Py_uhash_t) _Py_HashSecret.suffix;
if (x == -1)
x = -2;
x = _Py_HashBytes(PyUnicode_DATA(self),
PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
_PyUnicode_HASH(self) = x;
return x;
}