bpo-34751: improved hash function for tuples (GH-9471)

This commit is contained in:
jdemeyer 2018-10-28 02:06:38 +02:00 committed by Raymond Hettinger
parent 53125a53f4
commit aeb1be5868
3 changed files with 142 additions and 42 deletions

View file

@ -333,39 +333,60 @@ error:
return NULL;
}
/* The addend 82520, was selected from the range(0, 1000000) for
generating the greatest number of prime multipliers for tuples
up to length eight:
1082527, 1165049, 1082531, 1165057, 1247581, 1330103, 1082533,
1330111, 1412633, 1165069, 1247599, 1495177, 1577699
/* Hash for tuples. This is a slightly simplified version of the xxHash
non-cryptographic hash:
- we do not use any parallellism, there is only 1 accumulator.
- we drop the final mixing since this is just a permutation of the
output space: it does not help against collisions.
- at the end, we mangle the length with a single constant.
For the xxHash specification, see
https://github.com/Cyan4973/xxHash/blob/master/doc/xxhash_spec.md
Tests have shown that it's not worth to cache the hash value, see
issue #9685.
Below are the official constants from the xxHash specification. Optimizing
compilers should emit a single "rotate" instruction for the
_PyHASH_XXROTATE() expansion. If that doesn't happen for some important
platform, the macro could be changed to expand to a platform-specific rotate
spelling instead.
*/
#if SIZEOF_PY_UHASH_T > 4
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL)
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL)
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL)
#define _PyHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */
#else
#define _PyHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL)
#define _PyHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL)
#define _PyHASH_XXPRIME_5 ((Py_uhash_t)374761393UL)
#define _PyHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */
#endif
/* Tests have shown that it's not worth to cache the hash value, see
https://bugs.python.org/issue9685 */
static Py_hash_t
tuplehash(PyTupleObject *v)
{
Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Py_hash_t y;
Py_ssize_t len = Py_SIZE(v);
PyObject **p;
Py_uhash_t mult = _PyHASH_MULTIPLIER;
x = 0x345678UL;
p = v->ob_item;
while (--len >= 0) {
y = PyObject_Hash(*p++);
if (y == -1)
Py_ssize_t i, len = Py_SIZE(v);
PyObject **item = v->ob_item;
Py_uhash_t acc = _PyHASH_XXPRIME_5;
for (i = 0; i < len; i++) {
Py_uhash_t lane = PyObject_Hash(item[i]);
if (lane == (Py_uhash_t)-1) {
return -1;
x = (x ^ y) * mult;
/* the cast might truncate len; that doesn't change hash stability */
mult += (Py_hash_t)(82520UL + len + len);
}
acc += lane * _PyHASH_XXPRIME_2;
acc = _PyHASH_XXROTATE(acc);
acc *= _PyHASH_XXPRIME_1;
}
x += 97531UL;
if (x == (Py_uhash_t)-1)
x = -2;
return x;
/* Add input length, mangled to keep the historical value of hash(()). */
acc += len ^ (_PyHASH_XXPRIME_5 ^ 3527539UL);
if (acc == (Py_uhash_t)-1) {
return 1546275796;
}
return acc;
}
static Py_ssize_t