ssue #19183: Implement PEP 456 'secure and interchangeable hash algorithm'.

Python now uses SipHash24 on all major platforms.
2025-08-03 00:23:06 +00:00 · 2013-11-20 11:46:18 +01:00 · 2013-11-20 11:46:18 +01:00 · 985ecdcfc2
commit 985ecdcfc2
parent fe32aec25a
27 changed files with 1029 additions and 242 deletions
--- a/Objects/bytesobject.c
+++ b/Objects/bytesobject.c
@ -897,7 +897,7 @@ bytes_hash(PyBytesObject *a)
 {
    if (a->ob_shash == -1) {
        /* Can't fail */
-        a->ob_shash = _Py_HashBytes((unsigned char *) a->ob_sval, Py_SIZE(a));
+        a->ob_shash = _Py_HashBytes(a->ob_sval, Py_SIZE(a));
    }
    return a->ob_shash;
 }
--- a/Objects/memoryobject.c
+++ b/Objects/memoryobject.c
@ -2742,7 +2742,7 @@ memory_hash(PyMemoryViewObject *self)
        }

        /* Can't fail */
-        self->hash = _Py_HashBytes((unsigned char *)mem, view->len);
+        self->hash = _Py_HashBytes(mem, view->len);

        if (mem != view->buf)
            PyMem_Free(mem);
--- a/Objects/object.c
+++ b/Objects/object.c
@ -731,150 +731,6 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
    return ok;
 }

-/* Set of hash utility functions to help maintaining the invariant that
-    if a==b then hash(a)==hash(b)
-
-   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
-*/
-
-/* For numeric types, the hash of a number x is based on the reduction
-   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
-   hash(x) == hash(y) whenever x and y are numerically equal, even if
-   x and y have different types.
-
-   A quick summary of the hashing strategy:
-
-   (1) First define the 'reduction of x modulo P' for any rational
-   number x; this is a standard extension of the usual notion of
-   reduction modulo P for integers.  If x == p/q (written in lowest
-   terms), the reduction is interpreted as the reduction of p times
-   the inverse of the reduction of q, all modulo P; if q is exactly
-   divisible by P then define the reduction to be infinity.  So we've
-   got a well-defined map
-
-      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
-
-   (2) Now for a rational number x, define hash(x) by:
-
-      reduce(x)   if x >= 0
-      -reduce(-x) if x < 0
-
-   If the result of the reduction is infinity (this is impossible for
-   integers, floats and Decimals) then use the predefined hash value
-   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
-   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
-   hashes of float and Decimal infinities and nans.
-
-   A selling point for the above strategy is that it makes it possible
-   to compute hashes of decimal and binary floating-point numbers
-   efficiently, even if the exponent of the binary or decimal number
-   is large.  The key point is that
-
-      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
-
-   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
-   binary or decimal float is never infinity, since the denominator is a power
-   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
-   for nonnegative x,
-
-      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
-
-      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
-
-   and reduce(10**e) can be computed efficiently by the usual modular
-   exponentiation algorithm.  For reduce(2**e) it's even better: since
-   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
-   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
-
-   */
-
-Py_hash_t
-_Py_HashDouble(double v)
-{
-    int e, sign;
-    double m;
-    Py_uhash_t x, y;
-
-    if (!Py_IS_FINITE(v)) {
-        if (Py_IS_INFINITY(v))
-            return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
-        else
-            return _PyHASH_NAN;
-    }
-
-    m = frexp(v, &e);
-
-    sign = 1;
-    if (m < 0) {
-        sign = -1;
-        m = -m;
-    }
-
-    /* process 28 bits at a time;  this should work well both for binary
-       and hexadecimal floating point. */
-    x = 0;
-    while (m) {
-        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
-        m *= 268435456.0;  /* 2**28 */
-        e -= 28;
-        y = (Py_uhash_t)m;  /* pull out integer part */
-        m -= y;
-        x += y;
-        if (x >= _PyHASH_MODULUS)
-            x -= _PyHASH_MODULUS;
-    }
-
-    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
-    e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
-    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
-
-    x = x * sign;
-    if (x == (Py_uhash_t)-1)
-        x = (Py_uhash_t)-2;
-    return (Py_hash_t)x;
-}
-
-Py_hash_t
-_Py_HashPointer(void *p)
-{
-    Py_hash_t x;
-    size_t y = (size_t)p;
-    /* bottom 3 or 4 bits are likely to be 0; rotate y by 4 to avoid
-       excessive hash collisions for dicts and sets */
-    y = (y >> 4) | (y << (8 * SIZEOF_VOID_P - 4));
-    x = (Py_hash_t)y;
-    if (x == -1)
-        x = -2;
-    return x;
-}
-
-Py_hash_t
-_Py_HashBytes(unsigned char *p, Py_ssize_t len)
-{
-    Py_uhash_t x;
-    Py_ssize_t i;
-
-    /*
-      We make the hash of the empty string be 0, rather than using
-      (prefix ^ suffix), since this slightly obfuscates the hash secret
-    */
-#ifdef Py_DEBUG
-    assert(_Py_HashSecret_Initialized);
-#endif
-    if (len == 0) {
-        return 0;
-    }
-    x = (Py_uhash_t) _Py_HashSecret.prefix;
-    x ^= (Py_uhash_t) *p << 7;
-    for (i = 0; i < len; i++)
-        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
-    x ^= (Py_uhash_t) len;
-    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
-    if (x == -1)
-        x = -2;
-    return x;
-}
-
 Py_hash_t
 PyObject_HashNotImplemented(PyObject *v)
 {
@ -883,8 +739,6 @@ PyObject_HashNotImplemented(PyObject *v)
    return -1;
 }

-_Py_HashSecret_t _Py_HashSecret;
-
 Py_hash_t
 PyObject_Hash(PyObject *v)
 {
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -11386,39 +11386,8 @@ unicode_hash(PyObject *self)
        _PyUnicode_HASH(self) = 0;
        return 0;
    }
-
-    /* The hash function as a macro, gets expanded three times below. */
-#define HASH(P)                                            \
-    x ^= (Py_uhash_t) *P << 7;                             \
-    while (--len >= 0)                                     \
-        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
-
-    x = (Py_uhash_t) _Py_HashSecret.prefix;
-    switch (PyUnicode_KIND(self)) {
-    case PyUnicode_1BYTE_KIND: {
-        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
-        HASH(c);
-        break;
-    }
-    case PyUnicode_2BYTE_KIND: {
-        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
-        HASH(s);
-        break;
-    }
-    default: {
-        Py_UCS4 *l;
-        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
-               "Impossible switch case in unicode_hash");
-        l = PyUnicode_4BYTE_DATA(self);
-        HASH(l);
-        break;
-    }
-    }
-    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
-    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
-
-    if (x == -1)
-        x = -2;
+    x = _Py_HashBytes(PyUnicode_DATA(self),
+                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
    _PyUnicode_HASH(self) = x;
    return x;
 }