Issue #8188: Introduce a new scheme for computing hashes of numbers

(instances of int, float, complex, decimal.Decimal and fractions.Fraction) that makes it easy to maintain the invariant that hash(x) == hash(y) whenever x and y have equal value.
2025-11-17 17:46:45 +00:00 · 2010-05-23 13:33:13 +00:00 · 2010-05-23 13:33:13 +00:00 · dc787d2055
commit dc787d2055
parent 03721133a6
14 changed files with 566 additions and 137 deletions
--- a/Objects/complexobject.c
+++ b/Objects/complexobject.c
@ -403,12 +403,12 @@ complex_str(PyComplexObject *v)
 static long
 complex_hash(PyComplexObject *v)
 {
-    long hashreal, hashimag, combined;
-    hashreal = _Py_HashDouble(v->cval.real);
-    if (hashreal == -1)
+    unsigned long hashreal, hashimag, combined;
+    hashreal = (unsigned long)_Py_HashDouble(v->cval.real);
+    if (hashreal == (unsigned long)-1)
        return -1;
-    hashimag = _Py_HashDouble(v->cval.imag);
-    if (hashimag == -1)
+    hashimag = (unsigned long)_Py_HashDouble(v->cval.imag);
+    if (hashimag == (unsigned long)-1)
        return -1;
    /* Note:  if the imaginary part is 0, hashimag is 0 now,
     * so the following returns hashreal unchanged.  This is
@ -416,10 +416,10 @@ complex_hash(PyComplexObject *v)
     * compare equal must have the same hash value, so that
     * hash(x + 0*j) must equal hash(x).
     */
-    combined = hashreal + 1000003 * hashimag;
-    if (combined == -1)
-        combined = -2;
-    return combined;
+    combined = hashreal + _PyHASH_IMAG * hashimag;
+    if (combined == (unsigned long)-1)
+        combined = (unsigned long)-2;
+    return (long)combined;
 }

 /* This macro may return! */
--- a/Objects/longobject.c
+++ b/Objects/longobject.c
@ -2571,18 +2571,37 @@ long_hash(PyLongObject *v)
        sign = -1;
        i = -(i);
    }
-    /* The following loop produces a C unsigned long x such that x is
-       congruent to the absolute value of v modulo ULONG_MAX.  The
-       resulting x is nonzero if and only if v is. */
    while (--i >= 0) {
-        /* Force a native long #-bits (32 or 64) circular shift */
-        x = (x >> (8*SIZEOF_LONG-PyLong_SHIFT)) | (x << PyLong_SHIFT);
+        /* Here x is a quantity in the range [0, _PyHASH_MODULUS); we
+           want to compute x * 2**PyLong_SHIFT + v->ob_digit[i] modulo
+           _PyHASH_MODULUS.
+
+           The computation of x * 2**PyLong_SHIFT % _PyHASH_MODULUS
+           amounts to a rotation of the bits of x.  To see this, write
+
+             x * 2**PyLong_SHIFT = y * 2**_PyHASH_BITS + z
+
+           where y = x >> (_PyHASH_BITS - PyLong_SHIFT) gives the top
+           PyLong_SHIFT bits of x (those that are shifted out of the
+           original _PyHASH_BITS bits, and z = (x << PyLong_SHIFT) &
+           _PyHASH_MODULUS gives the bottom _PyHASH_BITS - PyLong_SHIFT
+           bits of x, shifted up.  Then since 2**_PyHASH_BITS is
+           congruent to 1 modulo _PyHASH_MODULUS, y*2**_PyHASH_BITS is
+           congruent to y modulo _PyHASH_MODULUS.  So
+
+             x * 2**PyLong_SHIFT = y + z (mod _PyHASH_MODULUS).
+
+           The right-hand side is just the result of rotating the
+           _PyHASH_BITS bits of x left by PyLong_SHIFT places; since
+           not all _PyHASH_BITS bits of x are 1s, the same is true
+           after rotation, so 0 <= y+z < _PyHASH_MODULUS and y + z is
+           the reduction of x*2**PyLong_SHIFT modulo
+           _PyHASH_MODULUS. */
+        x = ((x << PyLong_SHIFT) & _PyHASH_MODULUS) |
+            (x >> (_PyHASH_BITS - PyLong_SHIFT));
        x += v->ob_digit[i];
-        /* If the addition above overflowed we compensate by
-           incrementing.  This preserves the value modulo
-           ULONG_MAX. */
-        if (x < v->ob_digit[i])
-            x++;
+        if (x >= _PyHASH_MODULUS)
+            x -= _PyHASH_MODULUS;
    }
    x = x * sign;
    if (x == (unsigned long)-1)
--- a/Objects/object.c
+++ b/Objects/object.c
@ -647,63 +647,101 @@ PyObject_RichCompareBool(PyObject *v, PyObject *w, int op)
   All the utility functions (_Py_Hash*()) return "-1" to signify an error.
 */

+/* For numeric types, the hash of a number x is based on the reduction
+   of x modulo the prime P = 2**_PyHASH_BITS - 1.  It's designed so that
+   hash(x) == hash(y) whenever x and y are numerically equal, even if
+   x and y have different types.
+
+   A quick summary of the hashing strategy:
+
+   (1) First define the 'reduction of x modulo P' for any rational
+   number x; this is a standard extension of the usual notion of
+   reduction modulo P for integers.  If x == p/q (written in lowest
+   terms), the reduction is interpreted as the reduction of p times
+   the inverse of the reduction of q, all modulo P; if q is exactly
+   divisible by P then define the reduction to be infinity.  So we've
+   got a well-defined map
+
+      reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
+
+   (2) Now for a rational number x, define hash(x) by:
+
+      reduce(x)   if x >= 0
+      -reduce(-x) if x < 0
+
+   If the result of the reduction is infinity (this is impossible for
+   integers, floats and Decimals) then use the predefined hash value
+   _PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
+   _PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
+   hashes of float and Decimal infinities and nans.
+
+   A selling point for the above strategy is that it makes it possible
+   to compute hashes of decimal and binary floating-point numbers
+   efficiently, even if the exponent of the binary or decimal number
+   is large.  The key point is that
+
+      reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
+
+   provided that {reduce(x), reduce(y)} != {0, infinity}.  The reduction of a
+   binary or decimal float is never infinity, since the denominator is a power
+   of 2 (for binary) or a divisor of a power of 10 (for decimal).  So we have,
+   for nonnegative x,
+
+      reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
+
+      reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
+
+   and reduce(10**e) can be computed efficiently by the usual modular
+   exponentiation algorithm.  For reduce(2**e) it's even better: since
+   P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
+   by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
+
+   */
+
 long
 _Py_HashDouble(double v)
 {
-    double intpart, fractpart;
-    int expo;
-    long hipart;
-    long x;             /* the final hash value */
-    /* This is designed so that Python numbers of different types
-     * that compare equal hash to the same value; otherwise comparisons
-     * of mapping keys will turn out weird.
-     */
+    int e, sign;
+    double m;
+    unsigned long x, y;

    if (!Py_IS_FINITE(v)) {
        if (Py_IS_INFINITY(v))
-            return v < 0 ? -271828 : 314159;
+            return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
        else
-            return 0;
+            return _PyHASH_NAN;
    }
-    fractpart = modf(v, &intpart);
-    if (fractpart == 0.0) {
-        /* This must return the same hash as an equal int or long. */
-        if (intpart > LONG_MAX/2 || -intpart > LONG_MAX/2) {
-            /* Convert to long and use its hash. */
-            PyObject *plong;                    /* converted to Python long */
-            plong = PyLong_FromDouble(v);
-            if (plong == NULL)
-                return -1;
-            x = PyObject_Hash(plong);
-            Py_DECREF(plong);
-            return x;
-        }
-        /* Fits in a C long == a Python int, so is its own hash. */
-        x = (long)intpart;
-        if (x == -1)
-            x = -2;
-        return x;
+
+    m = frexp(v, &e);
+
+    sign = 1;
+    if (m < 0) {
+        sign = -1;
+        m = -m;
    }
-    /* The fractional part is non-zero, so we don't have to worry about
-     * making this match the hash of some other type.
-     * Use frexp to get at the bits in the double.
-     * Since the VAX D double format has 56 mantissa bits, which is the
-     * most of any double format in use, each of these parts may have as
-     * many as (but no more than) 56 significant bits.
-     * So, assuming sizeof(long) >= 4, each part can be broken into two
-     * longs; frexp and multiplication are used to do that.
-     * Also, since the Cray double format has 15 exponent bits, which is
-     * the most of any double format in use, shifting the exponent field
-     * left by 15 won't overflow a long (again assuming sizeof(long) >= 4).
-     */
-    v = frexp(v, &expo);
-    v *= 2147483648.0;          /* 2**31 */
-    hipart = (long)v;           /* take the top 32 bits */
-    v = (v - (double)hipart) * 2147483648.0; /* get the next 32 bits */
-    x = hipart + (long)v + (expo << 15);
-    if (x == -1)
-        x = -2;
-    return x;
+
+    /* process 28 bits at a time;  this should work well both for binary
+       and hexadecimal floating point. */
+    x = 0;
+    while (m) {
+        x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
+        m *= 268435456.0;  /* 2**28 */
+        e -= 28;
+        y = (unsigned long)m;  /* pull out integer part */
+        m -= y;
+        x += y;
+        if (x >= _PyHASH_MODULUS)
+            x -= _PyHASH_MODULUS;
+    }
+
+    /* adjust for the exponent;  first reduce it modulo _PyHASH_BITS */
+    e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
+    x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
+
+    x = x * sign;
+    if (x == (unsigned long)-1)
+        x = (unsigned long)-2;
+    return (long)x;
 }

 long
--- a/Objects/typeobject.c
+++ b/Objects/typeobject.c
@ -4921,6 +4921,7 @@ slot_tp_hash(PyObject *self)
    PyObject *func, *res;
    static PyObject *hash_str;
    long h;
+    int overflow;

    func = lookup_method(self, "__hash__", &hash_str);

@ -4937,14 +4938,27 @@ slot_tp_hash(PyObject *self)
    Py_DECREF(func);
    if (res == NULL)
        return -1;
-    if (PyLong_Check(res))
+
+    if (!PyLong_Check(res)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "__hash__ method should return an integer");
+        return -1;
+    }
+    /* Transform the PyLong `res` to a C long `h`.  For an existing
+       hashable Python object x, hash(x) will always lie within the range
+       of a C long.  Therefore our transformation must preserve values
+       that already lie within this range, to ensure that if x.__hash__()
+       returns hash(y) then hash(x) == hash(y). */
+    h = PyLong_AsLongAndOverflow(res, &overflow);
+    if (overflow)
+        /* res was not within the range of a C long, so we're free to
+           use any sufficiently bit-mixing transformation;
+           long.__hash__ will do nicely. */
        h = PyLong_Type.tp_hash(res);
-    else
-        h = PyLong_AsLong(res);
    Py_DECREF(res);
-           if (h == -1 && !PyErr_Occurred())
-           h = -2;
-           return h;
+    if (h == -1 && !PyErr_Occurred())
+        h = -2;
+    return h;
 }

 static PyObject *