mirror of
https://github.com/python/cpython.git
synced 2025-09-01 22:47:59 +00:00
gh-93065: Fix HAMT to iterate correctly over 7-level deep trees (GH-93066) (GH-93146)
Also while there, clarify a few things about why we reduce the hash to 32 bits.
Co-authored-by: Eli Libman <eli@hyro.ai>
Co-authored-by: Yury Selivanov <yury@edgedb.com>
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
(cherry picked from commit c1f5c903a7
)
This commit is contained in:
parent
c1b12495f6
commit
a4bea26ee4
5 changed files with 65 additions and 4 deletions
|
@ -5,7 +5,19 @@
|
||||||
# error "this header requires Py_BUILD_CORE define"
|
# error "this header requires Py_BUILD_CORE define"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define _Py_HAMT_MAX_TREE_DEPTH 7
|
|
||||||
|
/*
|
||||||
|
HAMT tree is shaped by hashes of keys. Every group of 5 bits of a hash denotes
|
||||||
|
the exact position of the key in one level of the tree. Since we're using
|
||||||
|
32 bit hashes, we can have at most 7 such levels. Although if there are
|
||||||
|
two distinct keys with equal hashes, they will have to occupy the same
|
||||||
|
cell in the 7th level of the tree -- so we'd put them in a "collision" node.
|
||||||
|
Which brings the total possible tree depth to 8. Read more about the actual
|
||||||
|
layout of the HAMT tree in `hamt.c`.
|
||||||
|
|
||||||
|
This constant is used to define a datastucture for storing iteration state.
|
||||||
|
*/
|
||||||
|
#define _Py_HAMT_MAX_TREE_DEPTH 8
|
||||||
|
|
||||||
|
|
||||||
#define PyHamt_Check(o) Py_IS_TYPE(o, &_PyHamt_Type)
|
#define PyHamt_Check(o) Py_IS_TYPE(o, &_PyHamt_Type)
|
||||||
|
|
|
@ -533,6 +533,41 @@ class HamtTest(unittest.TestCase):
|
||||||
self.assertEqual(len(h4), 2)
|
self.assertEqual(len(h4), 2)
|
||||||
self.assertEqual(len(h5), 3)
|
self.assertEqual(len(h5), 3)
|
||||||
|
|
||||||
|
def test_hamt_collision_3(self):
|
||||||
|
# Test that iteration works with the deepest tree possible.
|
||||||
|
# https://github.com/python/cpython/issues/93065
|
||||||
|
|
||||||
|
C = HashKey(0b10000000_00000000_00000000_00000000, 'C')
|
||||||
|
D = HashKey(0b10000000_00000000_00000000_00000000, 'D')
|
||||||
|
|
||||||
|
E = HashKey(0b00000000_00000000_00000000_00000000, 'E')
|
||||||
|
|
||||||
|
h = hamt()
|
||||||
|
h = h.set(C, 'C')
|
||||||
|
h = h.set(D, 'D')
|
||||||
|
h = h.set(E, 'E')
|
||||||
|
|
||||||
|
# BitmapNode(size=2 count=1 bitmap=0b1):
|
||||||
|
# NULL:
|
||||||
|
# BitmapNode(size=2 count=1 bitmap=0b1):
|
||||||
|
# NULL:
|
||||||
|
# BitmapNode(size=2 count=1 bitmap=0b1):
|
||||||
|
# NULL:
|
||||||
|
# BitmapNode(size=2 count=1 bitmap=0b1):
|
||||||
|
# NULL:
|
||||||
|
# BitmapNode(size=2 count=1 bitmap=0b1):
|
||||||
|
# NULL:
|
||||||
|
# BitmapNode(size=2 count=1 bitmap=0b1):
|
||||||
|
# NULL:
|
||||||
|
# BitmapNode(size=4 count=2 bitmap=0b101):
|
||||||
|
# <Key name:E hash:0>: 'E'
|
||||||
|
# NULL:
|
||||||
|
# CollisionNode(size=4 id=0x107a24520):
|
||||||
|
# <Key name:C hash:2147483648>: 'C'
|
||||||
|
# <Key name:D hash:2147483648>: 'D'
|
||||||
|
|
||||||
|
self.assertEqual({k.name for k in h.keys()}, {'C', 'D', 'E'})
|
||||||
|
|
||||||
def test_hamt_stress(self):
|
def test_hamt_stress(self):
|
||||||
COLLECTION_SIZE = 7000
|
COLLECTION_SIZE = 7000
|
||||||
TEST_ITERS_EVERY = 647
|
TEST_ITERS_EVERY = 647
|
||||||
|
|
|
@ -1054,6 +1054,7 @@ Robert Li
|
||||||
Xuanji Li
|
Xuanji Li
|
||||||
Zekun Li
|
Zekun Li
|
||||||
Zheao Li
|
Zheao Li
|
||||||
|
Eli Libman
|
||||||
Dan Lidral-Porter
|
Dan Lidral-Porter
|
||||||
Robert van Liere
|
Robert van Liere
|
||||||
Ross Light
|
Ross Light
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
Fix contextvars HAMT implementation to handle iteration over deep trees.
|
||||||
|
|
||||||
|
The bug was discovered and fixed by Eli Libman. See
|
||||||
|
`MagicStack/immutables#84 <https://github.com/MagicStack/immutables/issues/84>`_
|
||||||
|
for more details.
|
|
@ -408,14 +408,22 @@ hamt_hash(PyObject *o)
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* While it's suboptimal to reduce Python's 64 bit hash to
|
/* While it's somewhat suboptimal to reduce Python's 64 bit hash to
|
||||||
32 bits via XOR, it seems that the resulting hash function
|
32 bits via XOR, it seems that the resulting hash function
|
||||||
is good enough (this is also how Long type is hashed in Java.)
|
is good enough (this is also how Long type is hashed in Java.)
|
||||||
Storing 10, 100, 1000 Python strings results in a relatively
|
Storing 10, 100, 1000 Python strings results in a relatively
|
||||||
shallow and uniform tree structure.
|
shallow and uniform tree structure.
|
||||||
|
|
||||||
Please don't change this hashing algorithm, as there are many
|
Also it's worth noting that it would be possible to adapt the tree
|
||||||
tests that test some exact tree shape to cover all code paths.
|
structure to 64 bit hashes, but that would increase memory pressure
|
||||||
|
and provide little to no performance benefits for collections with
|
||||||
|
fewer than billions of key/value pairs.
|
||||||
|
|
||||||
|
Important: do not change this hash reducing function. There are many
|
||||||
|
tests that need an exact tree shape to cover all code paths and
|
||||||
|
we do that by specifying concrete values for test data's `__hash__`.
|
||||||
|
If this function is changed most of the regression tests would
|
||||||
|
become useless.
|
||||||
*/
|
*/
|
||||||
int32_t xored = (int32_t)(hash & 0xffffffffl) ^ (int32_t)(hash >> 32);
|
int32_t xored = (int32_t)(hash & 0xffffffffl) ^ (int32_t)(hash >> 32);
|
||||||
return xored == -1 ? -2 : xored;
|
return xored == -1 ? -2 : xored;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue