144 lines
5.3 KiB
Diff
144 lines
5.3 KiB
Diff
From 95c9c2b9cb2d3c1d29c8ce77f154de8bd5313dae Mon Sep 17 00:00:00 2001
|
|
From: "Miss Islington (bot)"
|
|
<31488909+miss-islington@users.noreply.github.com>
|
|
Date: Tue, 24 May 2022 01:52:49 -0700
|
|
Subject: [PATCH] gh-93065: Fix HAMT to iterate correctly over 7-level
|
|
deep
|
|
trees (GH-93066) (#93147)
|
|
MIME-Version: 1.0
|
|
Content-Type: text/plain; charset=UTF-8
|
|
Content-Transfer-Encoding: 8bit
|
|
|
|
Also while there, clarify a few things about why we reduce the hash to
|
|
32 bits.
|
|
|
|
Co-authored-by: Eli Libman <eli@hyro.ai>
|
|
Co-authored-by: Yury Selivanov <yury@edgedb.com>
|
|
Co-authored-by: Łukasz Langa <lukasz@langa.pl>
|
|
|
|
(cherry picked from commit c1f5c903a7e4ed27190488f4e33b00d3c3d952e5)
|
|
|
|
---
|
|
Include/internal/pycore_hamt.h | 14 +++++++++++++-
|
|
Lib/test/test_context.py | 35 ++++++++++++++++++++++++++++++++++
|
|
Misc/ACKS | 1 +
|
|
Python/hamt.c | 14 +++++++++++---
|
|
4 files changed, 60 insertions(+), 4 deletions(-)
|
|
|
|
diff --git a/Include/internal/pycore_hamt.h b/Include/internal/pycore_hamt.h
|
|
index aaf6559..357d966 100644
|
|
--- a/Include/internal/pycore_hamt.h
|
|
+++ b/Include/internal/pycore_hamt.h
|
|
@@ -5,7 +5,19 @@
|
|
# error "this header requires Py_BUILD_CORE define"
|
|
#endif
|
|
|
|
-#define _Py_HAMT_MAX_TREE_DEPTH 7
|
|
+
|
|
+/*
|
|
+HAMT tree is shaped by hashes of keys. Every group of 5 bits of a hash denotes
|
|
+the exact position of the key in one level of the tree. Since we're using
|
|
+32 bit hashes, we can have at most 7 such levels. Although if there are
|
|
+two distinct keys with equal hashes, they will have to occupy the same
|
|
+cell in the 7th level of the tree -- so we'd put them in a "collision" node.
|
|
+Which brings the total possible tree depth to 8. Read more about the actual
|
|
+layout of the HAMT tree in `hamt.c`.
|
|
+
|
|
+This constant is used to define a datastucture for storing iteration state.
|
|
+*/
|
|
+#define _Py_HAMT_MAX_TREE_DEPTH 8
|
|
|
|
|
|
#define PyHamt_Check(o) Py_IS_TYPE(o, &_PyHamt_Type)
|
|
diff --git a/Lib/test/test_context.py b/Lib/test/test_context.py
|
|
index 2d8b63a..689e3d4 100644
|
|
--- a/Lib/test/test_context.py
|
|
+++ b/Lib/test/test_context.py
|
|
@@ -533,6 +533,41 @@ class HamtTest(unittest.TestCase):
|
|
self.assertEqual(len(h4), 2)
|
|
self.assertEqual(len(h5), 3)
|
|
|
|
+ def test_hamt_collision_3(self):
|
|
+ # Test that iteration works with the deepest tree possible.
|
|
+ # https://github.com/python/cpython/issues/93065
|
|
+
|
|
+ C = HashKey(0b10000000_00000000_00000000_00000000, 'C')
|
|
+ D = HashKey(0b10000000_00000000_00000000_00000000, 'D')
|
|
+
|
|
+ E = HashKey(0b00000000_00000000_00000000_00000000, 'E')
|
|
+
|
|
+ h = hamt()
|
|
+ h = h.set(C, 'C')
|
|
+ h = h.set(D, 'D')
|
|
+ h = h.set(E, 'E')
|
|
+
|
|
+ # BitmapNode(size=2 count=1 bitmap=0b1):
|
|
+ # NULL:
|
|
+ # BitmapNode(size=2 count=1 bitmap=0b1):
|
|
+ # NULL:
|
|
+ # BitmapNode(size=2 count=1 bitmap=0b1):
|
|
+ # NULL:
|
|
+ # BitmapNode(size=2 count=1 bitmap=0b1):
|
|
+ # NULL:
|
|
+ # BitmapNode(size=2 count=1 bitmap=0b1):
|
|
+ # NULL:
|
|
+ # BitmapNode(size=2 count=1 bitmap=0b1):
|
|
+ # NULL:
|
|
+ # BitmapNode(size=4 count=2 bitmap=0b101):
|
|
+ # <Key name:E hash:0>: 'E'
|
|
+ # NULL:
|
|
+ # CollisionNode(size=4 id=0x107a24520):
|
|
+ # <Key name:C hash:2147483648>: 'C'
|
|
+ # <Key name:D hash:2147483648>: 'D'
|
|
+
|
|
+ self.assertEqual({k.name for k in h.keys()}, {'C', 'D', 'E'})
|
|
+
|
|
def test_hamt_stress(self):
|
|
COLLECTION_SIZE = 7000
|
|
TEST_ITERS_EVERY = 647
|
|
diff --git a/Misc/ACKS b/Misc/ACKS
|
|
index ac893ac..8699b98 100644
|
|
--- a/Misc/ACKS
|
|
+++ b/Misc/ACKS
|
|
@@ -1031,6 +1031,7 @@ Robert Li
|
|
Xuanji Li
|
|
Zekun Li
|
|
Zheao Li
|
|
+Eli Libman
|
|
Dan Lidral-Porter
|
|
Robert van Liere
|
|
Ross Light
|
|
diff --git a/Python/hamt.c b/Python/hamt.c
|
|
index 8801c5e..3296109 100644
|
|
--- a/Python/hamt.c
|
|
+++ b/Python/hamt.c
|
|
@@ -407,14 +407,22 @@ hamt_hash(PyObject *o)
|
|
return -1;
|
|
}
|
|
|
|
- /* While it's suboptimal to reduce Python's 64 bit hash to
|
|
+ /* While it's somewhat suboptimal to reduce Python's 64 bit hash to
|
|
32 bits via XOR, it seems that the resulting hash function
|
|
is good enough (this is also how Long type is hashed in Java.)
|
|
Storing 10, 100, 1000 Python strings results in a relatively
|
|
shallow and uniform tree structure.
|
|
|
|
- Please don't change this hashing algorithm, as there are many
|
|
- tests that test some exact tree shape to cover all code paths.
|
|
+ Also it's worth noting that it would be possible to adapt the tree
|
|
+ structure to 64 bit hashes, but that would increase memory pressure
|
|
+ and provide little to no performance benefits for collections with
|
|
+ fewer than billions of key/value pairs.
|
|
+
|
|
+ Important: do not change this hash reducing function. There are many
|
|
+ tests that need an exact tree shape to cover all code paths and
|
|
+ we do that by specifying concrete values for test data's `__hash__`.
|
|
+ If this function is changed most of the regression tests would
|
|
+ become useless.
|
|
*/
|
|
int32_t xored = (int32_t)(hash & 0xffffffffl) ^ (int32_t)(hash >> 32);
|
|
return xored == -1 ? -2 : xored;
|
|
--
|
|
2.33.0
|
|
|