Bring comments up to date (e.g., they still said the table had to be

a prime size, which is in fact never true anymore ...).
2025-09-02 15:07:53 +00:00 · 2000-12-13 01:02:46 +00:00 · 2000-12-13 01:02:46 +00:00 · ea8f2bf9ca
commit ea8f2bf9ca
parent 8152d32375
1 changed files with 40 additions and 23 deletions
--- a/Objects/dictobject.c
+++ b/Objects/dictobject.c
@ -15,7 +15,7 @@
 /*
 Table of irreducible polynomials to efficiently cycle through
-GF(2^n)-{0}, 2<=n<=30.
+GF(2^n)-{0}, 2<=n<=30.  A table size is always a power of 2.
 */
 static long polys[] = {
 	4 + 3,
@ -54,13 +54,26 @@ static long polys[] = {
 static PyObject *dummy; /* Initialized by first call to newdictobject() */
 /*
-Invariant for entries: when in use, me_value is not NULL and me_key is
+There are three kinds of slots in the table:
-not NULL and not dummy; when not in use, me_value is NULL and me_key
+
-is either NULL or dummy.  A dummy key value cannot be replaced by
+1. Unused.  me_key == me_value == NULL
-NULL, since otherwise other keys may be lost.
+   Does not hold an active (key, value) pair now and never did.  Unused can
   transition to Active upon key insertion.  This is the only case in which
   me_key is NULL, and is each slot's initial state.
 2. Active.  me_key != NULL and me_key != dummy and me_value != NULL
   Holds an active (key, value) pair.  Active can transition to Dummy upon
   key deletion.  This is the only case in which me_value != NULL.
 3. Dummy.  me_key == dummy && me_value == NULL
   Previously held an active (key, value) pair, but that was deleted and an
   active pair has not yet overwritten the slot.  Dummy can transition to
   Active upon key insertion.  Dummy slots cannot be made Unused again
   (cannot have me_key set to NULL), else the probe sequence in case of
   collision would have no way to know they were once active.
 */
 typedef struct {
-	long me_hash;
+	long me_hash;      /* cached hash code of me_key */
 	PyObject *me_key;
 	PyObject *me_value;
 #ifdef USE_CACHE_ALIGNED
@ -69,20 +82,21 @@ typedef struct {
 } dictentry;
 /*
-To ensure the lookup algorithm terminates, the table size must be a
+To ensure the lookup algorithm terminates, there must be at least one Unsused
-prime number and there must be at least one NULL key in the table.
+slot (NULL key) in the table.
-The value ma_fill is the number of non-NULL keys; ma_used is the number
+The value ma_fill is the number of non-NULL keys (sum of Active and Dummy);
-of non-NULL, non-dummy keys.
+ma_used is the number of non-NULL, non-dummy keys (== the number of non-NULL
-To avoid slowing down lookups on a near-full table, we resize the table
+values == the number of Active items).
-when it is more than half filled.
+To avoid slowing down lookups on a near-full table, we resize the table when
 it is more than half filled.
 */
 typedef struct dictobject dictobject;
 struct dictobject {
 	PyObject_HEAD
-	int ma_fill;
+	int ma_fill;  /* # Active + # Dummy */
-	int ma_used;
+	int ma_used;  /* # Active */
-	int ma_size;
+	int ma_size;  /* total # slots in ma_table */
-	int ma_poly;
+	int ma_poly;  /* appopriate entry from polys vector */
 	dictentry *ma_table;
 	dictentry *(*ma_lookup)(dictobject *mp, PyObject *key, long hash);
 };
@ -138,12 +152,12 @@ This is based on Algorithm D from Knuth Vol. 3, Sec. 6.4.
 Open addressing is preferred over chaining since the link overhead for
 chaining would be substantial (100% with typical malloc overhead).
 However, instead of going through the table at constant steps, we cycle
-through the values of GF(2^n)-{0}. This avoids modulo computations, being
+through the values of GF(2^n).  This avoids modulo computations, being
 much cheaper on RISC machines, without leading to clustering.
 The initial probe index is computed as hash mod the table size.
-Subsequent probe indices use the values of x^i in GF(2^n) as an offset,
+Subsequent probe indices use the values of x^i in GF(2^n)-{0} as an offset,
-where x is a root. The initial value is derived from hash, too.
+where x is a root. The initial offset is derived from hash, too.
 All arithmetic on hash should ignore overflow.
@ -168,11 +182,14 @@ lookdict(dictobject *mp, PyObject *key, register long hash)
 	register int cmp;
 	PyObject *err_type, *err_value, *err_tb;
 	/* We must come up with (i, incr) such that 0 <= i < ma_size
-	   and 0 < incr < ma_size and both are a function of hash */
+	   and 0 < incr < ma_size and both are a function of hash.
 	   i is the initial table index and incr the initial probe offset. */
 	i = (~hash) & mask;
 	/* We use ~hash instead of hash, as degenerate hash functions, such
 	   as for ints <sigh>, can have lots of leading zeros. It's not
-	   really a performance risk, but better safe than sorry. */
+	   really a performance risk, but better safe than sorry.
 	   12-Dec-00 tim:  so ~hash produces lots of leading ones instead --
 	   what's the gain? */
 	ep = &ep0[i];
 	if (ep->me_key == NULL || ep->me_key == key)
 		return ep;