bpo-34561: Switch to Munro & Wild "powersort" merge strategy. (#28108)

For list.sort(), replace our ad hoc merge ordering strategy with the principled, elegant, and provably near-optimal one from Munro and Wild's "powersort".
2025-08-02 16:13:13 +00:00 · 2021-09-06 12:54:41 -05:00 · 2021-09-06 12:54:41 -05:00 · 5cb4c672d8
commit 5cb4c672d8
parent 19871fce3b
3 changed files with 163 additions and 77 deletions
--- a/Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst
+++ b/Builtins/2021-09-01-19-21-48.bpo-34561.uMAVA-.rst
@ -0,0 +1 @@
 List sorting now uses the merge-ordering strategy from Munro and Wild's ``powersort()``. Unlike the former strategy, this is provably near-optimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast linear-time approximations to a problem that's inherently at best quadratic-time to solve truly optimally, it's also possible to contrive cases where the former strategy did better.
--- a/Objects/listobject.c
+++ b/Objects/listobject.c
@ -1139,12 +1139,11 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
           if (k)
 /* The maximum number of entries in a MergeState's pending-runs stack.
- * This is enough to sort arrays of size up to about
+ * For a list with n elements, this needs at most floor(log2(n)) + 1 entries
- *     32 * phi ** MAX_MERGE_PENDING
+ * even if we didn't force runs to a minimal length.  So the number of bits
- * where phi ~= 1.618.  85 is ridiculouslylarge enough, good for an array
+ * in a Py_ssize_t is plenty large enough for all cases.
 * with 2**64 elements.
 */
-#define MAX_MERGE_PENDING 85
+#define MAX_MERGE_PENDING (SIZEOF_SIZE_T * 8)
 /* When we get into galloping mode, we stay there until both runs win less
 * often than MIN_GALLOP consecutive times.  See listsort.txt for more info.
@ -1159,7 +1158,8 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
 */
 struct s_slice {
    sortslice base;
-    Py_ssize_t len;
+    Py_ssize_t len;   /* length of run */
    int power; /* node "level" for powersort merge strategy */
 };
 typedef struct s_MergeState MergeState;
@ -1170,6 +1170,9 @@ struct s_MergeState {
     */
    Py_ssize_t min_gallop;
    Py_ssize_t listlen;     /* len(input_list) - read only */
    PyObject **basekeys;    /* base address of keys array - read only */
    /* 'a' is temp storage to help with merges.  It contains room for
     * alloced entries.
     */
@ -1513,7 +1516,8 @@ fail:
 /* Conceptually a MergeState's constructor. */
 static void
-merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
+merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
           sortslice *lo)
 {
    assert(ms != NULL);
    if (has_keyfunc) {
@ -1538,6 +1542,8 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc)
    ms->a.keys = ms->temparray;
    ms->n = 0;
    ms->min_gallop = MIN_GALLOP;
    ms->listlen = list_size;
    ms->basekeys = lo->keys;
 }
 /* Free all the temp memory owned by the MergeState.  This must be called
@ -1920,37 +1926,74 @@ merge_at(MergeState *ms, Py_ssize_t i)
        return merge_hi(ms, ssa, na, ssb, nb);
 }
-/* Examine the stack of runs waiting to be merged, merging adjacent runs
+/* Two adjacent runs begin at index s1. The first run has length n1, and
- * until the stack invariants are re-established:
+ * the second run (starting at index s1+n1) has length n2. The list has total
 * length n.
 * Compute the "power" of the first run. See listsort.txt for details.
 */
 static int
 powerloop(Py_ssize_t s1, Py_ssize_t n1, Py_ssize_t n2, Py_ssize_t n)
 {
    int result = 0;
    assert(s1 >= 0);
    assert(n1 > 0 && n2 > 0);
    assert(s1 + n1 + n2 <= n);
    /* midpoints a and b:
     * a = s1 + n1/2
     * b = s1 + n1 + n2/2 = a + (n1 + n2)/2
     *
     * Those may not be integers, though, because of the "/2". So we work with
     * 2*a and 2*b instead, which are necessarily integers. It makes no
     * difference to the outcome, since the bits in the expansion of (2*i)/n
     * are merely shifted one position from those of i/n.
     */
    Py_ssize_t a = 2 * s1 + n1;  /* 2*a */
    Py_ssize_t b = a + n1 + n2;  /* 2*b */
    /* Emulate a/n and b/n one bit a time, until bits differ. */
    for (;;) {
        ++result;
        if (a >= n) {  /* both quotient bits are 1 */
            assert(b >= a);
            a -= n;
            b -= n;
        }
        else if (b >= n) {  /* a/n bit is 0, b/n bit is 1 */
            break;
        } /* else both quotient bits are 0 */
        assert(a < b && b < n);
        a <<= 1;
        b <<= 1;
    }
    return result;
 }
 /* The next run has been identified, of length n2.
 * If there's already a run on the stack, apply the "powersort" merge strategy:
 * compute the topmost run's "power" (depth in a conceptual binary merge tree)
 * and merge adjacent runs on the stack with greater power. See listsort.txt
 * for more info.
 *
- * 1. len[-3] > len[-2] + len[-1]
+ * It's the caller's responsibilty to push the new run on the stack when this
- * 2. len[-2] > len[-1]
+ * returns.
 *
 * See listsort.txt for more info.
 *
 * Returns 0 on success, -1 on error.
 */
 static int
-merge_collapse(MergeState *ms)
+found_new_run(MergeState *ms, Py_ssize_t n2)
 {
    struct s_slice *p = ms->pending;
    assert(ms);
-    while (ms->n > 1) {
+    if (ms->n) {
-        Py_ssize_t n = ms->n - 2;
+        assert(ms->n > 0);
-        if ((n > 0 && p[n-1].len <= p[n].len + p[n+1].len) ||
+        struct s_slice *p = ms->pending;
-            (n > 1 && p[n-2].len <= p[n-1].len + p[n].len)) {
+        Py_ssize_t s1 = p[ms->n - 1].base.keys - ms->basekeys; /* start index */
-            if (p[n-1].len < p[n+1].len)
+        Py_ssize_t n1 = p[ms->n - 1].len;
-                --n;
+        int power = powerloop(s1, n1, n2, ms->listlen);
-            if (merge_at(ms, n) < 0)
+        while (ms->n > 1 && p[ms->n - 2].power > power) {
            if (merge_at(ms, ms->n - 2) < 0)
                return -1;
        }
-        else if (p[n].len <= p[n+1].len) {
+        assert(ms->n < 2 || p[ms->n - 2].power < power);
-            if (merge_at(ms, n) < 0)
+        p[ms->n - 1].power = power;
                return -1;
        }
        else
            break;
    }
    return 0;
 }
@ -2357,7 +2400,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
    }
    /* End of pre-sort check: ms is now set properly! */
-    merge_init(&ms, saved_ob_size, keys != NULL);
+    merge_init(&ms, saved_ob_size, keys != NULL, &lo);
    nremaining = saved_ob_size;
    if (nremaining < 2)
@ -2393,13 +2436,16 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
                goto fail;
            n = force;
        }
-        /* Push run onto pending-runs stack, and maybe merge. */
+        /* Maybe merge pending runs. */
        assert(ms.n == 0 || ms.pending[ms.n -1].base.keys +
                            ms.pending[ms.n-1].len == lo.keys);
        if (found_new_run(&ms, n) < 0)
            goto fail;
        /* Push new run on stack. */
        assert(ms.n < MAX_MERGE_PENDING);
        ms.pending[ms.n].base = lo;
        ms.pending[ms.n].len = n;
        ++ms.n;
        if (merge_collapse(&ms) < 0)
            goto fail;
        /* Advance to find next run. */
        sortslice_advance(&lo, n);
        nremaining -= n;
--- a/Objects/listsort.txt
+++ b/Objects/listsort.txt
@ -318,65 +318,104 @@ merging must be done as (A+B)+C or A+(B+C) instead.
 So merging is always done on two consecutive runs at a time, and in-place,
 although this may require some temp memory (more on that later).
-When a run is identified, its base address and length are pushed on a stack
+When a run is identified, its length is passed to found_new_run() to
-in the MergeState struct.  merge_collapse() is then called to potentially
+potentially merge runs on a stack of pending runs.  We would like to delay
-merge runs on that stack.  We would like to delay merging as long as possible
+merging as long as possible in order to exploit patterns that may come up
-in order to exploit patterns that may come up later, but we like even more to
+later, but we like even more to do merging as soon as possible to exploit
-do merging as soon as possible to exploit that the run just found is still
+that the run just found is still high in the memory hierarchy.  We also can't
-high in the memory hierarchy.  We also can't delay merging "too long" because
+delay merging "too long" because it consumes memory to remember the runs that
-it consumes memory to remember the runs that are still unmerged, and the
+are still unmerged, and the stack has a fixed size.
 stack has a fixed size.
-What turned out to be a good compromise maintains two invariants on the
+The original version of this code used the first thing I made up that didn't
-stack entries, where A, B and C are the lengths of the three rightmost not-yet
+obviously suck ;-) It was loosely based on invariants involving the Fibonacci
-merged slices:
+sequence.
-1.  A > B+C
+It worked OK, but it was hard to reason about, and was subtle enough that the
-2.  B > C
+intended invariants weren't actually preserved.  Researchers discovered that
 when trying to complete a computer-generated correctness proof.  That was
 easily-enough repaired, but the discovery spurred quite a bit of academic
 interest in truly good ways to manage incremental merging on the fly.
-Note that, by induction, #2 implies the lengths of pending runs form a
+At least a dozen different approaches were developed, some provably having
-decreasing sequence.  #1 implies that, reading the lengths right to left,
+near-optimal worst case behavior with respect to the entropy of the
-the pending-run lengths grow at least as fast as the Fibonacci numbers.
+distribution of run lengths.  Some details can be found in bpo-34561.
 Therefore the stack can never grow larger than about log_base_phi(N) entries,
 where phi = (1+sqrt(5))/2 ~= 1.618.  Thus a small # of stack slots suffice
 for very large arrays.
-If A <= B+C, the smaller of A and C is merged with B (ties favor C, for the
+The code now uses the "powersort" merge strategy from:
 freshness-in-cache reason), and the new run replaces the A,B or B,C entries;
 e.g., if the last 3 entries are
-    A:30  B:20  C:10
+    "Nearly-Optimal Mergesorts: Fast, Practical Sorting Methods
     That Optimally Adapt to Existing Runs"
    J. Ian Munro and Sebastian Wild
-then B is merged with C, leaving
+The code is pretty simple, but the justification is quite involved, as it's
 based on fast approximations to optimal binary search trees, which are
 substantial topics on their own.
-    A:30  BC:30
+Here we'll just cover some pragmatic details:
-on the stack.  Or if they were
+The `powerloop()` function computes a run's "power". Say two adjacent runs
 begin at index s1. The first run has length n1, and the second run (starting
 at index s1+n1, called "s2" below) has length n2. The list has total length n.
 The "power" of the first run is a small integer, the depth of the node
 connecting the two runs in an ideal binary merge tree, where power 1 is the
 root node, and the power increases by 1 for each level deeper in the tree.
-    A:500  B:400:  C:1000
+The power is the least integer L such that the "midpoint interval" contains
 a rational number of the form J/2**L. The midpoint interval is the semi-
 closed interval:
-then A is merged with B, leaving
+    ((s1 + n1/2)/n, (s2 + n2/2)/n]
-    AB:900  C:1000
+Yes, that's brain-busting at first ;-) Concretely, if (s1 + n1/2)/n and
 (s2 + n2/2)/n are computed to infinite precision in binary, the power L is
 the first position at which the 2**-L bit differs between the expansions.
 Since the left end of the interval is less than the right end, the first
 differing bit must be a 0 bit in the left quotient and a 1 bit in the right
 quotient.
-on the stack.
+`powerloop()` emulates these divisions, 1 bit at a time, using comparisons,
 subtractions, and shifts in a loop.
-In both examples, the stack configuration after the merge still violates
+You'll notice the paper uses an O(1) method instead, but that relies on two
-invariant #2, and merge_collapse() goes on to continue merging runs until
+things we don't have:
 both invariants are satisfied.  As an extreme case, suppose we didn't do the
 minrun gimmick, and natural runs were of lengths 128, 64, 32, 16, 8, 4, 2,
 and 2.  Nothing would get merged until the final 2 was seen, and that would
 trigger 7 perfectly balanced merges.
-The thrust of these rules when they trigger merging is to balance the run
+- An O(1) "count leading zeroes" primitive. We can find such a thing as a C
-lengths as closely as possible, while keeping a low bound on the number of
+  extension on most platforms, but not all, and there's no uniform spelling
-runs we have to remember.  This is maximally effective for random data,
+  on the platforms that support it.
 where all runs are likely to be of (artificially forced) length minrun, and
 then we get a sequence of perfectly balanced merges (with, perhaps, some
 oddballs at the end).
-OTOH, one reason this sort is so good for partly ordered data has to do
+- Integer divison on an integer type twice as wide as needed to hold the
-with wildly unbalanced run lengths.
+  list length. But the latter is Py_ssize_t for us, and is typically the
  widest native signed integer type the platform supports.
 But since runs in our algorithm are almost never very short, the once-per-run
 overhead of `powerloop()` seems lost in the noise.
 Detail: why is Py_ssize_t "wide enough" in `powerloop()`?  We do, after all,
 shift integers of that width left by 1.  How do we know that won't spill into
 the sign bit?  The trick is that we have some slop. `n` (the total list
 length) is the number of list elements, which is at most 4 times (on a 32-box,
 with 4-byte pointers) smaller than than the largest size_t. So at least the
 leading two bits of the integers we're using are clear.
 Since we can't compute a run's power before seeing the run that follows it,
 the most-recently identified run is never merged by `found_new_run()`.
 Instead a new run is only used to compute the 2nd-most-recent run's power.
 Then adjacent runs are merged so long as their saved power (tree depth) is
 greater than that newly computed power. When found_new_run() returns, only
 then is a new run pushed on to the stack of pending runs.
 A key invariant is that powers on the run stack are strictly decreasing
 (starting from the run at the top of the stack).
 Note that even powersort's strategy isn't always truly optimal. It can't be.
 Computing an optimal merge sequence can be done in time quadratic in the
 number of runs, which is very much slower, and also requires finding &
 remembering _all_ the runs' lengths (of which there may be billions) in
 advance.  It's remarkable, though, how close to optimal this strategy gets.
 Curious factoid: of all the alternatives I've seen in the literature,
 powersort's is the only one that's always truly optimal for a collection of 3
 run lengths (for three lengths A B C, it's always optimal to first merge the
 shorter of A and C with B).
 Merge Memory
		`@ -0,0 +1 @@`
							List sorting now uses the merge-ordering strategy from Munro and Wild's ``powersort()``. Unlike the former strategy, this is provably near-optimal in the entropy of the distribution of run lengths. Most uses of ``list.sort()`` probably won't see a significant time difference, but may see significant improvements in cases where the former strategy was exceptionally poor. However, as these are all fast linear-time approximations to a problem that's inherently at best quadratic-time to solve truly optimally, it's also possible to contrive cases where the former strategy did better.