mirror of
https://github.com/python/cpython.git
synced 2025-07-08 03:45:36 +00:00
gh-135551: Change how sorting picks minimum run length (#135553)
New scheme from Stefan Pochmann for picking minimum run lengths. By allowing them to change a little from one run to the next, it's possible to arrange for that all merges, at all levels, strongly tend to be as evenly balanced as possible, for randomly ordered data. Meaning the number of initial runs is a power of 2, and all merges involve runs whose lengths differ by no more than 1.
This commit is contained in:
parent
b38810bab7
commit
2fc68e180f
4 changed files with 184 additions and 41 deletions
|
@ -1481,6 +1481,7 @@ Jean-François Piéronne
|
||||||
Oleg Plakhotnyuk
|
Oleg Plakhotnyuk
|
||||||
Anatoliy Platonov
|
Anatoliy Platonov
|
||||||
Marcel Plch
|
Marcel Plch
|
||||||
|
Stefan Pochmann
|
||||||
Kirill Podoprigora
|
Kirill Podoprigora
|
||||||
Remi Pointel
|
Remi Pointel
|
||||||
Jon Poler
|
Jon Poler
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
Sorting randomly ordered lists will often run a bit faster, thanks to a new scheme for picking minimum run lengths from Stefan Pochmann, which arranges for the merge tree to be as evenly balanced as is possible.
|
|
@ -1685,10 +1685,7 @@ sortslice_advance(sortslice *slice, Py_ssize_t n)
|
||||||
/* Avoid malloc for small temp arrays. */
|
/* Avoid malloc for small temp arrays. */
|
||||||
#define MERGESTATE_TEMP_SIZE 256
|
#define MERGESTATE_TEMP_SIZE 256
|
||||||
|
|
||||||
/* The largest value of minrun. This must be a power of 2, and >= 1, so that
|
/* The largest value of minrun. This must be a power of 2, and >= 1 */
|
||||||
* the compute_minrun() algorithm guarantees to return a result no larger than
|
|
||||||
* this,
|
|
||||||
*/
|
|
||||||
#define MAX_MINRUN 64
|
#define MAX_MINRUN 64
|
||||||
#if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
|
#if ((MAX_MINRUN) < 1) || ((MAX_MINRUN) & ((MAX_MINRUN) - 1))
|
||||||
#error "MAX_MINRUN must be a power of 2, and >= 1"
|
#error "MAX_MINRUN must be a power of 2, and >= 1"
|
||||||
|
@ -1749,6 +1746,11 @@ struct s_MergeState {
|
||||||
* of tuples. It may be set to safe_object_compare, but the idea is that hopefully
|
* of tuples. It may be set to safe_object_compare, but the idea is that hopefully
|
||||||
* we can assume more, and use one of the special-case compares. */
|
* we can assume more, and use one of the special-case compares. */
|
||||||
int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
|
int (*tuple_elem_compare)(PyObject *, PyObject *, MergeState *);
|
||||||
|
|
||||||
|
/* Varisbles used for minrun computation. The "ideal" minrun length is
|
||||||
|
* the infinite precision listlen / 2**e. See listsort.txt.
|
||||||
|
*/
|
||||||
|
Py_ssize_t mr_current, mr_e, mr_mask;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* binarysort is the best method for sorting small arrays: it does few
|
/* binarysort is the best method for sorting small arrays: it does few
|
||||||
|
@ -2210,6 +2212,14 @@ merge_init(MergeState *ms, Py_ssize_t list_size, int has_keyfunc,
|
||||||
ms->min_gallop = MIN_GALLOP;
|
ms->min_gallop = MIN_GALLOP;
|
||||||
ms->listlen = list_size;
|
ms->listlen = list_size;
|
||||||
ms->basekeys = lo->keys;
|
ms->basekeys = lo->keys;
|
||||||
|
|
||||||
|
/* State for generating minrun values. See listsort.txt. */
|
||||||
|
ms->mr_e = 0;
|
||||||
|
while (list_size >> ms->mr_e >= MAX_MINRUN) {
|
||||||
|
++ms->mr_e;
|
||||||
|
}
|
||||||
|
ms->mr_mask = (1 << ms->mr_e) - 1;
|
||||||
|
ms->mr_current = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Free all the temp memory owned by the MergeState. This must be called
|
/* Free all the temp memory owned by the MergeState. This must be called
|
||||||
|
@ -2687,27 +2697,15 @@ merge_force_collapse(MergeState *ms)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Compute a good value for the minimum run length; natural runs shorter
|
/* Return the next minrun value to use. See listsort.txt. */
|
||||||
* than this are boosted artificially via binary insertion.
|
Py_LOCAL_INLINE(Py_ssize_t)
|
||||||
*
|
minrun_next(MergeState *ms)
|
||||||
* If n < MAX_MINRUN return n (it's too small to bother with fancy stuff).
|
|
||||||
* Else if n is an exact power of 2, return MAX_MINRUN / 2.
|
|
||||||
* Else return an int k, MAX_MINRUN / 2 <= k <= MAX_MINRUN, such that n/k is
|
|
||||||
* close to, but strictly less than, an exact power of 2.
|
|
||||||
*
|
|
||||||
* See listsort.txt for more info.
|
|
||||||
*/
|
|
||||||
static Py_ssize_t
|
|
||||||
merge_compute_minrun(Py_ssize_t n)
|
|
||||||
{
|
{
|
||||||
Py_ssize_t r = 0; /* becomes 1 if any 1 bits are shifted off */
|
ms->mr_current += ms->listlen;
|
||||||
|
assert(ms->mr_current >= 0); /* no overflow */
|
||||||
assert(n >= 0);
|
Py_ssize_t result = ms->mr_current >> ms->mr_e;
|
||||||
while (n >= MAX_MINRUN) {
|
ms->mr_current &= ms->mr_mask;
|
||||||
r |= n & 1;
|
return result;
|
||||||
n >>= 1;
|
|
||||||
}
|
|
||||||
return n + r;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Here we define custom comparison functions to optimize for the cases one commonly
|
/* Here we define custom comparison functions to optimize for the cases one commonly
|
||||||
|
@ -3075,7 +3073,6 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
|
||||||
/* March over the array once, left to right, finding natural runs,
|
/* March over the array once, left to right, finding natural runs,
|
||||||
* and extending short natural runs to minrun elements.
|
* and extending short natural runs to minrun elements.
|
||||||
*/
|
*/
|
||||||
minrun = merge_compute_minrun(nremaining);
|
|
||||||
do {
|
do {
|
||||||
Py_ssize_t n;
|
Py_ssize_t n;
|
||||||
|
|
||||||
|
@ -3084,6 +3081,7 @@ list_sort_impl(PyListObject *self, PyObject *keyfunc, int reverse)
|
||||||
if (n < 0)
|
if (n < 0)
|
||||||
goto fail;
|
goto fail;
|
||||||
/* If short, extend to min(minrun, nremaining). */
|
/* If short, extend to min(minrun, nremaining). */
|
||||||
|
minrun = minrun_next(&ms);
|
||||||
if (n < minrun) {
|
if (n < minrun) {
|
||||||
const Py_ssize_t force = nremaining <= minrun ?
|
const Py_ssize_t force = nremaining <= minrun ?
|
||||||
nremaining : minrun;
|
nremaining : minrun;
|
||||||
|
|
|
@ -288,7 +288,6 @@ that 32 isn't a good choice for the general case! Consider N=2112:
|
||||||
|
|
||||||
>>> divmod(2112, 32)
|
>>> divmod(2112, 32)
|
||||||
(66, 0)
|
(66, 0)
|
||||||
>>>
|
|
||||||
|
|
||||||
If the data is randomly ordered, we're very likely to end up with 66 runs
|
If the data is randomly ordered, we're very likely to end up with 66 runs
|
||||||
each of length 32. The first 64 of these trigger a sequence of perfectly
|
each of length 32. The first 64 of these trigger a sequence of perfectly
|
||||||
|
@ -301,22 +300,94 @@ to get 64 elements into place).
|
||||||
If we take minrun=33 in this case, then we're very likely to end up with 64
|
If we take minrun=33 in this case, then we're very likely to end up with 64
|
||||||
runs each of length 33, and then all merges are perfectly balanced. Better!
|
runs each of length 33, and then all merges are perfectly balanced. Better!
|
||||||
|
|
||||||
What we want to avoid is picking minrun such that in
|
The original code used a cheap heuristic to pick a minrun that avoided the
|
||||||
|
very worst cases of imbalance for the final merge, but "pretty bad" cases
|
||||||
|
still existed.
|
||||||
|
|
||||||
q, r = divmod(N, minrun)
|
In 2025, Stefan Pochmann found a much better approach, based on letting minrun
|
||||||
|
vary a bit from one run to the next. Under his scheme, at _all_ levels of the
|
||||||
|
merge tree:
|
||||||
|
|
||||||
q is a power of 2 and r>0 (then the last merge only gets r elements into
|
- The number of runs is a power of 2.
|
||||||
place, and r < minrun is small compared to N), or q a little larger than a
|
- At most two different run lengths appear.
|
||||||
power of 2 regardless of r (then we've got a case similar to "2112", again
|
- When two do appear, the smaller is one less than the larger.
|
||||||
leaving too little work for the last merge to do).
|
- The lengths of run pairs merged never differ by more than one.
|
||||||
|
|
||||||
Instead we pick a minrun in range(MAX_MINRUN / 2, MAX_MINRUN + 1) such that
|
So, in all respects, as perfectly balanced as possible.
|
||||||
N/minrun is exactly a power of 2, or if that isn't possible, is close to, but
|
|
||||||
strictly less than, a power of 2. This is easier to do than it may sound:
|
For the 2112 case, that also keeps minrun at 33, but we were lucky there
|
||||||
take the first log2(MAX_MINRUN) bits of N, and add 1 if any of the remaining
|
that 2112 is 33 times a power of 2. The new approach doesn't rely on luck.
|
||||||
bits are set. In fact, that rule covers every case in this section, including
|
|
||||||
small N and exact powers of 2; merge_compute_minrun() is a deceptively simple
|
For example, with 315 random elements, the old scheme uses fixed minrun=40 and
|
||||||
function.
|
produces runs of length 40, except for the last. The new scheme produces a
|
||||||
|
mix of lengths 39 and 40:
|
||||||
|
|
||||||
|
old: 40 40 40 40 40 40 40 35
|
||||||
|
new: 39 39 40 39 39 40 39 40
|
||||||
|
|
||||||
|
Both schemes produce eight runs, a power of 2. That's good for a balanced
|
||||||
|
merge tree. But the new scheme allows merges where left and right length
|
||||||
|
never differ by more than 1:
|
||||||
|
|
||||||
|
39 39 40 39 39 40 39 40
|
||||||
|
78 79 79 79
|
||||||
|
157 158
|
||||||
|
315
|
||||||
|
|
||||||
|
(This shows merges downward, e.g., two runs of length 39 are merged and
|
||||||
|
become a run of length 78.)
|
||||||
|
|
||||||
|
With larger lists, the old scheme can get even more unbalanced. For example,
|
||||||
|
with 32769 elements (that's 2**15 + 1), it uses minrun=33 and produces 993
|
||||||
|
runs (of length 33). That's not even a power of 2. The new scheme instead
|
||||||
|
produces 1024 runs, all with length 32 except for the last one with length 33.
|
||||||
|
|
||||||
|
How does it work? Ideally, all runs would be exactly equally long. For the
|
||||||
|
above example, each run would have 315/8 = 39.375 elements. Which of course
|
||||||
|
doesn't work. But we can get close:
|
||||||
|
|
||||||
|
For the first run, we'd like 39.375 elements. Since that's impossible, we
|
||||||
|
instead use 39 (the floor) and remember the current leftover fraction 0.375.
|
||||||
|
For the second run, we add 0.375 + 39.375 = 39.75. Again impossible, so we
|
||||||
|
instead use 39 and remember 0.75. For the third run, we add 0.75 + 39.375 =
|
||||||
|
40.125. This time we get 40 and remember 0.125. And so on. Here's a Python
|
||||||
|
generator doing that:
|
||||||
|
|
||||||
|
def gen_minruns_with_floats(n):
|
||||||
|
mr = n
|
||||||
|
while mr >= MAX_MINRUN:
|
||||||
|
mr /= 2
|
||||||
|
|
||||||
|
mr_current = 0
|
||||||
|
while True:
|
||||||
|
mr_current += mr
|
||||||
|
yield int(mr_current)
|
||||||
|
mr_current %= 1
|
||||||
|
|
||||||
|
But while all arithmetic here can be done exactly using binery floating point,
|
||||||
|
floats have less precision that a Py_ssize_t, and mixing floats with ints is
|
||||||
|
needlessly expensive anyway.
|
||||||
|
|
||||||
|
So here's an integer version, where the internal numbers are scaled up by
|
||||||
|
2**e, or rather not divided by 2**e. Instead, only each yielded minrun gets
|
||||||
|
divided (by right-shifting). For example instead of adding 39.375 and
|
||||||
|
reducing modulo 1, it just adds 315 and reduces modulo 8. And always divides
|
||||||
|
by 8 to get each actual minrun value:
|
||||||
|
|
||||||
|
def gen_minruns_simpler(n):
|
||||||
|
e = 0
|
||||||
|
while (n >> e) >= MAX_MINRUN:
|
||||||
|
e += 1
|
||||||
|
mask = (1 << e) - 1
|
||||||
|
|
||||||
|
mr_current = 0
|
||||||
|
while True:
|
||||||
|
mr_current += n
|
||||||
|
yield mr_current >> e
|
||||||
|
mr_current &= mask
|
||||||
|
|
||||||
|
See note MINRUN CODE for a full implementation and a driver that exhaustively
|
||||||
|
verifies the claims above for all list lengths through 2 million.
|
||||||
|
|
||||||
|
|
||||||
The Merge Pattern
|
The Merge Pattern
|
||||||
|
@ -820,3 +891,75 @@ partially mitigated by pre-scanning the data to determine whether the data is
|
||||||
homogeneous with respect to type. If so, it is sometimes possible to
|
homogeneous with respect to type. If so, it is sometimes possible to
|
||||||
substitute faster type-specific comparisons for the slower, generic
|
substitute faster type-specific comparisons for the slower, generic
|
||||||
PyObject_RichCompareBool.
|
PyObject_RichCompareBool.
|
||||||
|
|
||||||
|
MINRUN CODE
|
||||||
|
from itertools import accumulate
|
||||||
|
try:
|
||||||
|
from itertools import batched
|
||||||
|
except ImportError:
|
||||||
|
from itertools import islice
|
||||||
|
def batched(xs, k):
|
||||||
|
it = iter(xs)
|
||||||
|
while chunk := tuple(islice(it, k)):
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
MAX_MINRUN = 64
|
||||||
|
|
||||||
|
def gen_minruns(n):
|
||||||
|
# In listobject.c, initialization is done in merge_init(), and
|
||||||
|
# the body of the loop in minrun_next().
|
||||||
|
mr_e = 0
|
||||||
|
while (n >> mr_e) >= MAX_MINRUN:
|
||||||
|
mr_e += 1
|
||||||
|
mr_mask = (1 << mr_e) - 1
|
||||||
|
|
||||||
|
mr_current = 0
|
||||||
|
while True:
|
||||||
|
mr_current += n
|
||||||
|
yield mr_current >> mr_e
|
||||||
|
mr_current &= mr_mask
|
||||||
|
|
||||||
|
def chew(n, show=False):
|
||||||
|
if n < 1:
|
||||||
|
return
|
||||||
|
|
||||||
|
sizes = []
|
||||||
|
tot = 0
|
||||||
|
for size in gen_minruns(n):
|
||||||
|
sizes.append(size)
|
||||||
|
tot += size
|
||||||
|
if tot >= n:
|
||||||
|
break
|
||||||
|
assert tot == n
|
||||||
|
print(n, len(sizes))
|
||||||
|
|
||||||
|
small, large = MAX_MINRUN // 2, MAX_MINRUN
|
||||||
|
while len(sizes) > 1:
|
||||||
|
assert not len(sizes) & 1
|
||||||
|
assert len(sizes).bit_count() == 1 # i.e., power of 2
|
||||||
|
assert sum(sizes) == n
|
||||||
|
assert min(sizes) >= min(n, small)
|
||||||
|
assert max(sizes) <= large
|
||||||
|
|
||||||
|
d = set(sizes)
|
||||||
|
assert len(d) <= 2
|
||||||
|
if len(d) == 2:
|
||||||
|
lo, hi = sorted(d)
|
||||||
|
assert lo + 1 == hi
|
||||||
|
|
||||||
|
mr = n / len(sizes)
|
||||||
|
for i, s in enumerate(accumulate(sizes, initial=0)):
|
||||||
|
assert int(mr * i) == s
|
||||||
|
|
||||||
|
newsizes = []
|
||||||
|
for a, b in batched(sizes, 2):
|
||||||
|
assert abs(a - b) <= 1
|
||||||
|
newsizes.append(a + b)
|
||||||
|
sizes = newsizes
|
||||||
|
smsll = large
|
||||||
|
large *= 2
|
||||||
|
|
||||||
|
assert sizes[0] == n
|
||||||
|
|
||||||
|
for n in range(2_000_001):
|
||||||
|
chew(n)
|
Loading…
Add table
Add a link
Reference in a new issue