mirror of
https://github.com/roc-lang/roc.git
synced 2025-09-27 05:49:08 +00:00
3929 lines
166 KiB
Zig
3929 lines
166 KiB
Zig
const std = @import("std");
|
|
const testing = std.testing;
|
|
|
|
const utils = @import("utils.zig");
|
|
const roc_panic = @import("panic.zig").panic_help;
|
|
|
|
const Ordering = utils.Ordering;
|
|
const GT = Ordering.GT;
|
|
const LT = Ordering.LT;
|
|
const EQ = Ordering.EQ;
|
|
const Opaque = ?[*]u8;
|
|
const CompareFn = *const fn (Opaque, Opaque, Opaque) callconv(.C) u8;
|
|
const CopyFn = *const fn (Opaque, Opaque) callconv(.C) void;
|
|
const IncN = *const fn (?[*]u8, usize) callconv(.C) void;
|
|
|
|
/// Any size larger than the max element buffer will be sorted indirectly via pointers.
|
|
/// TODO: tune this. I think due to llvm inlining the compare, the value likely should be lower.
|
|
/// I did some basic basic testing on my M1 and x86 machines with the c version of fluxsort.
|
|
/// The best tradeoff point is not the clearest and heavily depends on machine specifics.
|
|
/// Generally speaking, the faster memcpy is and the larger the cache line, the larger this should be.
|
|
/// Also, to my surprise, sorting by pointer is more performant on short arrays than long arrays (probably reduces time of final gather to order main array).
|
|
/// Anyway, there seems to be a hard cut off were the direct sort cost suddenly gets way larger.
|
|
/// In my testing for long arrays, the cutoff seems to be around 96-128 bytes.
|
|
/// For sort arrays, the custoff seems to be around 64-96 bytes.
|
|
const MAX_ELEMENT_BUFFER_SIZE: usize = 96;
|
|
const BufferType = [MAX_ELEMENT_BUFFER_SIZE]u8;
|
|
const BufferAlign = @alignOf(u128);
|
|
comptime {
|
|
std.debug.assert(MAX_ELEMENT_BUFFER_SIZE % BufferAlign == 0);
|
|
}
|
|
|
|
// ================ Fluxsort ==================================================
|
|
// The high level fluxsort functions.
|
|
|
|
pub fn fluxsort(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
data_is_owned_runtime: bool,
|
|
inc_n_data: IncN,
|
|
element_width: usize,
|
|
alignment: u32,
|
|
copy: CopyFn,
|
|
) void {
|
|
// Note, knowing constant versions of element_width and copy could have huge perf gains.
|
|
// Hopefully llvm will essentially always do it via constant argument propagation and inlining.
|
|
// If not, we may want to generate `n` different version of this function with comptime.
|
|
// Then have our builtin dispatch to the correct version.
|
|
// llvm garbage collection would remove all other variants.
|
|
// Also, for numeric types, inlining the compare function can be a 2x perf gain.
|
|
if (len < 132) {
|
|
// Just quadsort it.
|
|
quadsort(array, len, cmp, cmp_data, data_is_owned_runtime, inc_n_data, element_width, alignment, copy);
|
|
} else if (element_width <= MAX_ELEMENT_BUFFER_SIZE) {
|
|
if (data_is_owned_runtime) {
|
|
fluxsort_direct(array, len, cmp, cmp_data, element_width, alignment, copy, true, inc_n_data, false);
|
|
} else {
|
|
fluxsort_direct(array, len, cmp, cmp_data, element_width, alignment, copy, false, inc_n_data, false);
|
|
}
|
|
} else {
|
|
if (utils.alloc(len * @sizeOf(usize), @alignOf(usize))) |alloc_ptr| {
|
|
// Build list of pointers to sort.
|
|
var arr_ptr = @as([*]Opaque, @ptrCast(@alignCast(alloc_ptr)));
|
|
defer utils.dealloc(alloc_ptr, @alignOf(usize));
|
|
for (0..len) |i| {
|
|
arr_ptr[i] = array + i * element_width;
|
|
}
|
|
|
|
// Sort.
|
|
if (data_is_owned_runtime) {
|
|
fluxsort_direct(@ptrCast(arr_ptr), len, cmp, cmp_data, @sizeOf(usize), @alignOf(usize), &pointer_copy, true, inc_n_data, true);
|
|
} else {
|
|
fluxsort_direct(@ptrCast(arr_ptr), len, cmp, cmp_data, @sizeOf(usize), @alignOf(usize), &pointer_copy, false, inc_n_data, true);
|
|
}
|
|
|
|
if (utils.alloc(len * element_width, alignment)) |collect_ptr| {
|
|
// Collect sorted pointers into correct order.
|
|
defer utils.dealloc(collect_ptr, alignment);
|
|
for (0..len) |i| {
|
|
copy(collect_ptr + i * element_width, arr_ptr[i]);
|
|
}
|
|
|
|
// Copy to original array as sorted.
|
|
@memcpy(array[0..(len * element_width)], collect_ptr[0..(len * element_width)]);
|
|
} else {
|
|
roc_panic("Out of memory while trying to allocate for sorting", 0);
|
|
}
|
|
} else {
|
|
roc_panic("Out of memory while trying to allocate for sorting", 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn fluxsort_direct(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
alignment: u32,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
if (utils.alloc(len * element_width, alignment)) |swap| {
|
|
flux_analyze(array, len, swap, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
utils.dealloc(swap, alignment);
|
|
} else {
|
|
// Fallback to quadsort. It has ways to use less memory.
|
|
quadsort_direct(array, len, cmp, cmp_data, element_width, alignment, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
}
|
|
|
|
/// This value is used to help stay within l3 cache when sorting.
|
|
/// It technically should be tuned based on l3 cache size.
|
|
/// This is important for large arrays with pointers to other data.
|
|
/// 262144 is tude for a 6MB L3 cache.
|
|
/// For primitives and other small inline values, making this essentially infinite is better.
|
|
const QUAD_CACHE = 262144;
|
|
|
|
// When to stop using flux partition and switch to quadsort.
|
|
const FLUX_OUT = 96;
|
|
|
|
/// Determine whether to use mergesort or quicksort.
|
|
fn flux_analyze(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
const half1 = len / 2;
|
|
const quad1 = half1 / 2;
|
|
const quad2 = half1 - quad1;
|
|
const half2 = len - half1;
|
|
const quad3 = half2 / 2;
|
|
const quad4 = half2 - quad3;
|
|
|
|
var ptr_a = array;
|
|
var ptr_b = array + quad1 * element_width;
|
|
var ptr_c = array + half1 * element_width;
|
|
var ptr_d = array + (half1 + quad3) * element_width;
|
|
|
|
var streaks_a: u32 = 0;
|
|
var streaks_b: u32 = 0;
|
|
var streaks_c: u32 = 0;
|
|
var streaks_d: u32 = 0;
|
|
|
|
var balance_a: usize = 0;
|
|
var balance_b: usize = 0;
|
|
var balance_c: usize = 0;
|
|
var balance_d: usize = 0;
|
|
|
|
if (quad1 < quad2) {
|
|
// Must inc here, due to being in a branch.
|
|
const gt = compare_inc(cmp, cmp_data, ptr_b, ptr_b + element_width, data_is_owned, inc_n_data, indirect) == GT;
|
|
balance_b += @intFromBool(gt);
|
|
ptr_b += element_width;
|
|
}
|
|
if (quad1 < quad3) {
|
|
// Must inc here, due to being in a branch.
|
|
const gt = compare_inc(cmp, cmp_data, ptr_c, ptr_c + element_width, data_is_owned, inc_n_data, indirect) == GT;
|
|
balance_c += @intFromBool(gt);
|
|
ptr_c += element_width;
|
|
}
|
|
if (quad1 < quad4) {
|
|
// Must inc here, due to being in a branch.
|
|
balance_d += @intFromBool(compare_inc(cmp, cmp_data, ptr_d, ptr_d + element_width, data_is_owned, inc_n_data, indirect) == GT);
|
|
ptr_d += element_width;
|
|
}
|
|
|
|
var count = len;
|
|
while (count > 132) : (count -= 128) {
|
|
// 32*4 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 32 * 4);
|
|
}
|
|
var sum_a: u8 = 0;
|
|
var sum_b: u8 = 0;
|
|
var sum_c: u8 = 0;
|
|
var sum_d: u8 = 0;
|
|
for (0..32) |_| {
|
|
sum_a += @intFromBool(compare(cmp, cmp_data, ptr_a, ptr_a + element_width, indirect) == GT);
|
|
ptr_a += element_width;
|
|
sum_b += @intFromBool(compare(cmp, cmp_data, ptr_b, ptr_b + element_width, indirect) == GT);
|
|
ptr_b += element_width;
|
|
sum_c += @intFromBool(compare(cmp, cmp_data, ptr_c, ptr_c + element_width, indirect) == GT);
|
|
ptr_c += element_width;
|
|
sum_d += @intFromBool(compare(cmp, cmp_data, ptr_d, ptr_d + element_width, indirect) == GT);
|
|
ptr_d += element_width;
|
|
}
|
|
balance_a += sum_a;
|
|
sum_a = @intFromBool((sum_a == 0) or (sum_a == 32));
|
|
streaks_a += sum_a;
|
|
balance_b += sum_b;
|
|
sum_b = @intFromBool((sum_b == 0) or (sum_b == 32));
|
|
streaks_b += sum_b;
|
|
balance_c += sum_c;
|
|
sum_c = @intFromBool((sum_c == 0) or (sum_c == 32));
|
|
streaks_c += sum_c;
|
|
balance_d += sum_d;
|
|
sum_d = @intFromBool((sum_d == 0) or (sum_d == 32));
|
|
streaks_d += sum_d;
|
|
|
|
if (count > 516 and sum_a + sum_b + sum_c + sum_d == 0) {
|
|
balance_a += 48;
|
|
ptr_a += 96 * element_width;
|
|
balance_b += 48;
|
|
ptr_b += 96 * element_width;
|
|
balance_c += 48;
|
|
ptr_c += 96 * element_width;
|
|
balance_d += 48;
|
|
ptr_d += 96 * element_width;
|
|
count -= 384;
|
|
}
|
|
}
|
|
|
|
if (data_is_owned) {
|
|
if (count > 7) {
|
|
// 4*divCeil(count-7, 4) guaranteed compares.
|
|
const n: usize = std.math.divCeil(usize, count - 7, 4) catch unreachable;
|
|
inc_n_data(cmp_data, 4 * n);
|
|
}
|
|
}
|
|
while (count > 7) : (count -= 4) {
|
|
balance_a += @intFromBool(compare(cmp, cmp_data, ptr_a, ptr_a + element_width, indirect) == GT);
|
|
ptr_a += element_width;
|
|
balance_b += @intFromBool(compare(cmp, cmp_data, ptr_b, ptr_b + element_width, indirect) == GT);
|
|
ptr_b += element_width;
|
|
balance_c += @intFromBool(compare(cmp, cmp_data, ptr_c, ptr_c + element_width, indirect) == GT);
|
|
ptr_c += element_width;
|
|
balance_d += @intFromBool(compare(cmp, cmp_data, ptr_d, ptr_d + element_width, indirect) == GT);
|
|
ptr_d += element_width;
|
|
}
|
|
|
|
count = balance_a + balance_b + balance_c + balance_d;
|
|
|
|
if (count == 0) {
|
|
// The whole list may be ordered. Cool!
|
|
if (compare_inc(cmp, cmp_data, ptr_a, ptr_a + element_width, data_is_owned, inc_n_data, indirect) != GT and
|
|
compare_inc(cmp, cmp_data, ptr_b, ptr_b + element_width, data_is_owned, inc_n_data, indirect) != GT and
|
|
compare_inc(cmp, cmp_data, ptr_c, ptr_c + element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
return;
|
|
}
|
|
|
|
// Not fully sorted, too bad.
|
|
const reversed_a = quad1 - balance_a == 1;
|
|
const reversed_b = quad2 - balance_b == 1;
|
|
const reversed_c = quad3 - balance_c == 1;
|
|
const reversed_d = quad4 - balance_d == 1;
|
|
|
|
const reversed_any = reversed_a or reversed_b or reversed_c or reversed_d;
|
|
if (reversed_any) {
|
|
// 3 compares guaranteed.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 3);
|
|
}
|
|
const span1: u3 = @intFromBool(reversed_a and reversed_b) * @intFromBool(compare(cmp, cmp_data, ptr_a, ptr_a + element_width, indirect) == GT);
|
|
const span2: u3 = @intFromBool(reversed_b and reversed_c) * @intFromBool(compare(cmp, cmp_data, ptr_b, ptr_b + element_width, indirect) == GT);
|
|
const span3: u3 = @intFromBool(reversed_c and reversed_d) * @intFromBool(compare(cmp, cmp_data, ptr_c, ptr_c + element_width, indirect) == GT);
|
|
|
|
switch (span1 | (span2 << 1) | (span3 << 2)) {
|
|
0 => {},
|
|
1 => {
|
|
quad_reversal(array, ptr_b, element_width, copy);
|
|
balance_a = 0;
|
|
balance_b = 0;
|
|
},
|
|
2 => {
|
|
quad_reversal(ptr_a + 1, ptr_c, element_width, copy);
|
|
balance_b = 0;
|
|
balance_c = 0;
|
|
},
|
|
3 => {
|
|
quad_reversal(array, ptr_c, element_width, copy);
|
|
balance_a = 0;
|
|
balance_b = 0;
|
|
balance_c = 0;
|
|
},
|
|
4 => {
|
|
quad_reversal(ptr_b + 1, ptr_d, element_width, copy);
|
|
balance_c = 0;
|
|
balance_d = 0;
|
|
},
|
|
5 => {
|
|
quad_reversal(array, ptr_b, element_width, copy);
|
|
balance_a = 0;
|
|
balance_b = 0;
|
|
quad_reversal(ptr_b + 1, ptr_d, element_width, copy);
|
|
balance_c = 0;
|
|
balance_d = 0;
|
|
},
|
|
6 => {
|
|
quad_reversal(ptr_a + 1, ptr_d, element_width, copy);
|
|
balance_b = 0;
|
|
balance_c = 0;
|
|
balance_d = 0;
|
|
},
|
|
7 => {
|
|
quad_reversal(array, ptr_d, element_width, copy);
|
|
return;
|
|
},
|
|
}
|
|
// Indivial chunks that are reversed.
|
|
if (reversed_a and balance_a != 0) {
|
|
quad_reversal(array, ptr_a, element_width, copy);
|
|
balance_a = 0;
|
|
}
|
|
if (reversed_b and balance_b != 0) {
|
|
quad_reversal(ptr_a + element_width, ptr_b, element_width, copy);
|
|
balance_b = 0;
|
|
}
|
|
if (reversed_c and balance_c != 0) {
|
|
quad_reversal(ptr_b + element_width, ptr_c, element_width, copy);
|
|
balance_c = 0;
|
|
}
|
|
if (reversed_d and balance_d != 0) {
|
|
quad_reversal(ptr_c + element_width, ptr_d, element_width, copy);
|
|
balance_d = 0;
|
|
}
|
|
}
|
|
|
|
// Switch to quadsort if at least 25% ordered.
|
|
count = len / 512;
|
|
|
|
var ordered_a: u4 = @intFromBool(streaks_a > count);
|
|
var ordered_b: u4 = @intFromBool(streaks_b > count);
|
|
var ordered_c: u4 = @intFromBool(streaks_c > count);
|
|
var ordered_d: u4 = @intFromBool(streaks_d > count);
|
|
|
|
// Always use quadsort if memory pressure is bad.
|
|
if (quad1 > QUAD_CACHE) {
|
|
ordered_a = 1;
|
|
ordered_b = 1;
|
|
ordered_c = 1;
|
|
ordered_d = 1;
|
|
}
|
|
switch (ordered_a | (ordered_b << 1) | (ordered_c << 2) | (ordered_d << 3)) {
|
|
0 => {
|
|
flux_partition(array, swap, array, swap + len * element_width, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
},
|
|
1 => {
|
|
if (balance_a != 0)
|
|
quadsort_swap(array, quad1, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
flux_partition(ptr_a + element_width, swap, ptr_a + element_width, swap + (quad2 + half2) * element_width, quad2 + half2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
2 => {
|
|
flux_partition(array, swap, array, swap + quad1 * element_width, quad1, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_b != 0)
|
|
quadsort_swap(ptr_a + element_width, quad2, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
flux_partition(ptr_b + element_width, swap, ptr_b + element_width, swap + half2 * element_width, half2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
3 => {
|
|
if (balance_a != 0)
|
|
quadsort_swap(array, quad1, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_b != 0)
|
|
quadsort_swap(ptr_a + element_width, quad2, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
flux_partition(ptr_b + element_width, swap, ptr_b + element_width, swap + half2 * element_width, half2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
4 => {
|
|
flux_partition(array, swap, array, swap + half1 * element_width, half1, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_c != 0)
|
|
quadsort_swap(ptr_b + element_width, quad3, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
flux_partition(ptr_c + element_width, swap, ptr_c + element_width, swap + quad4 * element_width, quad4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
8 => {
|
|
flux_partition(array, swap, array, swap + (half1 + quad3) * element_width, (half1 + quad3), cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_d != 0)
|
|
quadsort_swap(ptr_c + element_width, quad4, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
9 => {
|
|
if (balance_a != 0)
|
|
quadsort_swap(array, quad1, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
flux_partition(ptr_a + element_width, swap, ptr_a + element_width, swap + (quad2 + quad3) * element_width, quad2 + quad3, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_d != 0)
|
|
quadsort_swap(ptr_c + element_width, quad4, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
12 => {
|
|
flux_partition(array, swap, array, swap + half1 * element_width, half1, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_c != 0)
|
|
quadsort_swap(ptr_b + element_width, quad3, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (balance_d != 0)
|
|
quadsort_swap(ptr_c + element_width, quad4, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
5, 6, 7, 10, 11, 13, 14, 15 => {
|
|
if (ordered_a != 0) {
|
|
if (balance_a != 0)
|
|
quadsort_swap(array, quad1, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
flux_partition(array, swap, array, swap + quad1 * element_width, quad1, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
if (ordered_b != 0) {
|
|
if (balance_b != 0)
|
|
quadsort_swap(ptr_a + element_width, quad2, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
flux_partition(ptr_a + element_width, swap, ptr_a + element_width, swap + quad2 * element_width, quad2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
if (ordered_c != 0) {
|
|
if (balance_c != 0)
|
|
quadsort_swap(ptr_b + element_width, quad3, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
flux_partition(ptr_b + element_width, swap, ptr_b + element_width, swap + quad3 * element_width, quad3, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
if (ordered_d != 0) {
|
|
if (balance_d != 0)
|
|
quadsort_swap(ptr_c + element_width, quad4, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
flux_partition(ptr_c + element_width, swap, ptr_c + element_width, swap + quad4 * element_width, quad4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
},
|
|
}
|
|
// Final Merging of sorted partitions.
|
|
if (compare_inc(cmp, cmp_data, ptr_a, ptr_a + element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
if (compare_inc(cmp, cmp_data, ptr_c, ptr_c + element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
if (compare_inc(cmp, cmp_data, ptr_b, ptr_b + element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
// Lucky us, everything sorted.
|
|
return;
|
|
}
|
|
@memcpy(swap[0..(len * element_width)], array[0..(len * element_width)]);
|
|
} else {
|
|
// First half sorted, second half needs merge.
|
|
cross_merge(swap + half1 * element_width, array + half1 * element_width, quad3, quad4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
@memcpy(swap[0..(half1 * element_width)], array[0..(half1 * element_width)]);
|
|
}
|
|
} else {
|
|
if (compare_inc(cmp, cmp_data, ptr_c, ptr_c + element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
// First half needs merge, second half sorted.
|
|
@memcpy((swap + half1 * element_width)[0..(half2 * element_width)], (array + half1 * element_width)[0..(half2 * element_width)]);
|
|
cross_merge(swap, array, quad1, quad2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
// Both halves need merge.
|
|
cross_merge(swap + half1 * element_width, ptr_b + element_width, quad3, quad4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
cross_merge(swap, array, quad1, quad2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
}
|
|
// Merge bach to original list.
|
|
cross_merge(array, swap, half1, half2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
fn flux_partition(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
x: [*]u8,
|
|
pivot: [*]u8,
|
|
initial_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var generic = false;
|
|
|
|
var pivot_ptr = pivot;
|
|
var x_ptr = x;
|
|
|
|
var len = initial_len;
|
|
var arr_len: usize = 0;
|
|
var swap_len: usize = 0;
|
|
|
|
while (true) {
|
|
pivot_ptr -= element_width;
|
|
|
|
if (len <= 2048) {
|
|
median_of_nine(x_ptr, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, pivot_ptr, indirect);
|
|
} else {
|
|
median_of_cube_root(array, swap, x_ptr, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, &generic, pivot_ptr, indirect);
|
|
|
|
if (generic) {
|
|
// Tons of identical elements, quadsort.
|
|
if (x_ptr == swap) {
|
|
@memcpy(array[0..(len * element_width)], swap[0..(len * element_width)]);
|
|
}
|
|
quadsort_swap(array, len, swap, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (arr_len != 0 and compare_inc(cmp, cmp_data, pivot_ptr + element_width, pivot_ptr, data_is_owned, inc_n_data, indirect) != GT) {
|
|
// pivot equals the last pivot, reverse partition and everything is done.
|
|
flux_reverse_partition(array, swap, array, pivot_ptr, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
// arr_len is elements <= pivot.
|
|
// swap_len is elements > pivot.
|
|
arr_len = flux_default_partition(array, swap, x_ptr, pivot_ptr, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
swap_len = len - arr_len;
|
|
|
|
// If highly imbalanced try a different strategy.
|
|
if (arr_len <= swap_len / 32 or swap_len <= FLUX_OUT) {
|
|
if (arr_len == 0)
|
|
return;
|
|
if (swap_len == 0) {
|
|
flux_reverse_partition(array, swap, array, pivot_ptr, arr_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
@memcpy((array + arr_len * element_width)[0..(swap_len * element_width)], swap[0..(swap_len * element_width)]);
|
|
quadsort_swap(array + arr_len * element_width, swap_len, swap, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
flux_partition(array + arr_len * element_width, swap, swap, pivot_ptr, swap_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
// If highly imbalanced try a different strategy
|
|
if (swap_len <= arr_len / 32 or arr_len <= FLUX_OUT) {
|
|
if (arr_len <= FLUX_OUT) {
|
|
quadsort_swap(array, arr_len, swap, arr_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
flux_reverse_partition(array, swap, array, pivot_ptr, arr_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return;
|
|
}
|
|
len = arr_len;
|
|
x_ptr = array;
|
|
}
|
|
}
|
|
|
|
// Improve generic data handling by mimickind dual pivot quicksort.
|
|
|
|
/// Partition x into array and swap (less than or equal to pivot).
|
|
/// Finally, copy from swap into array and potentially finish sorting.
|
|
/// Will return early if the array is highly unordered (allows for more quicksort).
|
|
/// Will return early if all elements went before the pivot (maybe all elements are same and can reverse parition next?).
|
|
/// Othewise, will complete the sort with quadsort.
|
|
///
|
|
/// Warning, on early return, the paritions of the array will be split over array and swap.
|
|
/// The returned size is the number of elements in the array.
|
|
fn flux_default_partition(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
x: [*]u8,
|
|
pivot: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) usize {
|
|
var arr_ptr = array;
|
|
var swap_ptr = swap;
|
|
var pivot_ptr = pivot;
|
|
var x_ptr = x;
|
|
|
|
// len guaranteed compares
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, len);
|
|
}
|
|
var run: usize = 0;
|
|
var a: usize = 8;
|
|
while (a <= len) : (a += 8) {
|
|
inline for (0..8) |_| {
|
|
const from = if (compare(cmp, cmp_data, x_ptr, pivot_ptr, indirect) != GT) &arr_ptr else &swap_ptr;
|
|
copy(from.*, x_ptr);
|
|
from.* += element_width;
|
|
x_ptr += element_width;
|
|
}
|
|
|
|
if (arr_ptr == array or swap_ptr == swap)
|
|
run = a;
|
|
}
|
|
for (0..(len % 8)) |_| {
|
|
const from = if (compare(cmp, cmp_data, x_ptr, pivot_ptr, indirect) != GT) &arr_ptr else &swap_ptr;
|
|
copy(from.*, x_ptr);
|
|
from.* += element_width;
|
|
x_ptr += element_width;
|
|
}
|
|
|
|
const m = (@intFromPtr(arr_ptr) - @intFromPtr(array)) / element_width;
|
|
|
|
// Not very sorted, early return and allow for more quicksort.
|
|
if (run <= len / 4) {
|
|
return m;
|
|
}
|
|
|
|
// Bad pivot? All elements went before it.
|
|
if (m == len) {
|
|
return m;
|
|
}
|
|
|
|
// Significantly sorted, finish with quadsort.
|
|
a = len - m;
|
|
@memcpy((array + m * element_width)[0..(a * element_width)], swap[0..(a * element_width)]);
|
|
|
|
quadsort_swap(array + m * element_width, a, swap, a, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
quadsort_swap(array, m, swap, m, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/// Partition x into array and swap.
|
|
/// Finally, copy from swap into array and finish sorting the part of the array less than pivot.
|
|
/// This is reverse cause it copies elements greater than the pivot into the array.
|
|
/// Elements are expected to at most be the same size as the pivot.
|
|
fn flux_reverse_partition(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
x: [*]u8,
|
|
pivot: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var arr_ptr = array;
|
|
var swap_ptr = swap;
|
|
var pivot_ptr = pivot;
|
|
var x_ptr = x;
|
|
|
|
// len guaranteed compares
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, len);
|
|
}
|
|
for (0..(len / 8)) |_| {
|
|
inline for (0..8) |_| {
|
|
const from = if (compare(cmp, cmp_data, pivot_ptr, x_ptr, indirect) == GT) &arr_ptr else &swap_ptr;
|
|
copy(from.*, x_ptr);
|
|
from.* += element_width;
|
|
x_ptr += element_width;
|
|
}
|
|
}
|
|
for (0..(len % 8)) |_| {
|
|
const from = if (compare(cmp, cmp_data, pivot_ptr, x_ptr, indirect) == GT) &arr_ptr else &swap_ptr;
|
|
copy(from.*, x_ptr);
|
|
from.* += element_width;
|
|
x_ptr += element_width;
|
|
}
|
|
|
|
const arr_len = (@intFromPtr(arr_ptr) - @intFromPtr(array)) / element_width;
|
|
const swap_len = (@intFromPtr(swap_ptr) - @intFromPtr(swap)) / element_width;
|
|
|
|
@memcpy((array + arr_len * element_width)[0..(swap_len * element_width)], swap[0..(swap_len * element_width)]);
|
|
|
|
if (swap_len <= arr_len / 16 or arr_len <= FLUX_OUT) {
|
|
quadsort_swap(array, arr_len, swap, arr_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
flux_partition(array, swap, array, pivot, arr_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
test "flux_default_partition" {
|
|
var expected: [32]i64 = undefined;
|
|
var test_count: i64 = 0;
|
|
var pivot: i64 = 0;
|
|
|
|
var arr: [32]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [32]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [32]i64{
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
|
|
2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
|
|
};
|
|
expected = [32]i64{
|
|
// <= pivot first half
|
|
1, 3, 5, 7, 9, 11, 13, 15,
|
|
// <= pivot second half
|
|
2, 4, 6, 8, 10, 12, 14, 16,
|
|
// > pivot first half
|
|
17, 19, 21, 23, 25, 27, 29, 31,
|
|
// > pivot second half
|
|
18, 20, 22, 24, 26, 28, 30, 32,
|
|
};
|
|
pivot = 16;
|
|
var arr_len = flux_default_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr_len, 16);
|
|
try testing.expectEqualSlices(i64, arr[0..16], expected[0..16]);
|
|
try testing.expectEqualSlices(i64, swap[0..16], expected[16..32]);
|
|
|
|
arr = [32]i64{
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
|
|
2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
|
|
};
|
|
expected = [32]i64{
|
|
// <= pivot first half
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23,
|
|
// <= pivot second half
|
|
2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24,
|
|
// > pivot first half
|
|
25, 27, 29, 31,
|
|
// > pivot second half
|
|
26, 28, 30, 32,
|
|
};
|
|
pivot = 24;
|
|
arr_len = flux_default_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr_len, 24);
|
|
try testing.expectEqualSlices(i64, arr[0..24], expected[0..24]);
|
|
try testing.expectEqualSlices(i64, swap[0..8], expected[24..32]);
|
|
|
|
arr = [32]i64{
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
|
|
2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
|
|
};
|
|
expected = [32]i64{
|
|
// <= pivot first half
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
|
|
// <= pivot second half
|
|
2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
|
|
};
|
|
pivot = 32;
|
|
arr_len = flux_default_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr_len, 32);
|
|
try testing.expectEqualSlices(i64, arr[0..32], expected[0..32]);
|
|
|
|
arr = [32]i64{
|
|
1, 3, 5, 7, 9, 11, 13, 15,
|
|
2, 4, 6, 8, 10, 12, 14, 16,
|
|
18, 20, 22, 24, 26, 28, 30, 32,
|
|
17, 19, 21, 23, 25, 27, 29, 31,
|
|
};
|
|
for (0..31) |i| {
|
|
expected[i] = @intCast(i + 1);
|
|
}
|
|
pivot = 16;
|
|
arr_len = flux_default_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr_len, 0);
|
|
try testing.expectEqualSlices(i64, arr[0..32], expected[0..32]);
|
|
}
|
|
|
|
test "flux_reverse_partition" {
|
|
const expected = [32]i64{
|
|
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
|
17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17,
|
|
};
|
|
var test_count: i64 = 0;
|
|
var pivot: i64 = 0;
|
|
|
|
var arr: [32]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [32]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [32]i64{
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 17, 17, 17, 17, 17, 17, 17,
|
|
2, 4, 6, 8, 10, 12, 14, 16, 17, 17, 17, 17, 17, 17, 17, 17,
|
|
};
|
|
pivot = 17;
|
|
flux_reverse_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [32]i64{
|
|
1, 17, 3, 17, 5, 17, 7, 17, 9, 17, 11, 17, 13, 17, 15, 17,
|
|
17, 2, 17, 4, 17, 6, 17, 8, 17, 10, 17, 12, 17, 14, 17, 16,
|
|
};
|
|
pivot = 17;
|
|
flux_reverse_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [32]i64{
|
|
15, 17, 13, 17, 11, 17, 9, 17, 7, 17, 5, 17, 3, 17, 1, 17,
|
|
17, 16, 17, 14, 17, 12, 17, 10, 17, 8, 17, 6, 17, 4, 17, 2,
|
|
};
|
|
pivot = 17;
|
|
flux_reverse_partition(arr_ptr, swap_ptr, arr_ptr, @ptrCast(&pivot), 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
// ================ Pivot Selection ===========================================
|
|
// Used for selecting the quicksort pivot for various sized arrays.
|
|
|
|
/// Returns the median of an array taking roughly cube root samples.
|
|
/// Only used for super large arrays, assumes the minimum cube root is 32.
|
|
/// Out is set to the median.
|
|
/// Generic is set to true if all elements selected for the median are the same.
|
|
fn median_of_cube_root(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
x_ptr: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
generic: *bool,
|
|
out: [*]u8,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var cbrt: usize = 32;
|
|
while (len > cbrt * cbrt * cbrt) : (cbrt *= 2) {}
|
|
|
|
const div = len / cbrt;
|
|
|
|
// Using a pointer to div as an int is to get a random offset from 0 to div.
|
|
var arr_ptr = x_ptr + (@intFromPtr(&div) / 16 % div) * element_width;
|
|
var swap_ptr = if (x_ptr == array) swap else array;
|
|
|
|
for (0..cbrt) |cnt| {
|
|
copy(swap_ptr + cnt * element_width, arr_ptr);
|
|
arr_ptr += div * element_width;
|
|
}
|
|
cbrt /= 2;
|
|
|
|
quadsort_swap(swap_ptr, cbrt, swap_ptr + cbrt * 2 * element_width, cbrt, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
quadsort_swap(swap_ptr + cbrt * element_width, cbrt, swap_ptr + cbrt * 2 * element_width, cbrt, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
generic.* = compare_inc(cmp, cmp_data, swap_ptr + (cbrt * 2 - 1) * element_width, swap_ptr, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, swap_ptr + (cbrt - 1) * element_width, swap_ptr, data_is_owned, inc_n_data, indirect) != GT;
|
|
|
|
binary_median(swap_ptr, swap_ptr + cbrt * element_width, cbrt, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, out, indirect);
|
|
}
|
|
|
|
/// Returns the median of 9 evenly distributed elements from a list.
|
|
fn median_of_nine(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
out: [*]u8,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var buffer: [9 * MAX_ELEMENT_BUFFER_SIZE]u8 align(BufferAlign) = undefined;
|
|
const swap_ptr = @as([*]u8, @ptrCast(&buffer[0]));
|
|
|
|
var arr_ptr = array;
|
|
|
|
const offset = (len / 9) * element_width;
|
|
for (0..9) |x| {
|
|
copy(swap_ptr + x * element_width, arr_ptr);
|
|
arr_ptr += offset;
|
|
}
|
|
|
|
trim_four(swap_ptr, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
trim_four(swap_ptr + 4 * element_width, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
copy(swap_ptr, swap_ptr + 5 * element_width);
|
|
copy(swap_ptr + 3 * element_width, swap_ptr + 8 * element_width);
|
|
|
|
trim_four(swap_ptr, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
copy(swap_ptr, swap_ptr + 6 * element_width);
|
|
|
|
// 3 guaranteed compares
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 3);
|
|
}
|
|
const x: usize = @intFromBool(compare(cmp, cmp_data, swap_ptr + 0 * element_width, swap_ptr + 1 * element_width, indirect) == GT);
|
|
const y: usize = @intFromBool(compare(cmp, cmp_data, swap_ptr + 0 * element_width, swap_ptr + 2 * element_width, indirect) == GT);
|
|
const z: usize = @intFromBool(compare(cmp, cmp_data, swap_ptr + 1 * element_width, swap_ptr + 2 * element_width, indirect) == GT);
|
|
|
|
const index = @intFromBool(x == y) + (x ^ z);
|
|
copy(out, swap_ptr + index * element_width);
|
|
}
|
|
|
|
/// Ensures the middle two elements of the array are the middle two elements by sorting.
|
|
/// Does not care about the rest of the elements and can overwrite them.
|
|
fn trim_four(
|
|
initial_ptr_a: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var buffer: BufferType align(BufferAlign) = undefined;
|
|
const tmp_ptr = @as([*]u8, @ptrCast(&buffer[0]));
|
|
|
|
// 4 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 4);
|
|
}
|
|
var ptr_a = initial_ptr_a;
|
|
{
|
|
const gt = compare(cmp, cmp_data, ptr_a, ptr_a + element_width, indirect) == GT;
|
|
const x = if (gt) element_width else 0;
|
|
const not_x = if (!gt) element_width else 0;
|
|
copy(tmp_ptr, ptr_a + not_x);
|
|
copy(ptr_a, ptr_a + x);
|
|
copy(ptr_a + element_width, tmp_ptr);
|
|
ptr_a += 2 * element_width;
|
|
}
|
|
{
|
|
const gt = compare(cmp, cmp_data, ptr_a, ptr_a + element_width, indirect) == GT;
|
|
const x = if (gt) element_width else 0;
|
|
const not_x = if (!gt) element_width else 0;
|
|
copy(tmp_ptr, ptr_a + not_x);
|
|
copy(ptr_a, ptr_a + x);
|
|
copy(ptr_a + element_width, tmp_ptr);
|
|
ptr_a -= 2 * element_width;
|
|
}
|
|
{
|
|
const lte = compare(cmp, cmp_data, ptr_a, ptr_a + 2 * element_width, indirect) != GT;
|
|
const x = if (lte) 2 * element_width else 0;
|
|
copy(ptr_a + 2 * element_width, ptr_a + x);
|
|
ptr_a += element_width;
|
|
}
|
|
{
|
|
const gt = compare(cmp, cmp_data, ptr_a, ptr_a + 2 * element_width, indirect) == GT;
|
|
const x = if (gt) 2 * element_width else 0;
|
|
copy(ptr_a, ptr_a + x);
|
|
}
|
|
}
|
|
|
|
/// Attempts to find the median of 2 binary arrays of len.
|
|
/// Set out to the larger median from the two lists.
|
|
fn binary_median(
|
|
initial_ptr_a: [*]u8,
|
|
initial_ptr_b: [*]u8,
|
|
initial_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
out: [*]u8,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var len = initial_len;
|
|
if (data_is_owned) {
|
|
// We need to increment log2 of len times.
|
|
const log2 = @bitSizeOf(usize) - @clz(len);
|
|
inc_n_data(cmp_data, log2);
|
|
}
|
|
var ptr_a = initial_ptr_a;
|
|
var ptr_b = initial_ptr_b;
|
|
len /= 2;
|
|
while (len != 0) : (len /= 2) {
|
|
if (compare(cmp, cmp_data, ptr_a, ptr_b, indirect) != GT) {
|
|
ptr_a += len * element_width;
|
|
} else {
|
|
ptr_b += len * element_width;
|
|
}
|
|
}
|
|
var from = if (compare(cmp, cmp_data, ptr_a, ptr_b, indirect) == GT) ptr_a else ptr_b;
|
|
copy(out, from);
|
|
}
|
|
|
|
test "median_of_cube_root" {
|
|
var test_count: i64 = 0;
|
|
var out: i64 = 0;
|
|
var generic = false;
|
|
|
|
var swap: [32]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
{
|
|
var arr: [32]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [32]i64{
|
|
1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31,
|
|
2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32,
|
|
};
|
|
median_of_cube_root(arr_ptr, swap_ptr, arr_ptr, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&generic), @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 17);
|
|
try testing.expectEqual(generic, false);
|
|
|
|
for (0..32) |i| {
|
|
arr[i] = 7;
|
|
}
|
|
median_of_cube_root(arr_ptr, swap_ptr, arr_ptr, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&generic), @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 7);
|
|
try testing.expectEqual(generic, true);
|
|
|
|
for (0..32) |i| {
|
|
arr[i] = 7 + @as(i64, @intCast(i % 2));
|
|
}
|
|
median_of_cube_root(arr_ptr, swap_ptr, arr_ptr, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&generic), @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 8);
|
|
try testing.expectEqual(generic, false);
|
|
}
|
|
}
|
|
|
|
test "median_of_nine" {
|
|
var test_count: i64 = 0;
|
|
var out: i64 = 0;
|
|
|
|
{
|
|
var arr: [9]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [9]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9 };
|
|
median_of_nine(arr_ptr, 10, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
// Note: median is not guaranteed to be extact. in this case:
|
|
// [2, 3], [6, 7] -> [3, 6] -> [3, 6, 9] -> 6
|
|
try testing.expectEqual(out, 6);
|
|
|
|
arr = [9]i64{ 1, 3, 5, 7, 9, 2, 4, 6, 8 };
|
|
median_of_nine(arr_ptr, 10, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
// Note: median is not guaranteed to be extact. in this case:
|
|
// [3, 5], [4, 6] -> [4, 5] -> [4, 5, 8] -> 5
|
|
try testing.expectEqual(out, 5);
|
|
|
|
arr = [9]i64{ 2, 3, 9, 4, 5, 7, 8, 6, 1 };
|
|
median_of_nine(arr_ptr, 10, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
// Note: median is not guaranteed to be extact. in this case:
|
|
// [3, 4], [5, 6] -> [4, 5] -> [1, 4, 5] -> 4
|
|
try testing.expectEqual(out, 4);
|
|
}
|
|
}
|
|
|
|
test "trim_four" {
|
|
var test_count: i64 = 0;
|
|
|
|
var arr: [4]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [4]i64{ 1, 2, 3, 4 };
|
|
trim_four(arr_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [4]i64{ 1, 2, 3, 4 });
|
|
|
|
arr = [4]i64{ 2, 3, 1, 4 };
|
|
trim_four(arr_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [4]i64{ 2, 3, 2, 4 });
|
|
|
|
arr = [4]i64{ 4, 3, 2, 1 };
|
|
trim_four(arr_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [4]i64{ 3, 2, 3, 2 });
|
|
}
|
|
|
|
test "binary_median" {
|
|
var test_count: i64 = 0;
|
|
var out: i64 = 0;
|
|
|
|
{
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
binary_median(arr_ptr, arr_ptr + 5 * @sizeOf(i64), 5, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 6);
|
|
|
|
arr = [10]i64{ 1, 3, 5, 7, 9, 2, 4, 6, 8, 10 };
|
|
binary_median(arr_ptr, arr_ptr + 5 * @sizeOf(i64), 5, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 5);
|
|
}
|
|
{
|
|
var arr: [16]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [16]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 };
|
|
binary_median(arr_ptr, arr_ptr + 8 * @sizeOf(i64), 8, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 9);
|
|
|
|
arr = [16]i64{ 1, 3, 5, 7, 9, 11, 13, 15, 2, 4, 6, 8, 10, 12, 14, 16 };
|
|
binary_median(arr_ptr, arr_ptr + 8 * @sizeOf(i64), 8, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 9);
|
|
|
|
arr = [16]i64{ 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8 };
|
|
binary_median(arr_ptr, arr_ptr + 8 * @sizeOf(i64), 8, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, @ptrCast(&out), false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(out, 9);
|
|
}
|
|
}
|
|
|
|
// ================ Quadsort ==================================================
|
|
// The high level quadsort functions.
|
|
|
|
/// A version of quadsort given pre-allocated swap memory.
|
|
/// This is a primitive needed fro fluxsort.
|
|
/// Will not allocate.
|
|
pub fn quadsort_swap(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
if (len < 96) {
|
|
tail_swap(array, len, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else if (quad_swap(array, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect) != .sorted) {
|
|
const block_len = quad_merge(array, len, swap, swap_len, 32, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
rotate_merge(array, len, swap, swap_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
}
|
|
|
|
pub fn quadsort(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
data_is_owned_runtime: bool,
|
|
inc_n_data: IncN,
|
|
element_width: usize,
|
|
alignment: u32,
|
|
copy: CopyFn,
|
|
) void {
|
|
// Note, knowing constant versions of element_width and copy could have huge perf gains.
|
|
// Hopefully llvm will essentially always do it via constant argument propagation and inlining.
|
|
// If not, we may want to generate `n` different version of this function with comptime.
|
|
// Then have our builtin dispatch to the correct version.
|
|
// llvm garbage collection would remove all other variants.
|
|
// Also, for numeric types, inlining the compare function can be a 2x perf gain.
|
|
if (element_width <= MAX_ELEMENT_BUFFER_SIZE) {
|
|
if (data_is_owned_runtime) {
|
|
quadsort_direct(array, len, cmp, cmp_data, element_width, alignment, copy, true, inc_n_data, false);
|
|
} else {
|
|
quadsort_direct(array, len, cmp, cmp_data, element_width, alignment, copy, false, inc_n_data, false);
|
|
}
|
|
} else {
|
|
if (utils.alloc(len * @sizeOf(usize), @alignOf(usize))) |alloc_ptr| {
|
|
// Build list of pointers to sort.
|
|
var arr_ptr = @as([*]Opaque, @ptrCast(@alignCast(alloc_ptr)));
|
|
defer utils.dealloc(alloc_ptr, @alignOf(usize));
|
|
for (0..len) |i| {
|
|
arr_ptr[i] = array + i * element_width;
|
|
}
|
|
|
|
// Sort.
|
|
if (data_is_owned_runtime) {
|
|
quadsort_direct(@ptrCast(arr_ptr), len, cmp, cmp_data, @sizeOf(usize), @alignOf(usize), &pointer_copy, true, inc_n_data, true);
|
|
} else {
|
|
quadsort_direct(@ptrCast(arr_ptr), len, cmp, cmp_data, @sizeOf(usize), @alignOf(usize), &pointer_copy, false, inc_n_data, true);
|
|
}
|
|
|
|
if (utils.alloc(len * element_width, alignment)) |collect_ptr| {
|
|
// Collect sorted pointers into correct order.
|
|
defer utils.dealloc(collect_ptr, alignment);
|
|
for (0..len) |i| {
|
|
copy(collect_ptr + i * element_width, arr_ptr[i]);
|
|
}
|
|
|
|
// Copy to original array as sorted.
|
|
@memcpy(array[0..(len * element_width)], collect_ptr[0..(len * element_width)]);
|
|
} else {
|
|
roc_panic("Out of memory while trying to allocate for sorting", 0);
|
|
}
|
|
} else {
|
|
roc_panic("Out of memory while trying to allocate for sorting", 0);
|
|
}
|
|
}
|
|
}
|
|
|
|
fn quadsort_direct(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
alignment: u32,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var arr_ptr = array;
|
|
if (len < 32) {
|
|
// TODO: This is a solid amount of stack space. Is that ok?
|
|
// That said, it only ever allocates once (not recursive).
|
|
// Aside from embedded is probably ok. Just a 3 KB with 96 byte MAX_ELEMENT_BUFFER_SIZE.
|
|
// Also, zig doesn't hav alloca, so we always do max size here.
|
|
var swap_buffer: [MAX_ELEMENT_BUFFER_SIZE * 32]u8 align(BufferAlign) = undefined;
|
|
const swap = @as([*]u8, @ptrCast(&swap_buffer[0]));
|
|
tail_swap(arr_ptr, len, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else if (quad_swap(arr_ptr, len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect) != .sorted) {
|
|
var swap_len = len;
|
|
|
|
// This is optional, for about 5% perf hit, lower memory usage on large arrays.
|
|
// if (len > 4194304) {
|
|
// swap_len = 4194304;
|
|
// while (swap_len * 8 <= len) : (swap_len *= 4) {}
|
|
// }
|
|
|
|
if (utils.alloc(swap_len * element_width, alignment)) |swap| {
|
|
const block_len = quad_merge(arr_ptr, len, swap, swap_len, 32, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
rotate_merge(arr_ptr, len, swap, swap_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
utils.dealloc(swap, alignment);
|
|
} else {
|
|
// Fallback to still sort even when out of memory.
|
|
@call(.never_inline, quadsort_stack_swap, .{ arr_ptr, len, cmp, cmp_data, data_is_owned, inc_n_data, element_width, copy, indirect });
|
|
}
|
|
}
|
|
}
|
|
|
|
fn quadsort_stack_swap(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// Use a 512 element on stack swap buffer.
|
|
var swap_buffer: [MAX_ELEMENT_BUFFER_SIZE * 512]u8 align(BufferAlign) = undefined;
|
|
const swap = @as([*]u8, @ptrCast(&swap_buffer[0]));
|
|
|
|
const block_len = quad_merge(array, len, swap, 512, 32, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
rotate_merge(array, len, swap, 512, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
// ================ Inplace Rotate Merge ======================================
|
|
// These are used as backup if the swap size is not large enough.
|
|
// Also can be used for the final merge to reduce memory footprint.
|
|
|
|
fn rotate_merge(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
block_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var end_ptr = array + len * element_width;
|
|
|
|
if (len <= block_len * 2 and len -% block_len <= swap_len) {
|
|
partial_backwards_merge(array, len, swap, swap_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
|
|
var current_block_len = block_len;
|
|
while (current_block_len < len) : (current_block_len *= 2) {
|
|
var arr_ptr = array;
|
|
while (@intFromPtr(arr_ptr) + current_block_len * element_width < @intFromPtr(end_ptr)) : (arr_ptr += current_block_len * 2 * element_width) {
|
|
if (@intFromPtr(arr_ptr) + current_block_len * 2 * element_width < @intFromPtr(end_ptr)) {
|
|
rotate_merge_block(arr_ptr, swap, swap_len, current_block_len, current_block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
continue;
|
|
}
|
|
const right_len = (@intFromPtr(end_ptr) - @intFromPtr(arr_ptr)) / element_width - current_block_len;
|
|
rotate_merge_block(arr_ptr, swap, swap_len, current_block_len, right_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Merges two blocks together while only using limited memory.
|
|
fn rotate_merge_block(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
initial_left_block: usize,
|
|
initial_right: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var left_block = initial_left_block;
|
|
var right = initial_right;
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
if (compare(cmp, cmp_data, array + (left_block - 1) * element_width, array + left_block * element_width, indirect) != GT) {
|
|
// Lucky us, already sorted.
|
|
return;
|
|
}
|
|
|
|
var right_block = left_block / 2;
|
|
left_block -= right_block;
|
|
|
|
var left = monobound_binary_first(array + (left_block + right_block) * element_width, right, array + left_block * element_width, cmp, cmp_data, element_width, data_is_owned, inc_n_data, indirect);
|
|
right -= left;
|
|
|
|
if (left != 0) {
|
|
if (left_block + left <= swap_len) {
|
|
@memcpy(swap[0..(left_block * element_width)], array[0..(left_block * element_width)]);
|
|
@memcpy((swap + left_block * element_width)[0..(left * element_width)], (array + (left_block + right_block) * element_width)[0..(left * element_width)]);
|
|
std.mem.copyBackwards(u8, (array + (left + left_block) * element_width)[0..(right_block * element_width)], (array + left_block * element_width)[0..(right_block * element_width)]);
|
|
|
|
cross_merge(array, swap, left_block, left, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
trinity_rotation(array + left_block * element_width, right_block + left, swap, swap_len, right_block, element_width, copy);
|
|
|
|
const unbalanced = (left * 2 < left_block) or (left_block * 2 < left);
|
|
if (unbalanced and left <= swap_len) {
|
|
partial_backwards_merge(array, left_block + left, swap, swap_len, left_block, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else if (unbalanced and left_block <= swap_len) {
|
|
partial_forward_merge(array, left_block + left, swap, swap_len, left_block, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
rotate_merge_block(array, swap, swap_len, left_block, left, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (right != 0) {
|
|
const unbalanced = (right * 2 < right_block) or (right_block * 2 < right);
|
|
if ((unbalanced and right <= swap_len) or right + right_block <= swap_len) {
|
|
partial_backwards_merge(array + (left_block + left) * element_width, right_block + right, swap, swap_len, right_block, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else if (unbalanced and left_block <= swap_len) {
|
|
partial_forward_merge(array + (left_block + left) * element_width, right_block + right, swap, swap_len, right_block, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
} else {
|
|
rotate_merge_block(array + (left_block + left) * element_width, swap, swap_len, right_block, right, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Binary search, but more cache friendly!
|
|
fn monobound_binary_first(
|
|
array: [*]u8,
|
|
initial_top: usize,
|
|
value_ptr: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) usize {
|
|
var top = initial_top;
|
|
var end_ptr = array + top * element_width;
|
|
|
|
if (data_is_owned) {
|
|
// We need to increment log2 of n times.
|
|
// We can get that by counting leading zeros and of (top - 1).
|
|
// Needs to be `-1` so values that are powers of 2 don't sort up a bin.
|
|
// Then just add 1 back to the final result.
|
|
const log2 = @bitSizeOf(usize) - @clz(top - 1) + 1;
|
|
inc_n_data(cmp_data, log2);
|
|
}
|
|
while (top > 1) {
|
|
const mid = top / 2;
|
|
|
|
if (compare(cmp, cmp_data, value_ptr, end_ptr - mid * element_width, indirect) != GT) {
|
|
end_ptr -= mid * element_width;
|
|
}
|
|
top -= mid;
|
|
}
|
|
|
|
if (compare(cmp, cmp_data, value_ptr, end_ptr - element_width, indirect) != GT) {
|
|
end_ptr -= element_width;
|
|
}
|
|
return (@intFromPtr(end_ptr) - @intFromPtr(array)) / element_width;
|
|
}
|
|
|
|
/// Swap two neighboring chunks of an array quickly with limited memory.
|
|
fn trinity_rotation(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
full_swap_len: usize,
|
|
left_len: usize,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
) void {
|
|
var buffer: BufferType align(BufferAlign) = undefined;
|
|
const tmp_ptr = @as([*]u8, @ptrCast(&buffer[0]));
|
|
|
|
const right_len = len - left_len;
|
|
|
|
var swap_len = full_swap_len;
|
|
if (full_swap_len > 65536) {
|
|
swap_len = 65536;
|
|
}
|
|
|
|
if (left_len < right_len) {
|
|
if (left_len <= swap_len) {
|
|
@memcpy(swap[0..(element_width * left_len)], array[0..(element_width * left_len)]);
|
|
std.mem.copyForwards(u8, array[0..(element_width * right_len)], (array + left_len * element_width)[0..(element_width * right_len)]);
|
|
@memcpy((array + right_len * element_width)[0..(element_width * left_len)], swap[0..(element_width * left_len)]);
|
|
} else {
|
|
var a_ptr = array;
|
|
var b_ptr = a_ptr + left_len * element_width;
|
|
|
|
var bridge = right_len - left_len;
|
|
if (bridge <= swap_len and bridge > 3) {
|
|
var c_ptr = a_ptr + right_len * element_width;
|
|
var d_ptr = c_ptr + left_len * element_width;
|
|
|
|
@memcpy(swap[0..(bridge * element_width)], b_ptr[0..(bridge * element_width)]);
|
|
|
|
for (0..left_len) |_| {
|
|
c_ptr -= element_width;
|
|
d_ptr -= element_width;
|
|
copy(c_ptr, d_ptr);
|
|
b_ptr -= element_width;
|
|
copy(d_ptr, b_ptr);
|
|
}
|
|
@memcpy(a_ptr[0..(bridge * element_width)], swap[0..(bridge * element_width)]);
|
|
} else {
|
|
var c_ptr = b_ptr;
|
|
var d_ptr = c_ptr + right_len * element_width;
|
|
|
|
bridge = left_len / 2;
|
|
|
|
for (0..bridge) |_| {
|
|
b_ptr -= element_width;
|
|
copy(tmp_ptr, b_ptr);
|
|
copy(b_ptr, a_ptr);
|
|
copy(a_ptr, c_ptr);
|
|
a_ptr += element_width;
|
|
d_ptr -= element_width;
|
|
copy(c_ptr, d_ptr);
|
|
c_ptr += element_width;
|
|
copy(d_ptr, tmp_ptr);
|
|
}
|
|
|
|
bridge = (@intFromPtr(d_ptr) - @intFromPtr(c_ptr)) / (element_width * 2);
|
|
for (0..bridge) |_| {
|
|
copy(tmp_ptr, c_ptr);
|
|
d_ptr -= element_width;
|
|
copy(c_ptr, d_ptr);
|
|
c_ptr += element_width;
|
|
copy(d_ptr, a_ptr);
|
|
copy(a_ptr, tmp_ptr);
|
|
a_ptr += element_width;
|
|
}
|
|
|
|
bridge = (@intFromPtr(d_ptr) - @intFromPtr(a_ptr)) / (element_width * 2);
|
|
for (0..bridge) |_| {
|
|
copy(tmp_ptr, a_ptr);
|
|
d_ptr -= element_width;
|
|
copy(a_ptr, d_ptr);
|
|
a_ptr += element_width;
|
|
copy(d_ptr, tmp_ptr);
|
|
}
|
|
}
|
|
}
|
|
} else if (right_len < left_len) {
|
|
if (right_len <= swap_len) {
|
|
@memcpy(swap[0..(element_width * right_len)], (array + left_len * element_width)[0..(element_width * right_len)]);
|
|
std.mem.copyBackwards(u8, (array + right_len * element_width)[0..(element_width * left_len)], array[0..(element_width * left_len)]);
|
|
@memcpy(array[0..(element_width * right_len)], swap[0..(element_width * right_len)]);
|
|
} else {
|
|
var a_ptr = array;
|
|
var b_ptr = a_ptr + left_len * element_width;
|
|
|
|
var bridge = left_len - right_len;
|
|
if (bridge <= swap_len and bridge > 3) {
|
|
var c_ptr = a_ptr + right_len * element_width;
|
|
var d_ptr = c_ptr + left_len * element_width;
|
|
|
|
@memcpy(swap[0..(bridge * element_width)], c_ptr[0..(bridge * element_width)]);
|
|
|
|
for (0..right_len) |_| {
|
|
copy(c_ptr, a_ptr);
|
|
c_ptr += element_width;
|
|
copy(a_ptr, b_ptr);
|
|
a_ptr += element_width;
|
|
b_ptr += element_width;
|
|
}
|
|
@memcpy((d_ptr - bridge * element_width)[0..(bridge * element_width)], swap[0..(bridge * element_width)]);
|
|
} else {
|
|
var c_ptr = b_ptr;
|
|
var d_ptr = c_ptr + right_len * element_width;
|
|
|
|
bridge = right_len / 2;
|
|
|
|
for (0..bridge) |_| {
|
|
b_ptr -= element_width;
|
|
copy(tmp_ptr, b_ptr);
|
|
copy(b_ptr, a_ptr);
|
|
copy(a_ptr, c_ptr);
|
|
a_ptr += element_width;
|
|
d_ptr -= element_width;
|
|
copy(c_ptr, d_ptr);
|
|
c_ptr += element_width;
|
|
copy(d_ptr, tmp_ptr);
|
|
}
|
|
|
|
bridge = (@intFromPtr(b_ptr) - @intFromPtr(a_ptr)) / (element_width * 2);
|
|
for (0..bridge) |_| {
|
|
b_ptr -= element_width;
|
|
copy(tmp_ptr, b_ptr);
|
|
copy(b_ptr, a_ptr);
|
|
d_ptr -= element_width;
|
|
copy(a_ptr, d_ptr);
|
|
a_ptr += element_width;
|
|
copy(d_ptr, tmp_ptr);
|
|
}
|
|
|
|
bridge = (@intFromPtr(d_ptr) - @intFromPtr(a_ptr)) / (element_width * 2);
|
|
for (0..bridge) |_| {
|
|
copy(tmp_ptr, a_ptr);
|
|
d_ptr -= element_width;
|
|
copy(a_ptr, d_ptr);
|
|
a_ptr += element_width;
|
|
copy(d_ptr, tmp_ptr);
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
var left_ptr = array;
|
|
var right_ptr = left_ptr + left_len * element_width;
|
|
|
|
for (0..left_len) |_| {
|
|
copy(tmp_ptr, left_ptr);
|
|
copy(left_ptr, right_ptr);
|
|
left_ptr += element_width;
|
|
copy(right_ptr, tmp_ptr);
|
|
right_ptr += element_width;
|
|
}
|
|
}
|
|
}
|
|
|
|
test "rotate_merge" {
|
|
const expected = [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
var test_count: i64 = 0;
|
|
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [10]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 2, 9, 10 };
|
|
rotate_merge(arr_ptr, 10, swap_ptr, 10, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 9, 2, 10 };
|
|
rotate_merge(arr_ptr, 9, swap_ptr, 9, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 3, 4, 6, 9, 1, 2, 5, 10, 7, 8 };
|
|
rotate_merge(arr_ptr, 10, swap_ptr, 10, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
// Limited swap, can't finish merge
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 9, 2, 10 };
|
|
rotate_merge(arr_ptr, 10, swap_ptr, 4, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
test "monobound_binary_first" {
|
|
var test_count: i64 = 0;
|
|
|
|
var arr = [25]i64{ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49 };
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var value: i64 = undefined;
|
|
var value_ptr = @as([*]u8, @ptrCast(&value));
|
|
|
|
value = 7;
|
|
var res = monobound_binary_first(arr_ptr, 25, value_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(res, 3);
|
|
|
|
value = 39;
|
|
res = monobound_binary_first(arr_ptr, 25, value_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(res, 19);
|
|
|
|
value = 40;
|
|
res = monobound_binary_first(arr_ptr, 25, value_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(res, 20);
|
|
|
|
value = -10;
|
|
res = monobound_binary_first(arr_ptr, 25, value_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(res, 0);
|
|
|
|
value = 10000;
|
|
res = monobound_binary_first(arr_ptr, 25, value_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(res, 25);
|
|
}
|
|
|
|
test "trinity_rotation" {
|
|
{
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [10]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
// Even.
|
|
arr = [10]i64{ 6, 7, 8, 9, 10, 1, 2, 3, 4, 5 };
|
|
trinity_rotation(arr_ptr, 10, swap_ptr, 10, 5, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 });
|
|
|
|
// left large, right fits in swap.
|
|
arr = [10]i64{ 3, 4, 5, 6, 7, 8, 9, 10, 1, 2 };
|
|
trinity_rotation(arr_ptr, 10, swap_ptr, 10, 8, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 });
|
|
|
|
// right large, left fits in swap.
|
|
arr = [10]i64{ 9, 10, 1, 2, 3, 4, 5, 6, 7, 8 };
|
|
trinity_rotation(arr_ptr, 10, swap_ptr, 10, 2, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 });
|
|
|
|
// left large, no swap.
|
|
arr = [10]i64{ 3, 4, 5, 6, 7, 8, 9, 10, 1, 2 };
|
|
trinity_rotation(arr_ptr, 10, swap_ptr, 0, 8, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 });
|
|
|
|
// right large, no swap.
|
|
arr = [10]i64{ 9, 10, 1, 2, 3, 4, 5, 6, 7, 8 };
|
|
trinity_rotation(arr_ptr, 10, swap_ptr, 0, 2, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 });
|
|
}
|
|
{
|
|
var arr: [16]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [5]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
// left larger, bridge in swap.
|
|
arr = [16]i64{ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6 };
|
|
trinity_rotation(arr_ptr, 16, swap_ptr, 5, 10, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [16]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 });
|
|
|
|
// // right large, bridge in swap.
|
|
arr = [16]i64{ 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
trinity_rotation(arr_ptr, 16, swap_ptr, 5, 6, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [16]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 });
|
|
}
|
|
}
|
|
|
|
// ================ Unbalanced Merges =========================================
|
|
|
|
/// Merges the remaining blocks at the tail of the array.
|
|
fn tail_merge(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
block_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
const end_ptr = array + len * element_width;
|
|
var current_block_len = block_len;
|
|
while (current_block_len < len and current_block_len <= swap_len) : (current_block_len *= 2) {
|
|
var arr_ptr = array;
|
|
while (@intFromPtr(arr_ptr) + current_block_len * element_width < @intFromPtr(end_ptr)) : (arr_ptr += 2 * current_block_len * element_width) {
|
|
if (@intFromPtr(arr_ptr) + 2 * current_block_len * element_width < @intFromPtr(end_ptr)) {
|
|
partial_backwards_merge(arr_ptr, 2 * current_block_len, swap, swap_len, current_block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
continue;
|
|
}
|
|
const rem_len = (@intFromPtr(end_ptr) - @intFromPtr(arr_ptr)) / element_width;
|
|
partial_backwards_merge(arr_ptr, rem_len, swap, swap_len, current_block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Merges a full left block with a smaller than block size right chunk.
|
|
/// The merge goes from tail to head.
|
|
fn partial_backwards_merge(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
block_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
std.debug.assert(swap_len >= block_len);
|
|
|
|
if (len == block_len) {
|
|
// Just a single block, already done.
|
|
return;
|
|
}
|
|
|
|
var left_tail = array + (block_len - 1) * element_width;
|
|
var dest_tail = array + (len - 1) * element_width;
|
|
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
if (compare(cmp, cmp_data, left_tail, left_tail + element_width, indirect) != GT) {
|
|
// Lucky case, blocks happen to be sorted.
|
|
return;
|
|
}
|
|
|
|
const right_len = len - block_len;
|
|
if (len <= swap_len and right_len >= 64) {
|
|
// Large remaining merge and we have enough space to just do it in swap.
|
|
|
|
cross_merge(swap, array, block_len, right_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
@memcpy(array[0..(element_width * len)], swap[0..(element_width * len)]);
|
|
|
|
return;
|
|
}
|
|
|
|
@memcpy(swap[0..(element_width * right_len)], (array + block_len * element_width)[0..(element_width * right_len)]);
|
|
|
|
var right_tail = swap + (right_len - 1) * element_width;
|
|
|
|
// For backards, we first try to do really large chunks, of 16 elements.
|
|
outer: while (@intFromPtr(left_tail) > @intFromPtr(array + 16 * element_width) and @intFromPtr(right_tail) > @intFromPtr(swap + 16 * element_width)) {
|
|
// Due to if looping, these must use `compare_inc`
|
|
while (compare_inc(cmp, cmp_data, left_tail, right_tail - 15 * element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..16) |_| {
|
|
copy(dest_tail, right_tail);
|
|
dest_tail -= element_width;
|
|
right_tail -= element_width;
|
|
}
|
|
if (@intFromPtr(right_tail) <= @intFromPtr(swap + 16 * element_width))
|
|
break :outer;
|
|
}
|
|
// Due to if looping, these must use `compare_inc`
|
|
while (compare_inc(cmp, cmp_data, left_tail - 15 * element_width, right_tail, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..16) |_| {
|
|
copy(dest_tail, left_tail);
|
|
dest_tail -= element_width;
|
|
left_tail -= element_width;
|
|
}
|
|
if (@intFromPtr(left_tail) <= @intFromPtr(array + 16 * element_width))
|
|
break :outer;
|
|
}
|
|
// Attempt to deal with the rest of the chunk in groups of 2.
|
|
var loops: usize = 8;
|
|
while (true) {
|
|
// Due to if else chain and uncertain calling, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, left_tail, right_tail - element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest_tail, right_tail);
|
|
dest_tail -= element_width;
|
|
right_tail -= element_width;
|
|
}
|
|
} else if (compare_inc(cmp, cmp_data, left_tail - element_width, right_tail, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest_tail, left_tail);
|
|
dest_tail -= element_width;
|
|
left_tail -= element_width;
|
|
}
|
|
} else {
|
|
// Couldn't move two elements, do a cross swap and continue.
|
|
// 2 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 2);
|
|
}
|
|
const lte = compare(cmp, cmp_data, left_tail, right_tail, indirect) != GT;
|
|
var x = if (lte) element_width else 0;
|
|
var not_x = if (!lte) element_width else 0;
|
|
dest_tail -= element_width;
|
|
copy(dest_tail + x, right_tail);
|
|
right_tail -= element_width;
|
|
copy(dest_tail + not_x, left_tail);
|
|
left_tail -= element_width;
|
|
dest_tail -= element_width;
|
|
|
|
tail_branchless_merge(&dest_tail, &left_tail, &right_tail, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
loops -= 1;
|
|
if (loops == 0)
|
|
break;
|
|
}
|
|
}
|
|
|
|
// For rest of tail, attempt to merge 2 elements a time from tail to head.
|
|
while (@intFromPtr(right_tail) > @intFromPtr(swap) + element_width and @intFromPtr(left_tail) > @intFromPtr(array) + element_width) {
|
|
// Note: I am not sure how to get the same generation as the original C.
|
|
// This implementation has an extra function call here.
|
|
// The C use `goto` to implement the two tail recursive functions below inline.
|
|
// I think the closest equivalent in zig would be to use an enum and a switch.
|
|
// That would potentially optimize to computed gotos.
|
|
const break_loop = partial_forward_merge_right_tail_2(&dest_tail, &array, &left_tail, &swap, &right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (break_loop)
|
|
break;
|
|
|
|
// 2 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 2);
|
|
}
|
|
// Couldn't move two elements, do a cross swap and continue.
|
|
const lte = compare(cmp, cmp_data, left_tail, right_tail, indirect) != GT;
|
|
var x = if (lte) element_width else 0;
|
|
var not_x = if (!lte) element_width else 0;
|
|
dest_tail -= element_width;
|
|
copy(dest_tail + x, right_tail);
|
|
right_tail -= element_width;
|
|
copy(dest_tail + not_x, left_tail);
|
|
left_tail -= element_width;
|
|
dest_tail -= element_width;
|
|
|
|
tail_branchless_merge(&dest_tail, &left_tail, &right_tail, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
// Deal with tail.
|
|
while (@intFromPtr(right_tail) >= @intFromPtr(swap) and @intFromPtr(left_tail) >= @intFromPtr(array)) {
|
|
// This feels like a place where we may be able reduce inc_n_data calls.
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
tail_branchless_merge(&dest_tail, &left_tail, &right_tail, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
while (@intFromPtr(right_tail) >= @intFromPtr(swap)) {
|
|
copy(dest_tail, right_tail);
|
|
dest_tail -= element_width;
|
|
right_tail -= element_width;
|
|
}
|
|
}
|
|
|
|
// The following two functions are exactly the same but with the if blocks swapped.
|
|
// They hot loop on one side until it fails, then switch to the other list.
|
|
|
|
fn partial_forward_merge_right_tail_2(
|
|
dest: *[*]u8,
|
|
left_head: *const [*]u8,
|
|
left_tail: *[*]u8,
|
|
right_head: *const [*]u8,
|
|
right_tail: *[*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) bool {
|
|
if (compare_inc(cmp, cmp_data, left_tail.*, right_tail.* - element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, right_tail.*);
|
|
dest.* -= element_width;
|
|
right_tail.* -= element_width;
|
|
}
|
|
if (@intFromPtr(right_tail.*) > @intFromPtr(right_head.*) + element_width) {
|
|
return partial_forward_merge_right_tail_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
if (compare_inc(cmp, cmp_data, left_tail.* - element_width, right_tail.*, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, left_tail.*);
|
|
dest.* -= element_width;
|
|
left_tail.* -= element_width;
|
|
}
|
|
if (@intFromPtr(left_tail.*) > @intFromPtr(left_head.*) + element_width) {
|
|
return partial_forward_merge_left_tail_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
fn partial_forward_merge_left_tail_2(
|
|
dest: *[*]u8,
|
|
left_head: *const [*]u8,
|
|
left_tail: *[*]u8,
|
|
right_head: *const [*]u8,
|
|
right_tail: *[*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) bool {
|
|
if (compare_inc(cmp, cmp_data, left_tail.* - element_width, right_tail.*, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, left_tail.*);
|
|
dest.* -= element_width;
|
|
left_tail.* -= element_width;
|
|
}
|
|
if (@intFromPtr(left_tail.*) > @intFromPtr(left_head.*) + element_width) {
|
|
return partial_forward_merge_left_tail_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
if (compare_inc(cmp, cmp_data, left_tail.*, right_tail.* - element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, right_tail.*);
|
|
dest.* -= element_width;
|
|
right_tail.* -= element_width;
|
|
}
|
|
if (@intFromPtr(right_tail.*) > @intFromPtr(right_head.*) + element_width) {
|
|
return partial_forward_merge_right_tail_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
/// Merges a full left block with a smaller than block size right chunk.
|
|
/// The merge goes from head to tail.
|
|
fn partial_forward_merge(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
block_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
std.debug.assert(swap_len >= block_len);
|
|
|
|
if (len == block_len) {
|
|
// Just a single block, already done.
|
|
return;
|
|
}
|
|
|
|
var right_head = array + block_len * element_width;
|
|
var right_tail = array + (len - 1) * element_width;
|
|
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
if (compare(cmp, cmp_data, right_head - element_width, right_head, indirect) != GT) {
|
|
// Lucky case, blocks happen to be sorted.
|
|
return;
|
|
}
|
|
|
|
@memcpy(swap[0..(element_width * block_len)], array[0..(element_width * block_len)]);
|
|
|
|
var left_head = swap;
|
|
var left_tail = swap + (block_len - 1) * element_width;
|
|
|
|
var dest_head = array;
|
|
// Attempt to merge 2 elements a time from head then tail.
|
|
while (@intFromPtr(left_head) < @intFromPtr(left_tail) - element_width and @intFromPtr(right_head) < @intFromPtr(right_tail) - element_width) {
|
|
// Note: I am not sure how to get the same generation as the original C.
|
|
// This implementation has an extra function call here.
|
|
// The C use `goto` to implement the two tail recursive functions below inline.
|
|
// I think the closest equivalent in zig would be to use an enum and a switch.
|
|
// That would potentially optimize to computed gotos.
|
|
const break_loop = partial_forward_merge_right_head_2(&dest_head, &left_head, &left_tail, &right_head, &right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
if (break_loop)
|
|
break;
|
|
|
|
// 2 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 2);
|
|
}
|
|
// Couldn't move two elements, do a cross swap and continue.
|
|
const lte = compare(cmp, cmp_data, left_head, right_head, indirect) != GT;
|
|
var x = if (lte) element_width else 0;
|
|
var not_x = if (!lte) element_width else 0;
|
|
copy(dest_head + x, right_head);
|
|
right_head += element_width;
|
|
copy(dest_head + not_x, left_head);
|
|
left_head += element_width;
|
|
dest_head += 2 * element_width;
|
|
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
// Deal with tail.
|
|
while (@intFromPtr(left_head) <= @intFromPtr(left_tail) and @intFromPtr(right_head) <= @intFromPtr(right_tail)) {
|
|
// This feels like a place where we may be able reduce inc_n_data calls.
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
while (@intFromPtr(left_head) <= @intFromPtr(left_tail)) {
|
|
copy(dest_head, left_head);
|
|
dest_head += element_width;
|
|
left_head += element_width;
|
|
}
|
|
}
|
|
|
|
// The following two functions are exactly the same but with the if blocks swapped.
|
|
// They hot loop on one side until it fails, then switch to the other list.
|
|
|
|
fn partial_forward_merge_right_head_2(
|
|
dest: *[*]u8,
|
|
left_head: *[*]u8,
|
|
left_tail: *const [*]u8,
|
|
right_head: *[*]u8,
|
|
right_tail: *const [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) bool {
|
|
if (compare_inc(cmp, cmp_data, left_head.*, right_head.* + element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, right_head.*);
|
|
dest.* += element_width;
|
|
right_head.* += element_width;
|
|
}
|
|
if (@intFromPtr(right_head.*) < @intFromPtr(right_tail.*) - element_width) {
|
|
return partial_forward_merge_right_head_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
if (compare_inc(cmp, cmp_data, left_head.* + element_width, right_head.*, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, left_head.*);
|
|
dest.* += element_width;
|
|
left_head.* += element_width;
|
|
}
|
|
if (@intFromPtr(left_head.*) < @intFromPtr(left_tail.*) - element_width) {
|
|
return partial_forward_merge_left_head_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
fn partial_forward_merge_left_head_2(
|
|
dest: *[*]u8,
|
|
left_head: *[*]u8,
|
|
left_tail: *const [*]u8,
|
|
right_head: *[*]u8,
|
|
right_tail: *const [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) bool {
|
|
if (compare_inc(cmp, cmp_data, left_head.* + element_width, right_head.*, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, left_head.*);
|
|
dest.* += element_width;
|
|
left_head.* += element_width;
|
|
}
|
|
if (@intFromPtr(left_head.*) < @intFromPtr(left_tail.*) - element_width) {
|
|
return partial_forward_merge_left_head_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
if (compare_inc(cmp, cmp_data, left_head.*, right_head.* + element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..2) |_| {
|
|
copy(dest.*, right_head.*);
|
|
dest.* += element_width;
|
|
right_head.* += element_width;
|
|
}
|
|
if (@intFromPtr(right_head.*) < @intFromPtr(right_tail.*) - element_width) {
|
|
return partial_forward_merge_right_head_2(dest, left_head, left_tail, right_head, right_tail, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
test "tail_merge" {
|
|
var test_count: i64 = 0;
|
|
const expected = [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [10]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 2, 9, 10 };
|
|
tail_merge(arr_ptr, 10, swap_ptr, 10, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 2, 9, 10 };
|
|
tail_merge(arr_ptr, 9, swap_ptr, 9, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 3, 4, 6, 9, 1, 2, 5, 10, 7, 8 };
|
|
tail_merge(arr_ptr, 10, swap_ptr, 10, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
test "partial_backwards_merge" {
|
|
var test_count: i64 = 0;
|
|
{
|
|
const expected = [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [10]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [10]i64{ 3, 4, 5, 6, 7, 8, 1, 2, 9, 10 };
|
|
partial_backwards_merge(arr_ptr, 10, swap_ptr, 10, 6, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 2, 4, 6, 8, 9, 10, 1, 3, 5, 7 };
|
|
partial_backwards_merge(arr_ptr, 10, swap_ptr, 10, 6, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 1, 2, 3, 4, 5, 6, 8, 9, 10, 7 };
|
|
partial_backwards_merge(arr_ptr, 10, swap_ptr, 10, 9, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 1, 2, 4, 5, 6, 8, 9, 3, 7, 10 };
|
|
partial_backwards_merge(arr_ptr, 10, swap_ptr, 9, 7, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
{
|
|
var expected: [64]i64 = undefined;
|
|
for (0..64) |i| {
|
|
expected[i] = @intCast(i + 1);
|
|
}
|
|
|
|
var arr: [64]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [64]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
// chunks
|
|
for (0..16) |i| {
|
|
arr[i] = @intCast(i + 17);
|
|
}
|
|
for (0..16) |i| {
|
|
arr[i + 16] = @intCast(i + 49);
|
|
}
|
|
for (0..16) |i| {
|
|
arr[i + 32] = @intCast(i + 1);
|
|
}
|
|
for (0..16) |i| {
|
|
arr[i + 48] = @intCast(i + 33);
|
|
}
|
|
partial_backwards_merge(arr_ptr, 64, swap_ptr, 64, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
// chunks with break
|
|
for (0..16) |i| {
|
|
arr[i] = @intCast(i + 17);
|
|
}
|
|
for (0..16) |i| {
|
|
arr[i + 32] = @intCast(i + 1);
|
|
}
|
|
for (0..16) |i| {
|
|
arr[i + 16] = @intCast(i + 49);
|
|
}
|
|
for (0..16) |i| {
|
|
arr[i + 48] = @intCast(i + 34);
|
|
}
|
|
arr[16] = 33;
|
|
arr[63] = 49;
|
|
|
|
partial_backwards_merge(arr_ptr, 64, swap_ptr, 64, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
}
|
|
|
|
test "partial_forward_merge" {
|
|
var test_count: i64 = 0;
|
|
const expected = [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [10]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [10]i64{ 3, 4, 5, 6, 7, 8, 1, 2, 9, 10 };
|
|
partial_forward_merge(arr_ptr, 10, swap_ptr, 10, 6, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 2, 4, 6, 8, 9, 10, 1, 3, 5, 7 };
|
|
partial_forward_merge(arr_ptr, 10, swap_ptr, 10, 6, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 1, 2, 3, 4, 5, 6, 8, 9, 10, 7 };
|
|
partial_forward_merge(arr_ptr, 10, swap_ptr, 10, 9, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
arr = [10]i64{ 1, 2, 4, 5, 6, 8, 9, 3, 7, 10 };
|
|
partial_forward_merge(arr_ptr, 10, swap_ptr, 9, 7, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
// ================ Quad Merge Support ========================================
|
|
|
|
/// Merges an array of of sized blocks of sorted elements with a tail.
|
|
/// Returns the block length of sorted runs after the call.
|
|
/// This is needed if the merge ran out of swap space.
|
|
fn quad_merge(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
swap_len: usize,
|
|
block_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) usize {
|
|
const end_ptr = array + len * element_width;
|
|
var current_block_len = block_len * 4;
|
|
|
|
while (current_block_len <= len and current_block_len <= swap_len) : (current_block_len *= 4) {
|
|
var arr_ptr = array;
|
|
while (true) {
|
|
quad_merge_block(arr_ptr, swap, current_block_len / 4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
arr_ptr += current_block_len * element_width;
|
|
if (@intFromPtr(arr_ptr) + current_block_len * element_width > @intFromPtr(end_ptr))
|
|
break;
|
|
}
|
|
|
|
const rem_len = (@intFromPtr(end_ptr) - @intFromPtr(arr_ptr)) / element_width;
|
|
tail_merge(arr_ptr, rem_len, swap, swap_len, current_block_len / 4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
tail_merge(array, len, swap, swap_len, current_block_len / 4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
return current_block_len / 2;
|
|
}
|
|
|
|
/// Merges 4 even sized blocks of sorted elements.
|
|
fn quad_merge_block(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
block_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
const block_x_2 = 2 * block_len;
|
|
|
|
const block1 = array;
|
|
const block2 = block1 + block_len * element_width;
|
|
const block3 = block2 + block_len * element_width;
|
|
const block4 = block3 + block_len * element_width;
|
|
|
|
// 2 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 2);
|
|
}
|
|
const in_order_1_2: u2 = @intFromBool(compare(cmp, cmp_data, block2 - element_width, block2, indirect) != GT);
|
|
const in_order_3_4: u2 = @intFromBool(compare(cmp, cmp_data, block4 - element_width, block4, indirect) != GT);
|
|
|
|
switch (in_order_1_2 | (in_order_3_4 << 1)) {
|
|
0 => {
|
|
// Nothing sorted. Just run merges on both.
|
|
cross_merge(swap, array, block_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
cross_merge(swap + block_x_2 * element_width, block3, block_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
1 => {
|
|
// First half sorted already.
|
|
@memcpy(swap[0..(element_width * block_x_2)], array[0..(element_width * block_x_2)]);
|
|
cross_merge(swap + block_x_2 * element_width, block3, block_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
2 => {
|
|
// Second half sorted already.
|
|
cross_merge(swap, array, block_len, block_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
@memcpy((swap + element_width * block_x_2)[0..(element_width * block_x_2)], block3[0..(element_width * block_x_2)]);
|
|
},
|
|
3 => {
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
const in_order_2_3 = compare(cmp, cmp_data, block3 - element_width, block3, indirect) != GT;
|
|
if (in_order_2_3)
|
|
// Lucky, all sorted.
|
|
return;
|
|
|
|
// Copy everything into swap to merge back into this array.
|
|
@memcpy(swap[0..(element_width * block_x_2 * 2)], array[0..(element_width * block_x_2 * 2)]);
|
|
},
|
|
}
|
|
|
|
// Merge 2 larger blocks.
|
|
cross_merge(array, swap, block_x_2, block_x_2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
/// Cross merge attempts to merge two arrays in chunks of multiple elements.
|
|
fn cross_merge(
|
|
dest: [*]u8,
|
|
src: [*]u8,
|
|
left_len: usize,
|
|
right_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var left_head = src;
|
|
var right_head = src + left_len * element_width;
|
|
var left_tail = right_head - element_width;
|
|
var right_tail = left_tail + right_len * element_width;
|
|
|
|
// If the data looks too random and the sizes are similar,
|
|
// fallback to the branchless parity merge.
|
|
if (left_len + 1 >= right_len and right_len + 1 >= left_len and left_len >= 32) {
|
|
const offset = 15 * element_width;
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, left_head + offset, right_head, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, left_head, right_head + offset, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, left_tail, right_tail - offset, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, left_tail - offset, right_tail, data_is_owned, inc_n_data, indirect) != GT) {
|
|
parity_merge(dest, src, left_len, right_len, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
}
|
|
|
|
var dest_head = dest;
|
|
var dest_tail = dest + (left_len + right_len - 1) * element_width;
|
|
|
|
outer: while (true) {
|
|
// This has to be allowed to go negative to be correct. Thus, isize.
|
|
if (@as(isize, @intCast(@intFromPtr(left_tail))) - @as(isize, @intCast(@intFromPtr(left_head))) > @as(isize, @intCast(8 * element_width))) {
|
|
// 8 elements all less than or equal to and can be moved together.
|
|
// Due to looping, these must use `compare_inc`
|
|
while (compare_inc(cmp, cmp_data, left_head + 7 * element_width, right_head, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..8) |_| {
|
|
copy(dest_head, left_head);
|
|
dest_head += element_width;
|
|
left_head += element_width;
|
|
}
|
|
if (@intFromPtr(left_tail) - @intFromPtr(left_head) <= 8 * element_width)
|
|
continue :outer;
|
|
}
|
|
|
|
// Attempt to do the same from the tail.
|
|
// 8 elements all greater than and can be moved together.
|
|
// Due to looping, these must use `compare_inc`
|
|
while (compare_inc(cmp, cmp_data, left_tail - 7 * element_width, right_tail, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..8) |_| {
|
|
copy(dest_tail, left_tail);
|
|
dest_tail -= element_width;
|
|
left_tail -= element_width;
|
|
}
|
|
if (@intFromPtr(left_tail) - @intFromPtr(left_head) <= 8 * element_width)
|
|
continue :outer;
|
|
}
|
|
}
|
|
|
|
// Attempt to do the same for the right list.
|
|
// This has to be allowed to go negative to be correct. Thus, isize.
|
|
if (@as(isize, @intCast(@intFromPtr(right_tail))) - @as(isize, @intCast(@intFromPtr(right_head))) > @as(isize, @intCast(8 * element_width))) {
|
|
// left greater than 8 elements right and can be moved together.
|
|
// Due to looping, these must use `compare_inc`
|
|
while (compare_inc(cmp, cmp_data, left_head, right_head + 7 * element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
inline for (0..8) |_| {
|
|
copy(dest_head, right_head);
|
|
dest_head += element_width;
|
|
right_head += element_width;
|
|
}
|
|
if (@intFromPtr(right_tail) - @intFromPtr(right_head) <= 8 * element_width)
|
|
continue :outer;
|
|
}
|
|
|
|
// Attempt to do the same from the tail.
|
|
// left less than or equalt to 8 elements right and can be moved together.
|
|
// Due to looping, these must use `compare_inc`
|
|
while (compare_inc(cmp, cmp_data, left_tail, right_tail - 7 * element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
inline for (0..8) |_| {
|
|
copy(dest_tail, right_tail);
|
|
dest_tail -= element_width;
|
|
right_tail -= element_width;
|
|
}
|
|
if (@intFromPtr(right_tail) - @intFromPtr(right_head) <= 8 * element_width)
|
|
continue :outer;
|
|
}
|
|
}
|
|
|
|
if (@intFromPtr(dest_tail) - @intFromPtr(dest_head) < 16 * element_width)
|
|
break;
|
|
|
|
// Large enough to warrent a two way merge.
|
|
// 16 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 16);
|
|
}
|
|
for (0..8) |_| {
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&dest_tail, &left_tail, &right_tail, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
}
|
|
|
|
// Clean up tail.
|
|
while (@intFromPtr(left_head) <= @intFromPtr(left_tail) and @intFromPtr(right_head) <= @intFromPtr(right_tail)) {
|
|
// This feels like a place where we may be able reduce inc_n_data calls.
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
while (@intFromPtr(left_head) <= @intFromPtr(left_tail)) {
|
|
copy(dest_head, left_head);
|
|
dest_head += element_width;
|
|
left_head += element_width;
|
|
}
|
|
while (@intFromPtr(right_head) <= @intFromPtr(right_tail)) {
|
|
copy(dest_head, right_head);
|
|
dest_head += element_width;
|
|
right_head += element_width;
|
|
}
|
|
}
|
|
|
|
test "quad_merge" {
|
|
var test_count: i64 = 0;
|
|
const expected = [10]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
|
|
|
|
var arr: [10]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [10]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
var size: usize = undefined;
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 2, 9, 10 };
|
|
size = quad_merge(arr_ptr, 10, swap_ptr, 10, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
try testing.expectEqual(size, 16);
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 9, 2, 10 };
|
|
size = quad_merge(arr_ptr, 9, swap_ptr, 9, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
try testing.expectEqual(size, 16);
|
|
|
|
arr = [10]i64{ 3, 4, 6, 9, 1, 2, 5, 10, 7, 8 };
|
|
size = quad_merge(arr_ptr, 10, swap_ptr, 10, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
try testing.expectEqual(size, 8);
|
|
|
|
// Limited swap, can't finish merge
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 9, 2, 10 };
|
|
size = quad_merge(arr_ptr, 10, swap_ptr, 4, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [10]i64{ 1, 3, 4, 5, 6, 7, 8, 9, 2, 10 });
|
|
try testing.expectEqual(size, 4);
|
|
|
|
arr = [10]i64{ 7, 8, 5, 6, 3, 4, 1, 9, 2, 10 };
|
|
size = quad_merge(arr_ptr, 10, swap_ptr, 3, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [10]i64{ 5, 6, 7, 8, 1, 3, 4, 9, 2, 10 });
|
|
try testing.expectEqual(size, 4);
|
|
}
|
|
|
|
test "quad_merge_block" {
|
|
var test_count: i64 = 0;
|
|
const expected = [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 };
|
|
|
|
var arr: [8]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap: [8]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
// case 0 - totally unsorted
|
|
arr = [8]i64{ 7, 8, 5, 6, 3, 4, 1, 2 };
|
|
quad_merge_block(arr_ptr, swap_ptr, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
// case 1 - first half sorted
|
|
arr = [8]i64{ 5, 6, 7, 8, 3, 4, 1, 2 };
|
|
quad_merge_block(arr_ptr, swap_ptr, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
// case 2 - second half sorted
|
|
arr = [8]i64{ 7, 8, 5, 6, 1, 2, 3, 4 };
|
|
quad_merge_block(arr_ptr, swap_ptr, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
// case 3 both haves sorted
|
|
arr = [8]i64{ 1, 3, 5, 7, 2, 4, 6, 8 };
|
|
quad_merge_block(arr_ptr, swap_ptr, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
|
|
// case 3 - lucky, sorted
|
|
arr = [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 };
|
|
quad_merge_block(arr_ptr, swap_ptr, 2, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
// try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
test "cross_merge" {
|
|
var test_count: i64 = 0;
|
|
var expected: [64]i64 = undefined;
|
|
for (0..64) |i| {
|
|
expected[i] = @intCast(i + 1);
|
|
}
|
|
|
|
var src: [64]i64 = undefined;
|
|
var dest: [64]i64 = undefined;
|
|
var src_ptr = @as([*]u8, @ptrCast(&src[0]));
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[0]));
|
|
|
|
// Opitimal case, ordered but swapped
|
|
for (0..32) |i| {
|
|
src[i] = @intCast(i + 33);
|
|
}
|
|
for (0..32) |i| {
|
|
src[i + 32] = @intCast(i + 1);
|
|
}
|
|
cross_merge(dest_ptr, src_ptr, 32, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, expected);
|
|
|
|
// will fallback, every other
|
|
for (0..32) |i| {
|
|
src[i * 2] = @intCast(i * 2 + 1);
|
|
src[i * 2 + 1] = @intCast(i * 2 + 2);
|
|
}
|
|
cross_merge(dest_ptr, src_ptr, 32, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, expected);
|
|
|
|
// super uneven
|
|
for (0..20) |i| {
|
|
src[i] = @intCast(i + 45);
|
|
}
|
|
for (0..44) |i| {
|
|
src[i + 20] = @intCast(i + 1);
|
|
}
|
|
cross_merge(dest_ptr, src_ptr, 20, 44, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, expected);
|
|
|
|
// chunks
|
|
for (0..16) |i| {
|
|
src[i] = @intCast(i + 17);
|
|
}
|
|
for (0..16) |i| {
|
|
src[i + 16] = @intCast(i + 49);
|
|
}
|
|
for (0..16) |i| {
|
|
src[i + 32] = @intCast(i + 1);
|
|
}
|
|
for (0..16) |i| {
|
|
src[i + 48] = @intCast(i + 33);
|
|
}
|
|
cross_merge(dest_ptr, src_ptr, 32, 32, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, expected);
|
|
}
|
|
|
|
// ================ 32 Element Blocks =========================================
|
|
|
|
const QuadSwapResult = enum {
|
|
sorted,
|
|
unfinished,
|
|
};
|
|
|
|
/// Starts with an unsorted array and turns it into sorted blocks of length 32.
|
|
fn quad_swap(
|
|
array: [*]u8,
|
|
len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) QuadSwapResult {
|
|
// TODO: This is a solid amount of stack space. Is that ok?
|
|
// That said, it only ever allocates once (not recursive).
|
|
// Aside from embedded is probably ok. Just a 3 KB with 96 byte MAX_ELEMENT_BUFFER_SIZE.
|
|
var swap_buffer: [MAX_ELEMENT_BUFFER_SIZE * 32]u8 align(BufferAlign) = undefined;
|
|
const swap = @as([*]u8, @ptrCast(&swap_buffer[0]));
|
|
var tmp_buffer: BufferType align(BufferAlign) = undefined;
|
|
const tmp_ptr = @as([*]u8, @ptrCast(&tmp_buffer[0]));
|
|
|
|
var arr_ptr = array;
|
|
var reverse_head = arr_ptr;
|
|
|
|
// First sort groups of 8 elements.
|
|
var count = len / 8;
|
|
var skip_tail_swap = false;
|
|
outer: while (count != 0) {
|
|
count -= 1;
|
|
|
|
// 4 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 4);
|
|
}
|
|
var v1: u4 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 0 * element_width, arr_ptr + 1 * element_width, indirect) == GT);
|
|
var v2: u4 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 2 * element_width, arr_ptr + 3 * element_width, indirect) == GT);
|
|
var v3: u4 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 4 * element_width, arr_ptr + 5 * element_width, indirect) == GT);
|
|
var v4: u4 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 6 * element_width, arr_ptr + 7 * element_width, indirect) == GT);
|
|
|
|
// This is an attempt at computed gotos in zig.
|
|
// Not yet sure if it will optimize as well as the raw gotos in C.
|
|
const Cases = enum { ordered, reversed, not_ordered };
|
|
var state: Cases = switch_state: {
|
|
switch (v1 | (v2 << 1) | (v3 << 2) | (v4 << 3)) {
|
|
0 => {
|
|
// potentially already ordered, check rest!
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
break :switch_state .ordered;
|
|
}
|
|
// 16 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 16);
|
|
}
|
|
quad_swap_merge(arr_ptr, swap, cmp, cmp_data, element_width, copy, indirect);
|
|
|
|
arr_ptr += 8 * element_width;
|
|
continue :outer;
|
|
},
|
|
15 => {
|
|
// potentially already reverse ordered, check rest!
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
reverse_head = arr_ptr;
|
|
break :switch_state .reversed;
|
|
}
|
|
break :switch_state .not_ordered;
|
|
},
|
|
else => {
|
|
break :switch_state .not_ordered;
|
|
},
|
|
}
|
|
};
|
|
while (true) {
|
|
switch (state) {
|
|
.not_ordered => {
|
|
inline for ([4]u4{ v1, v2, v3, v4 }) |v| {
|
|
const x = if (v == 0) element_width else 0;
|
|
const not_x = if (v != 0) element_width else 0;
|
|
copy(tmp_ptr, arr_ptr + x);
|
|
copy(arr_ptr, arr_ptr + not_x);
|
|
copy(arr_ptr + element_width, tmp_ptr);
|
|
arr_ptr += 2 * element_width;
|
|
}
|
|
arr_ptr -= 8 * element_width;
|
|
|
|
// 16 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 16);
|
|
}
|
|
quad_swap_merge(arr_ptr, swap, cmp, cmp_data, element_width, copy, indirect);
|
|
|
|
arr_ptr += 8 * element_width;
|
|
continue :outer;
|
|
},
|
|
.ordered => {
|
|
arr_ptr += 8 * element_width;
|
|
|
|
// 1 group was order, lets see if that continues!
|
|
if (count != 0) {
|
|
count -= 1;
|
|
// 4 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 4);
|
|
}
|
|
v1 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 0 * element_width, arr_ptr + 1 * element_width, indirect) == GT);
|
|
v2 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 2 * element_width, arr_ptr + 3 * element_width, indirect) == GT);
|
|
v3 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 4 * element_width, arr_ptr + 5 * element_width, indirect) == GT);
|
|
v4 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 6 * element_width, arr_ptr + 7 * element_width, indirect) == GT);
|
|
if (v1 | v2 | v3 | v4 != 0) {
|
|
// Sadly not ordered still, maybe reversed though?
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (v1 + v2 + v3 + v4 == 4 and compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
reverse_head = arr_ptr;
|
|
state = .reversed;
|
|
continue;
|
|
}
|
|
state = .not_ordered;
|
|
continue;
|
|
}
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
state = .ordered;
|
|
continue;
|
|
}
|
|
|
|
// 16 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 16);
|
|
}
|
|
quad_swap_merge(arr_ptr, swap, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 8 * element_width;
|
|
continue :outer;
|
|
}
|
|
break :outer;
|
|
},
|
|
.reversed => {
|
|
arr_ptr += 8 * element_width;
|
|
|
|
// 1 group was reversed, lets see if that continues!
|
|
if (count != 0) {
|
|
count -= 1;
|
|
// 4 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 4);
|
|
}
|
|
v1 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 0 * element_width, arr_ptr + 1 * element_width, indirect) != GT);
|
|
v2 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 2 * element_width, arr_ptr + 3 * element_width, indirect) != GT);
|
|
v3 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 4 * element_width, arr_ptr + 5 * element_width, indirect) != GT);
|
|
v4 = @intFromBool(compare(cmp, cmp_data, arr_ptr + 6 * element_width, arr_ptr + 7 * element_width, indirect) != GT);
|
|
if (v1 | v2 | v3 | v4 != 0) {
|
|
// Sadly not still reversed.
|
|
// So we just need to reverse upto this point, but not the current 8 element block.
|
|
} else {
|
|
// This also checks the boundary between this and the last block.
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, arr_ptr - 1 * element_width, arr_ptr + 0 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
// Row multiple reversed blocks in a row!
|
|
state = .reversed;
|
|
continue;
|
|
}
|
|
}
|
|
// Actually fix up the reversed blocks.
|
|
quad_reversal(reverse_head, arr_ptr - element_width, element_width, copy);
|
|
|
|
// Since we already have v1 to v4, check the next block state.
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (v1 + v2 + v3 + v4 == 4 and compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
state = .ordered;
|
|
continue;
|
|
}
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (v1 + v2 + v3 + v4 == 0 and compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) == GT and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
reverse_head = arr_ptr;
|
|
state = .reversed;
|
|
continue;
|
|
}
|
|
|
|
// Just an unorderd block, do it inplace.
|
|
inline for ([4]u4{ v1, v2, v3, v4 }) |v| {
|
|
const x = if (v == 0) element_width else 0;
|
|
const not_x = if (v != 0) element_width else 0;
|
|
copy(tmp_ptr, arr_ptr + not_x);
|
|
copy(arr_ptr, arr_ptr + x);
|
|
copy(arr_ptr + element_width, tmp_ptr);
|
|
arr_ptr += 2 * element_width;
|
|
}
|
|
arr_ptr -= 8 * element_width;
|
|
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) == GT or compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) == GT or compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) == GT) {
|
|
// 16 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 16);
|
|
}
|
|
quad_swap_merge(arr_ptr, swap, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
arr_ptr += 8 * element_width;
|
|
continue :outer;
|
|
}
|
|
|
|
// Handle tail block when reversing.
|
|
const rem = len % 8;
|
|
reverse_block: {
|
|
// Due to chance of breaking and not running, must use `comapare_inc`.
|
|
if (rem == 7 and compare_inc(cmp, cmp_data, arr_ptr + 5 * element_width, arr_ptr + 6 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
if (rem >= 6 and compare_inc(cmp, cmp_data, arr_ptr + 4 * element_width, arr_ptr + 5 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
if (rem >= 5 and compare_inc(cmp, cmp_data, arr_ptr + 3 * element_width, arr_ptr + 4 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
if (rem >= 4 and compare_inc(cmp, cmp_data, arr_ptr + 2 * element_width, arr_ptr + 3 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
if (rem >= 3 and compare_inc(cmp, cmp_data, arr_ptr + 1 * element_width, arr_ptr + 2 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
if (rem >= 2 and compare_inc(cmp, cmp_data, arr_ptr + 0 * element_width, arr_ptr + 1 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
if (rem >= 1 and compare_inc(cmp, cmp_data, arr_ptr - 1 * element_width, arr_ptr + 0 * element_width, data_is_owned, inc_n_data, indirect) != GT)
|
|
break :reverse_block;
|
|
quad_reversal(reverse_head, arr_ptr + rem * element_width - element_width, element_width, copy);
|
|
|
|
// If we just reversed the entire array, it is sorted.
|
|
if (reverse_head == array)
|
|
return .sorted;
|
|
|
|
skip_tail_swap = true;
|
|
break :outer;
|
|
}
|
|
quad_reversal(reverse_head, arr_ptr - element_width, element_width, copy);
|
|
|
|
break :outer;
|
|
},
|
|
}
|
|
}
|
|
}
|
|
if (!skip_tail_swap) {
|
|
tail_swap(arr_ptr, len % 8, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
// Group into 32 element blocks.
|
|
arr_ptr = array;
|
|
|
|
count = len / 32;
|
|
while (count != 0) : ({
|
|
count -= 1;
|
|
arr_ptr += 32 * element_width;
|
|
}) {
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, arr_ptr + 7 * element_width, arr_ptr + 8 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 15 * element_width, arr_ptr + 16 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr + 23 * element_width, arr_ptr + 24 * element_width, data_is_owned, inc_n_data, indirect) != GT) {
|
|
// Already in order.
|
|
continue;
|
|
}
|
|
parity_merge(swap, arr_ptr, 8, 8, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
parity_merge(swap + 16 * element_width, arr_ptr + 16 * element_width, 8, 8, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
parity_merge(arr_ptr, swap, 16, 16, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
// Deal with final tail for 32 element blocks.
|
|
// Anything over 8 elements is multiple blocks worth merging together.
|
|
if (len % 32 > 8) {
|
|
tail_merge(arr_ptr, len % 32, swap, 32, 8, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
return .unfinished;
|
|
}
|
|
|
|
/// Merge 4 sorted arrays of length 2 into a sorted array of length 8 using swap space.
|
|
/// Requires that the refcount of cmp_data be incremented 16 times.
|
|
fn quad_swap_merge(
|
|
array: [*]u8,
|
|
swap: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
parity_merge_two(swap, array, cmp, cmp_data, element_width, copy, indirect);
|
|
parity_merge_two(swap + 4 * element_width, array + 4 * element_width, cmp, cmp_data, element_width, copy, indirect);
|
|
|
|
parity_merge_four(array, swap, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
/// Reverse values from start to end.
|
|
fn quad_reversal(
|
|
start: [*]u8,
|
|
end: [*]u8,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
) void {
|
|
var buffer1: BufferType align(BufferAlign) = undefined;
|
|
var buffer2: BufferType align(BufferAlign) = undefined;
|
|
|
|
const tmp1_ptr = @as([*]u8, @ptrCast(&buffer1[0]));
|
|
const tmp2_ptr = @as([*]u8, @ptrCast(&buffer2[0]));
|
|
|
|
var loops = (@intFromPtr(end) - @intFromPtr(start)) / (element_width * 2);
|
|
|
|
var h1_start = start;
|
|
var h1_end = start + loops * element_width;
|
|
var h2_start = end - loops * element_width;
|
|
var h2_end = end;
|
|
|
|
if (loops % 2 == 0) {
|
|
copy(tmp2_ptr, h1_end);
|
|
copy(h1_end, h2_start);
|
|
h1_end -= element_width;
|
|
copy(h2_start, tmp2_ptr);
|
|
h2_start += element_width;
|
|
loops -= 1;
|
|
}
|
|
|
|
loops /= 2;
|
|
|
|
while (true) {
|
|
copy(tmp1_ptr, h1_start);
|
|
copy(h1_start, h2_end);
|
|
h1_start += element_width;
|
|
copy(h2_end, tmp1_ptr);
|
|
h2_end -= element_width;
|
|
|
|
copy(tmp2_ptr, h1_end);
|
|
copy(h1_end, h2_start);
|
|
h1_end -= element_width;
|
|
copy(h2_start, tmp2_ptr);
|
|
h2_start += element_width;
|
|
|
|
if (loops == 0)
|
|
break;
|
|
loops -= 1;
|
|
}
|
|
}
|
|
|
|
test "quad_swap" {
|
|
var test_count: i64 = 0;
|
|
var arr: [75]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [75]i64{
|
|
// multiple ordered chunks
|
|
1, 3, 5, 7, 9, 11, 13, 15,
|
|
33, 34, 35, 36, 37, 38, 39, 40,
|
|
// partially ordered
|
|
41, 42, 45, 46, 43, 44, 47, 48,
|
|
// multiple reverse chunks
|
|
70, 69, 68, 67, 66, 65, 64, 63,
|
|
16, 14, 12, 10, 8, 6, 4, 2,
|
|
// another ordered
|
|
49, 50, 51, 52, 53, 54, 55, 56,
|
|
// unordered
|
|
23, 21, 19, 20, 24, 22, 18, 17,
|
|
// partially reversed
|
|
32, 31, 28, 27, 30, 29, 26, 25,
|
|
// awkward tail
|
|
62, 59, 61, 60, 71, 73, 75, 74,
|
|
72, 58, 57,
|
|
};
|
|
|
|
var result = quad_swap(arr_ptr, 75, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(result, .unfinished);
|
|
try testing.expectEqual(arr, [75]i64{
|
|
// first 32 elements sorted (with 8 reversed that get flipped here)
|
|
1, 2, 3, 4, 5, 6, 7, 8,
|
|
9, 10, 11, 12, 13, 14, 15, 16,
|
|
33, 34, 35, 36, 37, 38, 39, 40,
|
|
41, 42, 43, 44, 45, 46, 47, 48,
|
|
// second 32 elements sorted (with 8 reversed that get flipped here)
|
|
17, 18, 19, 20, 21, 22, 23, 24,
|
|
25, 26, 27, 28, 29, 30, 31, 32,
|
|
49, 50, 51, 52, 53, 54, 55, 56,
|
|
63, 64, 65, 66, 67, 68, 69, 70,
|
|
// awkward tail
|
|
57, 58, 59, 60, 61, 62, 71, 72,
|
|
73, 74, 75,
|
|
});
|
|
|
|
// Just reversed.
|
|
var expected: [75]i64 = undefined;
|
|
for (0..75) |i| {
|
|
expected[i] = @intCast(i + 1);
|
|
arr[i] = @intCast(75 - i);
|
|
}
|
|
result = quad_swap(arr_ptr, 75, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(result, .sorted);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
|
|
test "quad_swap_merge" {
|
|
var arr: [8]i64 = undefined;
|
|
var swap: [8]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
arr = [8]i64{ 5, 6, 7, 8, 1, 2, 3, 4 };
|
|
swap = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
quad_swap_merge(arr_ptr, swap_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(arr, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
|
|
arr = [8]i64{ 5, 7, 1, 3, 6, 8, 2, 4 };
|
|
swap = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
quad_swap_merge(arr_ptr, swap_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(arr, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
|
|
arr = [8]i64{ 1, 8, 3, 4, 5, 6, 2, 7 };
|
|
swap = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
quad_swap_merge(arr_ptr, swap_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(arr, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
}
|
|
|
|
test "quad_reversal" {
|
|
{
|
|
var arr = [8]i64{ 8, 7, 6, 5, 4, 3, 2, 1 };
|
|
var start_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var end_ptr = @as([*]u8, @ptrCast(&arr[7]));
|
|
quad_reversal(start_ptr, end_ptr, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
}
|
|
{
|
|
var arr = [9]i64{ 9, 8, 7, 6, 5, 4, 3, 2, 1 };
|
|
var start_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var end_ptr = @as([*]u8, @ptrCast(&arr[8]));
|
|
quad_reversal(start_ptr, end_ptr, @sizeOf(i64), &test_i64_copy);
|
|
try testing.expectEqual(arr, [9]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9 });
|
|
}
|
|
}
|
|
|
|
// ================ Small Arrays ==============================================
|
|
// Below are functions for sorting under 32 element arrays.
|
|
|
|
/// Uses swap space to sort the tail of an array.
|
|
/// The array should generally be under 32 elements in length.
|
|
fn tail_swap(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
if (len < 8) {
|
|
tiny_sort(array, len, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
return;
|
|
}
|
|
|
|
const half1 = len / 2;
|
|
const quad1 = half1 / 2;
|
|
const quad2 = half1 - quad1;
|
|
const half2 = len - half1;
|
|
const quad3 = half2 / 2;
|
|
const quad4 = half2 - quad3;
|
|
|
|
var arr_ptr = array;
|
|
tail_swap(arr_ptr, quad1, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
arr_ptr += quad1 * element_width;
|
|
tail_swap(arr_ptr, quad2, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
arr_ptr += quad2 * element_width;
|
|
tail_swap(arr_ptr, quad3, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
arr_ptr += quad3 * element_width;
|
|
tail_swap(arr_ptr, quad4, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
|
|
// Due to short circuit logic, these must use `compare_inc`
|
|
if (compare_inc(cmp, cmp_data, array + (quad1 - 1) * element_width, array + quad1 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, array + (half1 - 1) * element_width, array + half1 * element_width, data_is_owned, inc_n_data, indirect) != GT and compare_inc(cmp, cmp_data, arr_ptr - 1 * element_width, arr_ptr, data_is_owned, inc_n_data, indirect) != GT) {
|
|
return;
|
|
}
|
|
|
|
parity_merge(swap, array, quad1, quad2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
parity_merge(swap + half1 * element_width, array + half1 * element_width, quad3, quad4, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
parity_merge(array, swap, half1, half2, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
}
|
|
|
|
/// Merges two neighboring sorted arrays into dest.
|
|
/// Left and right length mus be same or within 1 element.
|
|
fn parity_merge(
|
|
dest: [*]u8,
|
|
src: [*]u8,
|
|
left_len: usize,
|
|
right_len: usize,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
std.debug.assert(left_len == right_len or left_len == right_len - 1 or left_len - 1 == right_len);
|
|
|
|
var left_head = src;
|
|
var right_head = src + left_len * element_width;
|
|
var dest_head = dest;
|
|
|
|
var left_tail = right_head - element_width;
|
|
var right_tail = left_tail + right_len * element_width;
|
|
var dest_tail = dest + (left_len + right_len - 1) * element_width;
|
|
|
|
if (left_len < right_len) {
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
// 2 + 2(left_len -1) = (2*left_len) guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 2 * left_len);
|
|
}
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
|
|
for (0..(left_len - 1)) |_| {
|
|
head_branchless_merge(&dest_head, &left_head, &right_head, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&dest_tail, &left_tail, &right_tail, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
tail_branchless_merge(&dest_tail, &left_tail, &right_tail, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
test "tail_swap" {
|
|
var test_count: i64 = 0;
|
|
var swap: [31]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
var arr: [31]i64 = undefined;
|
|
var expected: [31]i64 = undefined;
|
|
for (0..31) |i| {
|
|
arr[i] = @intCast(i + 1);
|
|
expected[i] = @intCast(i + 1);
|
|
}
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
for (0..10) |seed| {
|
|
var rng = std.rand.DefaultPrng.init(seed);
|
|
rng.random().shuffle(i64, arr[0..]);
|
|
|
|
tail_swap(arr_ptr, 31, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, expected);
|
|
}
|
|
}
|
|
|
|
test "parity_merge" {
|
|
var test_count: i64 = 0;
|
|
{
|
|
var dest: [8]i64 = undefined;
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[0]));
|
|
|
|
var arr: [8]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [8]i64{ 1, 3, 5, 7, 2, 4, 6, 8 };
|
|
dest = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge(dest_ptr, arr_ptr, 4, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
|
|
arr = [8]i64{ 5, 6, 7, 8, 1, 2, 3, 4 };
|
|
dest = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge(dest_ptr, arr_ptr, 4, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
}
|
|
{
|
|
var dest: [9]i64 = undefined;
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[0]));
|
|
|
|
var arr: [9]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [9]i64{ 1, 3, 5, 8, 2, 4, 6, 7, 9 };
|
|
dest = [9]i64{ 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge(dest_ptr, arr_ptr, 4, 5, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, [9]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9 });
|
|
|
|
arr = [9]i64{ 6, 7, 8, 9, 1, 2, 3, 4, 5 };
|
|
dest = [9]i64{ 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge(dest_ptr, arr_ptr, 4, 5, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, [9]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9 });
|
|
|
|
arr = [9]i64{ 1, 3, 5, 7, 8, 2, 4, 6, 9 };
|
|
dest = [9]i64{ 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge(dest_ptr, arr_ptr, 5, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, [9]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9 });
|
|
|
|
arr = [9]i64{ 5, 6, 7, 8, 9, 1, 2, 3, 4 };
|
|
dest = [9]i64{ 0, 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge(dest_ptr, arr_ptr, 5, 4, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(dest, [9]i64{ 1, 2, 3, 4, 5, 6, 7, 8, 9 });
|
|
}
|
|
}
|
|
|
|
// ================ Tiny Arrays ===============================================
|
|
// Below are functions for sorting 0 to 7 element arrays.
|
|
|
|
/// Sort arrays of 0 to 7 elements.
|
|
fn tiny_sort(
|
|
array: [*]u8,
|
|
len: usize,
|
|
swap: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
std.debug.assert(len < 8);
|
|
|
|
var buffer: BufferType align(BufferAlign) = undefined;
|
|
const tmp_ptr = @as([*]u8, @ptrCast(&buffer[0]));
|
|
|
|
switch (len) {
|
|
1, 0 => {
|
|
return;
|
|
},
|
|
2 => {
|
|
// 1 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
swap_branchless(array, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
},
|
|
3 => {
|
|
// 3 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 3);
|
|
}
|
|
var arr_ptr = array;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
},
|
|
4 => {
|
|
parity_swap_four(array, tmp_ptr, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
5 => {
|
|
parity_swap_five(array, tmp_ptr, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
6 => {
|
|
parity_swap_six(array, tmp_ptr, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
7 => {
|
|
parity_swap_seven(array, tmp_ptr, swap, cmp, cmp_data, element_width, copy, data_is_owned, inc_n_data, indirect);
|
|
},
|
|
else => {
|
|
unreachable;
|
|
},
|
|
}
|
|
}
|
|
|
|
fn parity_swap_four(
|
|
array: [*]u8,
|
|
tmp_ptr: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// 3 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 3);
|
|
}
|
|
var arr_ptr = array;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
|
|
const gt = compare(cmp, cmp_data, arr_ptr, arr_ptr + element_width, indirect) == GT;
|
|
if (gt) {
|
|
// 3 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 3);
|
|
}
|
|
copy(tmp_ptr, arr_ptr);
|
|
copy(arr_ptr, arr_ptr + element_width);
|
|
copy(arr_ptr + element_width, tmp_ptr);
|
|
arr_ptr -= element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
}
|
|
|
|
fn parity_swap_five(
|
|
array: [*]u8,
|
|
tmp_ptr: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// 4 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 4);
|
|
}
|
|
var arr_ptr = array;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
var more_work = swap_branchless_return_gt(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
more_work += swap_branchless_return_gt(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr = array;
|
|
|
|
if (more_work != 0) {
|
|
// 6 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 6);
|
|
}
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr = array;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
}
|
|
|
|
fn parity_swap_six(
|
|
array: [*]u8,
|
|
tmp_ptr: [*]u8,
|
|
swap: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// 7 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 5);
|
|
}
|
|
var arr_ptr = array;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 3 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr = array;
|
|
|
|
{
|
|
const lte = compare(cmp, cmp_data, arr_ptr + 2 * element_width, arr_ptr + 3 * element_width, indirect) != GT;
|
|
if (lte) {
|
|
// 2 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 2);
|
|
}
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 4 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
return;
|
|
}
|
|
}
|
|
|
|
// 8 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 8);
|
|
}
|
|
{
|
|
const gt = compare(cmp, cmp_data, arr_ptr, arr_ptr + element_width, indirect) == GT;
|
|
var x = if (gt) element_width else 0;
|
|
var not_x = if (!gt) element_width else 0;
|
|
copy(swap, arr_ptr + x);
|
|
copy(swap + element_width, arr_ptr + not_x);
|
|
copy(swap + 2 * element_width, arr_ptr + 2 * element_width);
|
|
arr_ptr += 4 * element_width;
|
|
}
|
|
{
|
|
const gt = compare(cmp, cmp_data, arr_ptr, arr_ptr + element_width, indirect) == GT;
|
|
var x = if (gt) element_width else 0;
|
|
var not_x = if (!gt) element_width else 0;
|
|
copy(swap + 4 * element_width, arr_ptr + x);
|
|
copy(swap + 5 * element_width, arr_ptr + not_x);
|
|
copy(swap + 3 * element_width, arr_ptr - element_width);
|
|
}
|
|
|
|
arr_ptr = array;
|
|
var left = swap;
|
|
var right = swap + 3 * element_width;
|
|
|
|
head_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
head_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
head_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
|
|
arr_ptr = array + 5 * element_width;
|
|
left = swap + 2 * element_width;
|
|
right = swap + 5 * element_width;
|
|
|
|
tail_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
const gt = compare(cmp, cmp_data, left, right, indirect) == GT;
|
|
const from = if (gt) left else right;
|
|
copy(arr_ptr, from);
|
|
}
|
|
|
|
fn parity_swap_seven(
|
|
array: [*]u8,
|
|
tmp_ptr: [*]u8,
|
|
swap: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// 6 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 6);
|
|
}
|
|
var arr_ptr = array;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= 3 * element_width;
|
|
var more_work = swap_branchless_return_gt(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
more_work += swap_branchless_return_gt(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr += 2 * element_width;
|
|
more_work += swap_branchless_return_gt(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr -= element_width;
|
|
|
|
if (more_work == 0)
|
|
return;
|
|
|
|
// 11 guaranteed compares.
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 11);
|
|
}
|
|
swap_branchless(arr_ptr, tmp_ptr, cmp, cmp_data, element_width, copy, indirect);
|
|
arr_ptr = array;
|
|
|
|
{
|
|
const gt = compare(cmp, cmp_data, arr_ptr, arr_ptr + element_width, indirect) == GT;
|
|
var x = if (gt) element_width else 0;
|
|
var not_x = if (!gt) element_width else 0;
|
|
copy(swap, arr_ptr + x);
|
|
copy(swap + element_width, arr_ptr + not_x);
|
|
copy(swap + 2 * element_width, arr_ptr + 2 * element_width);
|
|
arr_ptr += 3 * element_width;
|
|
}
|
|
{
|
|
const gt = compare(cmp, cmp_data, arr_ptr, arr_ptr + element_width, indirect) == GT;
|
|
var x = if (gt) element_width else 0;
|
|
var not_x = if (!gt) element_width else 0;
|
|
copy(swap + 3 * element_width, arr_ptr + x);
|
|
copy(swap + 4 * element_width, arr_ptr + not_x);
|
|
arr_ptr += 2 * element_width;
|
|
}
|
|
{
|
|
const gt = compare(cmp, cmp_data, arr_ptr, arr_ptr + element_width, indirect) == GT;
|
|
var x = if (gt) element_width else 0;
|
|
var not_x = if (!gt) element_width else 0;
|
|
copy(swap + 5 * element_width, arr_ptr + x);
|
|
copy(swap + 6 * element_width, arr_ptr + not_x);
|
|
}
|
|
|
|
arr_ptr = array;
|
|
var left = swap;
|
|
var right = swap + 3 * element_width;
|
|
|
|
head_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
head_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
head_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
|
|
arr_ptr = array + 6 * element_width;
|
|
left = swap + 2 * element_width;
|
|
right = swap + 6 * element_width;
|
|
|
|
tail_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&arr_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
const gt = compare(cmp, cmp_data, left, right, indirect) == GT;
|
|
const from = if (gt) left else right;
|
|
copy(arr_ptr, from);
|
|
}
|
|
|
|
test "tiny_sort" {
|
|
var test_count: i64 = 0;
|
|
var swap: [7]i64 = undefined;
|
|
var swap_ptr = @as([*]u8, @ptrCast(&swap[0]));
|
|
|
|
{
|
|
var arr: [7]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [7]i64{ 3, 1, 2, 5, 4, 7, 6 };
|
|
tiny_sort(arr_ptr, 7, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [7]i64{ 1, 2, 3, 4, 5, 6, 7 });
|
|
|
|
arr = [7]i64{ 7, 6, 5, 4, 3, 2, 1 };
|
|
tiny_sort(arr_ptr, 7, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [7]i64{ 1, 2, 3, 4, 5, 6, 7 });
|
|
}
|
|
{
|
|
var arr: [6]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [6]i64{ 3, 1, 2, 6, 4, 5 };
|
|
tiny_sort(arr_ptr, 6, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [6]i64{ 1, 2, 3, 4, 5, 6 });
|
|
|
|
arr = [6]i64{ 6, 5, 4, 3, 2, 1 };
|
|
tiny_sort(arr_ptr, 6, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [6]i64{ 1, 2, 3, 4, 5, 6 });
|
|
}
|
|
{
|
|
var arr: [5]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [5]i64{ 2, 1, 4, 3, 5 };
|
|
tiny_sort(arr_ptr, 5, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [5]i64{ 1, 2, 3, 4, 5 });
|
|
|
|
arr = [5]i64{ 5, 4, 3, 2, 1 };
|
|
tiny_sort(arr_ptr, 5, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [5]i64{ 1, 2, 3, 4, 5 });
|
|
}
|
|
{
|
|
var arr: [4]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
|
|
arr = [4]i64{ 4, 2, 1, 3 };
|
|
tiny_sort(arr_ptr, 4, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [4]i64{ 1, 2, 3, 4 });
|
|
|
|
arr = [4]i64{ 2, 1, 4, 3 };
|
|
tiny_sort(arr_ptr, 4, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [4]i64{ 1, 2, 3, 4 });
|
|
}
|
|
{
|
|
var arr = [3]i64{ 2, 3, 1 };
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
tiny_sort(arr_ptr, 3, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [3]i64{ 1, 2, 3 });
|
|
}
|
|
{
|
|
var arr = [2]i64{ 2, 1 };
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
tiny_sort(arr_ptr, 2, swap_ptr, &test_i64_compare_refcounted, @ptrCast(&test_count), @sizeOf(i64), &test_i64_copy, true, &test_inc_n_data, false);
|
|
try testing.expectEqual(test_count, 0);
|
|
try testing.expectEqual(arr, [2]i64{ 1, 2 });
|
|
}
|
|
}
|
|
|
|
// ================ Primitives ================================================
|
|
// Below are sorting primitives that attempt to be branchless.
|
|
// They all also are always inline for performance.
|
|
// The are the smallest fundamental unit.
|
|
|
|
/// Merge two neighboring sorted 4 element arrays into dest.
|
|
/// Requires that the refcount of cmp_data be incremented 8 times.
|
|
inline fn parity_merge_four(
|
|
dest: [*]u8,
|
|
array: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var left = array;
|
|
var right = array + (4 * element_width);
|
|
var dest_ptr = dest;
|
|
head_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
head_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
head_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
const lte = compare(cmp, cmp_data, left, right, indirect) != GT;
|
|
var to_copy = if (lte) left else right;
|
|
copy(dest_ptr, to_copy);
|
|
|
|
left = array + (3 * element_width);
|
|
right = array + (7 * element_width);
|
|
dest_ptr = dest + (7 * element_width);
|
|
tail_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
tail_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
const gt = compare(cmp, cmp_data, left, right, indirect) == GT;
|
|
to_copy = if (gt) left else right;
|
|
copy(dest_ptr, to_copy);
|
|
}
|
|
|
|
/// Merge two neighboring sorted 2 element arrays into dest.
|
|
/// Requires that the refcount of cmp_data be incremented 4 times.
|
|
inline fn parity_merge_two(
|
|
dest: [*]u8,
|
|
array: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
var left = array;
|
|
var right = array + (2 * element_width);
|
|
var dest_ptr = dest;
|
|
head_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
const lte = compare(cmp, cmp_data, left, right, indirect) != GT;
|
|
var to_copy = if (lte) left else right;
|
|
copy(dest_ptr, to_copy);
|
|
|
|
left = array + element_width;
|
|
right = array + (3 * element_width);
|
|
dest_ptr = dest + (3 * element_width);
|
|
tail_branchless_merge(&dest_ptr, &left, &right, cmp, cmp_data, element_width, copy, indirect);
|
|
const gt = compare(cmp, cmp_data, left, right, indirect) == GT;
|
|
to_copy = if (gt) left else right;
|
|
copy(dest_ptr, to_copy);
|
|
}
|
|
|
|
/// Moves the smaller element from left and rigth to dest.
|
|
/// Will increment both dest and the smaller element ptr to their next index.
|
|
/// Inlining will remove the extra level of pointer indirection here.
|
|
/// It is just used to allow mutating the input pointers.
|
|
/// Requires that the refcount of cmp_data be incremented 1 time.
|
|
inline fn head_branchless_merge(
|
|
dest: *[*]u8,
|
|
left: *[*]u8,
|
|
right: *[*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// Note equivalent c code:
|
|
// *ptd++ = cmp(ptl, ptr) <= 0 ? *ptl++ : *ptr++;
|
|
// While not guaranteed branchless, tested in godbolt for x86_64, aarch32, aarch64, riscv64, and wasm32.
|
|
const lte = compare(cmp, cmp_data, left.*, right.*, indirect) != GT;
|
|
const from = if (lte) left else right;
|
|
copy(dest.*, from.*);
|
|
from.* += element_width;
|
|
dest.* += element_width;
|
|
}
|
|
|
|
/// Moves the smaller element from left and rigth to dest.
|
|
/// Will decrement both dest and the smaller element ptr to their previous index.
|
|
/// Inlining will remove the extra level of pointer indirection here.
|
|
/// It is just used to allow mutating the input pointers.
|
|
/// Requires that the refcount of cmp_data be incremented 1 time.
|
|
inline fn tail_branchless_merge(
|
|
dest: *[*]u8,
|
|
left: *[*]u8,
|
|
right: *[*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// Note equivalent c code:
|
|
// *tpd-- = cmp(tpl, tpr) > 0 ? *tpl-- : *tpr--;
|
|
// While not guaranteed branchless, tested in godbolt for x86_64, aarch32, aarch64, riscv64, and wasm32.
|
|
const gt = compare(cmp, cmp_data, left.*, right.*, indirect) == GT;
|
|
const from = if (gt) left else right;
|
|
copy(dest.*, from.*);
|
|
from.* -= element_width;
|
|
dest.* -= element_width;
|
|
}
|
|
|
|
/// Swaps the element at ptr with the element after it if the element is greater than the next.
|
|
/// Requires that the refcount of cmp_data be incremented 1 time.
|
|
inline fn swap_branchless(
|
|
ptr: [*]u8,
|
|
tmp: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) void {
|
|
// While not guaranteed branchless, tested in godbolt for x86_64, aarch32, aarch64, riscv64, and wasm32.
|
|
_ = swap_branchless_return_gt(ptr, tmp, cmp, cmp_data, element_width, copy, indirect);
|
|
}
|
|
|
|
/// Requires that the refcount of cmp_data be incremented 1 time.
|
|
inline fn swap_branchless_return_gt(
|
|
ptr: [*]u8,
|
|
tmp: [*]u8,
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
element_width: usize,
|
|
copy: CopyFn,
|
|
comptime indirect: bool,
|
|
) u8 {
|
|
// While not guaranteed branchless, tested in godbolt for x86_64, aarch32, aarch64, riscv64, and wasm32.
|
|
const gt = compare(cmp, cmp_data, ptr, ptr + element_width, indirect) == GT;
|
|
var x = if (gt) element_width else 0;
|
|
const from = if (gt) ptr else ptr + element_width;
|
|
copy(tmp, from);
|
|
copy(ptr, ptr + x);
|
|
copy(ptr + element_width, tmp);
|
|
return @intFromBool(gt);
|
|
}
|
|
|
|
/// Requires that the refcount of cmp_data be incremented 1 time.
|
|
inline fn compare(
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
lhs_opaque: *anyopaque,
|
|
rhs_opaque: *anyopaque,
|
|
comptime indirect: bool,
|
|
) Ordering {
|
|
if (indirect) {
|
|
const lhs = @as(*[*]u8, @ptrCast(@alignCast(lhs_opaque))).*;
|
|
const rhs = @as(*[*]u8, @ptrCast(@alignCast(rhs_opaque))).*;
|
|
return @as(Ordering, @enumFromInt(cmp(cmp_data, lhs, rhs)));
|
|
} else {
|
|
const lhs = @as([*]u8, @ptrCast(@alignCast(lhs_opaque)));
|
|
const rhs = @as([*]u8, @ptrCast(@alignCast(rhs_opaque)));
|
|
return @as(Ordering, @enumFromInt(cmp(cmp_data, lhs, rhs)));
|
|
}
|
|
}
|
|
|
|
/// Only use this as a last resort.
|
|
/// It will increment the refcount before comparing.
|
|
/// Incrementing for each individual compare is slow.
|
|
/// Perfer to increment in batches where possible.
|
|
inline fn compare_inc(
|
|
cmp: CompareFn,
|
|
cmp_data: Opaque,
|
|
lhs: [*]u8,
|
|
rhs: [*]u8,
|
|
comptime data_is_owned: bool,
|
|
inc_n_data: IncN,
|
|
comptime indirect: bool,
|
|
) Ordering {
|
|
if (data_is_owned) {
|
|
inc_n_data(cmp_data, 1);
|
|
}
|
|
return compare(cmp, cmp_data, lhs, rhs, indirect);
|
|
}
|
|
|
|
test "parity_merge_four" {
|
|
var arr: [8]i64 = undefined;
|
|
var dest: [8]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[0]));
|
|
|
|
arr = [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 };
|
|
dest = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge_four(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
|
|
arr = [8]i64{ 5, 6, 7, 8, 1, 2, 3, 4 };
|
|
dest = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge_four(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
|
|
arr = [8]i64{ 1, 3, 5, 7, 2, 4, 6, 8 };
|
|
dest = [8]i64{ 0, 0, 0, 0, 0, 0, 0, 0 };
|
|
parity_merge_four(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [8]i64{ 1, 2, 3, 4, 5, 6, 7, 8 });
|
|
}
|
|
|
|
test "parity_merge_two" {
|
|
var arr: [4]i64 = undefined;
|
|
var dest: [4]i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[0]));
|
|
|
|
arr = [4]i64{ 1, 2, 3, 4 };
|
|
dest = [4]i64{ 0, 0, 0, 0 };
|
|
parity_merge_two(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [4]i64{ 1, 2, 3, 4 });
|
|
|
|
arr = [4]i64{ 1, 3, 2, 4 };
|
|
dest = [4]i64{ 0, 0, 0, 0 };
|
|
parity_merge_two(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [4]i64{ 1, 2, 3, 4 });
|
|
|
|
arr = [4]i64{ 3, 4, 1, 2 };
|
|
dest = [4]i64{ 0, 0, 0, 0 };
|
|
parity_merge_two(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [4]i64{ 1, 2, 3, 4 });
|
|
|
|
arr = [4]i64{ 2, 4, 1, 3 };
|
|
dest = [4]i64{ 0, 0, 0, 0 };
|
|
parity_merge_two(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [4]i64{ 1, 2, 3, 4 });
|
|
|
|
arr = [4]i64{ 1, 4, 2, 3 };
|
|
dest = [4]i64{ 0, 0, 0, 0 };
|
|
parity_merge_two(dest_ptr, arr_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(dest, [4]i64{ 1, 2, 3, 4 });
|
|
}
|
|
|
|
test "head_branchless_merge" {
|
|
var dest = [6]i64{ 0, 0, 0, 0, 0, 0 };
|
|
var left = [4]i64{ 1, 7, 10, 22 };
|
|
var right = [4]i64{ 2, 2, 8, 22 };
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[0]));
|
|
var left_ptr = @as([*]u8, @ptrCast(&left[0]));
|
|
var right_ptr = @as([*]u8, @ptrCast(&right[0]));
|
|
|
|
head_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
head_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
head_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
head_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
head_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
head_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
|
|
try testing.expectEqual(dest, [6]i64{ 1, 2, 2, 7, 8, 10 });
|
|
}
|
|
|
|
test "tail_branchless_merge" {
|
|
var dest = [6]i64{ 0, 0, 0, 0, 0, 0 };
|
|
var left = [4]i64{ -22, 1, 7, 10 };
|
|
var right = [4]i64{ -22, 2, 2, 8 };
|
|
var dest_ptr = @as([*]u8, @ptrCast(&dest[dest.len - 1]));
|
|
var left_ptr = @as([*]u8, @ptrCast(&left[left.len - 1]));
|
|
var right_ptr = @as([*]u8, @ptrCast(&right[right.len - 1]));
|
|
|
|
tail_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
tail_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
tail_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
tail_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
tail_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
tail_branchless_merge(&dest_ptr, &left_ptr, &right_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
|
|
try testing.expectEqual(dest, [6]i64{ 1, 2, 2, 7, 8, 10 });
|
|
}
|
|
|
|
test "swap" {
|
|
var arr: [2]i64 = undefined;
|
|
var tmp: i64 = undefined;
|
|
var arr_ptr = @as([*]u8, @ptrCast(&arr[0]));
|
|
var tmp_ptr = @as([*]u8, @ptrCast(&tmp));
|
|
|
|
arr = [2]i64{ 10, 20 };
|
|
swap_branchless(arr_ptr, tmp_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(arr, [2]i64{ 10, 20 });
|
|
|
|
arr = [2]i64{ 77, -12 };
|
|
swap_branchless(arr_ptr, tmp_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(arr, [2]i64{ -12, 77 });
|
|
|
|
arr = [2]i64{ -22, -22 };
|
|
swap_branchless(arr_ptr, tmp_ptr, &test_i64_compare, null, @sizeOf(i64), &test_i64_copy, false);
|
|
try testing.expectEqual(arr, [2]i64{ -22, -22 });
|
|
}
|
|
|
|
pub fn pointer_copy(dst_ptr: Opaque, src_ptr: Opaque) callconv(.C) void {
|
|
@as(*usize, @alignCast(@ptrCast(dst_ptr))).* = @as(*usize, @alignCast(@ptrCast(src_ptr))).*;
|
|
}
|
|
|
|
fn test_i64_compare(_: Opaque, a_ptr: Opaque, b_ptr: Opaque) callconv(.C) u8 {
|
|
const a = @as(*i64, @alignCast(@ptrCast(a_ptr))).*;
|
|
const b = @as(*i64, @alignCast(@ptrCast(b_ptr))).*;
|
|
|
|
const gt = @as(u8, @intFromBool(a > b));
|
|
const lt = @as(u8, @intFromBool(a < b));
|
|
|
|
// Eq = 0
|
|
// GT = 1
|
|
// LT = 2
|
|
return lt + lt + gt;
|
|
}
|
|
|
|
fn test_i64_compare_refcounted(count_ptr: Opaque, a_ptr: Opaque, b_ptr: Opaque) callconv(.C) u8 {
|
|
const a = @as(*i64, @alignCast(@ptrCast(a_ptr))).*;
|
|
const b = @as(*i64, @alignCast(@ptrCast(b_ptr))).*;
|
|
|
|
const gt = @as(u8, @intFromBool(a > b));
|
|
const lt = @as(u8, @intFromBool(a < b));
|
|
|
|
@as(*isize, @ptrCast(@alignCast(count_ptr))).* -= 1;
|
|
// Eq = 0
|
|
// GT = 1
|
|
// LT = 2
|
|
return lt + lt + gt;
|
|
}
|
|
|
|
fn test_i64_copy(dst_ptr: Opaque, src_ptr: Opaque) callconv(.C) void {
|
|
@as(*i64, @alignCast(@ptrCast(dst_ptr))).* = @as(*i64, @alignCast(@ptrCast(src_ptr))).*;
|
|
}
|
|
|
|
fn test_inc_n_data(count_ptr: Opaque, n: usize) callconv(.C) void {
|
|
@as(*isize, @ptrCast(@alignCast(count_ptr))).* += @intCast(n);
|
|
}
|