Merge pull request #8735 from sylvestre/sort-perf-2
Some checks are pending
CICD / Style/cargo-deny (push) Waiting to run
CICD / Build (push) Blocked by required conditions
CICD / Style/deps (push) Waiting to run
CICD / Documentation/warnings (push) Waiting to run
CICD / MinRustV (push) Waiting to run
CICD / Test all features separately (push) Blocked by required conditions
CICD / Dependencies (push) Waiting to run
CICD / Build/Makefile (push) Blocked by required conditions
CICD / Build/stable (push) Blocked by required conditions
CICD / Build/nightly (push) Blocked by required conditions
CICD / Binary sizes (push) Blocked by required conditions
CICD / Tests/BusyBox test suite (push) Blocked by required conditions
CICD / Tests/Toybox test suite (push) Blocked by required conditions
CICD / Code Coverage (push) Waiting to run
CICD / Separate Builds (push) Waiting to run
CICD / Build/SELinux (push) Blocked by required conditions
CICD / Run benchmarks (CodSpeed) (push) Blocked by required conditions
GnuTests / Aggregate GNU test results (push) Blocked by required conditions
GnuTests / Run GNU tests (native) (push) Waiting to run
GnuTests / Run GNU tests (SELinux) (push) Waiting to run
Android / Test builds (push) Waiting to run
Code Quality / Style/format (push) Waiting to run
Code Quality / Style/lint (push) Waiting to run
Code Quality / Style/spelling (push) Waiting to run
Code Quality / Style/toml (push) Waiting to run
Code Quality / Style/Python (push) Waiting to run
Code Quality / Pre-commit hooks (push) Waiting to run
Devcontainer / Verify devcontainer (push) Waiting to run
FreeBSD / Style and Lint (push) Waiting to run
FreeBSD / Tests (push) Waiting to run
WSL2 / Test (push) Waiting to run

sort: add benchmark
This commit is contained in:
Sylvestre Ledru 2025-09-25 21:45:25 +02:00 committed by GitHub
commit ea5c8158d4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 650 additions and 95 deletions

2
.vscode/cSpell.json vendored
View file

@ -32,6 +32,8 @@
".devcontainer/**",
"util/gnu-patches/**",
"docs/src/release-notes/**",
"src/uu/*/benches/*.rs",
"src/uucore/src/lib/features/benchmark.rs",
],
"enableGlobDot": true,

1
Cargo.lock generated
View file

@ -3930,6 +3930,7 @@ dependencies = [
"bigdecimal",
"binary-heap-plus",
"clap",
"codspeed-divan-compat",
"compare",
"ctrlc",
"fluent",

View file

@ -4,45 +4,32 @@
// file that was distributed with this source code.
use divan::{Bencher, black_box};
use tempfile::TempDir;
use uu_numfmt::uumain;
use uucore::benchmark::{create_test_file, run_util_function};
/// Generate numeric data for benchmarking
fn generate_numbers(count: usize) -> String {
(1..=count)
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join("\n")
}
/// Setup benchmark environment with test data
fn setup_benchmark(data: String) -> (TempDir, String) {
let temp_dir = tempfile::tempdir().unwrap();
let file_path = create_test_file(data.as_bytes(), temp_dir.path());
let file_path_str = file_path.to_str().unwrap().to_string();
(temp_dir, file_path_str)
}
use uucore::benchmark::{run_util_function, setup_test_file, text_data};
/// Benchmark SI formatting with different number counts
#[divan::bench(args = [1_000_000])]
fn numfmt_to_si(bencher: Bencher, count: usize) {
let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
let data = text_data::generate_numbers(count);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &["--to=si", &file_path_str]));
black_box(run_util_function(uumain, &["--to=si", file_path_str]));
});
}
/// Benchmark SI formatting with precision format
#[divan::bench(args = [1_000_000])]
fn numfmt_to_si_precision(bencher: Bencher, count: usize) {
let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
let data = text_data::generate_numbers(count);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["--to=si", "--format=%.6f", &file_path_str],
&["--to=si", "--format=%.6f", file_path_str],
));
});
}
@ -50,10 +37,12 @@ fn numfmt_to_si_precision(bencher: Bencher, count: usize) {
/// Benchmark IEC (binary) formatting
#[divan::bench(args = [1_000_000])]
fn numfmt_to_iec(bencher: Bencher, count: usize) {
let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
let data = text_data::generate_numbers(count);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &["--to=iec", &file_path_str]));
black_box(run_util_function(uumain, &["--to=iec", file_path_str]));
});
}
@ -65,10 +54,11 @@ fn numfmt_from_si(bencher: Bencher, count: usize) {
.map(|n| format!("{:.1}K", n as f64 / 1000.0))
.collect::<Vec<_>>()
.join("\n");
let (_temp_dir, file_path_str) = setup_benchmark(data);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &["--from=si", &file_path_str]));
black_box(run_util_function(uumain, &["--from=si", file_path_str]));
});
}
@ -80,23 +70,26 @@ fn numfmt_large_numbers_si(bencher: Bencher, count: usize) {
.map(|n| (n * 1_000_000).to_string())
.collect::<Vec<_>>()
.join("\n");
let (_temp_dir, file_path_str) = setup_benchmark(data);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &["--to=si", &file_path_str]));
black_box(run_util_function(uumain, &["--to=si", file_path_str]));
});
}
/// Benchmark different padding widths
#[divan::bench(args = [(1_000_000, 5), (1_000_000, 50)])]
fn numfmt_padding(bencher: Bencher, (count, padding): (usize, usize)) {
let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
let data = text_data::generate_numbers(count);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
let padding_arg = format!("--padding={padding}");
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["--to=si", &padding_arg, &file_path_str],
&["--to=si", &padding_arg, file_path_str],
));
});
}
@ -104,13 +97,15 @@ fn numfmt_padding(bencher: Bencher, (count, padding): (usize, usize)) {
/// Benchmark round modes with SI formatting
#[divan::bench(args = [("up", 100_000), ("down", 1_000_000), ("towards-zero", 1_000_000)])]
fn numfmt_round_modes(bencher: Bencher, (round_mode, count): (&str, usize)) {
let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
let data = text_data::generate_numbers(count);
let file_path = setup_test_file(data.as_bytes());
let file_path_str = file_path.to_str().unwrap();
let round_arg = format!("--round={round_mode}");
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["--to=si", &round_arg, &file_path_str],
&["--to=si", &round_arg, file_path_str],
));
});
}

View file

@ -40,6 +40,21 @@ fluent = { workspace = true }
[target.'cfg(target_os = "linux")'.dependencies]
nix = { workspace = true }
[dev-dependencies]
divan = { workspace = true }
tempfile = { workspace = true }
uucore = { workspace = true, features = [
"benchmark",
"fs",
"parser",
"version-cmp",
"i18n-collator",
] }
[[bin]]
name = "sort"
path = "src/main.rs"
[[bench]]
name = "sort_bench"
harness = false

View file

@ -0,0 +1,158 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use divan::{Bencher, black_box};
use uu_sort::uumain;
use uucore::benchmark::{run_util_function, setup_test_file, text_data};
/// Benchmark sorting ASCII-only data
#[divan::bench(args = [100_000, 500_000])]
fn sort_ascii_only(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_ascii_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark sorting accented/non-ASCII data
#[divan::bench(args = [100_000, 500_000])]
fn sort_accented_data(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_accented_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark sorting mixed ASCII/non-ASCII data
#[divan::bench(args = [100_000, 500_000])]
fn sort_mixed_data(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_mixed_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark case-sensitive sorting with mixed case data
#[divan::bench(args = [100_000, 500_000])]
fn sort_case_sensitive(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_case_sensitive_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark case-insensitive sorting (fold case)
#[divan::bench(args = [100_000, 500_000])]
fn sort_case_insensitive(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_case_sensitive_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["-f", file_path.to_str().unwrap()],
));
});
}
/// Benchmark dictionary order sorting (only blanks and alphanumeric)
#[divan::bench(args = [100_000, 500_000])]
fn sort_dictionary_order(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_mixed_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["-d", file_path.to_str().unwrap()],
));
});
}
/// Benchmark numeric sorting with mixed data
#[divan::bench(args = [100_000, 500_000])]
fn sort_numeric(bencher: Bencher, num_lines: usize) {
let mut data = Vec::new();
// Generate numeric data with some text prefixes
for i in 0..num_lines {
let value = (i * 13) % 10000; // Pseudo-random numeric values
data.extend_from_slice(format!("value_{value}\n").as_bytes());
}
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["-n", file_path.to_str().unwrap()],
));
});
}
/// Benchmark reverse sorting with locale-aware data
#[divan::bench(args = [100_000, 500_000])]
fn sort_reverse_locale(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_accented_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["-r", file_path.to_str().unwrap()],
));
});
}
/// Benchmark sorting with specific key field
#[divan::bench(args = [100_000, 500_000])]
fn sort_key_field(bencher: Bencher, num_lines: usize) {
let mut data = Vec::new();
// Generate data with multiple fields
let words = ["café", "naïve", "apple", "über", "banana"];
for i in 0..num_lines {
let word = words[i % words.len()];
let num1 = i % 100;
let num2 = (i * 7) % 100;
data.extend_from_slice(format!("{num1}\t{word}\t{num2}\n").as_bytes());
}
let file_path = setup_test_file(&data);
bencher.bench(|| {
// Sort by second field
black_box(run_util_function(
uumain,
&["-k", "2", file_path.to_str().unwrap()],
));
});
}
/// Benchmark unique sorting with locale-aware data
#[divan::bench(args = [100_000, 500_000])]
fn sort_unique_locale(bencher: Bencher, num_lines: usize) {
let data = text_data::generate_accented_data(num_lines);
let file_path = setup_test_file(&data);
bencher.bench(|| {
black_box(run_util_function(
uumain,
&["-u", file_path.to_str().unwrap()],
));
});
}
fn main() {
divan::main();
}

View file

@ -0,0 +1,166 @@
// This file is part of the uutils coreutils package.
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use divan::{Bencher, black_box};
use std::env;
use uu_sort::uumain;
use uucore::benchmark::{run_util_function, setup_test_file, text_data};
/// Benchmark ASCII-only data sorting with C locale (byte comparison)
#[divan::bench]
fn sort_ascii_c_locale(bencher: Bencher) {
let data = text_data::generate_ascii_data_simple(100_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "C");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark ASCII-only data sorting with UTF-8 locale
#[divan::bench]
fn sort_ascii_utf8_locale(bencher: Bencher) {
let data = text_data::generate_ascii_data_simple(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "en_US.UTF-8");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark mixed ASCII/Unicode data with C locale
#[divan::bench]
fn sort_mixed_c_locale(bencher: Bencher) {
let data = text_data::generate_mixed_locale_data(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "C");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark mixed ASCII/Unicode data with UTF-8 locale
#[divan::bench]
fn sort_mixed_utf8_locale(bencher: Bencher) {
let data = text_data::generate_mixed_locale_data(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "en_US.UTF-8");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark German locale-specific data with C locale
#[divan::bench]
fn sort_german_c_locale(bencher: Bencher) {
let data = text_data::generate_german_locale_data(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "C");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark German locale-specific data with German locale
#[divan::bench]
fn sort_german_locale(bencher: Bencher) {
let data = text_data::generate_german_locale_data(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "de_DE.UTF-8");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark random strings of different lengths
#[divan::bench]
fn sort_random_strings(bencher: Bencher) {
let data = text_data::generate_random_strings(10_000, 50);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "en_US.UTF-8");
}
black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
});
}
/// Benchmark numeric sorting performance
#[divan::bench]
fn sort_numeric(bencher: Bencher) {
let mut data = Vec::new();
for i in 0..10_000 {
let line = format!("{}\n", 10_000 - i);
data.extend_from_slice(line.as_bytes());
}
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "en_US.UTF-8");
}
black_box(run_util_function(
uumain,
&["-n", file_path.to_str().unwrap()],
));
});
}
/// Benchmark reverse sorting
#[divan::bench]
fn sort_reverse_mixed(bencher: Bencher) {
let data = text_data::generate_mixed_locale_data(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "en_US.UTF-8");
}
black_box(run_util_function(
uumain,
&["-r", file_path.to_str().unwrap()],
));
});
}
/// Benchmark unique sorting
#[divan::bench]
fn sort_unique_mixed(bencher: Bencher) {
let data = text_data::generate_mixed_locale_data(10_000);
let file_path = setup_test_file(&data);
bencher.bench(|| {
unsafe {
env::set_var("LC_ALL", "en_US.UTF-8");
}
black_box(run_util_function(
uumain,
&["-u", file_path.to_str().unwrap()],
));
});
}
fn main() {
divan::main();
}

View file

@ -5,9 +5,9 @@
use divan::{Bencher, black_box};
use uu_tsort::uumain;
use uucore::benchmark::{create_test_file, run_util_function};
use uucore::benchmark::{run_util_function, setup_test_file};
/// Generate topological sort test data with different characteristics
/// Generate topological sort test data - linear chain
fn generate_linear_chain(num_nodes: usize) -> Vec<u8> {
let mut data = Vec::new();
@ -18,7 +18,7 @@ fn generate_linear_chain(num_nodes: usize) -> Vec<u8> {
data
}
/// Generate a DAG with more complex dependencies
/// Generate a DAG with tree-like structure
fn generate_tree_dag(depth: usize, branching_factor: usize) -> Vec<u8> {
let mut data = Vec::new();
let mut node_id = 0;
@ -116,64 +116,8 @@ fn generate_wide_dag(num_nodes: usize) -> Vec<u8> {
data
}
/// Benchmark linear chain graphs of different sizes
/// This tests the performance improvements mentioned in PR #8694
#[divan::bench(args = [1_000, 10_000, 100_000, 1_000_000])]
fn tsort_linear_chain(bencher: Bencher, num_nodes: usize) {
let temp_dir = tempfile::tempdir().unwrap();
let data = generate_linear_chain(num_nodes);
let file_path = create_test_file(&data, temp_dir.path());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark tree-like DAG structures
#[divan::bench(args = [(4, 3), (5, 3), (6, 2), (7, 2)])]
fn tsort_tree_dag(bencher: Bencher, (depth, branching): (usize, usize)) {
let temp_dir = tempfile::tempdir().unwrap();
let data = generate_tree_dag(depth, branching);
let file_path = create_test_file(&data, temp_dir.path());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark complex DAG with cross-dependencies
#[divan::bench(args = [1_000, 5_000, 10_000, 50_000])]
fn tsort_complex_dag(bencher: Bencher, num_nodes: usize) {
let temp_dir = tempfile::tempdir().unwrap();
let data = generate_complex_dag(num_nodes);
let file_path = create_test_file(&data, temp_dir.path());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark wide DAG with many parallel chains
/// This should stress the hashmap optimizations from PR #8694
#[divan::bench(args = [10_000, 50_000, 100_000])]
fn tsort_wide_dag(bencher: Bencher, num_nodes: usize) {
let temp_dir = tempfile::tempdir().unwrap();
let data = generate_wide_dag(num_nodes);
let file_path = create_test_file(&data, temp_dir.path());
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark input parsing vs computation by using files with different edge densities
#[divan::bench(args = [10_000, 50_000])]
fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) {
let temp_dir = tempfile::tempdir().unwrap();
/// Generate DAG data for input parsing stress tests
fn generate_input_parsing_heavy(num_edges: usize) -> Vec<u8> {
// Create a scenario with many edges but relatively few unique nodes
// This stresses the input parsing and graph construction optimizations
let num_unique_nodes = (num_edges as f64).sqrt() as usize;
@ -187,7 +131,64 @@ fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) {
}
}
let file_path = create_test_file(&data, temp_dir.path());
data
}
/// Benchmark linear chain graphs of different sizes
/// This tests the performance improvements mentioned in PR #8694
#[divan::bench(args = [1_000, 10_000, 100_000, 1_000_000])]
fn tsort_linear_chain(bencher: Bencher, num_nodes: usize) {
let data = generate_linear_chain(num_nodes);
let file_path = setup_test_file(&data);
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark tree-like DAG structures
#[divan::bench(args = [(4, 3), (5, 3), (6, 2), (7, 2)])]
fn tsort_tree_dag(bencher: Bencher, (depth, branching): (usize, usize)) {
let data = generate_tree_dag(depth, branching);
let file_path = setup_test_file(&data);
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark complex DAG with cross-dependencies
#[divan::bench(args = [1_000, 5_000, 10_000, 50_000])]
fn tsort_complex_dag(bencher: Bencher, num_nodes: usize) {
let data = generate_complex_dag(num_nodes);
let file_path = setup_test_file(&data);
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark wide DAG with many parallel chains
/// This should stress the hashmap optimizations from PR #8694
#[divan::bench(args = [10_000, 50_000, 100_000])]
fn tsort_wide_dag(bencher: Bencher, num_nodes: usize) {
let data = generate_wide_dag(num_nodes);
let file_path = setup_test_file(&data);
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {
black_box(run_util_function(uumain, &[file_path_str]));
});
}
/// Benchmark input parsing vs computation by using files with different edge densities
#[divan::bench(args = [10_000, 50_000])]
fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) {
let data = generate_input_parsing_heavy(num_edges);
let file_path = setup_test_file(&data);
let file_path_str = file_path.to_str().unwrap();
bencher.bench(|| {

View file

@ -49,6 +49,7 @@ os_display = "0.1.3"
# Benchmark dependencies (optional)
divan = { workspace = true, optional = true }
tempfile = { workspace = true, optional = true }
digest = { workspace = true, optional = true }
hex = { workspace = true, optional = true }
@ -173,4 +174,4 @@ wide = []
tty = []
time = ["jiff"]
uptime = ["chrono", "libc", "windows-sys", "utmpx", "utmp-classic"]
benchmark = ["divan"]
benchmark = ["divan", "tempfile"]

View file

@ -32,6 +32,16 @@ where
util_func(os_args.into_iter())
}
/// Helper function to set up a temporary test file and leak the temporary directory
/// so it persists for the duration of the benchmark
pub fn setup_test_file(data: &[u8]) -> PathBuf {
let temp_dir = tempfile::tempdir().unwrap();
let file_path = create_test_file(data, temp_dir.path());
// Keep temp_dir alive by leaking it - the OS will clean it up
std::mem::forget(temp_dir);
file_path
}
/// Generate test data with different characteristics for text processing utilities
pub mod text_data {
/// Generate test data with a specific size in MB and average line length
@ -85,4 +95,210 @@ pub mod text_data {
data
}
/// Helper function to generate test data from a list of words
pub fn generate_data_from_words(words: &[&str], num_lines: usize) -> Vec<u8> {
let mut data = Vec::new();
for i in 0..num_lines {
let word = words[i % words.len()];
let number = i % 1000;
data.extend_from_slice(format!("{word}_{number:03}\n").as_bytes());
}
data
}
/// Helper function to generate test data from a list of words without number suffix
pub fn generate_data_from_words_simple(words: &[&str], num_lines: usize) -> Vec<u8> {
let mut data = Vec::new();
for i in 0..num_lines {
let word = words[i % words.len()];
data.extend_from_slice(format!("{word}\n").as_bytes());
}
data
}
/// Helper function to generate test data from a list of words with counter
pub fn generate_data_from_words_with_counter(words: &[&str], num_lines: usize) -> Vec<u8> {
let mut data = Vec::new();
for i in 0..num_lines {
let word = words[i % words.len()];
let line = format!("{word}{i:04}\n");
data.extend_from_slice(line.as_bytes());
}
data
}
/// Generate test data with ASCII-only text
pub fn generate_ascii_data(num_lines: usize) -> Vec<u8> {
let words = [
"apple",
"banana",
"cherry",
"date",
"elderberry",
"fig",
"grape",
"honeydew",
"kiwi",
"lemon",
"mango",
"nectarine",
"orange",
"papaya",
"quince",
"raspberry",
"strawberry",
"tangerine",
"ugli",
"vanilla",
"watermelon",
"xigua",
"yellow",
"zucchini",
"avocado",
];
generate_data_from_words(&words, num_lines)
}
/// Generate simple ASCII data with line numbers
pub fn generate_ascii_data_simple(num_lines: usize) -> Vec<u8> {
let mut data = Vec::new();
for i in 0..num_lines {
let line = format!("line_{:06}\n", (num_lines - i - 1));
data.extend_from_slice(line.as_bytes());
}
data
}
/// Generate test data with accented characters that require locale-aware sorting
pub fn generate_accented_data(num_lines: usize) -> Vec<u8> {
let words = [
// French words with accents
"café",
"naïve",
"résumé",
"fiancé",
"crème",
"déjà",
"façade",
"château",
"élève",
"côte",
// German words with umlauts
"über",
"Müller",
"schön",
"Köln",
"Düsseldorf",
"Österreich",
"Zürich",
"Mädchen",
"Bär",
"größer",
// Spanish words with tildes and accents
"niño",
"señor",
"año",
"mañana",
"español",
"corazón",
"María",
"José",
"más",
"también",
];
generate_data_from_words(&words, num_lines)
}
/// Generate test data with mixed ASCII and non-ASCII characters
pub fn generate_mixed_data(num_lines: usize) -> Vec<u8> {
let words = [
// Mix of ASCII and accented words
"apple",
"café",
"banana",
"naïve",
"cherry",
"résumé",
"date",
"fiancé",
"elderberry",
"crème",
"über",
"grape",
"Müller",
"honeydew",
"schön",
"niño",
"kiwi",
"señor",
"lemon",
"año",
"mango",
"María",
"orange",
"José",
"papaya",
];
generate_data_from_words(&words, num_lines)
}
/// Generate mixed locale data with counter
pub fn generate_mixed_locale_data(num_lines: usize) -> Vec<u8> {
let mixed_strings = [
"zebra", "äpfel", "banana", "öl", "cat", "über", "dog", "zürich", "elephant", "café",
"fish", "naïve", "grape", "résumé", "house", "piñata",
];
generate_data_from_words_with_counter(&mixed_strings, num_lines)
}
/// Generate German locale-specific data
pub fn generate_german_locale_data(num_lines: usize) -> Vec<u8> {
let german_words = [
"Ärger", "Öffnung", "Über", "Zucker", "Bär", "Föhn", "Größe", "Höhe", "Käse", "Löwe",
"Mädchen", "Nüsse", "Röntgen", "Schäfer", "Tür", "Würfel", "ä", "ö", "ü", "ß", "a",
"o", "u", "s",
];
generate_data_from_words_with_counter(&german_words, num_lines)
}
/// Generate test data with uppercase/lowercase variations
pub fn generate_case_sensitive_data(num_lines: usize) -> Vec<u8> {
let base_words = [
"apple", "Apple", "APPLE", "banana", "Banana", "BANANA", "café", "Café", "CAFÉ",
"über", "Über", "ÜBER",
];
generate_data_from_words_simple(&base_words, num_lines)
}
/// Generate random strings with mixed charset including accented characters
pub fn generate_random_strings(num_lines: usize, length: usize) -> Vec<u8> {
let mut data = Vec::new();
let charset =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789äöüÄÖÜßéèêëàâîïôûç";
let charset_bytes = charset.as_bytes();
for i in 0..num_lines {
let mut line = String::new();
for j in 0..length {
let idx = ((i * length + j) * 17 + 42) % charset_bytes.len();
line.push(charset_bytes[idx] as char);
}
line.push('\n');
data.extend_from_slice(line.as_bytes());
}
data
}
/// Generate numeric data for benchmarking (simple sequential numbers)
pub fn generate_numbers(count: usize) -> String {
(1..=count)
.map(|n| n.to_string())
.collect::<Vec<_>>()
.join("\n")
}
}