diff --git a/.vscode/cSpell.json b/.vscode/cSpell.json index 70f821786..5d3e3524b 100644 --- a/.vscode/cSpell.json +++ b/.vscode/cSpell.json @@ -32,6 +32,8 @@ ".devcontainer/**", "util/gnu-patches/**", "docs/src/release-notes/**", + "src/uu/*/benches/*.rs", + "src/uucore/src/lib/features/benchmark.rs", ], "enableGlobDot": true, diff --git a/Cargo.lock b/Cargo.lock index d2cf4d7af..4f1786416 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3930,6 +3930,7 @@ dependencies = [ "bigdecimal", "binary-heap-plus", "clap", + "codspeed-divan-compat", "compare", "ctrlc", "fluent", diff --git a/src/uu/numfmt/benches/numfmt_bench.rs b/src/uu/numfmt/benches/numfmt_bench.rs index ee09b7d0f..d72bfa3f7 100644 --- a/src/uu/numfmt/benches/numfmt_bench.rs +++ b/src/uu/numfmt/benches/numfmt_bench.rs @@ -4,45 +4,32 @@ // file that was distributed with this source code. use divan::{Bencher, black_box}; -use tempfile::TempDir; use uu_numfmt::uumain; -use uucore::benchmark::{create_test_file, run_util_function}; - -/// Generate numeric data for benchmarking -fn generate_numbers(count: usize) -> String { - (1..=count) - .map(|n| n.to_string()) - .collect::>() - .join("\n") -} - -/// Setup benchmark environment with test data -fn setup_benchmark(data: String) -> (TempDir, String) { - let temp_dir = tempfile::tempdir().unwrap(); - let file_path = create_test_file(data.as_bytes(), temp_dir.path()); - let file_path_str = file_path.to_str().unwrap().to_string(); - (temp_dir, file_path_str) -} +use uucore::benchmark::{run_util_function, setup_test_file, text_data}; /// Benchmark SI formatting with different number counts #[divan::bench(args = [1_000_000])] fn numfmt_to_si(bencher: Bencher, count: usize) { - let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count)); + let data = text_data::generate_numbers(count); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); bencher.bench(|| { - black_box(run_util_function(uumain, &["--to=si", &file_path_str])); + black_box(run_util_function(uumain, &["--to=si", file_path_str])); }); } /// Benchmark SI formatting with precision format #[divan::bench(args = [1_000_000])] fn numfmt_to_si_precision(bencher: Bencher, count: usize) { - let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count)); + let data = text_data::generate_numbers(count); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); bencher.bench(|| { black_box(run_util_function( uumain, - &["--to=si", "--format=%.6f", &file_path_str], + &["--to=si", "--format=%.6f", file_path_str], )); }); } @@ -50,10 +37,12 @@ fn numfmt_to_si_precision(bencher: Bencher, count: usize) { /// Benchmark IEC (binary) formatting #[divan::bench(args = [1_000_000])] fn numfmt_to_iec(bencher: Bencher, count: usize) { - let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count)); + let data = text_data::generate_numbers(count); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); bencher.bench(|| { - black_box(run_util_function(uumain, &["--to=iec", &file_path_str])); + black_box(run_util_function(uumain, &["--to=iec", file_path_str])); }); } @@ -65,10 +54,11 @@ fn numfmt_from_si(bencher: Bencher, count: usize) { .map(|n| format!("{:.1}K", n as f64 / 1000.0)) .collect::>() .join("\n"); - let (_temp_dir, file_path_str) = setup_benchmark(data); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); bencher.bench(|| { - black_box(run_util_function(uumain, &["--from=si", &file_path_str])); + black_box(run_util_function(uumain, &["--from=si", file_path_str])); }); } @@ -80,23 +70,26 @@ fn numfmt_large_numbers_si(bencher: Bencher, count: usize) { .map(|n| (n * 1_000_000).to_string()) .collect::>() .join("\n"); - let (_temp_dir, file_path_str) = setup_benchmark(data); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); bencher.bench(|| { - black_box(run_util_function(uumain, &["--to=si", &file_path_str])); + black_box(run_util_function(uumain, &["--to=si", file_path_str])); }); } /// Benchmark different padding widths #[divan::bench(args = [(1_000_000, 5), (1_000_000, 50)])] fn numfmt_padding(bencher: Bencher, (count, padding): (usize, usize)) { - let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count)); + let data = text_data::generate_numbers(count); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); let padding_arg = format!("--padding={padding}"); bencher.bench(|| { black_box(run_util_function( uumain, - &["--to=si", &padding_arg, &file_path_str], + &["--to=si", &padding_arg, file_path_str], )); }); } @@ -104,13 +97,15 @@ fn numfmt_padding(bencher: Bencher, (count, padding): (usize, usize)) { /// Benchmark round modes with SI formatting #[divan::bench(args = [("up", 100_000), ("down", 1_000_000), ("towards-zero", 1_000_000)])] fn numfmt_round_modes(bencher: Bencher, (round_mode, count): (&str, usize)) { - let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count)); + let data = text_data::generate_numbers(count); + let file_path = setup_test_file(data.as_bytes()); + let file_path_str = file_path.to_str().unwrap(); let round_arg = format!("--round={round_mode}"); bencher.bench(|| { black_box(run_util_function( uumain, - &["--to=si", &round_arg, &file_path_str], + &["--to=si", &round_arg, file_path_str], )); }); } diff --git a/src/uu/sort/Cargo.toml b/src/uu/sort/Cargo.toml index 789f6fec5..816e42d20 100644 --- a/src/uu/sort/Cargo.toml +++ b/src/uu/sort/Cargo.toml @@ -40,6 +40,21 @@ fluent = { workspace = true } [target.'cfg(target_os = "linux")'.dependencies] nix = { workspace = true } +[dev-dependencies] +divan = { workspace = true } +tempfile = { workspace = true } +uucore = { workspace = true, features = [ + "benchmark", + "fs", + "parser", + "version-cmp", + "i18n-collator", +] } + [[bin]] name = "sort" path = "src/main.rs" + +[[bench]] +name = "sort_bench" +harness = false diff --git a/src/uu/sort/benches/sort_bench.rs b/src/uu/sort/benches/sort_bench.rs new file mode 100644 index 000000000..ce1eb839f --- /dev/null +++ b/src/uu/sort/benches/sort_bench.rs @@ -0,0 +1,158 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use divan::{Bencher, black_box}; +use uu_sort::uumain; +use uucore::benchmark::{run_util_function, setup_test_file, text_data}; + +/// Benchmark sorting ASCII-only data +#[divan::bench(args = [100_000, 500_000])] +fn sort_ascii_only(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_ascii_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark sorting accented/non-ASCII data +#[divan::bench(args = [100_000, 500_000])] +fn sort_accented_data(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_accented_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark sorting mixed ASCII/non-ASCII data +#[divan::bench(args = [100_000, 500_000])] +fn sort_mixed_data(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_mixed_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark case-sensitive sorting with mixed case data +#[divan::bench(args = [100_000, 500_000])] +fn sort_case_sensitive(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_case_sensitive_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark case-insensitive sorting (fold case) +#[divan::bench(args = [100_000, 500_000])] +fn sort_case_insensitive(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_case_sensitive_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function( + uumain, + &["-f", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark dictionary order sorting (only blanks and alphanumeric) +#[divan::bench(args = [100_000, 500_000])] +fn sort_dictionary_order(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_mixed_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function( + uumain, + &["-d", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark numeric sorting with mixed data +#[divan::bench(args = [100_000, 500_000])] +fn sort_numeric(bencher: Bencher, num_lines: usize) { + let mut data = Vec::new(); + + // Generate numeric data with some text prefixes + for i in 0..num_lines { + let value = (i * 13) % 10000; // Pseudo-random numeric values + data.extend_from_slice(format!("value_{value}\n").as_bytes()); + } + + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function( + uumain, + &["-n", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark reverse sorting with locale-aware data +#[divan::bench(args = [100_000, 500_000])] +fn sort_reverse_locale(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_accented_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function( + uumain, + &["-r", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark sorting with specific key field +#[divan::bench(args = [100_000, 500_000])] +fn sort_key_field(bencher: Bencher, num_lines: usize) { + let mut data = Vec::new(); + + // Generate data with multiple fields + let words = ["café", "naïve", "apple", "über", "banana"]; + for i in 0..num_lines { + let word = words[i % words.len()]; + let num1 = i % 100; + let num2 = (i * 7) % 100; + data.extend_from_slice(format!("{num1}\t{word}\t{num2}\n").as_bytes()); + } + + let file_path = setup_test_file(&data); + + bencher.bench(|| { + // Sort by second field + black_box(run_util_function( + uumain, + &["-k", "2", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark unique sorting with locale-aware data +#[divan::bench(args = [100_000, 500_000])] +fn sort_unique_locale(bencher: Bencher, num_lines: usize) { + let data = text_data::generate_accented_data(num_lines); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + black_box(run_util_function( + uumain, + &["-u", file_path.to_str().unwrap()], + )); + }); +} + +fn main() { + divan::main(); +} diff --git a/src/uu/sort/benches/sort_locale_bench.rs b/src/uu/sort/benches/sort_locale_bench.rs new file mode 100644 index 000000000..7747b17f4 --- /dev/null +++ b/src/uu/sort/benches/sort_locale_bench.rs @@ -0,0 +1,166 @@ +// This file is part of the uutils coreutils package. +// +// For the full copyright and license information, please view the LICENSE +// file that was distributed with this source code. + +use divan::{Bencher, black_box}; +use std::env; +use uu_sort::uumain; +use uucore::benchmark::{run_util_function, setup_test_file, text_data}; + +/// Benchmark ASCII-only data sorting with C locale (byte comparison) +#[divan::bench] +fn sort_ascii_c_locale(bencher: Bencher) { + let data = text_data::generate_ascii_data_simple(100_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "C"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark ASCII-only data sorting with UTF-8 locale +#[divan::bench] +fn sort_ascii_utf8_locale(bencher: Bencher) { + let data = text_data::generate_ascii_data_simple(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "en_US.UTF-8"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark mixed ASCII/Unicode data with C locale +#[divan::bench] +fn sort_mixed_c_locale(bencher: Bencher) { + let data = text_data::generate_mixed_locale_data(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "C"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark mixed ASCII/Unicode data with UTF-8 locale +#[divan::bench] +fn sort_mixed_utf8_locale(bencher: Bencher) { + let data = text_data::generate_mixed_locale_data(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "en_US.UTF-8"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark German locale-specific data with C locale +#[divan::bench] +fn sort_german_c_locale(bencher: Bencher) { + let data = text_data::generate_german_locale_data(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "C"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark German locale-specific data with German locale +#[divan::bench] +fn sort_german_locale(bencher: Bencher) { + let data = text_data::generate_german_locale_data(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "de_DE.UTF-8"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark random strings of different lengths +#[divan::bench] +fn sort_random_strings(bencher: Bencher) { + let data = text_data::generate_random_strings(10_000, 50); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "en_US.UTF-8"); + } + black_box(run_util_function(uumain, &[file_path.to_str().unwrap()])); + }); +} + +/// Benchmark numeric sorting performance +#[divan::bench] +fn sort_numeric(bencher: Bencher) { + let mut data = Vec::new(); + for i in 0..10_000 { + let line = format!("{}\n", 10_000 - i); + data.extend_from_slice(line.as_bytes()); + } + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "en_US.UTF-8"); + } + black_box(run_util_function( + uumain, + &["-n", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark reverse sorting +#[divan::bench] +fn sort_reverse_mixed(bencher: Bencher) { + let data = text_data::generate_mixed_locale_data(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "en_US.UTF-8"); + } + black_box(run_util_function( + uumain, + &["-r", file_path.to_str().unwrap()], + )); + }); +} + +/// Benchmark unique sorting +#[divan::bench] +fn sort_unique_mixed(bencher: Bencher) { + let data = text_data::generate_mixed_locale_data(10_000); + let file_path = setup_test_file(&data); + + bencher.bench(|| { + unsafe { + env::set_var("LC_ALL", "en_US.UTF-8"); + } + black_box(run_util_function( + uumain, + &["-u", file_path.to_str().unwrap()], + )); + }); +} + +fn main() { + divan::main(); +} diff --git a/src/uu/tsort/benches/tsort_bench.rs b/src/uu/tsort/benches/tsort_bench.rs index a222b59a2..157f2a0c4 100644 --- a/src/uu/tsort/benches/tsort_bench.rs +++ b/src/uu/tsort/benches/tsort_bench.rs @@ -5,9 +5,9 @@ use divan::{Bencher, black_box}; use uu_tsort::uumain; -use uucore::benchmark::{create_test_file, run_util_function}; +use uucore::benchmark::{run_util_function, setup_test_file}; -/// Generate topological sort test data with different characteristics +/// Generate topological sort test data - linear chain fn generate_linear_chain(num_nodes: usize) -> Vec { let mut data = Vec::new(); @@ -18,7 +18,7 @@ fn generate_linear_chain(num_nodes: usize) -> Vec { data } -/// Generate a DAG with more complex dependencies +/// Generate a DAG with tree-like structure fn generate_tree_dag(depth: usize, branching_factor: usize) -> Vec { let mut data = Vec::new(); let mut node_id = 0; @@ -116,64 +116,8 @@ fn generate_wide_dag(num_nodes: usize) -> Vec { data } -/// Benchmark linear chain graphs of different sizes -/// This tests the performance improvements mentioned in PR #8694 -#[divan::bench(args = [1_000, 10_000, 100_000, 1_000_000])] -fn tsort_linear_chain(bencher: Bencher, num_nodes: usize) { - let temp_dir = tempfile::tempdir().unwrap(); - let data = generate_linear_chain(num_nodes); - let file_path = create_test_file(&data, temp_dir.path()); - let file_path_str = file_path.to_str().unwrap(); - - bencher.bench(|| { - black_box(run_util_function(uumain, &[file_path_str])); - }); -} - -/// Benchmark tree-like DAG structures -#[divan::bench(args = [(4, 3), (5, 3), (6, 2), (7, 2)])] -fn tsort_tree_dag(bencher: Bencher, (depth, branching): (usize, usize)) { - let temp_dir = tempfile::tempdir().unwrap(); - let data = generate_tree_dag(depth, branching); - let file_path = create_test_file(&data, temp_dir.path()); - let file_path_str = file_path.to_str().unwrap(); - - bencher.bench(|| { - black_box(run_util_function(uumain, &[file_path_str])); - }); -} - -/// Benchmark complex DAG with cross-dependencies -#[divan::bench(args = [1_000, 5_000, 10_000, 50_000])] -fn tsort_complex_dag(bencher: Bencher, num_nodes: usize) { - let temp_dir = tempfile::tempdir().unwrap(); - let data = generate_complex_dag(num_nodes); - let file_path = create_test_file(&data, temp_dir.path()); - let file_path_str = file_path.to_str().unwrap(); - - bencher.bench(|| { - black_box(run_util_function(uumain, &[file_path_str])); - }); -} - -/// Benchmark wide DAG with many parallel chains -/// This should stress the hashmap optimizations from PR #8694 -#[divan::bench(args = [10_000, 50_000, 100_000])] -fn tsort_wide_dag(bencher: Bencher, num_nodes: usize) { - let temp_dir = tempfile::tempdir().unwrap(); - let data = generate_wide_dag(num_nodes); - let file_path = create_test_file(&data, temp_dir.path()); - let file_path_str = file_path.to_str().unwrap(); - - bencher.bench(|| { - black_box(run_util_function(uumain, &[file_path_str])); - }); -} - -/// Benchmark input parsing vs computation by using files with different edge densities -#[divan::bench(args = [10_000, 50_000])] -fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) { - let temp_dir = tempfile::tempdir().unwrap(); +/// Generate DAG data for input parsing stress tests +fn generate_input_parsing_heavy(num_edges: usize) -> Vec { // Create a scenario with many edges but relatively few unique nodes // This stresses the input parsing and graph construction optimizations let num_unique_nodes = (num_edges as f64).sqrt() as usize; @@ -187,7 +131,64 @@ fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) { } } - let file_path = create_test_file(&data, temp_dir.path()); + data +} + +/// Benchmark linear chain graphs of different sizes +/// This tests the performance improvements mentioned in PR #8694 +#[divan::bench(args = [1_000, 10_000, 100_000, 1_000_000])] +fn tsort_linear_chain(bencher: Bencher, num_nodes: usize) { + let data = generate_linear_chain(num_nodes); + let file_path = setup_test_file(&data); + let file_path_str = file_path.to_str().unwrap(); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path_str])); + }); +} + +/// Benchmark tree-like DAG structures +#[divan::bench(args = [(4, 3), (5, 3), (6, 2), (7, 2)])] +fn tsort_tree_dag(bencher: Bencher, (depth, branching): (usize, usize)) { + let data = generate_tree_dag(depth, branching); + let file_path = setup_test_file(&data); + let file_path_str = file_path.to_str().unwrap(); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path_str])); + }); +} + +/// Benchmark complex DAG with cross-dependencies +#[divan::bench(args = [1_000, 5_000, 10_000, 50_000])] +fn tsort_complex_dag(bencher: Bencher, num_nodes: usize) { + let data = generate_complex_dag(num_nodes); + let file_path = setup_test_file(&data); + let file_path_str = file_path.to_str().unwrap(); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path_str])); + }); +} + +/// Benchmark wide DAG with many parallel chains +/// This should stress the hashmap optimizations from PR #8694 +#[divan::bench(args = [10_000, 50_000, 100_000])] +fn tsort_wide_dag(bencher: Bencher, num_nodes: usize) { + let data = generate_wide_dag(num_nodes); + let file_path = setup_test_file(&data); + let file_path_str = file_path.to_str().unwrap(); + + bencher.bench(|| { + black_box(run_util_function(uumain, &[file_path_str])); + }); +} + +/// Benchmark input parsing vs computation by using files with different edge densities +#[divan::bench(args = [10_000, 50_000])] +fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) { + let data = generate_input_parsing_heavy(num_edges); + let file_path = setup_test_file(&data); let file_path_str = file_path.to_str().unwrap(); bencher.bench(|| { diff --git a/src/uucore/Cargo.toml b/src/uucore/Cargo.toml index 2060863be..b05bc1564 100644 --- a/src/uucore/Cargo.toml +++ b/src/uucore/Cargo.toml @@ -49,6 +49,7 @@ os_display = "0.1.3" # Benchmark dependencies (optional) divan = { workspace = true, optional = true } +tempfile = { workspace = true, optional = true } digest = { workspace = true, optional = true } hex = { workspace = true, optional = true } @@ -173,4 +174,4 @@ wide = [] tty = [] time = ["jiff"] uptime = ["chrono", "libc", "windows-sys", "utmpx", "utmp-classic"] -benchmark = ["divan"] +benchmark = ["divan", "tempfile"] diff --git a/src/uucore/src/lib/features/benchmark.rs b/src/uucore/src/lib/features/benchmark.rs index 4358fabd6..bc6949384 100644 --- a/src/uucore/src/lib/features/benchmark.rs +++ b/src/uucore/src/lib/features/benchmark.rs @@ -32,6 +32,16 @@ where util_func(os_args.into_iter()) } +/// Helper function to set up a temporary test file and leak the temporary directory +/// so it persists for the duration of the benchmark +pub fn setup_test_file(data: &[u8]) -> PathBuf { + let temp_dir = tempfile::tempdir().unwrap(); + let file_path = create_test_file(data, temp_dir.path()); + // Keep temp_dir alive by leaking it - the OS will clean it up + std::mem::forget(temp_dir); + file_path +} + /// Generate test data with different characteristics for text processing utilities pub mod text_data { /// Generate test data with a specific size in MB and average line length @@ -85,4 +95,210 @@ pub mod text_data { data } + + /// Helper function to generate test data from a list of words + pub fn generate_data_from_words(words: &[&str], num_lines: usize) -> Vec { + let mut data = Vec::new(); + for i in 0..num_lines { + let word = words[i % words.len()]; + let number = i % 1000; + data.extend_from_slice(format!("{word}_{number:03}\n").as_bytes()); + } + data + } + + /// Helper function to generate test data from a list of words without number suffix + pub fn generate_data_from_words_simple(words: &[&str], num_lines: usize) -> Vec { + let mut data = Vec::new(); + for i in 0..num_lines { + let word = words[i % words.len()]; + data.extend_from_slice(format!("{word}\n").as_bytes()); + } + data + } + + /// Helper function to generate test data from a list of words with counter + pub fn generate_data_from_words_with_counter(words: &[&str], num_lines: usize) -> Vec { + let mut data = Vec::new(); + for i in 0..num_lines { + let word = words[i % words.len()]; + let line = format!("{word}{i:04}\n"); + data.extend_from_slice(line.as_bytes()); + } + data + } + + /// Generate test data with ASCII-only text + pub fn generate_ascii_data(num_lines: usize) -> Vec { + let words = [ + "apple", + "banana", + "cherry", + "date", + "elderberry", + "fig", + "grape", + "honeydew", + "kiwi", + "lemon", + "mango", + "nectarine", + "orange", + "papaya", + "quince", + "raspberry", + "strawberry", + "tangerine", + "ugli", + "vanilla", + "watermelon", + "xigua", + "yellow", + "zucchini", + "avocado", + ]; + + generate_data_from_words(&words, num_lines) + } + + /// Generate simple ASCII data with line numbers + pub fn generate_ascii_data_simple(num_lines: usize) -> Vec { + let mut data = Vec::new(); + for i in 0..num_lines { + let line = format!("line_{:06}\n", (num_lines - i - 1)); + data.extend_from_slice(line.as_bytes()); + } + data + } + + /// Generate test data with accented characters that require locale-aware sorting + pub fn generate_accented_data(num_lines: usize) -> Vec { + let words = [ + // French words with accents + "café", + "naïve", + "résumé", + "fiancé", + "crème", + "déjà", + "façade", + "château", + "élève", + "côte", + // German words with umlauts + "über", + "Müller", + "schön", + "Köln", + "Düsseldorf", + "Österreich", + "Zürich", + "Mädchen", + "Bär", + "größer", + // Spanish words with tildes and accents + "niño", + "señor", + "año", + "mañana", + "español", + "corazón", + "María", + "José", + "más", + "también", + ]; + + generate_data_from_words(&words, num_lines) + } + + /// Generate test data with mixed ASCII and non-ASCII characters + pub fn generate_mixed_data(num_lines: usize) -> Vec { + let words = [ + // Mix of ASCII and accented words + "apple", + "café", + "banana", + "naïve", + "cherry", + "résumé", + "date", + "fiancé", + "elderberry", + "crème", + "über", + "grape", + "Müller", + "honeydew", + "schön", + "niño", + "kiwi", + "señor", + "lemon", + "año", + "mango", + "María", + "orange", + "José", + "papaya", + ]; + + generate_data_from_words(&words, num_lines) + } + + /// Generate mixed locale data with counter + pub fn generate_mixed_locale_data(num_lines: usize) -> Vec { + let mixed_strings = [ + "zebra", "äpfel", "banana", "öl", "cat", "über", "dog", "zürich", "elephant", "café", + "fish", "naïve", "grape", "résumé", "house", "piñata", + ]; + generate_data_from_words_with_counter(&mixed_strings, num_lines) + } + + /// Generate German locale-specific data + pub fn generate_german_locale_data(num_lines: usize) -> Vec { + let german_words = [ + "Ärger", "Öffnung", "Über", "Zucker", "Bär", "Föhn", "Größe", "Höhe", "Käse", "Löwe", + "Mädchen", "Nüsse", "Röntgen", "Schäfer", "Tür", "Würfel", "ä", "ö", "ü", "ß", "a", + "o", "u", "s", + ]; + generate_data_from_words_with_counter(&german_words, num_lines) + } + + /// Generate test data with uppercase/lowercase variations + pub fn generate_case_sensitive_data(num_lines: usize) -> Vec { + let base_words = [ + "apple", "Apple", "APPLE", "banana", "Banana", "BANANA", "café", "Café", "CAFÉ", + "über", "Über", "ÜBER", + ]; + + generate_data_from_words_simple(&base_words, num_lines) + } + + /// Generate random strings with mixed charset including accented characters + pub fn generate_random_strings(num_lines: usize, length: usize) -> Vec { + let mut data = Vec::new(); + let charset = + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789äöüÄÖÜßéèêëàâîïôûç"; + let charset_bytes = charset.as_bytes(); + + for i in 0..num_lines { + let mut line = String::new(); + for j in 0..length { + let idx = ((i * length + j) * 17 + 42) % charset_bytes.len(); + line.push(charset_bytes[idx] as char); + } + line.push('\n'); + data.extend_from_slice(line.as_bytes()); + } + data + } + + /// Generate numeric data for benchmarking (simple sequential numbers) + pub fn generate_numbers(count: usize) -> String { + (1..=count) + .map(|n| n.to_string()) + .collect::>() + .join("\n") + } }