Merge pull request #8735 from sylvestre/sort-perf-2

sort: add benchmark
2025-12-23 08:47:37 +00:00 · 2025-09-25 21:45:25 +02:00 · 2025-09-25 21:45:25 +02:00 · ea5c8158d4
commit ea5c8158d4
parent 5efafd6563 5b0d17318a
9 changed files with 650 additions and 95 deletions
--- a/.vscode/cSpell.json
+++ b/.vscode/cSpell.json
@ -32,6 +32,8 @@
    ".devcontainer/**",
    "util/gnu-patches/**",
    "docs/src/release-notes/**",
+    "src/uu/*/benches/*.rs",
+    "src/uucore/src/lib/features/benchmark.rs",
  ],

  "enableGlobDot": true,
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3930,6 +3930,7 @@ dependencies = [
 "bigdecimal",
 "binary-heap-plus",
 "clap",
+ "codspeed-divan-compat",
 "compare",
 "ctrlc",
 "fluent",
--- a/src/uu/numfmt/benches/numfmt_bench.rs
+++ b/src/uu/numfmt/benches/numfmt_bench.rs
@ -4,45 +4,32 @@
 // file that was distributed with this source code.

 use divan::{Bencher, black_box};
-use tempfile::TempDir;
 use uu_numfmt::uumain;
-use uucore::benchmark::{create_test_file, run_util_function};
-
-/// Generate numeric data for benchmarking
-fn generate_numbers(count: usize) -> String {
-    (1..=count)
-        .map(|n| n.to_string())
-        .collect::<Vec<_>>()
-        .join("\n")
-}
-
-/// Setup benchmark environment with test data
-fn setup_benchmark(data: String) -> (TempDir, String) {
-    let temp_dir = tempfile::tempdir().unwrap();
-    let file_path = create_test_file(data.as_bytes(), temp_dir.path());
-    let file_path_str = file_path.to_str().unwrap().to_string();
-    (temp_dir, file_path_str)
-}
+use uucore::benchmark::{run_util_function, setup_test_file, text_data};

 /// Benchmark SI formatting with different number counts
 #[divan::bench(args = [1_000_000])]
 fn numfmt_to_si(bencher: Bencher, count: usize) {
-    let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
+    let data = text_data::generate_numbers(count);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();

    bencher.bench(|| {
-        black_box(run_util_function(uumain, &["--to=si", &file_path_str]));
+        black_box(run_util_function(uumain, &["--to=si", file_path_str]));
    });
 }

 /// Benchmark SI formatting with precision format
 #[divan::bench(args = [1_000_000])]
 fn numfmt_to_si_precision(bencher: Bencher, count: usize) {
-    let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
+    let data = text_data::generate_numbers(count);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();

    bencher.bench(|| {
        black_box(run_util_function(
            uumain,
-            &["--to=si", "--format=%.6f", &file_path_str],
+            &["--to=si", "--format=%.6f", file_path_str],
        ));
    });
 }
@ -50,10 +37,12 @@ fn numfmt_to_si_precision(bencher: Bencher, count: usize) {
 /// Benchmark IEC (binary) formatting
 #[divan::bench(args = [1_000_000])]
 fn numfmt_to_iec(bencher: Bencher, count: usize) {
-    let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
+    let data = text_data::generate_numbers(count);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();

    bencher.bench(|| {
-        black_box(run_util_function(uumain, &["--to=iec", &file_path_str]));
+        black_box(run_util_function(uumain, &["--to=iec", file_path_str]));
    });
 }

@ -65,10 +54,11 @@ fn numfmt_from_si(bencher: Bencher, count: usize) {
        .map(|n| format!("{:.1}K", n as f64 / 1000.0))
        .collect::<Vec<_>>()
        .join("\n");
-    let (_temp_dir, file_path_str) = setup_benchmark(data);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();

    bencher.bench(|| {
-        black_box(run_util_function(uumain, &["--from=si", &file_path_str]));
+        black_box(run_util_function(uumain, &["--from=si", file_path_str]));
    });
 }

@ -80,23 +70,26 @@ fn numfmt_large_numbers_si(bencher: Bencher, count: usize) {
        .map(|n| (n * 1_000_000).to_string())
        .collect::<Vec<_>>()
        .join("\n");
-    let (_temp_dir, file_path_str) = setup_benchmark(data);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();

    bencher.bench(|| {
-        black_box(run_util_function(uumain, &["--to=si", &file_path_str]));
+        black_box(run_util_function(uumain, &["--to=si", file_path_str]));
    });
 }

 /// Benchmark different padding widths
 #[divan::bench(args = [(1_000_000, 5), (1_000_000, 50)])]
 fn numfmt_padding(bencher: Bencher, (count, padding): (usize, usize)) {
-    let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
+    let data = text_data::generate_numbers(count);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();
    let padding_arg = format!("--padding={padding}");

    bencher.bench(|| {
        black_box(run_util_function(
            uumain,
-            &["--to=si", &padding_arg, &file_path_str],
+            &["--to=si", &padding_arg, file_path_str],
        ));
    });
 }
@ -104,13 +97,15 @@ fn numfmt_padding(bencher: Bencher, (count, padding): (usize, usize)) {
 /// Benchmark round modes with SI formatting
 #[divan::bench(args = [("up", 100_000), ("down", 1_000_000), ("towards-zero", 1_000_000)])]
 fn numfmt_round_modes(bencher: Bencher, (round_mode, count): (&str, usize)) {
-    let (_temp_dir, file_path_str) = setup_benchmark(generate_numbers(count));
+    let data = text_data::generate_numbers(count);
+    let file_path = setup_test_file(data.as_bytes());
+    let file_path_str = file_path.to_str().unwrap();
    let round_arg = format!("--round={round_mode}");

    bencher.bench(|| {
        black_box(run_util_function(
            uumain,
-            &["--to=si", &round_arg, &file_path_str],
+            &["--to=si", &round_arg, file_path_str],
        ));
    });
 }
--- a/src/uu/sort/Cargo.toml
+++ b/src/uu/sort/Cargo.toml
@ -40,6 +40,21 @@ fluent = { workspace = true }
 [target.'cfg(target_os = "linux")'.dependencies]
 nix = { workspace = true }

+[dev-dependencies]
+divan = { workspace = true }
+tempfile = { workspace = true }
+uucore = { workspace = true, features = [
+  "benchmark",
+  "fs",
+  "parser",
+  "version-cmp",
+  "i18n-collator",
+] }
+
 [[bin]]
 name = "sort"
 path = "src/main.rs"
+
+[[bench]]
+name = "sort_bench"
+harness = false
--- a/src/uu/sort/benches/sort_bench.rs
+++ b/src/uu/sort/benches/sort_bench.rs
@ -0,0 +1,158 @@
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use divan::{Bencher, black_box};
+use uu_sort::uumain;
+use uucore::benchmark::{run_util_function, setup_test_file, text_data};
+
+/// Benchmark sorting ASCII-only data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_ascii_only(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_ascii_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark sorting accented/non-ASCII data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_accented_data(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_accented_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark sorting mixed ASCII/non-ASCII data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_mixed_data(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_mixed_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark case-sensitive sorting with mixed case data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_case_sensitive(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_case_sensitive_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark case-insensitive sorting (fold case)
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_case_insensitive(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_case_sensitive_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(
+            uumain,
+            &["-f", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark dictionary order sorting (only blanks and alphanumeric)
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_dictionary_order(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_mixed_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(
+            uumain,
+            &["-d", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark numeric sorting with mixed data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_numeric(bencher: Bencher, num_lines: usize) {
+    let mut data = Vec::new();
+
+    // Generate numeric data with some text prefixes
+    for i in 0..num_lines {
+        let value = (i * 13) % 10000; // Pseudo-random numeric values
+        data.extend_from_slice(format!("value_{value}\n").as_bytes());
+    }
+
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(
+            uumain,
+            &["-n", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark reverse sorting with locale-aware data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_reverse_locale(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_accented_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(
+            uumain,
+            &["-r", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark sorting with specific key field
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_key_field(bencher: Bencher, num_lines: usize) {
+    let mut data = Vec::new();
+
+    // Generate data with multiple fields
+    let words = ["café", "naïve", "apple", "über", "banana"];
+    for i in 0..num_lines {
+        let word = words[i % words.len()];
+        let num1 = i % 100;
+        let num2 = (i * 7) % 100;
+        data.extend_from_slice(format!("{num1}\t{word}\t{num2}\n").as_bytes());
+    }
+
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        // Sort by second field
+        black_box(run_util_function(
+            uumain,
+            &["-k", "2", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark unique sorting with locale-aware data
+#[divan::bench(args = [100_000, 500_000])]
+fn sort_unique_locale(bencher: Bencher, num_lines: usize) {
+    let data = text_data::generate_accented_data(num_lines);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        black_box(run_util_function(
+            uumain,
+            &["-u", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+fn main() {
+    divan::main();
+}
--- a/src/uu/sort/benches/sort_locale_bench.rs
+++ b/src/uu/sort/benches/sort_locale_bench.rs
@ -0,0 +1,166 @@
+// This file is part of the uutils coreutils package.
+//
+// For the full copyright and license information, please view the LICENSE
+// file that was distributed with this source code.
+
+use divan::{Bencher, black_box};
+use std::env;
+use uu_sort::uumain;
+use uucore::benchmark::{run_util_function, setup_test_file, text_data};
+
+/// Benchmark ASCII-only data sorting with C locale (byte comparison)
+#[divan::bench]
+fn sort_ascii_c_locale(bencher: Bencher) {
+    let data = text_data::generate_ascii_data_simple(100_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "C");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark ASCII-only data sorting with UTF-8 locale
+#[divan::bench]
+fn sort_ascii_utf8_locale(bencher: Bencher) {
+    let data = text_data::generate_ascii_data_simple(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "en_US.UTF-8");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark mixed ASCII/Unicode data with C locale
+#[divan::bench]
+fn sort_mixed_c_locale(bencher: Bencher) {
+    let data = text_data::generate_mixed_locale_data(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "C");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark mixed ASCII/Unicode data with UTF-8 locale
+#[divan::bench]
+fn sort_mixed_utf8_locale(bencher: Bencher) {
+    let data = text_data::generate_mixed_locale_data(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "en_US.UTF-8");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark German locale-specific data with C locale
+#[divan::bench]
+fn sort_german_c_locale(bencher: Bencher) {
+    let data = text_data::generate_german_locale_data(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "C");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark German locale-specific data with German locale
+#[divan::bench]
+fn sort_german_locale(bencher: Bencher) {
+    let data = text_data::generate_german_locale_data(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "de_DE.UTF-8");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark random strings of different lengths
+#[divan::bench]
+fn sort_random_strings(bencher: Bencher) {
+    let data = text_data::generate_random_strings(10_000, 50);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "en_US.UTF-8");
+        }
+        black_box(run_util_function(uumain, &[file_path.to_str().unwrap()]));
+    });
+}
+
+/// Benchmark numeric sorting performance
+#[divan::bench]
+fn sort_numeric(bencher: Bencher) {
+    let mut data = Vec::new();
+    for i in 0..10_000 {
+        let line = format!("{}\n", 10_000 - i);
+        data.extend_from_slice(line.as_bytes());
+    }
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "en_US.UTF-8");
+        }
+        black_box(run_util_function(
+            uumain,
+            &["-n", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark reverse sorting
+#[divan::bench]
+fn sort_reverse_mixed(bencher: Bencher) {
+    let data = text_data::generate_mixed_locale_data(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "en_US.UTF-8");
+        }
+        black_box(run_util_function(
+            uumain,
+            &["-r", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+/// Benchmark unique sorting
+#[divan::bench]
+fn sort_unique_mixed(bencher: Bencher) {
+    let data = text_data::generate_mixed_locale_data(10_000);
+    let file_path = setup_test_file(&data);
+
+    bencher.bench(|| {
+        unsafe {
+            env::set_var("LC_ALL", "en_US.UTF-8");
+        }
+        black_box(run_util_function(
+            uumain,
+            &["-u", file_path.to_str().unwrap()],
+        ));
+    });
+}
+
+fn main() {
+    divan::main();
+}
--- a/src/uu/tsort/benches/tsort_bench.rs
+++ b/src/uu/tsort/benches/tsort_bench.rs
@ -5,9 +5,9 @@

 use divan::{Bencher, black_box};
 use uu_tsort::uumain;
-use uucore::benchmark::{create_test_file, run_util_function};
+use uucore::benchmark::{run_util_function, setup_test_file};

-/// Generate topological sort test data with different characteristics
+/// Generate topological sort test data - linear chain
 fn generate_linear_chain(num_nodes: usize) -> Vec<u8> {
    let mut data = Vec::new();

@ -18,7 +18,7 @@ fn generate_linear_chain(num_nodes: usize) -> Vec<u8> {
    data
 }

-/// Generate a DAG with more complex dependencies
+/// Generate a DAG with tree-like structure
 fn generate_tree_dag(depth: usize, branching_factor: usize) -> Vec<u8> {
    let mut data = Vec::new();
    let mut node_id = 0;
@ -116,64 +116,8 @@ fn generate_wide_dag(num_nodes: usize) -> Vec<u8> {
    data
 }

-/// Benchmark linear chain graphs of different sizes
-/// This tests the performance improvements mentioned in PR #8694
-#[divan::bench(args = [1_000, 10_000, 100_000, 1_000_000])]
-fn tsort_linear_chain(bencher: Bencher, num_nodes: usize) {
-    let temp_dir = tempfile::tempdir().unwrap();
-    let data = generate_linear_chain(num_nodes);
-    let file_path = create_test_file(&data, temp_dir.path());
-    let file_path_str = file_path.to_str().unwrap();
-
-    bencher.bench(|| {
-        black_box(run_util_function(uumain, &[file_path_str]));
-    });
-}
-
-/// Benchmark tree-like DAG structures
-#[divan::bench(args = [(4, 3), (5, 3), (6, 2), (7, 2)])]
-fn tsort_tree_dag(bencher: Bencher, (depth, branching): (usize, usize)) {
-    let temp_dir = tempfile::tempdir().unwrap();
-    let data = generate_tree_dag(depth, branching);
-    let file_path = create_test_file(&data, temp_dir.path());
-    let file_path_str = file_path.to_str().unwrap();
-
-    bencher.bench(|| {
-        black_box(run_util_function(uumain, &[file_path_str]));
-    });
-}
-
-/// Benchmark complex DAG with cross-dependencies
-#[divan::bench(args = [1_000, 5_000, 10_000, 50_000])]
-fn tsort_complex_dag(bencher: Bencher, num_nodes: usize) {
-    let temp_dir = tempfile::tempdir().unwrap();
-    let data = generate_complex_dag(num_nodes);
-    let file_path = create_test_file(&data, temp_dir.path());
-    let file_path_str = file_path.to_str().unwrap();
-
-    bencher.bench(|| {
-        black_box(run_util_function(uumain, &[file_path_str]));
-    });
-}
-
-/// Benchmark wide DAG with many parallel chains
-/// This should stress the hashmap optimizations from PR #8694
-#[divan::bench(args = [10_000, 50_000, 100_000])]
-fn tsort_wide_dag(bencher: Bencher, num_nodes: usize) {
-    let temp_dir = tempfile::tempdir().unwrap();
-    let data = generate_wide_dag(num_nodes);
-    let file_path = create_test_file(&data, temp_dir.path());
-    let file_path_str = file_path.to_str().unwrap();
-
-    bencher.bench(|| {
-        black_box(run_util_function(uumain, &[file_path_str]));
-    });
-}
-
-/// Benchmark input parsing vs computation by using files with different edge densities
-#[divan::bench(args = [10_000, 50_000])]
-fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) {
-    let temp_dir = tempfile::tempdir().unwrap();
+/// Generate DAG data for input parsing stress tests
+fn generate_input_parsing_heavy(num_edges: usize) -> Vec<u8> {
    // Create a scenario with many edges but relatively few unique nodes
    // This stresses the input parsing and graph construction optimizations
    let num_unique_nodes = (num_edges as f64).sqrt() as usize;
@ -187,7 +131,64 @@ fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) {
        }
    }

-    let file_path = create_test_file(&data, temp_dir.path());
+    data
+}
+
+/// Benchmark linear chain graphs of different sizes
+/// This tests the performance improvements mentioned in PR #8694
+#[divan::bench(args = [1_000, 10_000, 100_000, 1_000_000])]
+fn tsort_linear_chain(bencher: Bencher, num_nodes: usize) {
+    let data = generate_linear_chain(num_nodes);
+    let file_path = setup_test_file(&data);
+    let file_path_str = file_path.to_str().unwrap();
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path_str]));
+    });
+}
+
+/// Benchmark tree-like DAG structures
+#[divan::bench(args = [(4, 3), (5, 3), (6, 2), (7, 2)])]
+fn tsort_tree_dag(bencher: Bencher, (depth, branching): (usize, usize)) {
+    let data = generate_tree_dag(depth, branching);
+    let file_path = setup_test_file(&data);
+    let file_path_str = file_path.to_str().unwrap();
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path_str]));
+    });
+}
+
+/// Benchmark complex DAG with cross-dependencies
+#[divan::bench(args = [1_000, 5_000, 10_000, 50_000])]
+fn tsort_complex_dag(bencher: Bencher, num_nodes: usize) {
+    let data = generate_complex_dag(num_nodes);
+    let file_path = setup_test_file(&data);
+    let file_path_str = file_path.to_str().unwrap();
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path_str]));
+    });
+}
+
+/// Benchmark wide DAG with many parallel chains
+/// This should stress the hashmap optimizations from PR #8694
+#[divan::bench(args = [10_000, 50_000, 100_000])]
+fn tsort_wide_dag(bencher: Bencher, num_nodes: usize) {
+    let data = generate_wide_dag(num_nodes);
+    let file_path = setup_test_file(&data);
+    let file_path_str = file_path.to_str().unwrap();
+
+    bencher.bench(|| {
+        black_box(run_util_function(uumain, &[file_path_str]));
+    });
+}
+
+/// Benchmark input parsing vs computation by using files with different edge densities
+#[divan::bench(args = [10_000, 50_000])]
+fn tsort_input_parsing_heavy(bencher: Bencher, num_edges: usize) {
+    let data = generate_input_parsing_heavy(num_edges);
+    let file_path = setup_test_file(&data);
    let file_path_str = file_path.to_str().unwrap();

    bencher.bench(|| {
--- a/src/uucore/Cargo.toml
+++ b/src/uucore/Cargo.toml
@ -49,6 +49,7 @@ os_display = "0.1.3"

 # Benchmark dependencies (optional)
 divan = { workspace = true, optional = true }
+tempfile = { workspace = true, optional = true }

 digest = { workspace = true, optional = true }
 hex = { workspace = true, optional = true }
@ -173,4 +174,4 @@ wide = []
 tty = []
 time = ["jiff"]
 uptime = ["chrono", "libc", "windows-sys", "utmpx", "utmp-classic"]
-benchmark = ["divan"]
+benchmark = ["divan", "tempfile"]
--- a/src/uucore/src/lib/features/benchmark.rs
+++ b/src/uucore/src/lib/features/benchmark.rs
@ -32,6 +32,16 @@ where
    util_func(os_args.into_iter())
 }

+/// Helper function to set up a temporary test file and leak the temporary directory
+/// so it persists for the duration of the benchmark
+pub fn setup_test_file(data: &[u8]) -> PathBuf {
+    let temp_dir = tempfile::tempdir().unwrap();
+    let file_path = create_test_file(data, temp_dir.path());
+    // Keep temp_dir alive by leaking it - the OS will clean it up
+    std::mem::forget(temp_dir);
+    file_path
+}
+
 /// Generate test data with different characteristics for text processing utilities
 pub mod text_data {
    /// Generate test data with a specific size in MB and average line length
@ -85,4 +95,210 @@ pub mod text_data {

        data
    }
+
+    /// Helper function to generate test data from a list of words
+    pub fn generate_data_from_words(words: &[&str], num_lines: usize) -> Vec<u8> {
+        let mut data = Vec::new();
+        for i in 0..num_lines {
+            let word = words[i % words.len()];
+            let number = i % 1000;
+            data.extend_from_slice(format!("{word}_{number:03}\n").as_bytes());
+        }
+        data
+    }
+
+    /// Helper function to generate test data from a list of words without number suffix
+    pub fn generate_data_from_words_simple(words: &[&str], num_lines: usize) -> Vec<u8> {
+        let mut data = Vec::new();
+        for i in 0..num_lines {
+            let word = words[i % words.len()];
+            data.extend_from_slice(format!("{word}\n").as_bytes());
+        }
+        data
+    }
+
+    /// Helper function to generate test data from a list of words with counter
+    pub fn generate_data_from_words_with_counter(words: &[&str], num_lines: usize) -> Vec<u8> {
+        let mut data = Vec::new();
+        for i in 0..num_lines {
+            let word = words[i % words.len()];
+            let line = format!("{word}{i:04}\n");
+            data.extend_from_slice(line.as_bytes());
+        }
+        data
+    }
+
+    /// Generate test data with ASCII-only text
+    pub fn generate_ascii_data(num_lines: usize) -> Vec<u8> {
+        let words = [
+            "apple",
+            "banana",
+            "cherry",
+            "date",
+            "elderberry",
+            "fig",
+            "grape",
+            "honeydew",
+            "kiwi",
+            "lemon",
+            "mango",
+            "nectarine",
+            "orange",
+            "papaya",
+            "quince",
+            "raspberry",
+            "strawberry",
+            "tangerine",
+            "ugli",
+            "vanilla",
+            "watermelon",
+            "xigua",
+            "yellow",
+            "zucchini",
+            "avocado",
+        ];
+
+        generate_data_from_words(&words, num_lines)
+    }
+
+    /// Generate simple ASCII data with line numbers
+    pub fn generate_ascii_data_simple(num_lines: usize) -> Vec<u8> {
+        let mut data = Vec::new();
+        for i in 0..num_lines {
+            let line = format!("line_{:06}\n", (num_lines - i - 1));
+            data.extend_from_slice(line.as_bytes());
+        }
+        data
+    }
+
+    /// Generate test data with accented characters that require locale-aware sorting
+    pub fn generate_accented_data(num_lines: usize) -> Vec<u8> {
+        let words = [
+            // French words with accents
+            "café",
+            "naïve",
+            "résumé",
+            "fiancé",
+            "crème",
+            "déjà",
+            "façade",
+            "château",
+            "élève",
+            "côte",
+            // German words with umlauts
+            "über",
+            "Müller",
+            "schön",
+            "Köln",
+            "Düsseldorf",
+            "Österreich",
+            "Zürich",
+            "Mädchen",
+            "Bär",
+            "größer",
+            // Spanish words with tildes and accents
+            "niño",
+            "señor",
+            "año",
+            "mañana",
+            "español",
+            "corazón",
+            "María",
+            "José",
+            "más",
+            "también",
+        ];
+
+        generate_data_from_words(&words, num_lines)
+    }
+
+    /// Generate test data with mixed ASCII and non-ASCII characters
+    pub fn generate_mixed_data(num_lines: usize) -> Vec<u8> {
+        let words = [
+            // Mix of ASCII and accented words
+            "apple",
+            "café",
+            "banana",
+            "naïve",
+            "cherry",
+            "résumé",
+            "date",
+            "fiancé",
+            "elderberry",
+            "crème",
+            "über",
+            "grape",
+            "Müller",
+            "honeydew",
+            "schön",
+            "niño",
+            "kiwi",
+            "señor",
+            "lemon",
+            "año",
+            "mango",
+            "María",
+            "orange",
+            "José",
+            "papaya",
+        ];
+
+        generate_data_from_words(&words, num_lines)
+    }
+
+    /// Generate mixed locale data with counter
+    pub fn generate_mixed_locale_data(num_lines: usize) -> Vec<u8> {
+        let mixed_strings = [
+            "zebra", "äpfel", "banana", "öl", "cat", "über", "dog", "zürich", "elephant", "café",
+            "fish", "naïve", "grape", "résumé", "house", "piñata",
+        ];
+        generate_data_from_words_with_counter(&mixed_strings, num_lines)
+    }
+
+    /// Generate German locale-specific data
+    pub fn generate_german_locale_data(num_lines: usize) -> Vec<u8> {
+        let german_words = [
+            "Ärger", "Öffnung", "Über", "Zucker", "Bär", "Föhn", "Größe", "Höhe", "Käse", "Löwe",
+            "Mädchen", "Nüsse", "Röntgen", "Schäfer", "Tür", "Würfel", "ä", "ö", "ü", "ß", "a",
+            "o", "u", "s",
+        ];
+        generate_data_from_words_with_counter(&german_words, num_lines)
+    }
+
+    /// Generate test data with uppercase/lowercase variations
+    pub fn generate_case_sensitive_data(num_lines: usize) -> Vec<u8> {
+        let base_words = [
+            "apple", "Apple", "APPLE", "banana", "Banana", "BANANA", "café", "Café", "CAFÉ",
+            "über", "Über", "ÜBER",
+        ];
+
+        generate_data_from_words_simple(&base_words, num_lines)
+    }
+
+    /// Generate random strings with mixed charset including accented characters
+    pub fn generate_random_strings(num_lines: usize, length: usize) -> Vec<u8> {
+        let mut data = Vec::new();
+        let charset =
+            "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789äöüÄÖÜßéèêëàâîïôûç";
+        let charset_bytes = charset.as_bytes();
+
+        for i in 0..num_lines {
+            let mut line = String::new();
+            for j in 0..length {
+                let idx = ((i * length + j) * 17 + 42) % charset_bytes.len();
+                line.push(charset_bytes[idx] as char);
+            }
+            line.push('\n');
+            data.extend_from_slice(line.as_bytes());
+        }
+        data
+    }
+
+    /// Generate numeric data for benchmarking (simple sequential numbers)
+    pub fn generate_numbers(count: usize) -> String {
+        (1..=count)
+            .map(|n| n.to_string())
+            .collect::<Vec<_>>()
+            .join("\n")
+    }
 }