From 5a6986d55f4fa1a7e940e4ed06f486f04e63e99b Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Tue, 12 Aug 2025 00:15:18 +0200
Subject: [PATCH] split: prefix & suffix also support non-utf8

---
 src/uu/split/src/filenames.rs | 73 +++++++++++++++--------------
 src/uu/split/src/split.rs     | 11 +++--
 tests/by-util/test_split.rs   | 87 ++++++++++++++++++++++++-----------
 3 files changed, 108 insertions(+), 63 deletions(-)
diff --git a/src/uu/split/src/filenames.rs b/src/uu/split/src/filenames.rs
index 5d8796dd9..5c09bf6f2 100644
--- a/src/uu/split/src/filenames.rs
+++ b/src/uu/split/src/filenames.rs
@@ -39,6 +39,7 @@ use crate::{
     OPT_NUMERIC_SUFFIXES_SHORT, OPT_SUFFIX_LENGTH,
 };
 use clap::ArgMatches;
+use std::ffi::{OsStr, OsString};
 use std::path::is_separator;
 use thiserror::Error;
 use uucore::display::Quotable;
@@ -76,7 +77,7 @@ pub struct Suffix {
     length: usize,
     start: usize,
     auto_widening: bool,
-    additional: String,
+    additional: OsString,
 }
 
 /// An error when parsing suffix parameters from command-line arguments.
@@ -219,11 +220,13 @@ impl Suffix {
         }
 
         let additional = matches
-            .get_one::<String>(OPT_ADDITIONAL_SUFFIX)
+            .get_one::<OsString>(OPT_ADDITIONAL_SUFFIX)
             .unwrap()
-            .to_string();
-        if additional.chars().any(is_separator) {
-            return Err(SuffixError::ContainsSeparator(additional));
+            .clone();
+        if additional.to_string_lossy().chars().any(is_separator) {
+            return Err(SuffixError::ContainsSeparator(
+                additional.to_string_lossy().to_string(),
+            ));
         }
 
         let result = Self {
@@ -300,14 +303,14 @@ impl Suffix {
 /// assert_eq!(it.next().unwrap(), "chunk_02.txt");
 /// ```
 pub struct FilenameIterator<'a> {
-    prefix: &'a str,
-    additional_suffix: &'a str,
+    prefix: &'a OsStr,
+    additional_suffix: &'a OsStr,
     number: Number,
     first_iteration: bool,
 }
 
 impl<'a> FilenameIterator<'a> {
-    pub fn new(prefix: &'a str, suffix: &'a Suffix) -> UResult<Self> {
+    pub fn new(prefix: &'a OsStr, suffix: &'a Suffix) -> UResult<Self> {
         let radix = suffix.stype.radix();
         let number = if suffix.auto_widening {
             Number::DynamicWidth(DynamicWidthNumber::new(radix, suffix.start))
@@ -321,7 +324,7 @@ impl<'a> FilenameIterator<'a> {
                 })?,
             )
         };
-        let additional_suffix = suffix.additional.as_str();
+        let additional_suffix = &suffix.additional;
 
         Ok(FilenameIterator {
             prefix,
@@ -345,7 +348,9 @@ impl Iterator for FilenameIterator<'_> {
         // struct parameters unchanged.
         Some(format!(
             "{}{}{}",
-            self.prefix, self.number, self.additional_suffix
+            self.prefix.to_string_lossy(),
+            self.number,
+            self.additional_suffix.to_string_lossy()
         ))
     }
 }
@@ -364,14 +369,14 @@ mod tests {
             length: 2,
             start: 0,
             auto_widening: false,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_aa.txt");
         assert_eq!(it.next().unwrap(), "chunk_ab.txt");
         assert_eq!(it.next().unwrap(), "chunk_ac.txt");
 
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.nth(26 * 26 - 1).unwrap(), "chunk_zz.txt");
         assert_eq!(it.next(), None);
     }
@@ -383,14 +388,14 @@ mod tests {
             length: 2,
             start: 0,
             auto_widening: false,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_00.txt");
         assert_eq!(it.next().unwrap(), "chunk_01.txt");
         assert_eq!(it.next().unwrap(), "chunk_02.txt");
 
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.nth(10 * 10 - 1).unwrap(), "chunk_99.txt");
         assert_eq!(it.next(), None);
     }
@@ -402,14 +407,14 @@ mod tests {
             length: 2,
             start: 0,
             auto_widening: true,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_aa.txt");
         assert_eq!(it.next().unwrap(), "chunk_ab.txt");
         assert_eq!(it.next().unwrap(), "chunk_ac.txt");
 
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.nth(26 * 25 - 1).unwrap(), "chunk_yz.txt");
         assert_eq!(it.next().unwrap(), "chunk_zaaa.txt");
         assert_eq!(it.next().unwrap(), "chunk_zaab.txt");
@@ -422,14 +427,14 @@ mod tests {
             length: 2,
             start: 0,
             auto_widening: true,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_00.txt");
         assert_eq!(it.next().unwrap(), "chunk_01.txt");
         assert_eq!(it.next().unwrap(), "chunk_02.txt");
 
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.nth(10 * 9 - 1).unwrap(), "chunk_89.txt");
         assert_eq!(it.next().unwrap(), "chunk_9000.txt");
         assert_eq!(it.next().unwrap(), "chunk_9001.txt");
@@ -442,9 +447,9 @@ mod tests {
             length: 2,
             start: 5,
             auto_widening: true,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_05.txt");
         assert_eq!(it.next().unwrap(), "chunk_06.txt");
         assert_eq!(it.next().unwrap(), "chunk_07.txt");
@@ -457,9 +462,9 @@ mod tests {
             length: 2,
             start: 9,
             auto_widening: true,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_09.txt");
         assert_eq!(it.next().unwrap(), "chunk_0a.txt");
         assert_eq!(it.next().unwrap(), "chunk_0b.txt");
@@ -472,9 +477,9 @@ mod tests {
             length: 3,
             start: 999,
             auto_widening: false,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_999.txt");
         assert!(it.next().is_none());
 
@@ -483,9 +488,9 @@ mod tests {
             length: 3,
             start: 1000,
             auto_widening: false,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let it = FilenameIterator::new("chunk_", &suffix);
+        let it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix);
         assert!(it.is_err());
 
         let suffix = Suffix {
@@ -493,9 +498,9 @@ mod tests {
             length: 3,
             start: 0xfff,
             auto_widening: false,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
+        let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
         assert_eq!(it.next().unwrap(), "chunk_fff.txt");
         assert!(it.next().is_none());
 
@@ -504,9 +509,9 @@ mod tests {
             length: 3,
             start: 0x1000,
             auto_widening: false,
-            additional: ".txt".to_string(),
+            additional: ".txt".into(),
         };
-        let it = FilenameIterator::new("chunk_", &suffix);
+        let it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix);
         assert!(it.is_err());
     }
 }
diff --git a/src/uu/split/src/split.rs b/src/uu/split/src/split.rs
index 90b55b6b2..7d6cdbd94 100644
--- a/src/uu/split/src/split.rs
+++ b/src/uu/split/src/split.rs
@@ -274,6 +274,7 @@ pub fn uu_app() -> Command {
                 .allow_hyphen_values(true)
                 .value_name("SUFFIX")
                 .default_value("")
+                .value_parser(clap::value_parser!(OsString))
                 .help(translate!("split-help-additional-suffix")),
         )
         .arg(
@@ -378,7 +379,11 @@ pub fn uu_app() -> Command {
                 .value_hint(ValueHint::FilePath)
                 .value_parser(clap::value_parser!(OsString)),
         )
-        .arg(Arg::new(ARG_PREFIX).default_value("x"))
+        .arg(
+            Arg::new(ARG_PREFIX)
+                .default_value("x")
+                .value_parser(clap::value_parser!(OsString)),
+        )
 }
 
 /// Parameters that control how a file gets split.
@@ -386,7 +391,7 @@ pub fn uu_app() -> Command {
 /// You can convert an [`ArgMatches`] instance into a [`Settings`]
 /// instance by calling [`Settings::from`].
 struct Settings {
-    prefix: String,
+    prefix: OsString,
     suffix: Suffix,
     input: OsString,
     /// When supplied, a shell command to output to instead of xaa, xab …
@@ -490,7 +495,7 @@ impl Settings {
         };
 
         let result = Self {
-            prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().clone(),
+            prefix: matches.get_one::<OsString>(ARG_PREFIX).unwrap().clone(),
             suffix,
             input: matches.get_one::<OsString>(ARG_INPUT).unwrap().clone(),
             filter: matches.get_one::<String>(OPT_FILTER).cloned(),
diff --git a/tests/by-util/test_split.rs b/tests/by-util/test_split.rs
index 41523b355..9bef61230 100644
--- a/tests/by-util/test_split.rs
+++ b/tests/by-util/test_split.rs
@@ -1726,32 +1726,7 @@ fn test_split_non_utf8_argument_unix() {
     let opt_value = [0x66, 0x6f, 0x80, 0x6f];
     let opt_value = OsStr::from_bytes(&opt_value[..]);
     let name = OsStr::from_bytes(name.as_bytes());
-    ucmd.args(&[opt, opt_value, name])
-        .fails()
-        .stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
-}
-
-/// Test if there are invalid (non UTF-8) in the arguments - windows
-/// clap is expected to fail/panic
-#[test]
-#[cfg(windows)]
-fn test_split_non_utf8_argument_windows() {
-    use std::ffi::OsString;
-    use std::os::windows::ffi::OsStringExt;
-
-    let (at, mut ucmd) = at_and_ucmd!();
-    let name = "test_split_non_utf8_argument";
-    let opt = OsString::from("--additional-suffix");
-    RandomFile::new(&at, name).add_lines(2000);
-    // Here the values 0x0066 and 0x006f correspond to 'f' and 'o'
-    // respectively. The value 0xD800 is a lone surrogate half, invalid
-    // in a UTF-16 sequence.
-    let opt_value = [0x0066, 0x006f, 0xD800, 0x006f];
-    let opt_value = OsString::from_wide(&opt_value[..]);
-    let name = OsString::from(name);
-    ucmd.args(&[opt, opt_value, name])
-        .fails()
-        .stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
+    ucmd.args(&[opt, opt_value, name]).succeeds();
 }
 
 // Test '--separator' / '-t' option following GNU tests example
@@ -2022,3 +1997,63 @@ fn test_split_non_utf8_paths() {
     // Check that at least one split file was created
     assert!(at.plus("xaa").exists());
 }
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_split_non_utf8_prefix() {
+    use std::os::unix::ffi::OsStrExt;
+    let (at, mut ucmd) = at_and_ucmd!();
+
+    at.write("input.txt", "line1\nline2\nline3\nline4\n");
+
+    let prefix = std::ffi::OsStr::from_bytes(b"\xFF\xFE");
+    ucmd.arg("input.txt").arg(prefix).succeeds();
+
+    // Check that split files were created (functionality works)
+    // The actual filename may be converted due to lossy conversion, but the command should succeed
+    let entries: Vec<_> = std::fs::read_dir(at.as_string()).unwrap().collect();
+    let split_files = entries
+        .iter()
+        .filter_map(|e| e.as_ref().ok())
+        .filter(|entry| {
+            let name = entry.file_name();
+            let name_str = name.to_string_lossy();
+            name_str.starts_with("�") || name_str.len() > 2 // split files should exist
+        })
+        .count();
+    assert!(
+        split_files > 0,
+        "Expected at least one split file to be created"
+    );
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_split_non_utf8_additional_suffix() {
+    use std::os::unix::ffi::OsStrExt;
+    let (at, mut ucmd) = at_and_ucmd!();
+
+    at.write("input.txt", "line1\nline2\nline3\nline4\n");
+
+    let suffix = std::ffi::OsStr::from_bytes(b"\xFF\xFE");
+    ucmd.args(&["input.txt", "--additional-suffix"])
+        .arg(suffix)
+        .succeeds();
+
+    // Check that split files were created (functionality works)
+    // The actual filename may be converted due to lossy conversion, but the command should succeed
+    let entries: Vec<_> = std::fs::read_dir(at.as_string()).unwrap().collect();
+    let split_files = entries
+        .iter()
+        .filter_map(|e| e.as_ref().ok())
+        .filter(|entry| {
+            let name = entry.file_name();
+            let name_str = name.to_string_lossy();
+            name_str.ends_with("�") || name_str.starts_with('x') // split files should exist
+        })
+        .count();
+    assert!(
+        split_files > 0,
+        "Expected at least one split file to be created"
+    );
+}