split: prefix & suffix also support non-utf8

This commit is contained in:
Sylvestre Ledru 2025-08-12 00:15:18 +02:00
parent 39fbdeecb2
commit 5a6986d55f
3 changed files with 108 additions and 63 deletions

View file

@ -39,6 +39,7 @@ use crate::{
OPT_NUMERIC_SUFFIXES_SHORT, OPT_SUFFIX_LENGTH,
};
use clap::ArgMatches;
use std::ffi::{OsStr, OsString};
use std::path::is_separator;
use thiserror::Error;
use uucore::display::Quotable;
@ -76,7 +77,7 @@ pub struct Suffix {
length: usize,
start: usize,
auto_widening: bool,
additional: String,
additional: OsString,
}
/// An error when parsing suffix parameters from command-line arguments.
@ -219,11 +220,13 @@ impl Suffix {
}
let additional = matches
.get_one::<String>(OPT_ADDITIONAL_SUFFIX)
.get_one::<OsString>(OPT_ADDITIONAL_SUFFIX)
.unwrap()
.to_string();
if additional.chars().any(is_separator) {
return Err(SuffixError::ContainsSeparator(additional));
.clone();
if additional.to_string_lossy().chars().any(is_separator) {
return Err(SuffixError::ContainsSeparator(
additional.to_string_lossy().to_string(),
));
}
let result = Self {
@ -300,14 +303,14 @@ impl Suffix {
/// assert_eq!(it.next().unwrap(), "chunk_02.txt");
/// ```
pub struct FilenameIterator<'a> {
prefix: &'a str,
additional_suffix: &'a str,
prefix: &'a OsStr,
additional_suffix: &'a OsStr,
number: Number,
first_iteration: bool,
}
impl<'a> FilenameIterator<'a> {
pub fn new(prefix: &'a str, suffix: &'a Suffix) -> UResult<Self> {
pub fn new(prefix: &'a OsStr, suffix: &'a Suffix) -> UResult<Self> {
let radix = suffix.stype.radix();
let number = if suffix.auto_widening {
Number::DynamicWidth(DynamicWidthNumber::new(radix, suffix.start))
@ -321,7 +324,7 @@ impl<'a> FilenameIterator<'a> {
})?,
)
};
let additional_suffix = suffix.additional.as_str();
let additional_suffix = &suffix.additional;
Ok(FilenameIterator {
prefix,
@ -345,7 +348,9 @@ impl Iterator for FilenameIterator<'_> {
// struct parameters unchanged.
Some(format!(
"{}{}{}",
self.prefix, self.number, self.additional_suffix
self.prefix.to_string_lossy(),
self.number,
self.additional_suffix.to_string_lossy()
))
}
}
@ -364,14 +369,14 @@ mod tests {
length: 2,
start: 0,
auto_widening: false,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_aa.txt");
assert_eq!(it.next().unwrap(), "chunk_ab.txt");
assert_eq!(it.next().unwrap(), "chunk_ac.txt");
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.nth(26 * 26 - 1).unwrap(), "chunk_zz.txt");
assert_eq!(it.next(), None);
}
@ -383,14 +388,14 @@ mod tests {
length: 2,
start: 0,
auto_widening: false,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_00.txt");
assert_eq!(it.next().unwrap(), "chunk_01.txt");
assert_eq!(it.next().unwrap(), "chunk_02.txt");
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.nth(10 * 10 - 1).unwrap(), "chunk_99.txt");
assert_eq!(it.next(), None);
}
@ -402,14 +407,14 @@ mod tests {
length: 2,
start: 0,
auto_widening: true,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_aa.txt");
assert_eq!(it.next().unwrap(), "chunk_ab.txt");
assert_eq!(it.next().unwrap(), "chunk_ac.txt");
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.nth(26 * 25 - 1).unwrap(), "chunk_yz.txt");
assert_eq!(it.next().unwrap(), "chunk_zaaa.txt");
assert_eq!(it.next().unwrap(), "chunk_zaab.txt");
@ -422,14 +427,14 @@ mod tests {
length: 2,
start: 0,
auto_widening: true,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_00.txt");
assert_eq!(it.next().unwrap(), "chunk_01.txt");
assert_eq!(it.next().unwrap(), "chunk_02.txt");
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.nth(10 * 9 - 1).unwrap(), "chunk_89.txt");
assert_eq!(it.next().unwrap(), "chunk_9000.txt");
assert_eq!(it.next().unwrap(), "chunk_9001.txt");
@ -442,9 +447,9 @@ mod tests {
length: 2,
start: 5,
auto_widening: true,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_05.txt");
assert_eq!(it.next().unwrap(), "chunk_06.txt");
assert_eq!(it.next().unwrap(), "chunk_07.txt");
@ -457,9 +462,9 @@ mod tests {
length: 2,
start: 9,
auto_widening: true,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_09.txt");
assert_eq!(it.next().unwrap(), "chunk_0a.txt");
assert_eq!(it.next().unwrap(), "chunk_0b.txt");
@ -472,9 +477,9 @@ mod tests {
length: 3,
start: 999,
auto_widening: false,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_999.txt");
assert!(it.next().is_none());
@ -483,9 +488,9 @@ mod tests {
length: 3,
start: 1000,
auto_widening: false,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let it = FilenameIterator::new("chunk_", &suffix);
let it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix);
assert!(it.is_err());
let suffix = Suffix {
@ -493,9 +498,9 @@ mod tests {
length: 3,
start: 0xfff,
auto_widening: false,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let mut it = FilenameIterator::new("chunk_", &suffix).unwrap();
let mut it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix).unwrap();
assert_eq!(it.next().unwrap(), "chunk_fff.txt");
assert!(it.next().is_none());
@ -504,9 +509,9 @@ mod tests {
length: 3,
start: 0x1000,
auto_widening: false,
additional: ".txt".to_string(),
additional: ".txt".into(),
};
let it = FilenameIterator::new("chunk_", &suffix);
let it = FilenameIterator::new(std::ffi::OsStr::new("chunk_"), &suffix);
assert!(it.is_err());
}
}

View file

@ -274,6 +274,7 @@ pub fn uu_app() -> Command {
.allow_hyphen_values(true)
.value_name("SUFFIX")
.default_value("")
.value_parser(clap::value_parser!(OsString))
.help(translate!("split-help-additional-suffix")),
)
.arg(
@ -378,7 +379,11 @@ pub fn uu_app() -> Command {
.value_hint(ValueHint::FilePath)
.value_parser(clap::value_parser!(OsString)),
)
.arg(Arg::new(ARG_PREFIX).default_value("x"))
.arg(
Arg::new(ARG_PREFIX)
.default_value("x")
.value_parser(clap::value_parser!(OsString)),
)
}
/// Parameters that control how a file gets split.
@ -386,7 +391,7 @@ pub fn uu_app() -> Command {
/// You can convert an [`ArgMatches`] instance into a [`Settings`]
/// instance by calling [`Settings::from`].
struct Settings {
prefix: String,
prefix: OsString,
suffix: Suffix,
input: OsString,
/// When supplied, a shell command to output to instead of xaa, xab …
@ -490,7 +495,7 @@ impl Settings {
};
let result = Self {
prefix: matches.get_one::<String>(ARG_PREFIX).unwrap().clone(),
prefix: matches.get_one::<OsString>(ARG_PREFIX).unwrap().clone(),
suffix,
input: matches.get_one::<OsString>(ARG_INPUT).unwrap().clone(),
filter: matches.get_one::<String>(OPT_FILTER).cloned(),

View file

@ -1726,32 +1726,7 @@ fn test_split_non_utf8_argument_unix() {
let opt_value = [0x66, 0x6f, 0x80, 0x6f];
let opt_value = OsStr::from_bytes(&opt_value[..]);
let name = OsStr::from_bytes(name.as_bytes());
ucmd.args(&[opt, opt_value, name])
.fails()
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
}
/// Test if there are invalid (non UTF-8) in the arguments - windows
/// clap is expected to fail/panic
#[test]
#[cfg(windows)]
fn test_split_non_utf8_argument_windows() {
use std::ffi::OsString;
use std::os::windows::ffi::OsStringExt;
let (at, mut ucmd) = at_and_ucmd!();
let name = "test_split_non_utf8_argument";
let opt = OsString::from("--additional-suffix");
RandomFile::new(&at, name).add_lines(2000);
// Here the values 0x0066 and 0x006f correspond to 'f' and 'o'
// respectively. The value 0xD800 is a lone surrogate half, invalid
// in a UTF-16 sequence.
let opt_value = [0x0066, 0x006f, 0xD800, 0x006f];
let opt_value = OsString::from_wide(&opt_value[..]);
let name = OsString::from(name);
ucmd.args(&[opt, opt_value, name])
.fails()
.stderr_contains("error: invalid UTF-8 was detected in one or more arguments");
ucmd.args(&[opt, opt_value, name]).succeeds();
}
// Test '--separator' / '-t' option following GNU tests example
@ -2022,3 +1997,63 @@ fn test_split_non_utf8_paths() {
// Check that at least one split file was created
assert!(at.plus("xaa").exists());
}
#[test]
#[cfg(target_os = "linux")]
fn test_split_non_utf8_prefix() {
use std::os::unix::ffi::OsStrExt;
let (at, mut ucmd) = at_and_ucmd!();
at.write("input.txt", "line1\nline2\nline3\nline4\n");
let prefix = std::ffi::OsStr::from_bytes(b"\xFF\xFE");
ucmd.arg("input.txt").arg(prefix).succeeds();
// Check that split files were created (functionality works)
// The actual filename may be converted due to lossy conversion, but the command should succeed
let entries: Vec<_> = std::fs::read_dir(at.as_string()).unwrap().collect();
let split_files = entries
.iter()
.filter_map(|e| e.as_ref().ok())
.filter(|entry| {
let name = entry.file_name();
let name_str = name.to_string_lossy();
name_str.starts_with("<EFBFBD>") || name_str.len() > 2 // split files should exist
})
.count();
assert!(
split_files > 0,
"Expected at least one split file to be created"
);
}
#[test]
#[cfg(target_os = "linux")]
fn test_split_non_utf8_additional_suffix() {
use std::os::unix::ffi::OsStrExt;
let (at, mut ucmd) = at_and_ucmd!();
at.write("input.txt", "line1\nline2\nline3\nline4\n");
let suffix = std::ffi::OsStr::from_bytes(b"\xFF\xFE");
ucmd.args(&["input.txt", "--additional-suffix"])
.arg(suffix)
.succeeds();
// Check that split files were created (functionality works)
// The actual filename may be converted due to lossy conversion, but the command should succeed
let entries: Vec<_> = std::fs::read_dir(at.as_string()).unwrap().collect();
let split_files = entries
.iter()
.filter_map(|e| e.as_ref().ok())
.filter(|entry| {
let name = entry.file_name();
let name_str = name.to_string_lossy();
name_str.ends_with("<EFBFBD>") || name_str.starts_with('x') // split files should exist
})
.count();
assert!(
split_files > 0,
"Expected at least one split file to be created"
);
}