From 961a27acffdb3165af64efa3c68d7b7829021be1 Mon Sep 17 00:00:00 2001 From: Andrew Liebenow Date: Tue, 22 Oct 2024 13:38:14 -0500 Subject: [PATCH 1/7] printf: accept non-UTF-8 input in FORMAT and ARGUMENT arguments Other implementations of `printf` permit arbitrary data to be passed to `printf`. The only restriction is that a null byte terminates FORMAT and ARGUMENT argument strings (since they are C strings). The current implementation only accepts FORMAT and ARGUMENT arguments that are valid UTF-8 (this is being enforced by clap). This commit removes the UTF-8 validation by switching to OsStr and OsString. This allows users to use `printf` to transmit or reformat null-safe but not UTF-8-safe data, such as text encoded in an 8-bit text encoding. See the `non_utf_8_input` test for an example (ISO-8859-1 text). [drinkcat: also squashed in this commit to ease rebase] Author: Justin Tracey uucore, printf: improve non-UTF-8 format arguments This fixes handling of format arguments, in part by eliminating duplicate implementations. Utilities with format arguments other than printf will no longer accept things like "'a" as numbers, etc. Co-authored-by: Justin Tracey --- src/uu/echo/src/echo.rs | 9 +- src/uu/printf/src/printf.rs | 17 +- src/uucore/src/lib/features/checksum.rs | 2 +- .../src/lib/features/extendedbigdecimal.rs | 12 + .../src/lib/features/format/argument.rs | 279 ++++++++++++------ src/uucore/src/lib/features/format/mod.rs | 21 +- src/uucore/src/lib/features/format/spec.rs | 73 +++-- .../src/lib/features/parser/num_parser.rs | 15 - src/uucore/src/lib/lib.rs | 64 ++-- tests/by-util/test_printf.rs | 111 ++++++- 10 files changed, 392 insertions(+), 211 deletions(-) diff --git a/src/uu/echo/src/echo.rs b/src/uu/echo/src/echo.rs index 2386e87a0..ae0849b2a 100644 --- a/src/uu/echo/src/echo.rs +++ b/src/uu/echo/src/echo.rs @@ -8,10 +8,9 @@ use clap::{Arg, ArgAction, Command}; use std::env; use std::ffi::{OsStr, OsString}; use std::io::{self, StdoutLock, Write}; -use uucore::error::{UResult, USimpleError}; +use uucore::error::UResult; use uucore::format::{FormatChar, OctalParsing, parse_escape_only}; -use uucore::format_usage; -use uucore::os_str_as_bytes; +use uucore::{format_usage, os_str_as_bytes}; use uucore::locale::get_message; @@ -223,9 +222,9 @@ pub fn uu_app() -> Command { fn execute(stdout: &mut StdoutLock, args: Vec, options: Options) -> UResult<()> { for (i, arg) in args.into_iter().enumerate() { - let bytes = os_str_as_bytes(arg.as_os_str()) - .map_err(|_| USimpleError::new(1, get_message("echo-error-non-utf8")))?; + let bytes = os_str_as_bytes(&arg)?; + // Don't print a space before the first argument if i > 0 { stdout.write_all(b" ")?; } diff --git a/src/uu/printf/src/printf.rs b/src/uu/printf/src/printf.rs index d537350f1..b9f464f19 100644 --- a/src/uu/printf/src/printf.rs +++ b/src/uu/printf/src/printf.rs @@ -4,6 +4,7 @@ // file that was distributed with this source code. use clap::{Arg, ArgAction, Command}; use std::collections::HashMap; +use std::ffi::OsString; use std::io::stdout; use std::ops::ControlFlow; use uucore::error::{UResult, UUsageError}; @@ -18,21 +19,19 @@ mod options { pub const FORMAT: &str = "FORMAT"; pub const ARGUMENT: &str = "ARGUMENT"; } + #[uucore::main] pub fn uumain(args: impl uucore::Args) -> UResult<()> { let matches = uu_app().get_matches_from(args); let format = matches - .get_one::(options::FORMAT) + .get_one::(options::FORMAT) .ok_or_else(|| UUsageError::new(1, get_message("printf-error-missing-operand")))?; let format = os_str_as_bytes(format)?; - let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { - // FIXME: use os_str_as_bytes once FormatArgument supports Vec + let values: Vec<_> = match matches.get_many::(options::ARGUMENT) { Some(s) => s - .map(|os_string| { - FormatArgument::Unparsed(std::ffi::OsStr::to_string_lossy(os_string).to_string()) - }) + .map(|os_string| FormatArgument::Unparsed(os_string.to_owned())) .collect(), None => vec![], }; @@ -62,7 +61,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> { "{}", get_message_with_args( "printf-warning-ignoring-excess-arguments", - HashMap::from([("arg".to_string(), arg_str.to_string())]) + HashMap::from([("arg".to_string(), arg_str.to_string_lossy().to_string())]) ) ); } @@ -103,10 +102,10 @@ pub fn uu_app() -> Command { .help(get_message("printf-help-version")) .action(ArgAction::Version), ) - .arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(std::ffi::OsString))) + .arg(Arg::new(options::FORMAT).value_parser(clap::value_parser!(OsString))) .arg( Arg::new(options::ARGUMENT) .action(ArgAction::Append) - .value_parser(clap::value_parser!(std::ffi::OsString)), + .value_parser(clap::value_parser!(OsString)), ) } diff --git a/src/uucore/src/lib/features/checksum.rs b/src/uucore/src/lib/features/checksum.rs index 214edc16c..6f134ab4e 100644 --- a/src/uucore/src/lib/features/checksum.rs +++ b/src/uucore/src/lib/features/checksum.rs @@ -968,7 +968,7 @@ fn process_checksum_line( cached_line_format: &mut Option, last_algo: &mut Option, ) -> Result<(), LineCheckError> { - let line_bytes = os_str_as_bytes(line)?; + let line_bytes = os_str_as_bytes(line).map_err(|e| LineCheckError::UError(Box::new(e)))?; // Early return on empty or commented lines. if line.is_empty() || line_bytes.starts_with(b"#") { diff --git a/src/uucore/src/lib/features/extendedbigdecimal.rs b/src/uucore/src/lib/features/extendedbigdecimal.rs index 396b6f359..5748b6f1a 100644 --- a/src/uucore/src/lib/features/extendedbigdecimal.rs +++ b/src/uucore/src/lib/features/extendedbigdecimal.rs @@ -101,6 +101,18 @@ impl From for ExtendedBigDecimal { } } +impl From for ExtendedBigDecimal { + fn from(val: u8) -> Self { + Self::BigDecimal(val.into()) + } +} + +impl From for ExtendedBigDecimal { + fn from(val: u32) -> Self { + Self::BigDecimal(val.into()) + } +} + impl ExtendedBigDecimal { pub fn zero() -> Self { Self::BigDecimal(0.into()) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 7baf9a9a6..b3d002dd7 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -3,16 +3,20 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use super::ExtendedBigDecimal; +use super::{ExtendedBigDecimal, FormatError}; use crate::format::spec::ArgumentLocation; use crate::{ error::set_exit_code, + os_str_as_bytes, parser::num_parser::{ExtendedParser, ExtendedParserError}, quoting_style::{QuotingStyle, locale_aware_escape_name}, show_error, show_warning, }; use os_display::Quotable; -use std::{ffi::OsStr, num::NonZero}; +use std::{ + ffi::{OsStr, OsString}, + num::NonZero, +}; /// An argument for formatting /// @@ -24,12 +28,12 @@ use std::{ffi::OsStr, num::NonZero}; #[derive(Clone, Debug, PartialEq)] pub enum FormatArgument { Char(char), - String(String), + String(OsString), UnsignedInt(u64), SignedInt(i64), Float(ExtendedBigDecimal), /// Special argument that gets coerced into the other variants - Unparsed(String), + Unparsed(OsString), } /// A struct that holds a slice of format arguments and provides methods to access them @@ -69,63 +73,86 @@ impl<'a> FormatArguments<'a> { self.next_arg_position = self.current_offset; } - pub fn next_char(&mut self, position: &ArgumentLocation) -> u8 { + pub fn next_char(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::Char(c)) => *c as u8, - Some(FormatArgument::Unparsed(s)) => s.bytes().next().unwrap_or(b'\0'), - _ => b'\0', + Some(FormatArgument::Char(c)) => Ok(*c as u8), + Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes(os)?.first() { + Some(&byte) => Ok(byte), + None => Ok(b'\0'), + }, + _ => Ok(b'\0'), } } - pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a str { + pub fn next_string(&mut self, position: &ArgumentLocation) -> &'a OsStr { match self.next_arg(position) { - Some(FormatArgument::Unparsed(s) | FormatArgument::String(s)) => s, - _ => "", + Some(FormatArgument::Unparsed(os) | FormatArgument::String(os)) => os, + _ => "".as_ref(), } } - pub fn next_i64(&mut self, position: &ArgumentLocation) -> i64 { + pub fn next_i64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::SignedInt(n)) => *n, - Some(FormatArgument::Unparsed(s)) => extract_value(i64::extended_parse(s), s), - _ => 0, + Some(FormatArgument::SignedInt(n)) => Ok(*n), + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(0), } } - pub fn next_u64(&mut self, position: &ArgumentLocation) -> u64 { + pub fn next_u64(&mut self, position: &ArgumentLocation) -> Result { match self.next_arg(position) { - Some(FormatArgument::UnsignedInt(n)) => *n, - Some(FormatArgument::Unparsed(s)) => { - // Check if the string is a character literal enclosed in quotes - if s.starts_with(['"', '\'']) { - // Extract the content between the quotes safely using chars - let mut chars = s.trim_matches(|c| c == '"' || c == '\'').chars(); - if let Some(first_char) = chars.next() { - if chars.clone().count() > 0 { - // Emit a warning if there are additional characters - let remaining: String = chars.collect(); - show_warning!( - "{remaining}: character(s) following character constant have been ignored" - ); - } - return first_char as u64; // Use only the first character - } - return 0; // Empty quotes + Some(FormatArgument::UnsignedInt(n)) => Ok(*n), + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(0), + } + } + + pub fn next_extended_big_decimal( + &mut self, + position: &ArgumentLocation, + ) -> Result { + match self.next_arg(position) { + Some(FormatArgument::Float(n)) => Ok(n.clone()), + Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), + _ => Ok(ExtendedBigDecimal::zero()), + } + } + + fn get_num(os: &OsStr) -> Result + where + T: ExtendedParser + From + From + Default, + { + let s = os_str_as_bytes(os)?; + // Check if the string begins with a quote, and is therefore a literal + if let Some((&first, bytes)) = s.split_first() { + if (first == b'"' || first == b'\'') && !bytes.is_empty() { + let (val, len) = if let Some(c) = bytes + .utf8_chunks() + .next() + .expect("bytes should not be empty") + .valid() + .chars() + .next() + { + // Valid UTF-8 character, cast the codepoint to u32 then T + // (largest unicode codepoint is only 3 bytes, so this is safe) + ((c as u32).into(), c.len_utf8()) + } else { + // Not a valid UTF-8 character, use the first byte + (bytes[0].into(), 1) + }; + // Emit a warning if there are additional characters + if bytes.len() > len { + show_warning!( + "{}: character(s) following character constant have been ignored", + String::from_utf8_lossy(&bytes[len..]) + ); } - extract_value(u64::extended_parse(s), s) + return Ok(val); } - _ => 0, - } - } - - pub fn next_extended_big_decimal(&mut self, position: &ArgumentLocation) -> ExtendedBigDecimal { - match self.next_arg(position) { - Some(FormatArgument::Float(n)) => n.clone(), - Some(FormatArgument::Unparsed(s)) => { - extract_value(ExtendedBigDecimal::extended_parse(s), s) - } - _ => ExtendedBigDecimal::zero(), } + let s = os.to_string_lossy(); + Ok(extract_value(T::extended_parse(&s), &s)) } fn get_at_relative_position(&mut self, pos: NonZero) -> Option<&'a FormatArgument> { @@ -175,6 +202,7 @@ fn extract_value(p: Result>, input: &s } else { show_error!("{}: value not completely converted", input.quote()); } + v } } @@ -199,7 +227,10 @@ mod tests { assert!(!args.is_exhausted()); assert_eq!(Some(&FormatArgument::Char('a')), args.peek_arg()); assert!(!args.is_exhausted()); // Peek shouldn't consume - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(args.is_exhausted()); // After batch, exhausted with a single arg assert_eq!(None, args.peek_arg()); @@ -220,26 +251,50 @@ mod tests { ]); // First batch - two sequential calls - assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'y', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'z', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'y', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'x', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'w', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'x', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'w', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern - assert_eq!(b'v', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'u', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'v', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b'u', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(!args.is_exhausted()); // Fourth batch - same pattern (last batch) - assert_eq!(b't', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b's', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b't', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!( + b's', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -249,28 +304,37 @@ mod tests { // Test with different method types in sequence let args = [ FormatArgument::Char('a'), - FormatArgument::String("hello".to_string()), - FormatArgument::Unparsed("123".to_string()), - FormatArgument::String("world".to_string()), + FormatArgument::String("hello".into()), + FormatArgument::Unparsed("123".into()), + FormatArgument::String("world".into()), FormatArgument::Char('z'), - FormatArgument::String("test".to_string()), + FormatArgument::String("test".into()), ]; let mut args = FormatArguments::new(&args); // First batch - next_char followed by next_string - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'1', args.next_char(&ArgumentLocation::NextArgument)); // First byte of 123 + assert_eq!( + b'1', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // First byte of 123 assert_eq!("world", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern (last batch) - assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!( + b'z', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); assert_eq!("test", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); @@ -296,23 +360,23 @@ mod tests { ]); // First batch - positional access - assert_eq!(b'b', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'a', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'b', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same positional pattern - assert_eq!(b'e', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'd', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'e', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'd', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same positional pattern (last batch) - assert_eq!(b'h', args.next_char(&non_zero_pos(2))); // Position 2 - assert_eq!(b'g', args.next_char(&non_zero_pos(1))); // Position 1 - assert_eq!(b'i', args.next_char(&non_zero_pos(3))); // Position 3 + assert_eq!(b'h', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 + assert_eq!(b'g', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 + assert_eq!(b'i', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 args.start_next_batch(); assert!(args.is_exhausted()); } @@ -332,20 +396,29 @@ mod tests { ]); // First batch - mix of sequential and positional - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Positional + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same mixed pattern - assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Positional + assert_eq!( + b'd', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Last batch - same mixed pattern - assert_eq!(b'g', args.next_char(&ArgumentLocation::NextArgument)); // Sequential - assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds + assert_eq!( + b'g', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); // Sequential + assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } @@ -364,17 +437,21 @@ mod tests { let mut args = FormatArguments::new(&args); // First batch - i64, u64, decimal - assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument)); - assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument)); - let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); + assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); + let result = args + .next_extended_big_decimal(&ArgumentLocation::NextArgument) + .unwrap(); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument)); - assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument)); - let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); + assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); + let result = args + .next_extended_big_decimal(&ArgumentLocation::NextArgument) + .unwrap(); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(args.is_exhausted()); @@ -384,22 +461,22 @@ mod tests { fn test_unparsed_arguments() { // Test with unparsed arguments that get coerced let args = [ - FormatArgument::Unparsed("hello".to_string()), - FormatArgument::Unparsed("123".to_string()), - FormatArgument::Unparsed("hello".to_string()), - FormatArgument::Unparsed("456".to_string()), + FormatArgument::Unparsed("hello".into()), + FormatArgument::Unparsed("123".into()), + FormatArgument::Unparsed("hello".into()), + FormatArgument::Unparsed("456".into()), ]; let mut args = FormatArguments::new(&args); // First batch - string, number assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -409,25 +486,25 @@ mod tests { // Test with mixed types and positional access let args = [ FormatArgument::Char('a'), - FormatArgument::String("test".to_string()), + FormatArgument::String("test".into()), FormatArgument::UnsignedInt(42), FormatArgument::Char('b'), - FormatArgument::String("more".to_string()), + FormatArgument::String("more".into()), FormatArgument::UnsignedInt(99), ]; let mut args = FormatArguments::new(&args); // First batch - positional access of different types - assert_eq!(b'a', args.next_char(&non_zero_pos(1))); + assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); assert_eq!("test", args.next_string(&non_zero_pos(2))); - assert_eq!(42, args.next_u64(&non_zero_pos(3))); + assert_eq!(42, args.next_u64(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'b', args.next_char(&non_zero_pos(1))); + assert_eq!(b'b', args.next_char(&non_zero_pos(1)).unwrap()); assert_eq!("more", args.next_string(&non_zero_pos(2))); - assert_eq!(99, args.next_u64(&non_zero_pos(3))); + assert_eq!(99, args.next_u64(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -444,14 +521,20 @@ mod tests { ]); // First batch - assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'c', args.next_char(&non_zero_pos(3))); + assert_eq!( + b'a', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch (partial) - assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); - assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds + assert_eq!( + b'd', + args.next_char(&ArgumentLocation::NextArgument).unwrap() + ); + assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } diff --git a/src/uucore/src/lib/features/format/mod.rs b/src/uucore/src/lib/features/format/mod.rs index 7abd91475..532af34ef 100644 --- a/src/uucore/src/lib/features/format/mod.rs +++ b/src/uucore/src/lib/features/format/mod.rs @@ -37,8 +37,12 @@ pub mod human; pub mod num_format; mod spec; +pub use self::escape::{EscapedChar, OctalParsing}; use crate::extendedbigdecimal::ExtendedBigDecimal; -pub use argument::*; +pub use argument::{FormatArgument, FormatArguments}; + +use self::{escape::parse_escape_code, num_format::Formatter}; +use crate::{NonUtf8OsStrError, error::UError}; pub use spec::Spec; use std::{ error::Error, @@ -50,13 +54,6 @@ use std::{ use os_display::Quotable; -use crate::error::UError; - -pub use self::{ - escape::{EscapedChar, OctalParsing, parse_escape_code}, - num_format::Formatter, -}; - #[derive(Debug)] pub enum FormatError { SpecError(Vec), @@ -74,6 +71,7 @@ pub enum FormatError { /// The hexadecimal characters represent a code point that cannot represent a /// Unicode character (e.g., a surrogate code point) InvalidCharacter(char, Vec), + InvalidEncoding(NonUtf8OsStrError), } impl Error for FormatError {} @@ -85,6 +83,12 @@ impl From for FormatError { } } +impl From for FormatError { + fn from(value: NonUtf8OsStrError) -> FormatError { + FormatError::InvalidEncoding(value) + } +} + impl Display for FormatError { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { @@ -118,6 +122,7 @@ impl Display for FormatError { "invalid universal character name \\{escape_char}{}", String::from_utf8_lossy(digits) ), + Self::InvalidEncoding(no) => no.fmt(f), } } } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index 8322d26d7..d91180860 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -5,8 +5,6 @@ // spell-checker:ignore (vars) intmax ptrdiff padlen -use crate::quoting_style::{QuotingStyle, locale_aware_escape_name}; - use super::{ ExtendedBigDecimal, FormatChar, FormatError, OctalParsing, num_format::{ @@ -15,7 +13,11 @@ use super::{ }, parse_escape_only, }; -use crate::format::FormatArguments; +use crate::{ + format::FormatArguments, + os_str_as_bytes, + quoting_style::{QuotingStyle, locale_aware_escape_name}, +}; use std::{io::Write, num::NonZero, ops::ControlFlow}; /// A parsed specification for formatting a value @@ -355,7 +357,7 @@ impl Spec { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or_default(); write_padded( writer, - &[args.next_char(position)], + &[args.next_char(position)?], width, *align_left || neg_width, ) @@ -375,22 +377,21 @@ impl Spec { // TODO: We need to not use Rust's formatting for aligning the output, // so that we can just write bytes to stdout without panicking. let precision = resolve_asterisk_precision(*precision, args); - let s = args.next_string(position); + let os_str = args.next_string(position); + let bytes = os_str_as_bytes(os_str)?; + let truncated = match precision { - Some(p) if p < s.len() => &s[..p], - _ => s, + Some(p) if p < os_str.len() => &bytes[..p], + _ => bytes, }; - write_padded( - writer, - truncated.as_bytes(), - width, - *align_left || neg_width, - ) + write_padded(writer, truncated, width, *align_left || neg_width) } Self::EscapedString { position } => { - let s = args.next_string(position); - let mut parsed = Vec::new(); - for c in parse_escape_only(s.as_bytes(), OctalParsing::ThreeDigits) { + let os_str = args.next_string(position); + let bytes = os_str_as_bytes(os_str)?; + let mut parsed = Vec::::new(); + + for c in parse_escape_only(bytes, OctalParsing::ThreeDigits) { match c.write(&mut parsed)? { ControlFlow::Continue(()) => {} ControlFlow::Break(()) => { @@ -403,15 +404,11 @@ impl Spec { } Self::QuotedString { position } => { let s = locale_aware_escape_name( - args.next_string(position).as_ref(), + args.next_string(position), QuotingStyle::SHELL_ESCAPE, ); - #[cfg(unix)] - let bytes = std::os::unix::ffi::OsStringExt::into_vec(s); - #[cfg(not(unix))] - let bytes = s.to_string_lossy().as_bytes().to_owned(); - - writer.write_all(&bytes).map_err(FormatError::IoError) + let bytes = os_str_as_bytes(&s)?; + writer.write_all(bytes).map_err(FormatError::IoError) } Self::SignedInt { width, @@ -422,7 +419,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_i64(position); + let i = args.next_i64(position)?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -450,7 +447,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_u64(position); + let i = args.next_u64(position)?; if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -481,7 +478,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args); - let f: ExtendedBigDecimal = args.next_extended_big_decimal(position); + let f: ExtendedBigDecimal = args.next_extended_big_decimal(position)?; if precision.is_some_and(|p| p as u64 > i32::MAX as u64) { return Err(FormatError::InvalidPrecision( @@ -518,7 +515,7 @@ fn resolve_asterisk_width( match option { None => None, Some(CanAsterisk::Asterisk(loc)) => { - let nb = args.next_i64(&loc); + let nb = args.next_i64(&loc).unwrap_or(0); if nb < 0 { Some((usize::try_from(-(nb as isize)).ok().unwrap_or(0), true)) } else { @@ -537,7 +534,7 @@ fn resolve_asterisk_precision( ) -> Option { match option { None => None, - Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc) { + Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc).unwrap_or(0) { v if v >= 0 => usize::try_from(v).ok(), v if v < 0 => Some(0usize), _ => None, @@ -646,7 +643,7 @@ mod tests { Some((42, false)), resolve_asterisk_width( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("42".into())]), ) ); @@ -661,7 +658,7 @@ mod tests { Some((42, true)), resolve_asterisk_width( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".into())]), ) ); @@ -672,9 +669,9 @@ mod tests { NonZero::new(2).unwrap() ))), &mut FormatArguments::new(&[ - FormatArgument::Unparsed("1".to_string()), - FormatArgument::Unparsed("2".to_string()), - FormatArgument::Unparsed("3".to_string()) + FormatArgument::Unparsed("1".into()), + FormatArgument::Unparsed("2".into()), + FormatArgument::Unparsed("3".into()) ]), ) ); @@ -717,7 +714,7 @@ mod tests { Some(42), resolve_asterisk_precision( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("42".into())]), ) ); @@ -732,7 +729,7 @@ mod tests { Some(0), resolve_asterisk_precision( Some(CanAsterisk::Asterisk(ArgumentLocation::NextArgument)), - &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".to_string())]), + &mut FormatArguments::new(&[FormatArgument::Unparsed("-42".into())]), ) ); assert_eq!( @@ -742,9 +739,9 @@ mod tests { NonZero::new(2).unwrap() ))), &mut FormatArguments::new(&[ - FormatArgument::Unparsed("1".to_string()), - FormatArgument::Unparsed("2".to_string()), - FormatArgument::Unparsed("3".to_string()) + FormatArgument::Unparsed("1".into()), + FormatArgument::Unparsed("2".into()), + FormatArgument::Unparsed("3".into()) ]), ) ); diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 597f4b245..3fc681645 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -546,21 +546,6 @@ pub(crate) fn parse<'a>( target: ParseTarget, allowed_suffixes: &[(char, u32)], ) -> Result> { - // Parse the " and ' prefixes separately - if target != ParseTarget::Duration { - if let Some(rest) = input.strip_prefix(['\'', '"']) { - let mut chars = rest.char_indices().fuse(); - let v = chars - .next() - .map(|(_, c)| ExtendedBigDecimal::BigDecimal(u32::from(c).into())); - return match (v, chars.next()) { - (Some(v), None) => Ok(v), - (Some(v), Some((i, _))) => Err(ExtendedParserError::PartialMatch(v, &rest[i..])), - (None, _) => Err(ExtendedParserError::NotNumeric), - }; - } - } - let trimmed_input = input.trim_ascii_start(); // Initial minus/plus sign diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index 8059dac93..6cebe5e3c 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -311,23 +311,39 @@ pub fn read_yes() -> bool { } } +#[derive(Debug)] +pub struct NonUtf8OsStrError { + input_lossy_string: String, +} + +impl std::fmt::Display for NonUtf8OsStrError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + use os_display::Quotable; + let quoted = self.input_lossy_string.quote(); + f.write_fmt(format_args!( + "invalid UTF-8 input {quoted} encountered when converting to bytes on a platform that doesn't expose byte arguments", + )) + } +} + +impl std::error::Error for NonUtf8OsStrError {} +impl error::UError for NonUtf8OsStrError {} + /// Converts an `OsStr` to a UTF-8 `&[u8]`. /// /// This always succeeds on unix platforms, /// and fails on other platforms if the string can't be coerced to UTF-8. -pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { +pub fn os_str_as_bytes(os_string: &OsStr) -> Result<&[u8], NonUtf8OsStrError> { #[cfg(unix)] - let bytes = os_string.as_bytes(); + return Ok(os_string.as_bytes()); #[cfg(not(unix))] - let bytes = os_string + os_string .to_str() - .ok_or_else(|| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })? - .as_bytes(); - - Ok(bytes) + .ok_or_else(|| NonUtf8OsStrError { + input_lossy_string: os_string.to_string_lossy().into_owned(), + }) + .map(|s| s.as_bytes()) } /// Performs a potentially lossy conversion from `OsStr` to UTF-8 bytes. @@ -336,15 +352,13 @@ pub fn os_str_as_bytes(os_string: &OsStr) -> mods::error::UResult<&[u8]> { /// and wraps [`OsStr::to_string_lossy`] on non-unix platforms. pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { #[cfg(unix)] - let bytes = Cow::from(os_string.as_bytes()); + return Cow::from(os_string.as_bytes()); #[cfg(not(unix))] - let bytes = match os_string.to_string_lossy() { + match os_string.to_string_lossy() { Cow::Borrowed(slice) => Cow::from(slice.as_bytes()), Cow::Owned(owned) => Cow::from(owned.into_bytes()), - }; - - bytes + } } /// Converts a `&[u8]` to an `&OsStr`, @@ -354,13 +368,12 @@ pub fn os_str_as_bytes_lossy(os_string: &OsStr) -> Cow<[u8]> { /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { #[cfg(unix)] - let os_str = Cow::Borrowed(OsStr::from_bytes(bytes)); - #[cfg(not(unix))] - let os_str = Cow::Owned(OsString::from(str::from_utf8(bytes).map_err(|_| { - mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr") - })?)); + return Ok(Cow::Borrowed(OsStr::from_bytes(bytes))); - Ok(os_str) + #[cfg(not(unix))] + Ok(Cow::Owned(OsString::from(str::from_utf8(bytes).map_err( + |_| mods::error::UUsageError::new(1, "Unable to transform bytes into OsStr"), + )?))) } /// Converts a `Vec` into an `OsString`, parsing as UTF-8 on non-unix platforms. @@ -369,13 +382,12 @@ pub fn os_str_from_bytes(bytes: &[u8]) -> mods::error::UResult> { /// and fails on other platforms if the bytes can't be parsed as UTF-8. pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { #[cfg(unix)] - let s = OsString::from_vec(vec); - #[cfg(not(unix))] - let s = OsString::from(String::from_utf8(vec).map_err(|_| { - mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") - })?); + return Ok(OsString::from_vec(vec)); - Ok(s) + #[cfg(not(unix))] + Ok(OsString::from(String::from_utf8(vec).map_err(|_| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })?)) } /// Converts an `OsString` into a `Vec`, parsing as UTF-8 on non-unix platforms. diff --git a/tests/by-util/test_printf.rs b/tests/by-util/test_printf.rs index 559a803bf..5e2c0c7a0 100644 --- a/tests/by-util/test_printf.rs +++ b/tests/by-util/test_printf.rs @@ -805,7 +805,7 @@ fn test_overflow() { fn partial_char() { new_ucmd!() .args(&["%d", "'abc"]) - .fails_with_code(1) + .succeeds() .stdout_is("97") .stderr_is( "printf: warning: bc: character(s) following character constant have been ignored\n", @@ -1293,23 +1293,80 @@ fn float_arg_with_whitespace() { #[test] fn mb_input() { - for format in ["\"á", "\'á", "'\u{e1}"] { + let cases = vec![ + ("%04x\n", "\"á", "00e1\n"), + ("%04x\n", "'á", "00e1\n"), + ("%04x\n", "'\u{e1}", "00e1\n"), + ("%i\n", "\"á", "225\n"), + ("%i\n", "'á", "225\n"), + ("%i\n", "'\u{e1}", "225\n"), + ("%f\n", "'á", "225.000000\n"), + ]; + for (format, arg, stdout) in cases { new_ucmd!() - .args(&["%04x\n", format]) + .args(&[format, arg]) .succeeds() - .stdout_only("00e1\n"); + .stdout_only(stdout); } let cases = vec![ - ("\"á=", "="), - ("\'á-", "-"), - ("\'á=-==", "=-=="), - ("'\u{e1}++", "++"), + ("%04x\n", "\"á=", "00e1\n", "="), + ("%04x\n", "'á-", "00e1\n", "-"), + ("%04x\n", "'á=-==", "00e1\n", "=-=="), + ("%04x\n", "'á'", "00e1\n", "'"), + ("%04x\n", "'\u{e1}++", "00e1\n", "++"), + ("%04x\n", "''á'", "0027\n", "á'"), + ("%i\n", "\"á=", "225\n", "="), ]; - - for (format, expected) in cases { + for (format, arg, stdout, stderr) in cases { new_ucmd!() - .args(&["%04x\n", format]) + .args(&[format, arg]) + .succeeds() + .stdout_is(stdout) + .stderr_is(format!("printf: warning: {stderr}: character(s) following character constant have been ignored\n")); + } + + for arg in ["\"", "'"] { + new_ucmd!() + .args(&["%04x\n", arg]) + .fails() + .stderr_contains("expected a numeric value"); + } +} + +#[test] +#[cfg(target_family = "unix")] +fn mb_invalid_unicode() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + let cases = vec![ + ("%04x\n", b"\"\xe1", "00e1\n"), + ("%04x\n", b"'\xe1", "00e1\n"), + ("%i\n", b"\"\xe1", "225\n"), + ("%i\n", b"'\xe1", "225\n"), + ("%f\n", b"'\xe1", "225.000000\n"), + ]; + for (format, arg, stdout) in cases { + new_ucmd!() + .arg(format) + .arg(OsStr::from_bytes(arg)) + .succeeds() + .stdout_only(stdout); + } + + let cases = vec![ + (b"\"\xe1=".as_slice(), "="), + (b"'\xe1-".as_slice(), "-"), + (b"'\xe1=-==".as_slice(), "=-=="), + (b"'\xe1'".as_slice(), "'"), + // unclear if original or replacement character is better in stderr + //(b"''\xe1'".as_slice(), "'�'"), + ]; + for (arg, expected) in cases { + new_ucmd!() + .arg("%04x\n") + .arg(OsStr::from_bytes(arg)) .succeeds() .stdout_is("00e1\n") .stderr_is(format!("printf: warning: {expected}: character(s) following character constant have been ignored\n")); @@ -1364,3 +1421,35 @@ fn positional_format_specifiers() { .succeeds() .stdout_only("Octal: 115, Int: 42, Float: 3.141590, String: hello, Hex: ff, Scientific: 1.000000e-05, Char: A, Unsigned: 100, Integer: 123"); } + +#[test] +#[cfg(target_family = "unix")] +fn non_utf_8_input() { + use std::ffi::OsStr; + use std::os::unix::ffi::OsStrExt; + + // ISO-8859-1 encoded text + // spell-checker:disable + const INPUT_AND_OUTPUT: &[u8] = + b"Swer an rehte g\xFCete wendet s\xEEn gem\xFCete, dem volget s\xE6lde und \xEAre."; + // spell-checker:enable + + let os_str = OsStr::from_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%s") + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg(os_str) + .succeeds() + .stdout_only_bytes(INPUT_AND_OUTPUT); + + new_ucmd!() + .arg("%d") + .arg(os_str) + .fails() + .stderr_contains("expected a numeric value"); +} From d9be3315c1cf8f0e069a368ca5fe83a86b47bd72 Mon Sep 17 00:00:00 2001 From: Justin Tracey Date: Wed, 28 May 2025 19:39:19 -0400 Subject: [PATCH 2/7] printf: remove passing tests from why-error.md --- util/why-error.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/util/why-error.md b/util/why-error.md index c4c371d07..0fdff867a 100644 --- a/util/why-error.md +++ b/util/why-error.md @@ -38,11 +38,7 @@ This file documents why some tests are failing: * gnu/tests/mv/part-hardlink.sh * gnu/tests/od/od-N.sh * gnu/tests/od/od-float.sh -* gnu/tests/printf/printf-cov.pl -* gnu/tests/printf/printf-indexed.sh -* gnu/tests/printf/printf-mb.sh * gnu/tests/printf/printf-quote.sh -* gnu/tests/printf/printf.sh * gnu/tests/ptx/ptx-overrun.sh * gnu/tests/ptx/ptx.pl * gnu/tests/rm/empty-inacc.sh - https://github.com/uutils/coreutils/issues/7033 From 51453f833e4bf33795597b6cc0513361a05eee29 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Fri, 11 Jul 2025 11:55:08 +0800 Subject: [PATCH 3/7] uucore: num_parser: Add note about where literals are parsed --- src/uucore/src/lib/features/parser/num_parser.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 3fc681645..751373654 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -546,6 +546,9 @@ pub(crate) fn parse<'a>( target: ParseTarget, allowed_suffixes: &[(char, u32)], ) -> Result> { + // Note: literals with ' and " prefixes are parsed earlier on in argument parsing, + // before UTF-8 conversion. + let trimmed_input = input.trim_ascii_start(); // Initial minus/plus sign From 5b6a617024517dd9e61fd22daa2eefe314fa08ac Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Fri, 11 Jul 2025 11:54:20 +0800 Subject: [PATCH 4/7] uucore: format: Have get_num return T directly (not result) And fix all the callers, e.g. all the next_ functions. --- .../src/lib/features/format/argument.rs | 185 ++++++------------ src/uucore/src/lib/features/format/spec.rs | 12 +- 2 files changed, 70 insertions(+), 127 deletions(-) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index b3d002dd7..d02ffde04 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -3,7 +3,7 @@ // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. -use super::{ExtendedBigDecimal, FormatError}; +use super::ExtendedBigDecimal; use crate::format::spec::ArgumentLocation; use crate::{ error::set_exit_code, @@ -73,14 +73,14 @@ impl<'a> FormatArguments<'a> { self.next_arg_position = self.current_offset; } - pub fn next_char(&mut self, position: &ArgumentLocation) -> Result { + pub fn next_char(&mut self, position: &ArgumentLocation) -> u8 { match self.next_arg(position) { - Some(FormatArgument::Char(c)) => Ok(*c as u8), - Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes(os)?.first() { - Some(&byte) => Ok(byte), - None => Ok(b'\0'), + Some(FormatArgument::Char(c)) => *c as u8, + Some(FormatArgument::Unparsed(os)) => match os_str_as_bytes(os) { + Ok(bytes) => bytes.first().copied().unwrap_or(b'\0'), + Err(_) => b'\0', }, - _ => Ok(b'\0'), + _ => b'\0', } } @@ -91,38 +91,36 @@ impl<'a> FormatArguments<'a> { } } - pub fn next_i64(&mut self, position: &ArgumentLocation) -> Result { + pub fn next_i64(&mut self, position: &ArgumentLocation) -> i64 { match self.next_arg(position) { - Some(FormatArgument::SignedInt(n)) => Ok(*n), + Some(FormatArgument::SignedInt(n)) => *n, Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), - _ => Ok(0), + _ => 0, } } - pub fn next_u64(&mut self, position: &ArgumentLocation) -> Result { + pub fn next_u64(&mut self, position: &ArgumentLocation) -> u64 { match self.next_arg(position) { - Some(FormatArgument::UnsignedInt(n)) => Ok(*n), + Some(FormatArgument::UnsignedInt(n)) => *n, Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), - _ => Ok(0), + _ => 0, } } - pub fn next_extended_big_decimal( - &mut self, - position: &ArgumentLocation, - ) -> Result { + pub fn next_extended_big_decimal(&mut self, position: &ArgumentLocation) -> ExtendedBigDecimal { match self.next_arg(position) { - Some(FormatArgument::Float(n)) => Ok(n.clone()), + Some(FormatArgument::Float(n)) => n.clone(), Some(FormatArgument::Unparsed(os)) => Self::get_num::(os), - _ => Ok(ExtendedBigDecimal::zero()), + _ => ExtendedBigDecimal::zero(), } } - fn get_num(os: &OsStr) -> Result + fn get_num(os: &OsStr) -> T where T: ExtendedParser + From + From + Default, { - let s = os_str_as_bytes(os)?; + // FIXME: Remove unwrap + let s = os_str_as_bytes(os).unwrap(); // Check if the string begins with a quote, and is therefore a literal if let Some((&first, bytes)) = s.split_first() { if (first == b'"' || first == b'\'') && !bytes.is_empty() { @@ -148,11 +146,11 @@ impl<'a> FormatArguments<'a> { String::from_utf8_lossy(&bytes[len..]) ); } - return Ok(val); + return val; } } let s = os.to_string_lossy(); - Ok(extract_value(T::extended_parse(&s), &s)) + extract_value(T::extended_parse(&s), &s) } fn get_at_relative_position(&mut self, pos: NonZero) -> Option<&'a FormatArgument> { @@ -227,10 +225,7 @@ mod tests { assert!(!args.is_exhausted()); assert_eq!(Some(&FormatArgument::Char('a')), args.peek_arg()); assert!(!args.is_exhausted()); // Peek shouldn't consume - assert_eq!( - b'a', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); // After batch, exhausted with a single arg assert_eq!(None, args.peek_arg()); @@ -251,50 +246,26 @@ mod tests { ]); // First batch - two sequential calls - assert_eq!( - b'z', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); - assert_eq!( - b'y', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!(b'y', args.next_char(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!( - b'x', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); - assert_eq!( - b'w', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b'x', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!(b'w', args.next_char(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern - assert_eq!( - b'v', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); - assert_eq!( - b'u', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b'v', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!(b'u', args.next_char(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Fourth batch - same pattern (last batch) - assert_eq!( - b't', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); - assert_eq!( - b's', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b't', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!(b's', args.next_char(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -313,28 +284,19 @@ mod tests { let mut args = FormatArguments::new(&args); // First batch - next_char followed by next_string - assert_eq!( - b'a', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!( - b'1', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); // First byte of 123 + assert_eq!(b'1', args.next_char(&ArgumentLocation::NextArgument)); // First byte of 123 assert_eq!("world", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same pattern (last batch) - assert_eq!( - b'z', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); + assert_eq!(b'z', args.next_char(&ArgumentLocation::NextArgument)); assert_eq!("test", args.next_string(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); @@ -360,23 +322,23 @@ mod tests { ]); // First batch - positional access - assert_eq!(b'b', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 - assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 - assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 + assert_eq!(b'b', args.next_char(&non_zero_pos(2))); // Position 2 + assert_eq!(b'a', args.next_char(&non_zero_pos(1))); // Position 1 + assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same positional pattern - assert_eq!(b'e', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 - assert_eq!(b'd', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 - assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 + assert_eq!(b'e', args.next_char(&non_zero_pos(2))); // Position 2 + assert_eq!(b'd', args.next_char(&non_zero_pos(1))); // Position 1 + assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Position 3 args.start_next_batch(); assert!(!args.is_exhausted()); // Third batch - same positional pattern (last batch) - assert_eq!(b'h', args.next_char(&non_zero_pos(2)).unwrap()); // Position 2 - assert_eq!(b'g', args.next_char(&non_zero_pos(1)).unwrap()); // Position 1 - assert_eq!(b'i', args.next_char(&non_zero_pos(3)).unwrap()); // Position 3 + assert_eq!(b'h', args.next_char(&non_zero_pos(2))); // Position 2 + assert_eq!(b'g', args.next_char(&non_zero_pos(1))); // Position 1 + assert_eq!(b'i', args.next_char(&non_zero_pos(3))); // Position 3 args.start_next_batch(); assert!(args.is_exhausted()); } @@ -396,29 +358,20 @@ mod tests { ]); // First batch - mix of sequential and positional - assert_eq!( - b'a', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); // Sequential - assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); // Positional + assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); // Sequential + assert_eq!(b'c', args.next_char(&non_zero_pos(3))); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same mixed pattern - assert_eq!( - b'd', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); // Sequential - assert_eq!(b'f', args.next_char(&non_zero_pos(3)).unwrap()); // Positional + assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); // Sequential + assert_eq!(b'f', args.next_char(&non_zero_pos(3))); // Positional args.start_next_batch(); assert!(!args.is_exhausted()); // Last batch - same mixed pattern - assert_eq!( - b'g', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); // Sequential - assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds + assert_eq!(b'g', args.next_char(&ArgumentLocation::NextArgument)); // Sequential + assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } @@ -437,21 +390,17 @@ mod tests { let mut args = FormatArguments::new(&args); // First batch - i64, u64, decimal - assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); - assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); - let result = args - .next_extended_big_decimal(&ArgumentLocation::NextArgument) - .unwrap(); + assert_eq!(10, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(20, args.next_u64(&ArgumentLocation::NextArgument)); + let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); - assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument).unwrap()); - let result = args - .next_extended_big_decimal(&ArgumentLocation::NextArgument) - .unwrap(); + assert_eq!(30, args.next_i64(&ArgumentLocation::NextArgument)); + assert_eq!(40, args.next_u64(&ArgumentLocation::NextArgument)); + let result = args.next_extended_big_decimal(&ArgumentLocation::NextArgument); assert_eq!(ExtendedBigDecimal::zero(), result); args.start_next_batch(); assert!(args.is_exhausted()); @@ -470,13 +419,13 @@ mod tests { // First batch - string, number assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(123, args.next_i64(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern assert_eq!("hello", args.next_string(&ArgumentLocation::NextArgument)); - assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument).unwrap()); + assert_eq!(456, args.next_i64(&ArgumentLocation::NextArgument)); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -495,16 +444,16 @@ mod tests { let mut args = FormatArguments::new(&args); // First batch - positional access of different types - assert_eq!(b'a', args.next_char(&non_zero_pos(1)).unwrap()); + assert_eq!(b'a', args.next_char(&non_zero_pos(1))); assert_eq!("test", args.next_string(&non_zero_pos(2))); - assert_eq!(42, args.next_u64(&non_zero_pos(3)).unwrap()); + assert_eq!(42, args.next_u64(&non_zero_pos(3))); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch - same pattern - assert_eq!(b'b', args.next_char(&non_zero_pos(1)).unwrap()); + assert_eq!(b'b', args.next_char(&non_zero_pos(1))); assert_eq!("more", args.next_string(&non_zero_pos(2))); - assert_eq!(99, args.next_u64(&non_zero_pos(3)).unwrap()); + assert_eq!(99, args.next_u64(&non_zero_pos(3))); args.start_next_batch(); assert!(args.is_exhausted()); } @@ -521,20 +470,14 @@ mod tests { ]); // First batch - assert_eq!( - b'a', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); - assert_eq!(b'c', args.next_char(&non_zero_pos(3)).unwrap()); + assert_eq!(b'a', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!(b'c', args.next_char(&non_zero_pos(3))); args.start_next_batch(); assert!(!args.is_exhausted()); // Second batch (partial) - assert_eq!( - b'd', - args.next_char(&ArgumentLocation::NextArgument).unwrap() - ); - assert_eq!(b'\0', args.next_char(&non_zero_pos(3)).unwrap()); // Out of bounds + assert_eq!(b'd', args.next_char(&ArgumentLocation::NextArgument)); + assert_eq!(b'\0', args.next_char(&non_zero_pos(3))); // Out of bounds args.start_next_batch(); assert!(args.is_exhausted()); } diff --git a/src/uucore/src/lib/features/format/spec.rs b/src/uucore/src/lib/features/format/spec.rs index d91180860..98d455994 100644 --- a/src/uucore/src/lib/features/format/spec.rs +++ b/src/uucore/src/lib/features/format/spec.rs @@ -357,7 +357,7 @@ impl Spec { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or_default(); write_padded( writer, - &[args.next_char(position)?], + &[args.next_char(position)], width, *align_left || neg_width, ) @@ -419,7 +419,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_i64(position)?; + let i = args.next_i64(position); if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -447,7 +447,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args).unwrap_or_default(); - let i = args.next_u64(position)?; + let i = args.next_u64(position); if precision as u64 > i32::MAX as u64 { return Err(FormatError::InvalidPrecision(precision.to_string())); @@ -478,7 +478,7 @@ impl Spec { } => { let (width, neg_width) = resolve_asterisk_width(*width, args).unwrap_or((0, false)); let precision = resolve_asterisk_precision(*precision, args); - let f: ExtendedBigDecimal = args.next_extended_big_decimal(position)?; + let f: ExtendedBigDecimal = args.next_extended_big_decimal(position); if precision.is_some_and(|p| p as u64 > i32::MAX as u64) { return Err(FormatError::InvalidPrecision( @@ -515,7 +515,7 @@ fn resolve_asterisk_width( match option { None => None, Some(CanAsterisk::Asterisk(loc)) => { - let nb = args.next_i64(&loc).unwrap_or(0); + let nb = args.next_i64(&loc); if nb < 0 { Some((usize::try_from(-(nb as isize)).ok().unwrap_or(0), true)) } else { @@ -534,7 +534,7 @@ fn resolve_asterisk_precision( ) -> Option { match option { None => None, - Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc).unwrap_or(0) { + Some(CanAsterisk::Asterisk(loc)) => match args.next_i64(&loc) { v if v >= 0 => usize::try_from(v).ok(), v if v < 0 => Some(0usize), _ => None, From 9ddb8092e748ee479cd7df98049fc7768893a0e9 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Fri, 11 Jul 2025 14:16:18 +0800 Subject: [PATCH 5/7] uucore: num_parser: Copy PartialMatch String in ExtendedParserError In one case, we'll need an actual owned String in PartialMatch, so it's easier to just use that. Removes a bunch of lifetime things in the code too. --- .../src/lib/features/format/argument.rs | 2 +- .../src/lib/features/parser/num_parser.rs | 339 +++++++++++------- 2 files changed, 201 insertions(+), 140 deletions(-) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index d02ffde04..105092ebf 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -172,7 +172,7 @@ impl<'a> FormatArguments<'a> { } } -fn extract_value(p: Result>, input: &str) -> T { +fn extract_value(p: Result>, input: &str) -> T { match p { Ok(v) => v, Err(e) => { diff --git a/src/uucore/src/lib/features/parser/num_parser.rs b/src/uucore/src/lib/features/parser/num_parser.rs index 751373654..5f7d89538 100644 --- a/src/uucore/src/lib/features/parser/num_parser.rs +++ b/src/uucore/src/lib/features/parser/num_parser.rs @@ -109,12 +109,12 @@ impl Base { /// Type returned if a number could not be parsed in its entirety #[derive(Debug, PartialEq)] -pub enum ExtendedParserError<'a, T> { +pub enum ExtendedParserError { /// The input as a whole makes no sense NotNumeric, /// The beginning of the input made sense and has been parsed, /// while the remaining doesn't. - PartialMatch(T, &'a str), + PartialMatch(T, String), /// The value has overflowed the type storage. The returned value /// is saturated (e.g. positive or negative infinity, or min/max /// value for the integer type). @@ -124,7 +124,7 @@ pub enum ExtendedParserError<'a, T> { Underflow(T), } -impl<'a, T> ExtendedParserError<'a, T> +impl ExtendedParserError where T: Zero, { @@ -143,12 +143,12 @@ where /// conversion. fn map( self, - f: impl FnOnce(T) -> Result>, - ) -> ExtendedParserError<'a, U> + f: impl FnOnce(T) -> Result>, + ) -> ExtendedParserError where U: Zero, { - fn extract(v: Result>) -> U + fn extract(v: Result>) -> U where U: Zero, { @@ -172,15 +172,15 @@ where /// and `f64` float, where octal and binary formats are not allowed. pub trait ExtendedParser { // We pick a hopefully different name for our parser, to avoid clash with standard traits. - fn extended_parse(input: &str) -> Result> + fn extended_parse(input: &str) -> Result> where Self: Sized; } impl ExtendedParser for i64 { /// Parse a number as i64. No fractional part is allowed. - fn extended_parse(input: &str) -> Result> { - fn into_i64<'a>(ebd: ExtendedBigDecimal) -> Result> { + fn extended_parse(input: &str) -> Result> { + fn into_i64(ebd: ExtendedBigDecimal) -> Result> { match ebd { ExtendedBigDecimal::BigDecimal(bd) => { let (digits, scale) = bd.into_bigint_and_scale(); @@ -214,8 +214,8 @@ impl ExtendedParser for i64 { impl ExtendedParser for u64 { /// Parse a number as u64. No fractional part is allowed. - fn extended_parse(input: &str) -> Result> { - fn into_u64<'a>(ebd: ExtendedBigDecimal) -> Result> { + fn extended_parse(input: &str) -> Result> { + fn into_u64(ebd: ExtendedBigDecimal) -> Result> { match ebd { ExtendedBigDecimal::BigDecimal(bd) => { let (digits, scale) = bd.into_bigint_and_scale(); @@ -251,8 +251,8 @@ impl ExtendedParser for u64 { impl ExtendedParser for f64 { /// Parse a number as f64 - fn extended_parse(input: &str) -> Result> { - fn into_f64<'a>(ebd: ExtendedBigDecimal) -> Result> { + fn extended_parse(input: &str) -> Result> { + fn into_f64(ebd: ExtendedBigDecimal) -> Result> { // TODO: _Some_ of this is generic, so this should probably be implemented as an ExtendedBigDecimal trait (ToPrimitive). let v = match ebd { ExtendedBigDecimal::BigDecimal(bd) => { @@ -285,7 +285,7 @@ impl ExtendedParser for ExtendedBigDecimal { /// Parse a number as an ExtendedBigDecimal fn extended_parse( input: &str, - ) -> Result> { + ) -> Result> { parse(input, ParseTarget::Decimal, &[]) } } @@ -349,11 +349,11 @@ fn parse_suffix_multiplier<'a>(str: &'a str, allowed_suffixes: &[(char, u32)]) - (1, str) } -fn parse_special_value<'a>( - input: &'a str, +fn parse_special_value( + input: &str, negative: bool, allowed_suffixes: &[(char, u32)], -) -> Result> { +) -> Result> { let input_lc = input.to_ascii_lowercase(); // Array of ("String to match", return value when sign positive, when sign negative) @@ -376,7 +376,7 @@ fn parse_special_value<'a>( return if rest.is_empty() { Ok(special) } else { - Err(ExtendedParserError::PartialMatch(special, rest)) + Err(ExtendedParserError::PartialMatch(special, rest.to_string())) }; } } @@ -386,7 +386,7 @@ fn parse_special_value<'a>( /// Underflow/Overflow errors always contain 0 or infinity. /// overflow: true for overflow, false for underflow. -fn make_error<'a>(overflow: bool, negative: bool) -> ExtendedParserError<'a, ExtendedBigDecimal> { +fn make_error(overflow: bool, negative: bool) -> ExtendedParserError { let mut v = if overflow { ExtendedBigDecimal::Infinity } else { @@ -468,13 +468,13 @@ fn pow_with_context(bd: &BigDecimal, exp: i64, ctx: &Context) -> BigDecimal { } /// Construct an [`ExtendedBigDecimal`] based on parsed data -fn construct_extended_big_decimal<'a>( +fn construct_extended_big_decimal( digits: BigUint, negative: bool, base: Base, scale: i64, exponent: BigInt, -) -> Result> { +) -> Result> { if digits == BigUint::zero() { // Return return 0 if the digits are zero. In particular, we do not ever // return Overflow/Underflow errors in that case. @@ -541,11 +541,11 @@ pub(crate) enum ParseTarget { Duration, } -pub(crate) fn parse<'a>( - input: &'a str, +pub(crate) fn parse( + input: &str, target: ParseTarget, allowed_suffixes: &[(char, u32)], -) -> Result> { +) -> Result> { // Note: literals with ' and " prefixes are parsed earlier on in argument parsing, // before UTF-8 conversion. @@ -604,7 +604,7 @@ pub(crate) fn parse<'a>( } else { ExtendedBigDecimal::zero() }; - return Err(ExtendedParserError::PartialMatch(ebd, partial)); + return Err(ExtendedParserError::PartialMatch(ebd, partial.to_string())); } return if target == ParseTarget::Integral { @@ -628,7 +628,7 @@ pub(crate) fn parse<'a>( } else { Err(ExtendedParserError::PartialMatch( ebd_result.unwrap_or_else(|e| e.extract()), - rest, + rest.to_string(), )) } } @@ -674,14 +674,14 @@ mod tests { u64::extended_parse(""), Err(ExtendedParserError::NotNumeric) )); - assert!(matches!( + assert_eq!( u64::extended_parse("123.15"), - Err(ExtendedParserError::PartialMatch(123, ".15")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(123, ".15".to_string())) + ); + assert_eq!( u64::extended_parse("123e10"), - Err(ExtendedParserError::PartialMatch(123, "e10")) - )); + Err(ExtendedParserError::PartialMatch(123, "e10".to_string())) + ); } #[test] @@ -695,18 +695,18 @@ mod tests { )); assert_eq!(Ok(i64::MAX), i64::extended_parse(&format!("{}", i64::MAX))); assert_eq!(Ok(i64::MIN), i64::extended_parse(&format!("{}", i64::MIN))); - assert!(matches!( + assert_eq!( i64::extended_parse(&format!("{}", u64::MAX)), Err(ExtendedParserError::Overflow(i64::MAX)) - )); + ); assert!(matches!( i64::extended_parse(&format!("{}", i64::MAX as u64 + 1)), Err(ExtendedParserError::Overflow(i64::MAX)) )); - assert!(matches!( + assert_eq!( i64::extended_parse("-123e10"), - Err(ExtendedParserError::PartialMatch(-123, "e10")) - )); + Err(ExtendedParserError::PartialMatch(-123, "e10".to_string())) + ); assert!(matches!( i64::extended_parse(&format!("{}", -(u64::MAX as i128))), Err(ExtendedParserError::Overflow(i64::MIN)) @@ -758,20 +758,34 @@ mod tests { Ok(0.15), f64::extended_parse(".150000000000000000000000000231313") ); - assert!(matches!(f64::extended_parse("123.15e"), - Err(ExtendedParserError::PartialMatch(f, "e")) if f == 123.15)); - assert!(matches!(f64::extended_parse("123.15E"), - Err(ExtendedParserError::PartialMatch(f, "E")) if f == 123.15)); - assert!(matches!(f64::extended_parse("123.15e-"), - Err(ExtendedParserError::PartialMatch(f, "e-")) if f == 123.15)); - assert!(matches!(f64::extended_parse("123.15e+"), - Err(ExtendedParserError::PartialMatch(f, "e+")) if f == 123.15)); - assert!(matches!(f64::extended_parse("123.15e."), - Err(ExtendedParserError::PartialMatch(f, "e.")) if f == 123.15)); - assert!(matches!(f64::extended_parse("1.2.3"), - Err(ExtendedParserError::PartialMatch(f, ".3")) if f == 1.2)); - assert!(matches!(f64::extended_parse("123.15p5"), - Err(ExtendedParserError::PartialMatch(f, "p5")) if f == 123.15)); + assert_eq!( + f64::extended_parse("123.15e"), + Err(ExtendedParserError::PartialMatch(123.15, "e".to_string())) + ); + assert_eq!( + f64::extended_parse("123.15E"), + Err(ExtendedParserError::PartialMatch(123.15, "E".to_string())) + ); + assert_eq!( + f64::extended_parse("123.15e-"), + Err(ExtendedParserError::PartialMatch(123.15, "e-".to_string())) + ); + assert_eq!( + f64::extended_parse("123.15e+"), + Err(ExtendedParserError::PartialMatch(123.15, "e+".to_string())) + ); + assert_eq!( + f64::extended_parse("123.15e."), + Err(ExtendedParserError::PartialMatch(123.15, "e.".to_string())) + ); + assert_eq!( + f64::extended_parse("1.2.3"), + Err(ExtendedParserError::PartialMatch(1.2, ".3".to_string())) + ); + assert_eq!( + f64::extended_parse("123.15p5"), + Err(ExtendedParserError::PartialMatch(123.15, "p5".to_string())) + ); // Minus zero. 0.0 == -0.0 so we explicitly check the sign. assert_eq!(Ok(0.0), f64::extended_parse("-0.0")); assert!(f64::extended_parse("-0.0").unwrap().is_sign_negative()); @@ -794,10 +808,20 @@ mod tests { assert!(f64::extended_parse("nan").unwrap().is_sign_positive()); assert!(f64::extended_parse("NAN").unwrap().is_nan()); assert!(f64::extended_parse("NAN").unwrap().is_sign_positive()); - assert!(matches!(f64::extended_parse("-infinit"), - Err(ExtendedParserError::PartialMatch(f, "init")) if f == f64::NEG_INFINITY)); - assert!(matches!(f64::extended_parse("-infinity00"), - Err(ExtendedParserError::PartialMatch(f, "00")) if f == f64::NEG_INFINITY)); + assert_eq!( + f64::extended_parse("-infinit"), + Err(ExtendedParserError::PartialMatch( + f64::NEG_INFINITY, + "init".to_string() + )) + ); + assert_eq!( + f64::extended_parse("-infinity00"), + Err(ExtendedParserError::PartialMatch( + f64::NEG_INFINITY, + "00".to_string() + )) + ); assert!(f64::extended_parse(&format!("{}", u64::MAX)).is_ok()); assert!(f64::extended_parse(&format!("{}", i64::MIN)).is_ok()); @@ -982,14 +1006,22 @@ mod tests { // but we can check that the number still gets parsed properly: 0x0.8e5 is 0x8e5 / 16**3 assert_eq!(Ok(0.555908203125), f64::extended_parse("0x0.8e5")); - assert!(matches!(f64::extended_parse("0x0.1p"), - Err(ExtendedParserError::PartialMatch(f, "p")) if f == 0.0625)); - assert!(matches!(f64::extended_parse("0x0.1p-"), - Err(ExtendedParserError::PartialMatch(f, "p-")) if f == 0.0625)); - assert!(matches!(f64::extended_parse("0x.1p+"), - Err(ExtendedParserError::PartialMatch(f, "p+")) if f == 0.0625)); - assert!(matches!(f64::extended_parse("0x.1p."), - Err(ExtendedParserError::PartialMatch(f, "p.")) if f == 0.0625)); + assert_eq!( + f64::extended_parse("0x0.1p"), + Err(ExtendedParserError::PartialMatch(0.0625, "p".to_string())) + ); + assert_eq!( + f64::extended_parse("0x0.1p-"), + Err(ExtendedParserError::PartialMatch(0.0625, "p-".to_string())) + ); + assert_eq!( + f64::extended_parse("0x.1p+"), + Err(ExtendedParserError::PartialMatch(0.0625, "p+".to_string())) + ); + assert_eq!( + f64::extended_parse("0x.1p."), + Err(ExtendedParserError::PartialMatch(0.0625, "p.".to_string())) + ); assert_eq!( Ok(ExtendedBigDecimal::BigDecimal( @@ -1049,40 +1081,58 @@ mod tests { )); // Not actually hex numbers, but the prefixes look like it. - assert!(matches!(f64::extended_parse("0x"), - Err(ExtendedParserError::PartialMatch(f, "x")) if f == 0.0)); - assert!(matches!(f64::extended_parse("0x."), - Err(ExtendedParserError::PartialMatch(f, "x.")) if f == 0.0)); - assert!(matches!(f64::extended_parse("0xp"), - Err(ExtendedParserError::PartialMatch(f, "xp")) if f == 0.0)); - assert!(matches!(f64::extended_parse("0xp-2"), - Err(ExtendedParserError::PartialMatch(f, "xp-2")) if f == 0.0)); - assert!(matches!(f64::extended_parse("0x.p-2"), - Err(ExtendedParserError::PartialMatch(f, "x.p-2")) if f == 0.0)); - assert!(matches!(f64::extended_parse("0X"), - Err(ExtendedParserError::PartialMatch(f, "X")) if f == 0.0)); - assert!(matches!(f64::extended_parse("-0x"), - Err(ExtendedParserError::PartialMatch(f, "x")) if f == -0.0)); - assert!(matches!(f64::extended_parse("+0x"), - Err(ExtendedParserError::PartialMatch(f, "x")) if f == 0.0)); - assert!(matches!(f64::extended_parse("-0x."), - Err(ExtendedParserError::PartialMatch(f, "x.")) if f == -0.0)); - assert!(matches!( + assert_eq!( + f64::extended_parse("0x"), + Err(ExtendedParserError::PartialMatch(0.0, "x".to_string())) + ); + assert_eq!( + f64::extended_parse("0x."), + Err(ExtendedParserError::PartialMatch(0.0, "x.".to_string())) + ); + assert_eq!( + f64::extended_parse("0xp"), + Err(ExtendedParserError::PartialMatch(0.0, "xp".to_string())) + ); + assert_eq!( + f64::extended_parse("0xp-2"), + Err(ExtendedParserError::PartialMatch(0.0, "xp-2".to_string())) + ); + assert_eq!( + f64::extended_parse("0x.p-2"), + Err(ExtendedParserError::PartialMatch(0.0, "x.p-2".to_string())) + ); + assert_eq!( + f64::extended_parse("0X"), + Err(ExtendedParserError::PartialMatch(0.0, "X".to_string())) + ); + assert_eq!( + f64::extended_parse("-0x"), + Err(ExtendedParserError::PartialMatch(0.0, "x".to_string())) + ); + assert_eq!( + f64::extended_parse("+0x"), + Err(ExtendedParserError::PartialMatch(0.0, "x".to_string())) + ); + assert_eq!( + f64::extended_parse("-0x."), + Err(ExtendedParserError::PartialMatch(-0.0, "x.".to_string())) + ); + assert_eq!( u64::extended_parse("0x"), - Err(ExtendedParserError::PartialMatch(0, "x")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "x".to_string())) + ); + assert_eq!( u64::extended_parse("-0x"), - Err(ExtendedParserError::PartialMatch(0, "x")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "x".to_string())) + ); + assert_eq!( i64::extended_parse("0x"), - Err(ExtendedParserError::PartialMatch(0, "x")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "x".to_string())) + ); + assert_eq!( i64::extended_parse("-0x"), - Err(ExtendedParserError::PartialMatch(0, "x")) - )); + Err(ExtendedParserError::PartialMatch(0, "x".to_string())) + ); } #[test] @@ -1093,18 +1143,18 @@ mod tests { assert_eq!(Ok(-0o123), i64::extended_parse("-0123")); assert_eq!(Ok(0o123), u64::extended_parse("00123")); assert_eq!(Ok(0), u64::extended_parse("00")); - assert!(matches!( + assert_eq!( u64::extended_parse("008"), - Err(ExtendedParserError::PartialMatch(0, "8")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "8".to_string())) + ); + assert_eq!( u64::extended_parse("08"), - Err(ExtendedParserError::PartialMatch(0, "8")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "8".to_string())) + ); + assert_eq!( u64::extended_parse("0."), - Err(ExtendedParserError::PartialMatch(0, ".")) - )); + Err(ExtendedParserError::PartialMatch(0, ".".to_string())) + ); // No float tests, leading zeros get parsed as decimal anyway. } @@ -1116,51 +1166,62 @@ mod tests { assert_eq!(Ok(0b1011), u64::extended_parse("+0b1011")); assert_eq!(Ok(-0b1011), i64::extended_parse("-0b1011")); - assert!(matches!( + assert_eq!( u64::extended_parse("0b"), - Err(ExtendedParserError::PartialMatch(0, "b")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "b".to_string())) + ); + assert_eq!( u64::extended_parse("0b."), - Err(ExtendedParserError::PartialMatch(0, "b.")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "b.".to_string())) + ); + assert_eq!( u64::extended_parse("-0b"), - Err(ExtendedParserError::PartialMatch(0, "b")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "b".to_string())) + ); + assert_eq!( i64::extended_parse("0b"), - Err(ExtendedParserError::PartialMatch(0, "b")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0, "b".to_string())) + ); + assert_eq!( i64::extended_parse("-0b"), - Err(ExtendedParserError::PartialMatch(0, "b")) - )); + Err(ExtendedParserError::PartialMatch(0, "b".to_string())) + ); // Binary not allowed for floats - assert!(matches!( + assert_eq!( f64::extended_parse("0b100"), - Err(ExtendedParserError::PartialMatch(0f64, "b100")) - )); - assert!(matches!( + Err(ExtendedParserError::PartialMatch(0f64, "b100".to_string())) + ); + assert_eq!( f64::extended_parse("0b100.1"), - Err(ExtendedParserError::PartialMatch(0f64, "b100.1")) - )); + Err(ExtendedParserError::PartialMatch( + 0f64, + "b100.1".to_string() + )) + ); - assert!(match ExtendedBigDecimal::extended_parse("0b100.1") { - Err(ExtendedParserError::PartialMatch(ebd, "b100.1")) => - ebd == ExtendedBigDecimal::zero(), - _ => false, - }); + assert_eq!( + ExtendedBigDecimal::extended_parse("0b100.1"), + Err(ExtendedParserError::PartialMatch( + ExtendedBigDecimal::zero(), + "b100.1".to_string() + )) + ); - assert!(match ExtendedBigDecimal::extended_parse("0b") { - Err(ExtendedParserError::PartialMatch(ebd, "b")) => ebd == ExtendedBigDecimal::zero(), - _ => false, - }); - assert!(match ExtendedBigDecimal::extended_parse("0b.") { - Err(ExtendedParserError::PartialMatch(ebd, "b.")) => ebd == ExtendedBigDecimal::zero(), - _ => false, - }); + assert_eq!( + ExtendedBigDecimal::extended_parse("0b"), + Err(ExtendedParserError::PartialMatch( + ExtendedBigDecimal::zero(), + "b".to_string() + )) + ); + assert_eq!( + ExtendedBigDecimal::extended_parse("0b."), + Err(ExtendedParserError::PartialMatch( + ExtendedBigDecimal::zero(), + "b.".to_string() + )) + ); } #[test] @@ -1173,15 +1234,15 @@ mod tests { // Ensure that trailing whitespace is still a partial match assert_eq!( - Err(ExtendedParserError::PartialMatch(6, " ")), + Err(ExtendedParserError::PartialMatch(6, " ".to_string())), u64::extended_parse("0x6 ") ); assert_eq!( - Err(ExtendedParserError::PartialMatch(7, "\t")), + Err(ExtendedParserError::PartialMatch(7, "\t".to_string())), u64::extended_parse("0x7\t") ); assert_eq!( - Err(ExtendedParserError::PartialMatch(8, "\n")), + Err(ExtendedParserError::PartialMatch(8, "\n".to_string())), u64::extended_parse("0x8\n") ); From 668cefb2e469c8a7b237fab8fbe2f297d4f665e8 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Fri, 11 Jul 2025 13:45:39 +0800 Subject: [PATCH 6/7] uucore: format: Separate function to parse numbers starting with a quote After this, we can use a common extract_value function. --- .../src/lib/features/format/argument.rs | 104 ++++++++++++------ 1 file changed, 70 insertions(+), 34 deletions(-) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index 105092ebf..e1c466640 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -115,42 +115,74 @@ impl<'a> FormatArguments<'a> { } } + // Parse an OsStr that we know to start with a '/" + fn parse_quote_start(os: &OsStr) -> Result> + where + T: ExtendedParser + From + From + Default, + { + // If this fails (this can only happens on Windows), then just + // return NotNumeric. + let s = match os_str_as_bytes(os) { + Ok(s) => s, + Err(_) => return Err(ExtendedParserError::NotNumeric), + }; + + let bytes = match s.split_first() { + Some((b'"', bytes)) | Some((b'\'', bytes)) => bytes, + _ => { + // This really can't happen, the string we are given must start with '/". + debug_assert!(false); + return Err(ExtendedParserError::NotNumeric); + } + }; + + if bytes.is_empty() { + return Err(ExtendedParserError::NotNumeric); + } + + let (val, len) = if let Some(c) = bytes + .utf8_chunks() + .next() + .expect("bytes should not be empty") + .valid() + .chars() + .next() + { + // Valid UTF-8 character, cast the codepoint to u32 then T + // (largest unicode codepoint is only 3 bytes, so this is safe) + ((c as u32).into(), c.len_utf8()) + } else { + // Not a valid UTF-8 character, use the first byte + (bytes[0].into(), 1) + }; + // Emit a warning if there are additional characters + if bytes.len() > len { + return Err(ExtendedParserError::PartialMatch( + val, + String::from_utf8_lossy(&bytes[len..]).to_string(), + )); + } + + Ok(val) + } + fn get_num(os: &OsStr) -> T where T: ExtendedParser + From + From + Default, { - // FIXME: Remove unwrap - let s = os_str_as_bytes(os).unwrap(); - // Check if the string begins with a quote, and is therefore a literal - if let Some((&first, bytes)) = s.split_first() { - if (first == b'"' || first == b'\'') && !bytes.is_empty() { - let (val, len) = if let Some(c) = bytes - .utf8_chunks() - .next() - .expect("bytes should not be empty") - .valid() - .chars() - .next() - { - // Valid UTF-8 character, cast the codepoint to u32 then T - // (largest unicode codepoint is only 3 bytes, so this is safe) - ((c as u32).into(), c.len_utf8()) - } else { - // Not a valid UTF-8 character, use the first byte - (bytes[0].into(), 1) - }; - // Emit a warning if there are additional characters - if bytes.len() > len { - show_warning!( - "{}: character(s) following character constant have been ignored", - String::from_utf8_lossy(&bytes[len..]) - ); - } - return val; - } - } let s = os.to_string_lossy(); - extract_value(T::extended_parse(&s), &s) + let first = s.as_bytes().first().copied(); + + let quote_start = first == Some(b'"') || first == Some(b'\''); + let parsed = if quote_start { + // The string begins with a quote + Self::parse_quote_start(os) + } else { + T::extended_parse(&s) + }; + + // Get the best possible value, even if parsed was an error. + extract_value(parsed, &s, quote_start) } fn get_at_relative_position(&mut self, pos: NonZero) -> Option<&'a FormatArgument> { @@ -172,7 +204,11 @@ impl<'a> FormatArguments<'a> { } } -fn extract_value(p: Result>, input: &str) -> T { +fn extract_value( + p: Result>, + input: &str, + quote_start: bool, +) -> T { match p { Ok(v) => v, Err(e) => { @@ -192,8 +228,8 @@ fn extract_value(p: Result>, input: &str) Default::default() } ExtendedParserError::PartialMatch(v, rest) => { - let bytes = input.as_encoded_bytes(); - if !bytes.is_empty() && (bytes[0] == b'\'' || bytes[0] == b'"') { + if quote_start { + set_exit_code(0); show_warning!( "{rest}: character(s) following character constant have been ignored" ); From ccad817415f8ecaf63c7e261a75dd5bbe6174bb1 Mon Sep 17 00:00:00 2001 From: Nicolas Boichat Date: Fri, 11 Jul 2025 22:35:56 +0800 Subject: [PATCH 7/7] uucore: format: Use .into_owned() for from_utf8_lossy() Suggested by Gemini and I think that's not a bad idea. --- src/uucore/src/lib/features/format/argument.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/uucore/src/lib/features/format/argument.rs b/src/uucore/src/lib/features/format/argument.rs index e1c466640..935f28826 100644 --- a/src/uucore/src/lib/features/format/argument.rs +++ b/src/uucore/src/lib/features/format/argument.rs @@ -159,7 +159,7 @@ impl<'a> FormatArguments<'a> { if bytes.len() > len { return Err(ExtendedParserError::PartialMatch( val, - String::from_utf8_lossy(&bytes[len..]).to_string(), + String::from_utf8_lossy(&bytes[len..]).into_owned(), )); }