diff --git a/src/uu/expr/locales/en-US.ftl b/src/uu/expr/locales/en-US.ftl index 8d26566e9..2c09eee78 100644 --- a/src/uu/expr/locales/en-US.ftl +++ b/src/uu/expr/locales/en-US.ftl @@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Unmatched {"\\{"} expr-error-invalid-bracket-content = Invalid content of {"\\{\\}"} expr-error-trailing-backslash = Trailing backslash expr-error-too-big-range-quantifier-index = Regular expression too big +expr-error-match-utf8 = match does not support invalid UTF-8 encoding in { $arg } diff --git a/src/uu/expr/locales/fr-FR.ftl b/src/uu/expr/locales/fr-FR.ftl index f496b270c..b529db9d5 100644 --- a/src/uu/expr/locales/fr-FR.ftl +++ b/src/uu/expr/locales/fr-FR.ftl @@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Accolade ouvrante {"\\{"} non appariée expr-error-invalid-bracket-content = Contenu invalide de {"\\{\\}"} expr-error-trailing-backslash = Barre oblique inverse en fin expr-error-too-big-range-quantifier-index = Expression régulière trop grande +expr-error-match-utf8 = match ne supporte pas l'encodage UTF-8 invalide dans { $arg } diff --git a/src/uu/expr/src/expr.rs b/src/uu/expr/src/expr.rs index 7225f986c..99eb87381 100644 --- a/src/uu/expr/src/expr.rs +++ b/src/uu/expr/src/expr.rs @@ -5,9 +5,11 @@ use clap::{Arg, ArgAction, Command}; use std::collections::HashMap; +use std::io::Write; use syntax_tree::{AstNode, is_truthy}; use thiserror::Error; use uucore::locale::{get_message, get_message_with_args}; +use uucore::os_string_to_vec; use uucore::{ display::Quotable, error::{UError, UResult}, @@ -54,6 +56,8 @@ pub enum ExprError { TrailingBackslash, #[error("{}", get_message("expr-error-too-big-range-quantifier-index"))] TooBigRangeQuantifierIndex, + #[error("{}", get_message_with_args("expr-error-match-utf8", HashMap::from([("arg".to_string(), _0.quote().to_string())])))] + UnsupportedNonUtf8Match(String), } impl UError for ExprError { @@ -98,25 +102,27 @@ pub fn uu_app() -> Command { pub fn uumain(args: impl uucore::Args) -> UResult<()> { // For expr utility we do not want getopts. // The following usage should work without escaping hyphens: `expr -15 = 1 + 2 \* \( 3 - -4 \)` - let args: Vec = args + let args = args .skip(1) // Skip binary name - .map(|a| a.to_string_lossy().to_string()) - .collect(); + .map(os_string_to_vec) + .collect::, _>>()?; - if args.len() == 1 && args[0] == "--help" { + if args.len() == 1 && args[0] == b"--help" { let _ = uu_app().print_help(); - } else if args.len() == 1 && args[0] == "--version" { + } else if args.len() == 1 && args[0] == b"--version" { println!("{} {}", uucore::util_name(), uucore::crate_version!()); } else { // The first argument may be "--" and should be be ignored. - let args = if !args.is_empty() && args[0] == "--" { + let args = if !args.is_empty() && args[0] == b"--" { &args[1..] } else { &args }; - let res: String = AstNode::parse(args)?.eval()?.eval_as_string(); - println!("{res}"); + let res = AstNode::parse(args)?.eval()?.eval_as_string(); + let _ = std::io::stdout().write_all(&res); + let _ = std::io::stdout().write_all(b"\n"); + if !is_truthy(&res.into()) { return Err(1.into()); } diff --git a/src/uu/expr/src/syntax_tree.rs b/src/uu/expr/src/syntax_tree.rs index b0ae0142f..56b6f08d4 100644 --- a/src/uu/expr/src/syntax_tree.rs +++ b/src/uu/expr/src/syntax_tree.rs @@ -7,12 +7,15 @@ use std::{cell::Cell, collections::BTreeMap}; -use num_bigint::{BigInt, ParseBigIntError}; +use num_bigint::BigInt; use num_traits::ToPrimitive; use onig::{Regex, RegexOptions, Syntax}; use crate::{ExprError, ExprResult}; +pub(crate) type MaybeNonUtf8String = Vec; +pub(crate) type MaybeNonUtf8Str = [u8]; + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum BinOp { Relation(RelationOp), @@ -65,7 +68,7 @@ impl RelationOp { fn eval(&self, a: ExprResult, b: ExprResult) -> ExprResult { let a = a?; let b = b?; - let b = if let (Ok(a), Ok(b)) = (&a.to_bigint(), &b.to_bigint()) { + let b = if let (Some(a), Some(b)) = (&a.to_bigint(), &b.to_bigint()) { match self { Self::Lt => a < b, Self::Leq => a <= b, @@ -147,8 +150,17 @@ impl StringOp { Ok(left) } Self::Match => { - let left = left?.eval_as_string(); - let right = right?.eval_as_string(); + let left = String::from_utf8(left?.eval_as_string()).map_err(|u| { + ExprError::UnsupportedNonUtf8Match( + String::from_utf8_lossy(u.as_bytes()).into_owned(), + ) + })?; + let right = String::from_utf8(right?.eval_as_string()).map_err(|u| { + ExprError::UnsupportedNonUtf8Match( + String::from_utf8_lossy(u.as_bytes()).into_owned(), + ) + })?; + check_posix_regex_errors(&right)?; // Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep` @@ -237,8 +249,8 @@ impl StringOp { Self::Index => { let left = left?.eval_as_string(); let right = right?.eval_as_string(); - for (current_idx, ch_h) in left.chars().enumerate() { - for ch_n in right.to_string().chars() { + for (current_idx, ch_h) in left.iter().enumerate() { + for ch_n in &right { if ch_n == ch_h { return Ok((current_idx + 1).into()); } @@ -361,33 +373,33 @@ fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> { } /// Precedence for infix binary operators -const PRECEDENCE: &[&[(&str, BinOp)]] = &[ - &[("|", BinOp::String(StringOp::Or))], - &[("&", BinOp::String(StringOp::And))], +const PRECEDENCE: &[&[(&MaybeNonUtf8Str, BinOp)]] = &[ + &[(b"|", BinOp::String(StringOp::Or))], + &[(b"&", BinOp::String(StringOp::And))], &[ - ("<", BinOp::Relation(RelationOp::Lt)), - ("<=", BinOp::Relation(RelationOp::Leq)), - ("=", BinOp::Relation(RelationOp::Eq)), - ("!=", BinOp::Relation(RelationOp::Neq)), - (">=", BinOp::Relation(RelationOp::Geq)), - (">", BinOp::Relation(RelationOp::Gt)), + (b"<", BinOp::Relation(RelationOp::Lt)), + (b"<=", BinOp::Relation(RelationOp::Leq)), + (b"=", BinOp::Relation(RelationOp::Eq)), + (b"!=", BinOp::Relation(RelationOp::Neq)), + (b">=", BinOp::Relation(RelationOp::Geq)), + (b">", BinOp::Relation(RelationOp::Gt)), ], &[ - ("+", BinOp::Numeric(NumericOp::Add)), - ("-", BinOp::Numeric(NumericOp::Sub)), + (b"+", BinOp::Numeric(NumericOp::Add)), + (b"-", BinOp::Numeric(NumericOp::Sub)), ], &[ - ("*", BinOp::Numeric(NumericOp::Mul)), - ("/", BinOp::Numeric(NumericOp::Div)), - ("%", BinOp::Numeric(NumericOp::Mod)), + (b"*", BinOp::Numeric(NumericOp::Mul)), + (b"/", BinOp::Numeric(NumericOp::Div)), + (b"%", BinOp::Numeric(NumericOp::Mod)), ], - &[(":", BinOp::String(StringOp::Match))], + &[(b":", BinOp::String(StringOp::Match))], ]; #[derive(Debug, Clone, PartialEq, Eq)] pub enum NumOrStr { Num(BigInt), - Str(String), + Str(MaybeNonUtf8String), } impl From for NumOrStr { @@ -404,30 +416,37 @@ impl From for NumOrStr { impl From for NumOrStr { fn from(str: String) -> Self { + Self::Str(str.into()) + } +} + +impl From for NumOrStr { + fn from(str: MaybeNonUtf8String) -> Self { Self::Str(str) } } impl NumOrStr { - pub fn to_bigint(&self) -> Result { + pub fn to_bigint(&self) -> Option { match self { - Self::Num(num) => Ok(num.clone()), - Self::Str(str) => str.parse::(), + Self::Num(num) => Some(num.clone()), + Self::Str(str) => std::str::from_utf8(str).ok()?.parse::().ok(), } } pub fn eval_as_bigint(self) -> ExprResult { match self { Self::Num(num) => Ok(num), - Self::Str(str) => str + Self::Str(str) => String::from_utf8(str) + .map_err(|_| ExprError::NonIntegerArgument)? .parse::() .map_err(|_| ExprError::NonIntegerArgument), } } - pub fn eval_as_string(self) -> String { + pub fn eval_as_string(self) -> MaybeNonUtf8String { match self { - Self::Num(num) => num.to_string(), + Self::Num(num) => num.to_string().into(), Self::Str(str) => str, } } @@ -447,7 +466,7 @@ pub enum AstNodeInner { value: NumOrStr, }, Leaf { - value: String, + value: MaybeNonUtf8String, }, BinOp { op_type: BinOp, @@ -465,7 +484,7 @@ pub enum AstNodeInner { } impl AstNode { - pub fn parse(input: &[impl AsRef]) -> ExprResult { + pub fn parse(input: &[impl AsRef]) -> ExprResult { Parser::new(input).parse() } @@ -492,7 +511,7 @@ impl AstNode { result_stack.insert(node.id, Ok(value.clone())); } AstNodeInner::Leaf { value, .. } => { - result_stack.insert(node.id, Ok(value.to_string().into())); + result_stack.insert(node.id, Ok(value.to_owned().into())); } AstNodeInner::BinOp { op_type, @@ -529,7 +548,7 @@ impl AstNode { continue; }; - let string: String = string?.eval_as_string(); + let string: MaybeNonUtf8String = string?.eval_as_string(); // The GNU docs say: // @@ -550,7 +569,11 @@ impl AstNode { .unwrap_or(0); if let (Some(pos), Some(_)) = (pos.checked_sub(1), length.checked_sub(1)) { - let result = string.chars().skip(pos).take(length).collect::(); + let result = string + .into_iter() + .skip(pos) + .take(length) + .collect::(); result_stack.insert(node.id, Ok(result.into())); } else { result_stack.insert(node.id, Ok(String::new().into())); @@ -565,7 +588,7 @@ impl AstNode { continue; }; - let length = string?.eval_as_string().chars().count(); + let length = string?.eval_as_string().iter().count(); result_stack.insert(node.id, Ok(length.into())); } } @@ -591,17 +614,17 @@ fn get_next_id() -> u32 { }) } -struct Parser<'a, S: AsRef> { +struct Parser<'a, S: AsRef> { input: &'a [S], index: usize, } -impl<'a, S: AsRef> Parser<'a, S> { +impl<'a, S: AsRef> Parser<'a, S> { fn new(input: &'a [S]) -> Self { Self { input, index: 0 } } - fn next(&mut self) -> ExprResult<&'a str> { + fn next(&mut self) -> ExprResult<&'a MaybeNonUtf8Str> { let next = self.input.get(self.index); if let Some(next) = next { self.index += 1; @@ -610,12 +633,12 @@ impl<'a, S: AsRef> Parser<'a, S> { // The indexing won't panic, because we know that the input size // is greater than zero. Err(ExprError::MissingArgument( - self.input[self.index - 1].as_ref().into(), + String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into_owned(), )) } } - fn accept(&mut self, f: impl Fn(&str) -> Option) -> Option { + fn accept(&mut self, f: impl Fn(&MaybeNonUtf8Str) -> Option) -> Option { let next = self.input.get(self.index)?; let tok = f(next.as_ref()); if let Some(tok) = tok { @@ -632,7 +655,9 @@ impl<'a, S: AsRef> Parser<'a, S> { } let res = self.parse_expression()?; if let Some(arg) = self.input.get(self.index) { - return Err(ExprError::UnexpectedArgument(arg.as_ref().into())); + return Err(ExprError::UnexpectedArgument( + String::from_utf8_lossy(arg.as_ref()).into_owned(), + )); } Ok(res) } @@ -675,7 +700,7 @@ impl<'a, S: AsRef> Parser<'a, S> { fn parse_simple_expression(&mut self) -> ExprResult { let first = self.next()?; let inner = match first { - "match" => { + b"match" => { let left = self.parse_simple_expression()?; let right = self.parse_simple_expression()?; AstNodeInner::BinOp { @@ -684,7 +709,7 @@ impl<'a, S: AsRef> Parser<'a, S> { right: Box::new(right), } } - "substr" => { + b"substr" => { let string = self.parse_simple_expression()?; let pos = self.parse_simple_expression()?; let length = self.parse_simple_expression()?; @@ -694,7 +719,7 @@ impl<'a, S: AsRef> Parser<'a, S> { length: Box::new(length), } } - "index" => { + b"index" => { let left = self.parse_simple_expression()?; let right = self.parse_simple_expression()?; AstNodeInner::BinOp { @@ -703,32 +728,32 @@ impl<'a, S: AsRef> Parser<'a, S> { right: Box::new(right), } } - "length" => { + b"length" => { let string = self.parse_simple_expression()?; AstNodeInner::Length { string: Box::new(string), } } - "+" => AstNodeInner::Leaf { + b"+" => AstNodeInner::Leaf { value: self.next()?.into(), }, - "(" => { + b"(" => { // Evaluate the node just after parsing to we detect arithmetic // errors before checking for the closing parenthesis. let s = self.parse_expression()?.evaluated()?; match self.next() { - Ok(")") => {} + Ok(b")") => {} // Since we have parsed at least a '(', there will be a token // at `self.index - 1`. So this indexing won't panic. Ok(_) => { return Err(ExprError::ExpectedClosingBraceInsteadOf( - self.input[self.index - 1].as_ref().into(), + String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into(), )); } Err(ExprError::MissingArgument(_)) => { return Err(ExprError::ExpectedClosingBraceAfter( - self.input[self.index - 1].as_ref().into(), + String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into(), )); } Err(e) => return Err(e), @@ -752,11 +777,11 @@ pub fn is_truthy(s: &NumOrStr) -> bool { NumOrStr::Num(num) => num != &BigInt::from(0), NumOrStr::Str(str) => { // Edge case: `-` followed by nothing is truthy - if str == "-" { + if str == b"-" { return true; } - let mut bytes = str.bytes(); + let mut bytes = str.iter().copied(); // Empty string is falsy let Some(first) = bytes.next() else { @@ -922,7 +947,7 @@ mod test { .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), ""); + assert_eq!(result.eval_as_string(), b""); } #[test] @@ -931,13 +956,13 @@ mod test { .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), "0"); + assert_eq!(result.eval_as_string(), b"0"); let result = AstNode::parse(&["*cats", ":", r"*cats"]) .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), "5"); + assert_eq!(result.eval_as_string(), b"5"); } #[test] @@ -946,7 +971,7 @@ mod test { .unwrap() .eval() .unwrap(); - assert_eq!(result.eval_as_string(), "0"); + assert_eq!(result.eval_as_string(), b"0"); } #[test] diff --git a/src/uucore/src/lib/lib.rs b/src/uucore/src/lib/lib.rs index dafcdfca4..d81fd05ed 100644 --- a/src/uucore/src/lib/lib.rs +++ b/src/uucore/src/lib/lib.rs @@ -377,6 +377,24 @@ pub fn os_string_from_vec(vec: Vec) -> mods::error::UResult { Ok(s) } +/// Converts an `OsString` into a `Vec`, parsing as UTF-8 on non-unix platforms. +/// +/// This always succeeds on unix platforms, +/// and fails on other platforms if the bytes can't be parsed as UTF-8. +pub fn os_string_to_vec(s: OsString) -> mods::error::UResult> { + #[cfg(unix)] + let v = s.into_vec(); + #[cfg(not(unix))] + let v = s + .into_string() + .map_err(|_| { + mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments") + })? + .into(); + + Ok(v) +} + /// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec`, /// which avoids panicking on non UTF-8 input. pub fn read_byte_lines(