expr: Handle non UTF8 inputs

This commit is contained in:
Dorian Peron 2025-07-03 22:32:53 +02:00
parent 4b78537432
commit a669a84801
5 changed files with 114 additions and 63 deletions

View file

@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Unmatched {"\\{"}
expr-error-invalid-bracket-content = Invalid content of {"\\{\\}"}
expr-error-trailing-backslash = Trailing backslash
expr-error-too-big-range-quantifier-index = Regular expression too big
expr-error-match-utf8 = match does not support invalid UTF-8 encoding in { $arg }

View file

@ -63,3 +63,4 @@ expr-error-unmatched-opening-brace = Accolade ouvrante {"\\{"} non appariée
expr-error-invalid-bracket-content = Contenu invalide de {"\\{\\}"}
expr-error-trailing-backslash = Barre oblique inverse en fin
expr-error-too-big-range-quantifier-index = Expression régulière trop grande
expr-error-match-utf8 = match ne supporte pas l'encodage UTF-8 invalide dans { $arg }

View file

@ -5,9 +5,11 @@
use clap::{Arg, ArgAction, Command};
use std::collections::HashMap;
use std::io::Write;
use syntax_tree::{AstNode, is_truthy};
use thiserror::Error;
use uucore::locale::{get_message, get_message_with_args};
use uucore::os_string_to_vec;
use uucore::{
display::Quotable,
error::{UError, UResult},
@ -54,6 +56,8 @@ pub enum ExprError {
TrailingBackslash,
#[error("{}", get_message("expr-error-too-big-range-quantifier-index"))]
TooBigRangeQuantifierIndex,
#[error("{}", get_message_with_args("expr-error-match-utf8", HashMap::from([("arg".to_string(), _0.quote().to_string())])))]
UnsupportedNonUtf8Match(String),
}
impl UError for ExprError {
@ -98,25 +102,27 @@ pub fn uu_app() -> Command {
pub fn uumain(args: impl uucore::Args) -> UResult<()> {
// For expr utility we do not want getopts.
// The following usage should work without escaping hyphens: `expr -15 = 1 + 2 \* \( 3 - -4 \)`
let args: Vec<String> = args
let args = args
.skip(1) // Skip binary name
.map(|a| a.to_string_lossy().to_string())
.collect();
.map(os_string_to_vec)
.collect::<Result<Vec<_>, _>>()?;
if args.len() == 1 && args[0] == "--help" {
if args.len() == 1 && args[0] == b"--help" {
let _ = uu_app().print_help();
} else if args.len() == 1 && args[0] == "--version" {
} else if args.len() == 1 && args[0] == b"--version" {
println!("{} {}", uucore::util_name(), uucore::crate_version!());
} else {
// The first argument may be "--" and should be be ignored.
let args = if !args.is_empty() && args[0] == "--" {
let args = if !args.is_empty() && args[0] == b"--" {
&args[1..]
} else {
&args
};
let res: String = AstNode::parse(args)?.eval()?.eval_as_string();
println!("{res}");
let res = AstNode::parse(args)?.eval()?.eval_as_string();
let _ = std::io::stdout().write_all(&res);
let _ = std::io::stdout().write_all(b"\n");
if !is_truthy(&res.into()) {
return Err(1.into());
}

View file

@ -7,12 +7,15 @@
use std::{cell::Cell, collections::BTreeMap};
use num_bigint::{BigInt, ParseBigIntError};
use num_bigint::BigInt;
use num_traits::ToPrimitive;
use onig::{Regex, RegexOptions, Syntax};
use crate::{ExprError, ExprResult};
pub(crate) type MaybeNonUtf8String = Vec<u8>;
pub(crate) type MaybeNonUtf8Str = [u8];
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BinOp {
Relation(RelationOp),
@ -65,7 +68,7 @@ impl RelationOp {
fn eval(&self, a: ExprResult<NumOrStr>, b: ExprResult<NumOrStr>) -> ExprResult<NumOrStr> {
let a = a?;
let b = b?;
let b = if let (Ok(a), Ok(b)) = (&a.to_bigint(), &b.to_bigint()) {
let b = if let (Some(a), Some(b)) = (&a.to_bigint(), &b.to_bigint()) {
match self {
Self::Lt => a < b,
Self::Leq => a <= b,
@ -147,8 +150,17 @@ impl StringOp {
Ok(left)
}
Self::Match => {
let left = left?.eval_as_string();
let right = right?.eval_as_string();
let left = String::from_utf8(left?.eval_as_string()).map_err(|u| {
ExprError::UnsupportedNonUtf8Match(
String::from_utf8_lossy(u.as_bytes()).into_owned(),
)
})?;
let right = String::from_utf8(right?.eval_as_string()).map_err(|u| {
ExprError::UnsupportedNonUtf8Match(
String::from_utf8_lossy(u.as_bytes()).into_owned(),
)
})?;
check_posix_regex_errors(&right)?;
// Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep`
@ -237,8 +249,8 @@ impl StringOp {
Self::Index => {
let left = left?.eval_as_string();
let right = right?.eval_as_string();
for (current_idx, ch_h) in left.chars().enumerate() {
for ch_n in right.to_string().chars() {
for (current_idx, ch_h) in left.iter().enumerate() {
for ch_n in &right {
if ch_n == ch_h {
return Ok((current_idx + 1).into());
}
@ -361,33 +373,33 @@ fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> {
}
/// Precedence for infix binary operators
const PRECEDENCE: &[&[(&str, BinOp)]] = &[
&[("|", BinOp::String(StringOp::Or))],
&[("&", BinOp::String(StringOp::And))],
const PRECEDENCE: &[&[(&MaybeNonUtf8Str, BinOp)]] = &[
&[(b"|", BinOp::String(StringOp::Or))],
&[(b"&", BinOp::String(StringOp::And))],
&[
("<", BinOp::Relation(RelationOp::Lt)),
("<=", BinOp::Relation(RelationOp::Leq)),
("=", BinOp::Relation(RelationOp::Eq)),
("!=", BinOp::Relation(RelationOp::Neq)),
(">=", BinOp::Relation(RelationOp::Geq)),
(">", BinOp::Relation(RelationOp::Gt)),
(b"<", BinOp::Relation(RelationOp::Lt)),
(b"<=", BinOp::Relation(RelationOp::Leq)),
(b"=", BinOp::Relation(RelationOp::Eq)),
(b"!=", BinOp::Relation(RelationOp::Neq)),
(b">=", BinOp::Relation(RelationOp::Geq)),
(b">", BinOp::Relation(RelationOp::Gt)),
],
&[
("+", BinOp::Numeric(NumericOp::Add)),
("-", BinOp::Numeric(NumericOp::Sub)),
(b"+", BinOp::Numeric(NumericOp::Add)),
(b"-", BinOp::Numeric(NumericOp::Sub)),
],
&[
("*", BinOp::Numeric(NumericOp::Mul)),
("/", BinOp::Numeric(NumericOp::Div)),
("%", BinOp::Numeric(NumericOp::Mod)),
(b"*", BinOp::Numeric(NumericOp::Mul)),
(b"/", BinOp::Numeric(NumericOp::Div)),
(b"%", BinOp::Numeric(NumericOp::Mod)),
],
&[(":", BinOp::String(StringOp::Match))],
&[(b":", BinOp::String(StringOp::Match))],
];
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NumOrStr {
Num(BigInt),
Str(String),
Str(MaybeNonUtf8String),
}
impl From<usize> for NumOrStr {
@ -404,30 +416,37 @@ impl From<BigInt> for NumOrStr {
impl From<String> for NumOrStr {
fn from(str: String) -> Self {
Self::Str(str.into())
}
}
impl From<MaybeNonUtf8String> for NumOrStr {
fn from(str: MaybeNonUtf8String) -> Self {
Self::Str(str)
}
}
impl NumOrStr {
pub fn to_bigint(&self) -> Result<BigInt, ParseBigIntError> {
pub fn to_bigint(&self) -> Option<BigInt> {
match self {
Self::Num(num) => Ok(num.clone()),
Self::Str(str) => str.parse::<BigInt>(),
Self::Num(num) => Some(num.clone()),
Self::Str(str) => std::str::from_utf8(str).ok()?.parse::<BigInt>().ok(),
}
}
pub fn eval_as_bigint(self) -> ExprResult<BigInt> {
match self {
Self::Num(num) => Ok(num),
Self::Str(str) => str
Self::Str(str) => String::from_utf8(str)
.map_err(|_| ExprError::NonIntegerArgument)?
.parse::<BigInt>()
.map_err(|_| ExprError::NonIntegerArgument),
}
}
pub fn eval_as_string(self) -> String {
pub fn eval_as_string(self) -> MaybeNonUtf8String {
match self {
Self::Num(num) => num.to_string(),
Self::Num(num) => num.to_string().into(),
Self::Str(str) => str,
}
}
@ -447,7 +466,7 @@ pub enum AstNodeInner {
value: NumOrStr,
},
Leaf {
value: String,
value: MaybeNonUtf8String,
},
BinOp {
op_type: BinOp,
@ -465,7 +484,7 @@ pub enum AstNodeInner {
}
impl AstNode {
pub fn parse(input: &[impl AsRef<str>]) -> ExprResult<Self> {
pub fn parse(input: &[impl AsRef<MaybeNonUtf8Str>]) -> ExprResult<Self> {
Parser::new(input).parse()
}
@ -492,7 +511,7 @@ impl AstNode {
result_stack.insert(node.id, Ok(value.clone()));
}
AstNodeInner::Leaf { value, .. } => {
result_stack.insert(node.id, Ok(value.to_string().into()));
result_stack.insert(node.id, Ok(value.to_owned().into()));
}
AstNodeInner::BinOp {
op_type,
@ -529,7 +548,7 @@ impl AstNode {
continue;
};
let string: String = string?.eval_as_string();
let string: MaybeNonUtf8String = string?.eval_as_string();
// The GNU docs say:
//
@ -550,7 +569,11 @@ impl AstNode {
.unwrap_or(0);
if let (Some(pos), Some(_)) = (pos.checked_sub(1), length.checked_sub(1)) {
let result = string.chars().skip(pos).take(length).collect::<String>();
let result = string
.into_iter()
.skip(pos)
.take(length)
.collect::<MaybeNonUtf8String>();
result_stack.insert(node.id, Ok(result.into()));
} else {
result_stack.insert(node.id, Ok(String::new().into()));
@ -565,7 +588,7 @@ impl AstNode {
continue;
};
let length = string?.eval_as_string().chars().count();
let length = string?.eval_as_string().iter().count();
result_stack.insert(node.id, Ok(length.into()));
}
}
@ -591,17 +614,17 @@ fn get_next_id() -> u32 {
})
}
struct Parser<'a, S: AsRef<str>> {
struct Parser<'a, S: AsRef<MaybeNonUtf8Str>> {
input: &'a [S],
index: usize,
}
impl<'a, S: AsRef<str>> Parser<'a, S> {
impl<'a, S: AsRef<MaybeNonUtf8Str>> Parser<'a, S> {
fn new(input: &'a [S]) -> Self {
Self { input, index: 0 }
}
fn next(&mut self) -> ExprResult<&'a str> {
fn next(&mut self) -> ExprResult<&'a MaybeNonUtf8Str> {
let next = self.input.get(self.index);
if let Some(next) = next {
self.index += 1;
@ -610,12 +633,12 @@ impl<'a, S: AsRef<str>> Parser<'a, S> {
// The indexing won't panic, because we know that the input size
// is greater than zero.
Err(ExprError::MissingArgument(
self.input[self.index - 1].as_ref().into(),
String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into_owned(),
))
}
}
fn accept<T>(&mut self, f: impl Fn(&str) -> Option<T>) -> Option<T> {
fn accept<T>(&mut self, f: impl Fn(&MaybeNonUtf8Str) -> Option<T>) -> Option<T> {
let next = self.input.get(self.index)?;
let tok = f(next.as_ref());
if let Some(tok) = tok {
@ -632,7 +655,9 @@ impl<'a, S: AsRef<str>> Parser<'a, S> {
}
let res = self.parse_expression()?;
if let Some(arg) = self.input.get(self.index) {
return Err(ExprError::UnexpectedArgument(arg.as_ref().into()));
return Err(ExprError::UnexpectedArgument(
String::from_utf8_lossy(arg.as_ref()).into_owned(),
));
}
Ok(res)
}
@ -675,7 +700,7 @@ impl<'a, S: AsRef<str>> Parser<'a, S> {
fn parse_simple_expression(&mut self) -> ExprResult<AstNode> {
let first = self.next()?;
let inner = match first {
"match" => {
b"match" => {
let left = self.parse_simple_expression()?;
let right = self.parse_simple_expression()?;
AstNodeInner::BinOp {
@ -684,7 +709,7 @@ impl<'a, S: AsRef<str>> Parser<'a, S> {
right: Box::new(right),
}
}
"substr" => {
b"substr" => {
let string = self.parse_simple_expression()?;
let pos = self.parse_simple_expression()?;
let length = self.parse_simple_expression()?;
@ -694,7 +719,7 @@ impl<'a, S: AsRef<str>> Parser<'a, S> {
length: Box::new(length),
}
}
"index" => {
b"index" => {
let left = self.parse_simple_expression()?;
let right = self.parse_simple_expression()?;
AstNodeInner::BinOp {
@ -703,32 +728,32 @@ impl<'a, S: AsRef<str>> Parser<'a, S> {
right: Box::new(right),
}
}
"length" => {
b"length" => {
let string = self.parse_simple_expression()?;
AstNodeInner::Length {
string: Box::new(string),
}
}
"+" => AstNodeInner::Leaf {
b"+" => AstNodeInner::Leaf {
value: self.next()?.into(),
},
"(" => {
b"(" => {
// Evaluate the node just after parsing to we detect arithmetic
// errors before checking for the closing parenthesis.
let s = self.parse_expression()?.evaluated()?;
match self.next() {
Ok(")") => {}
Ok(b")") => {}
// Since we have parsed at least a '(', there will be a token
// at `self.index - 1`. So this indexing won't panic.
Ok(_) => {
return Err(ExprError::ExpectedClosingBraceInsteadOf(
self.input[self.index - 1].as_ref().into(),
String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into(),
));
}
Err(ExprError::MissingArgument(_)) => {
return Err(ExprError::ExpectedClosingBraceAfter(
self.input[self.index - 1].as_ref().into(),
String::from_utf8_lossy(self.input[self.index - 1].as_ref()).into(),
));
}
Err(e) => return Err(e),
@ -752,11 +777,11 @@ pub fn is_truthy(s: &NumOrStr) -> bool {
NumOrStr::Num(num) => num != &BigInt::from(0),
NumOrStr::Str(str) => {
// Edge case: `-` followed by nothing is truthy
if str == "-" {
if str == b"-" {
return true;
}
let mut bytes = str.bytes();
let mut bytes = str.iter().copied();
// Empty string is falsy
let Some(first) = bytes.next() else {
@ -922,7 +947,7 @@ mod test {
.unwrap()
.eval()
.unwrap();
assert_eq!(result.eval_as_string(), "");
assert_eq!(result.eval_as_string(), b"");
}
#[test]
@ -931,13 +956,13 @@ mod test {
.unwrap()
.eval()
.unwrap();
assert_eq!(result.eval_as_string(), "0");
assert_eq!(result.eval_as_string(), b"0");
let result = AstNode::parse(&["*cats", ":", r"*cats"])
.unwrap()
.eval()
.unwrap();
assert_eq!(result.eval_as_string(), "5");
assert_eq!(result.eval_as_string(), b"5");
}
#[test]
@ -946,7 +971,7 @@ mod test {
.unwrap()
.eval()
.unwrap();
assert_eq!(result.eval_as_string(), "0");
assert_eq!(result.eval_as_string(), b"0");
}
#[test]

View file

@ -377,6 +377,24 @@ pub fn os_string_from_vec(vec: Vec<u8>) -> mods::error::UResult<OsString> {
Ok(s)
}
/// Converts an `OsString` into a `Vec<u8>`, parsing as UTF-8 on non-unix platforms.
///
/// This always succeeds on unix platforms,
/// and fails on other platforms if the bytes can't be parsed as UTF-8.
pub fn os_string_to_vec(s: OsString) -> mods::error::UResult<Vec<u8>> {
#[cfg(unix)]
let v = s.into_vec();
#[cfg(not(unix))]
let v = s
.into_string()
.map_err(|_| {
mods::error::UUsageError::new(1, "invalid UTF-8 was detected in one or more arguments")
})?
.into();
Ok(v)
}
/// Equivalent to `std::BufRead::lines` which outputs each line as a `Vec<u8>`,
/// which avoids panicking on non UTF-8 input.
pub fn read_byte_lines<R: std::io::Read>(