Merge pull request #8606 from sylvestre/expr-multibyte
Some checks are pending
Android / Test builds (push) Waiting to run
CICD / Style/cargo-deny (push) Waiting to run
CICD / Build (push) Blocked by required conditions
CICD / Style/deps (push) Waiting to run
CICD / Documentation/warnings (push) Waiting to run
CICD / MinRustV (push) Waiting to run
CICD / Test all features separately (push) Blocked by required conditions
CICD / Dependencies (push) Waiting to run
CICD / Build/Makefile (push) Blocked by required conditions
CICD / Build/stable (push) Blocked by required conditions
CICD / Build/nightly (push) Blocked by required conditions
CICD / Binary sizes (push) Blocked by required conditions
CICD / Tests/BusyBox test suite (push) Blocked by required conditions
CICD / Tests/Toybox test suite (push) Blocked by required conditions
CICD / Code Coverage (push) Waiting to run
CICD / Separate Builds (push) Waiting to run
CICD / Build/SELinux (push) Blocked by required conditions
GnuTests / Run GNU tests (native) (push) Waiting to run
GnuTests / Run GNU tests (SELinux) (push) Waiting to run
GnuTests / Aggregate GNU test results (push) Blocked by required conditions
Code Quality / Style/format (push) Waiting to run
Code Quality / Style/lint (push) Waiting to run
Code Quality / Style/spelling (push) Waiting to run
Code Quality / Style/toml (push) Waiting to run
Code Quality / Style/Python (push) Waiting to run
Code Quality / Pre-commit hooks (push) Waiting to run
Devcontainer / Verify devcontainer (push) Waiting to run
FreeBSD / Style and Lint (push) Waiting to run
WSL2 / Test (push) Waiting to run
FreeBSD / Tests (push) Waiting to run

expr: fix some multibyte issues
This commit is contained in:
Daniel Hofstetter 2025-09-14 15:30:12 +02:00 committed by GitHub
commit aa42a02459
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 462 additions and 104 deletions

View file

@ -3,7 +3,7 @@
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
// spell-checker:ignore (ToDO) ints paren prec multibytes
// spell-checker:ignore (ToDO) ints paren prec multibytes aaaabc
use std::{cell::Cell, collections::BTreeMap};
@ -153,101 +153,9 @@ impl StringOp {
Ok(left)
}
Self::Match => {
let left = String::from_utf8(left?.eval_as_string()).map_err(|u| {
ExprError::UnsupportedNonUtf8Match(
String::from_utf8_lossy(u.as_bytes()).into_owned(),
)
})?;
let right = String::from_utf8(right?.eval_as_string()).map_err(|u| {
ExprError::UnsupportedNonUtf8Match(
String::from_utf8_lossy(u.as_bytes()).into_owned(),
)
})?;
check_posix_regex_errors(&right)?;
// Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep`
let mut re_string = String::with_capacity(right.len() + 1);
let mut pattern_chars = right.chars().peekable();
let mut prev = '\0';
let mut prev_is_escaped = false;
let mut is_start_of_expression = true;
// All patterns are anchored so they begin with a caret (^)
if pattern_chars.peek() != Some(&'^') {
re_string.push('^');
}
while let Some(curr) = pattern_chars.next() {
let curr_is_escaped = prev == '\\' && !prev_is_escaped;
let is_first_character = prev == '\0';
match curr {
// Character class negation "[^a]"
// Explicitly escaped caret "\^"
'^' if !is_start_of_expression && !matches!(prev, '[' | '\\') => {
re_string.push_str(r"\^");
}
'$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => {
re_string.push_str(r"\$");
}
'\\' if !curr_is_escaped && pattern_chars.peek().is_none() => {
return Err(ExprError::TrailingBackslash);
}
'{' if curr_is_escaped => {
// Handle '{' literally at the start of an expression
if is_start_of_expression {
if re_string.ends_with('\\') {
let _ = re_string.pop();
}
re_string.push(curr);
} else {
// Check if the following section is a valid range quantifier
verify_range_quantifier(&pattern_chars)?;
re_string.push(curr);
// Set the lower bound of range quantifier to 0 if it is missing
if pattern_chars.peek() == Some(&',') {
re_string.push('0');
}
}
}
_ => re_string.push(curr),
}
// Capturing group "\(abc\)"
// Alternative pattern "a\|b"
is_start_of_expression = curr == '\\' && is_first_character
|| curr_is_escaped && matches!(curr, '(' | '|')
|| curr == '\\' && prev_is_escaped && matches!(prev, '(' | '|');
prev_is_escaped = curr_is_escaped;
prev = curr;
}
let re = Regex::with_options(
&re_string,
RegexOptions::REGEX_OPTION_SINGLELINE,
Syntax::grep(),
)
.map_err(|error| match error.code() {
// "invalid repeat range {lower,upper}"
-123 => ExprError::InvalidBracketContent,
// "too big number for repeat range"
-201 => ExprError::TooBigRangeQuantifierIndex,
_ => ExprError::InvalidRegexExpression,
})?;
Ok(if re.captures_len() > 0 {
re.captures(&left)
.and_then(|captures| captures.at(1))
.unwrap_or("")
.to_string()
} else {
re.find(&left)
.map_or("0".to_string(), |(start, end)| (end - start).to_string())
}
.into())
let left_bytes = left?.eval_as_string();
let right_bytes = right?.eval_as_string();
evaluate_match_expression(left_bytes, right_bytes)
}
Self::Index => {
let left = left?.eval_as_string();
@ -369,6 +277,278 @@ fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> {
}
}
/// Build a regex from a pattern string with locale-aware encoding
fn build_regex(pattern_bytes: Vec<u8>) -> ExprResult<(Regex, String)> {
use onig::EncodedBytes;
use uucore::i18n::{UEncoding, get_locale_encoding};
let encoding = get_locale_encoding();
// For pattern processing, we need to handle it based on locale
let pattern_str = String::from_utf8(pattern_bytes.clone())
.unwrap_or_else(|_| String::from_utf8_lossy(&pattern_bytes).into());
check_posix_regex_errors(&pattern_str)?;
// Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep`
let mut re_string = String::with_capacity(pattern_str.len() + 1);
let mut pattern_chars = pattern_str.chars().peekable();
let mut prev = '\0';
let mut prev_is_escaped = false;
let mut is_start_of_expression = true;
// All patterns are anchored so they begin with a caret (^)
if pattern_chars.peek() != Some(&'^') {
re_string.push('^');
}
while let Some(curr) = pattern_chars.next() {
let curr_is_escaped = prev == '\\' && !prev_is_escaped;
let is_first_character = prev == '\0';
match curr {
// Character class negation "[^a]"
// Explicitly escaped caret "\^"
'^' if !is_start_of_expression && !matches!(prev, '[' | '\\') => {
re_string.push_str(r"\^");
}
'$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => {
re_string.push_str(r"\$");
}
'\\' if !curr_is_escaped && pattern_chars.peek().is_none() => {
return Err(ExprError::TrailingBackslash);
}
'{' if curr_is_escaped => {
// Handle '{' literally at the start of an expression
if is_start_of_expression {
if re_string.ends_with('\\') {
let _ = re_string.pop();
}
re_string.push(curr);
} else {
// Check if the following section is a valid range quantifier
verify_range_quantifier(&pattern_chars)?;
re_string.push(curr);
// Set the lower bound of range quantifier to 0 if it is missing
if pattern_chars.peek() == Some(&',') {
re_string.push('0');
}
}
}
_ => re_string.push(curr),
}
// Capturing group "\(abc\)"
// Alternative pattern "a\|b"
is_start_of_expression = curr == '\\' && is_first_character
|| curr_is_escaped && matches!(curr, '(' | '|')
|| curr == '\\' && prev_is_escaped && matches!(prev, '(' | '|');
prev_is_escaped = curr_is_escaped;
prev = curr;
}
// Create regex with proper encoding
let re = match encoding {
UEncoding::Utf8 => {
// For UTF-8 locale, use UTF-8 encoding
Regex::with_options_and_encoding(
&re_string,
RegexOptions::REGEX_OPTION_SINGLELINE,
Syntax::grep(),
)
}
UEncoding::Ascii => {
// For non-UTF-8 locale, use ASCII encoding
Regex::with_options_and_encoding(
EncodedBytes::ascii(re_string.as_bytes()),
RegexOptions::REGEX_OPTION_SINGLELINE,
Syntax::grep(),
)
}
}
.map_err(|error| match error.code() {
// "invalid repeat range {lower,upper}"
-123 => ExprError::InvalidBracketContent,
// "too big number for repeat range"
-201 => ExprError::TooBigRangeQuantifierIndex,
_ => ExprError::InvalidRegexExpression,
})?;
Ok((re, re_string))
}
/// Find matches in the input using the compiled regex
fn find_match(regex: Regex, re_string: String, left_bytes: Vec<u8>) -> ExprResult<String> {
use onig::EncodedBytes;
use uucore::i18n::{UEncoding, get_locale_encoding};
let encoding = get_locale_encoding();
// Match against the input using the appropriate encoding
let mut region = onig::Region::new();
let result = match encoding {
UEncoding::Utf8 => {
// In UTF-8 locale, check if input is valid UTF-8
if let Ok(left_str) = std::str::from_utf8(&left_bytes) {
// Valid UTF-8, match as UTF-8
let pos = regex.search_with_encoding(
left_str,
0,
left_str.len(),
onig::SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
);
if pos.is_some() {
if regex.captures_len() > 0 {
// Get first capture group
region
.pos(1)
.map(|(start, end)| left_str[start..end].to_string())
.unwrap_or_default()
} else {
// Count characters in the match
let (start, end) = region.pos(0).unwrap();
left_str[start..end].chars().count().to_string()
}
} else {
// No match
if regex.captures_len() > 0 {
String::new()
} else {
"0".to_string()
}
}
} else {
// Invalid UTF-8 in UTF-8 locale
// Try to match as bytes using ASCII encoding
let left_encoded = EncodedBytes::ascii(&left_bytes);
// Need to create ASCII version of regex too
let re_ascii = Regex::with_options_and_encoding(
EncodedBytes::ascii(re_string.as_bytes()),
RegexOptions::REGEX_OPTION_SINGLELINE,
Syntax::grep(),
)
.ok();
if let Some(re_ascii) = re_ascii {
let pos = re_ascii.search_with_encoding(
left_encoded,
0,
left_bytes.len(),
onig::SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
);
if pos.is_some() {
if re_ascii.captures_len() > 0 {
// Get first capture group
region
.pos(1)
.map(|(start, end)| {
// Return empty string for invalid UTF-8 capture in UTF-8 locale
if std::str::from_utf8(&left_bytes[start..end]).is_err() {
String::new()
} else {
String::from_utf8_lossy(&left_bytes[start..end])
.into_owned()
}
})
.unwrap_or_default()
} else {
// No capture groups - return 0 for invalid UTF-8 in UTF-8 locale
"0".to_string()
}
} else {
// No match
if re_ascii.captures_len() > 0 {
String::new()
} else {
"0".to_string()
}
}
} else {
// Couldn't create ASCII regex - no match
if regex.captures_len() > 0 {
String::new()
} else {
"0".to_string()
}
}
}
}
UEncoding::Ascii => {
// In ASCII/C locale, work with bytes directly
let left_encoded = EncodedBytes::ascii(&left_bytes);
let pos = regex.search_with_encoding(
left_encoded,
0,
left_bytes.len(),
onig::SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
);
if pos.is_some() {
if regex.captures_len() > 0 {
// Get first capture group - return raw bytes for C locale
if let Some((start, end)) = region.pos(1) {
let capture_bytes = &left_bytes[start..end];
// Return raw bytes as String for consistency with other cases
return Ok(String::from_utf8_lossy(capture_bytes).into_owned());
}
String::new()
} else {
// Return byte count of match
let (start, end) = region.pos(0).unwrap();
(end - start).to_string()
}
} else {
// No match
if regex.captures_len() > 0 {
String::new()
} else {
"0".to_string()
}
}
}
};
Ok(result)
}
/// Evaluate a match expression with locale-aware regex matching
fn evaluate_match_expression(left_bytes: Vec<u8>, right_bytes: Vec<u8>) -> ExprResult<NumOrStr> {
let (regex, re_string) = build_regex(right_bytes)?;
// Special case for ASCII locale with capture groups that need to return raw bytes
use uucore::i18n::{UEncoding, get_locale_encoding};
let encoding = get_locale_encoding();
if matches!(encoding, UEncoding::Ascii) && regex.captures_len() > 0 {
// Try to find the actual capture bytes for ASCII locale
let mut region = onig::Region::new();
let left_encoded = onig::EncodedBytes::ascii(&left_bytes);
let pos = regex.search_with_encoding(
left_encoded,
0,
left_bytes.len(),
onig::SearchOptions::SEARCH_OPTION_NONE,
Some(&mut region),
);
if pos.is_some() {
if let Some((start, end)) = region.pos(1) {
let capture_bytes = &left_bytes[start..end];
return Ok(MaybeNonUtf8String::from(capture_bytes.to_vec()).into());
}
}
}
let result = find_match(regex, re_string, left_bytes)?;
Ok(result.into())
}
/// Precedence for infix binary operators
const PRECEDENCE: &[&[(&MaybeNonUtf8Str, BinOp)]] = &[
&[(b"|", BinOp::String(StringOp::Or))],
@ -1038,4 +1218,187 @@ mod test {
Err(ExprError::TooBigRangeQuantifierIndex)
);
}
#[test]
fn test_evaluate_match_expression_basic() {
use super::evaluate_match_expression;
// Basic literal match
let result = evaluate_match_expression(b"hello".to_vec(), b"hello".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"5");
// No match
let result = evaluate_match_expression(b"hello".to_vec(), b"world".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"0");
// Partial match from beginning
let result = evaluate_match_expression(b"hello world".to_vec(), b"hello".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"5");
}
#[test]
fn test_evaluate_match_expression_regex_patterns() {
use super::evaluate_match_expression;
// Dot matches any character
let result = evaluate_match_expression(b"abc".to_vec(), b"a.c".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Star quantifier
let result = evaluate_match_expression(b"aaaabc".to_vec(), b"a*bc".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"6");
// Plus quantifier (escaped in BRE)
let result = evaluate_match_expression(b"aaaabc".to_vec(), b"a\\+bc".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"6");
// Question mark quantifier (escaped in BRE)
let result = evaluate_match_expression(b"abc".to_vec(), b"ab\\?c".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
}
#[test]
fn test_evaluate_match_expression_capture_groups() {
use super::evaluate_match_expression;
// Simple capture group
let result =
evaluate_match_expression(b"hello123".to_vec(), b"hello\\([0-9]*\\)".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"123");
// Empty capture group
let result =
evaluate_match_expression(b"hello".to_vec(), b"hello\\([0-9]*\\)".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"");
// No capture group, just match length
let result =
evaluate_match_expression(b"hello123".to_vec(), b"hello[0-9]*".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"8");
}
#[test]
fn test_evaluate_match_expression_character_classes() {
use super::evaluate_match_expression;
// Simple character class
let result = evaluate_match_expression(b"abc123".to_vec(), b"[a-z]*".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Negated character class
let result = evaluate_match_expression(b"123abc".to_vec(), b"[^a-z]*".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Digit character class
let result = evaluate_match_expression(b"123abc".to_vec(), b"[0-9]*".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
}
#[test]
fn test_evaluate_match_expression_anchoring() {
use super::evaluate_match_expression;
// Patterns are automatically anchored at start
let result = evaluate_match_expression(b"world hello".to_vec(), b"hello".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"0");
// Explicit start anchor (redundant but should work)
let result =
evaluate_match_expression(b"hello world".to_vec(), b"^hello".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"5");
// End anchor
let result =
evaluate_match_expression(b"hello world".to_vec(), b"world$".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"0"); // Should fail because not at start
let result = evaluate_match_expression(b"world".to_vec(), b"world$".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"5");
}
#[test]
fn test_evaluate_match_expression_special_characters() {
use super::evaluate_match_expression;
// Escaped special characters
let result = evaluate_match_expression(b"a.b".to_vec(), b"a\\.b".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Escaped asterisk
let result = evaluate_match_expression(b"a*b".to_vec(), b"a\\*b".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Caret not at beginning should be escaped
let result = evaluate_match_expression(b"a^b".to_vec(), b"a^b".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Dollar not at end should be escaped
let result = evaluate_match_expression(b"a$b".to_vec(), b"a$b".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
}
#[test]
fn test_evaluate_match_expression_range_quantifiers() {
use super::evaluate_match_expression;
// Fixed count quantifier
let result = evaluate_match_expression(b"aaa".to_vec(), b"a\\{3\\}".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"3");
// Range quantifier
let result = evaluate_match_expression(b"aa".to_vec(), b"a\\{1,3\\}".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"2");
// Minimum quantifier
let result = evaluate_match_expression(b"aaaa".to_vec(), b"a\\{2,\\}".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"4");
// Maximum quantifier
let result = evaluate_match_expression(b"aa".to_vec(), b"a\\{,3\\}".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"2");
}
#[test]
fn test_evaluate_match_expression_empty_and_edge_cases() {
use super::evaluate_match_expression;
// Empty input string
let result = evaluate_match_expression(b"".to_vec(), b".*".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"0");
// Empty pattern (should match empty string)
let result = evaluate_match_expression(b"".to_vec(), b"".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"0");
// Pattern matching empty string
let result = evaluate_match_expression(b"hello".to_vec(), b".*".to_vec()).unwrap();
assert_eq!(result.eval_as_string(), b"5");
}
#[test]
fn test_evaluate_match_expression_error_cases() {
use super::evaluate_match_expression;
// Unmatched opening parenthesis
let result = evaluate_match_expression(b"hello".to_vec(), b"\\(hello".to_vec());
assert!(matches!(
result,
Err(ExprError::UnmatchedOpeningParenthesis)
));
// Unmatched closing parenthesis
let result = evaluate_match_expression(b"hello".to_vec(), b"hello\\)".to_vec());
assert!(matches!(
result,
Err(ExprError::UnmatchedClosingParenthesis)
));
// Trailing backslash
let result = evaluate_match_expression(b"hello".to_vec(), b"hello\\".to_vec());
assert!(matches!(result, Err(ExprError::TrailingBackslash)));
// Invalid bracket content
let result = evaluate_match_expression(b"hello".to_vec(), b"a\\{invalid\\}".to_vec());
assert!(matches!(result, Err(ExprError::InvalidBracketContent)));
}
}

View file

@ -1869,11 +1869,10 @@ mod gnu_expr_multibyte {
// The regex engine should match the '.' to the first multibyte character.
#[test]
#[ignore = "not implemented"]
fn test_m3() {
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b".bc"];
let cases = &[TestCase::FR.out("3"), TestCase::C.code(1)];
let cases = &[TestCase::FR.out("3"), TestCase::C.out("0").code(1)];
for tc in cases {
check_test_case(args, tc);
@ -1883,7 +1882,6 @@ mod gnu_expr_multibyte {
// The opposite of the previous test: two dots should only match the two
// octets in single-byte locale.
#[test]
#[ignore = "not implemented"]
fn test_m4() {
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"..bc"];
@ -1896,11 +1894,10 @@ mod gnu_expr_multibyte {
// Match with grouping - a single dot should return the two octets
#[test]
#[ignore = "not implemented"]
fn test_m5() {
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"\\(.b\\)c"];
let cases = &[TestCase::FR.out(b"\xCE\xB1b"), TestCase::C.code(1)];
let cases = &[TestCase::FR.out(b"\xCE\xB1b"), TestCase::C.out("").code(1)];
for tc in cases {
check_test_case(args, tc);
@ -1910,11 +1907,10 @@ mod gnu_expr_multibyte {
// Invalid multibyte sequences - regex should not match in multibyte locale
// (POSIX requirement)
#[test]
#[ignore = "not implemented"]
fn test_m6() {
let args: &[&[u8]] = &[b"match", b"\xCEbc\xCE\xB4ef", b"\\(.\\)"];
let cases = &[TestCase::FR.code(1), TestCase::C.out(b"\xCE")];
let cases = &[TestCase::FR.out("").code(1), TestCase::C.out(b"\xCE")];
for tc in cases {
check_test_case(args, tc);
@ -1926,7 +1922,6 @@ mod gnu_expr_multibyte {
// In the single byte case, the regex engine sees two octets in the
// character class ('\xCE' and '\xB1') - and it matches the first one.
#[test]
#[ignore = "not implemented"]
fn test_m7() {
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"\\(.\\)"];