mirror of
https://github.com/uutils/coreutils.git
synced 2025-12-23 08:47:37 +00:00
Merge pull request #8606 from sylvestre/expr-multibyte
Some checks are pending
Android / Test builds (push) Waiting to run
CICD / Style/cargo-deny (push) Waiting to run
CICD / Build (push) Blocked by required conditions
CICD / Style/deps (push) Waiting to run
CICD / Documentation/warnings (push) Waiting to run
CICD / MinRustV (push) Waiting to run
CICD / Test all features separately (push) Blocked by required conditions
CICD / Dependencies (push) Waiting to run
CICD / Build/Makefile (push) Blocked by required conditions
CICD / Build/stable (push) Blocked by required conditions
CICD / Build/nightly (push) Blocked by required conditions
CICD / Binary sizes (push) Blocked by required conditions
CICD / Tests/BusyBox test suite (push) Blocked by required conditions
CICD / Tests/Toybox test suite (push) Blocked by required conditions
CICD / Code Coverage (push) Waiting to run
CICD / Separate Builds (push) Waiting to run
CICD / Build/SELinux (push) Blocked by required conditions
GnuTests / Run GNU tests (native) (push) Waiting to run
GnuTests / Run GNU tests (SELinux) (push) Waiting to run
GnuTests / Aggregate GNU test results (push) Blocked by required conditions
Code Quality / Style/format (push) Waiting to run
Code Quality / Style/lint (push) Waiting to run
Code Quality / Style/spelling (push) Waiting to run
Code Quality / Style/toml (push) Waiting to run
Code Quality / Style/Python (push) Waiting to run
Code Quality / Pre-commit hooks (push) Waiting to run
Devcontainer / Verify devcontainer (push) Waiting to run
FreeBSD / Style and Lint (push) Waiting to run
WSL2 / Test (push) Waiting to run
FreeBSD / Tests (push) Waiting to run
Some checks are pending
Android / Test builds (push) Waiting to run
CICD / Style/cargo-deny (push) Waiting to run
CICD / Build (push) Blocked by required conditions
CICD / Style/deps (push) Waiting to run
CICD / Documentation/warnings (push) Waiting to run
CICD / MinRustV (push) Waiting to run
CICD / Test all features separately (push) Blocked by required conditions
CICD / Dependencies (push) Waiting to run
CICD / Build/Makefile (push) Blocked by required conditions
CICD / Build/stable (push) Blocked by required conditions
CICD / Build/nightly (push) Blocked by required conditions
CICD / Binary sizes (push) Blocked by required conditions
CICD / Tests/BusyBox test suite (push) Blocked by required conditions
CICD / Tests/Toybox test suite (push) Blocked by required conditions
CICD / Code Coverage (push) Waiting to run
CICD / Separate Builds (push) Waiting to run
CICD / Build/SELinux (push) Blocked by required conditions
GnuTests / Run GNU tests (native) (push) Waiting to run
GnuTests / Run GNU tests (SELinux) (push) Waiting to run
GnuTests / Aggregate GNU test results (push) Blocked by required conditions
Code Quality / Style/format (push) Waiting to run
Code Quality / Style/lint (push) Waiting to run
Code Quality / Style/spelling (push) Waiting to run
Code Quality / Style/toml (push) Waiting to run
Code Quality / Style/Python (push) Waiting to run
Code Quality / Pre-commit hooks (push) Waiting to run
Devcontainer / Verify devcontainer (push) Waiting to run
FreeBSD / Style and Lint (push) Waiting to run
WSL2 / Test (push) Waiting to run
FreeBSD / Tests (push) Waiting to run
expr: fix some multibyte issues
This commit is contained in:
commit
aa42a02459
2 changed files with 462 additions and 104 deletions
|
|
@ -3,7 +3,7 @@
|
|||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
|
||||
// spell-checker:ignore (ToDO) ints paren prec multibytes
|
||||
// spell-checker:ignore (ToDO) ints paren prec multibytes aaaabc
|
||||
|
||||
use std::{cell::Cell, collections::BTreeMap};
|
||||
|
||||
|
|
@ -153,101 +153,9 @@ impl StringOp {
|
|||
Ok(left)
|
||||
}
|
||||
Self::Match => {
|
||||
let left = String::from_utf8(left?.eval_as_string()).map_err(|u| {
|
||||
ExprError::UnsupportedNonUtf8Match(
|
||||
String::from_utf8_lossy(u.as_bytes()).into_owned(),
|
||||
)
|
||||
})?;
|
||||
let right = String::from_utf8(right?.eval_as_string()).map_err(|u| {
|
||||
ExprError::UnsupportedNonUtf8Match(
|
||||
String::from_utf8_lossy(u.as_bytes()).into_owned(),
|
||||
)
|
||||
})?;
|
||||
|
||||
check_posix_regex_errors(&right)?;
|
||||
|
||||
// Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep`
|
||||
let mut re_string = String::with_capacity(right.len() + 1);
|
||||
let mut pattern_chars = right.chars().peekable();
|
||||
let mut prev = '\0';
|
||||
let mut prev_is_escaped = false;
|
||||
let mut is_start_of_expression = true;
|
||||
|
||||
// All patterns are anchored so they begin with a caret (^)
|
||||
if pattern_chars.peek() != Some(&'^') {
|
||||
re_string.push('^');
|
||||
}
|
||||
|
||||
while let Some(curr) = pattern_chars.next() {
|
||||
let curr_is_escaped = prev == '\\' && !prev_is_escaped;
|
||||
let is_first_character = prev == '\0';
|
||||
|
||||
match curr {
|
||||
// Character class negation "[^a]"
|
||||
// Explicitly escaped caret "\^"
|
||||
'^' if !is_start_of_expression && !matches!(prev, '[' | '\\') => {
|
||||
re_string.push_str(r"\^");
|
||||
}
|
||||
'$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => {
|
||||
re_string.push_str(r"\$");
|
||||
}
|
||||
'\\' if !curr_is_escaped && pattern_chars.peek().is_none() => {
|
||||
return Err(ExprError::TrailingBackslash);
|
||||
}
|
||||
'{' if curr_is_escaped => {
|
||||
// Handle '{' literally at the start of an expression
|
||||
if is_start_of_expression {
|
||||
if re_string.ends_with('\\') {
|
||||
let _ = re_string.pop();
|
||||
}
|
||||
re_string.push(curr);
|
||||
} else {
|
||||
// Check if the following section is a valid range quantifier
|
||||
verify_range_quantifier(&pattern_chars)?;
|
||||
|
||||
re_string.push(curr);
|
||||
// Set the lower bound of range quantifier to 0 if it is missing
|
||||
if pattern_chars.peek() == Some(&',') {
|
||||
re_string.push('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => re_string.push(curr),
|
||||
}
|
||||
|
||||
// Capturing group "\(abc\)"
|
||||
// Alternative pattern "a\|b"
|
||||
is_start_of_expression = curr == '\\' && is_first_character
|
||||
|| curr_is_escaped && matches!(curr, '(' | '|')
|
||||
|| curr == '\\' && prev_is_escaped && matches!(prev, '(' | '|');
|
||||
|
||||
prev_is_escaped = curr_is_escaped;
|
||||
prev = curr;
|
||||
}
|
||||
|
||||
let re = Regex::with_options(
|
||||
&re_string,
|
||||
RegexOptions::REGEX_OPTION_SINGLELINE,
|
||||
Syntax::grep(),
|
||||
)
|
||||
.map_err(|error| match error.code() {
|
||||
// "invalid repeat range {lower,upper}"
|
||||
-123 => ExprError::InvalidBracketContent,
|
||||
// "too big number for repeat range"
|
||||
-201 => ExprError::TooBigRangeQuantifierIndex,
|
||||
_ => ExprError::InvalidRegexExpression,
|
||||
})?;
|
||||
|
||||
Ok(if re.captures_len() > 0 {
|
||||
re.captures(&left)
|
||||
.and_then(|captures| captures.at(1))
|
||||
.unwrap_or("")
|
||||
.to_string()
|
||||
} else {
|
||||
re.find(&left)
|
||||
.map_or("0".to_string(), |(start, end)| (end - start).to_string())
|
||||
}
|
||||
.into())
|
||||
let left_bytes = left?.eval_as_string();
|
||||
let right_bytes = right?.eval_as_string();
|
||||
evaluate_match_expression(left_bytes, right_bytes)
|
||||
}
|
||||
Self::Index => {
|
||||
let left = left?.eval_as_string();
|
||||
|
|
@ -369,6 +277,278 @@ fn check_posix_regex_errors(pattern: &str) -> ExprResult<()> {
|
|||
}
|
||||
}
|
||||
|
||||
/// Build a regex from a pattern string with locale-aware encoding
|
||||
fn build_regex(pattern_bytes: Vec<u8>) -> ExprResult<(Regex, String)> {
|
||||
use onig::EncodedBytes;
|
||||
use uucore::i18n::{UEncoding, get_locale_encoding};
|
||||
|
||||
let encoding = get_locale_encoding();
|
||||
|
||||
// For pattern processing, we need to handle it based on locale
|
||||
let pattern_str = String::from_utf8(pattern_bytes.clone())
|
||||
.unwrap_or_else(|_| String::from_utf8_lossy(&pattern_bytes).into());
|
||||
check_posix_regex_errors(&pattern_str)?;
|
||||
|
||||
// Transpile the input pattern from BRE syntax to `onig` crate's `Syntax::grep`
|
||||
let mut re_string = String::with_capacity(pattern_str.len() + 1);
|
||||
let mut pattern_chars = pattern_str.chars().peekable();
|
||||
let mut prev = '\0';
|
||||
let mut prev_is_escaped = false;
|
||||
let mut is_start_of_expression = true;
|
||||
|
||||
// All patterns are anchored so they begin with a caret (^)
|
||||
if pattern_chars.peek() != Some(&'^') {
|
||||
re_string.push('^');
|
||||
}
|
||||
|
||||
while let Some(curr) = pattern_chars.next() {
|
||||
let curr_is_escaped = prev == '\\' && !prev_is_escaped;
|
||||
let is_first_character = prev == '\0';
|
||||
|
||||
match curr {
|
||||
// Character class negation "[^a]"
|
||||
// Explicitly escaped caret "\^"
|
||||
'^' if !is_start_of_expression && !matches!(prev, '[' | '\\') => {
|
||||
re_string.push_str(r"\^");
|
||||
}
|
||||
'$' if !curr_is_escaped && !is_end_of_expression(&pattern_chars) => {
|
||||
re_string.push_str(r"\$");
|
||||
}
|
||||
'\\' if !curr_is_escaped && pattern_chars.peek().is_none() => {
|
||||
return Err(ExprError::TrailingBackslash);
|
||||
}
|
||||
'{' if curr_is_escaped => {
|
||||
// Handle '{' literally at the start of an expression
|
||||
if is_start_of_expression {
|
||||
if re_string.ends_with('\\') {
|
||||
let _ = re_string.pop();
|
||||
}
|
||||
re_string.push(curr);
|
||||
} else {
|
||||
// Check if the following section is a valid range quantifier
|
||||
verify_range_quantifier(&pattern_chars)?;
|
||||
|
||||
re_string.push(curr);
|
||||
// Set the lower bound of range quantifier to 0 if it is missing
|
||||
if pattern_chars.peek() == Some(&',') {
|
||||
re_string.push('0');
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => re_string.push(curr),
|
||||
}
|
||||
|
||||
// Capturing group "\(abc\)"
|
||||
// Alternative pattern "a\|b"
|
||||
is_start_of_expression = curr == '\\' && is_first_character
|
||||
|| curr_is_escaped && matches!(curr, '(' | '|')
|
||||
|| curr == '\\' && prev_is_escaped && matches!(prev, '(' | '|');
|
||||
|
||||
prev_is_escaped = curr_is_escaped;
|
||||
prev = curr;
|
||||
}
|
||||
|
||||
// Create regex with proper encoding
|
||||
let re = match encoding {
|
||||
UEncoding::Utf8 => {
|
||||
// For UTF-8 locale, use UTF-8 encoding
|
||||
Regex::with_options_and_encoding(
|
||||
&re_string,
|
||||
RegexOptions::REGEX_OPTION_SINGLELINE,
|
||||
Syntax::grep(),
|
||||
)
|
||||
}
|
||||
UEncoding::Ascii => {
|
||||
// For non-UTF-8 locale, use ASCII encoding
|
||||
Regex::with_options_and_encoding(
|
||||
EncodedBytes::ascii(re_string.as_bytes()),
|
||||
RegexOptions::REGEX_OPTION_SINGLELINE,
|
||||
Syntax::grep(),
|
||||
)
|
||||
}
|
||||
}
|
||||
.map_err(|error| match error.code() {
|
||||
// "invalid repeat range {lower,upper}"
|
||||
-123 => ExprError::InvalidBracketContent,
|
||||
// "too big number for repeat range"
|
||||
-201 => ExprError::TooBigRangeQuantifierIndex,
|
||||
_ => ExprError::InvalidRegexExpression,
|
||||
})?;
|
||||
|
||||
Ok((re, re_string))
|
||||
}
|
||||
|
||||
/// Find matches in the input using the compiled regex
|
||||
fn find_match(regex: Regex, re_string: String, left_bytes: Vec<u8>) -> ExprResult<String> {
|
||||
use onig::EncodedBytes;
|
||||
use uucore::i18n::{UEncoding, get_locale_encoding};
|
||||
|
||||
let encoding = get_locale_encoding();
|
||||
|
||||
// Match against the input using the appropriate encoding
|
||||
let mut region = onig::Region::new();
|
||||
let result = match encoding {
|
||||
UEncoding::Utf8 => {
|
||||
// In UTF-8 locale, check if input is valid UTF-8
|
||||
if let Ok(left_str) = std::str::from_utf8(&left_bytes) {
|
||||
// Valid UTF-8, match as UTF-8
|
||||
let pos = regex.search_with_encoding(
|
||||
left_str,
|
||||
0,
|
||||
left_str.len(),
|
||||
onig::SearchOptions::SEARCH_OPTION_NONE,
|
||||
Some(&mut region),
|
||||
);
|
||||
|
||||
if pos.is_some() {
|
||||
if regex.captures_len() > 0 {
|
||||
// Get first capture group
|
||||
region
|
||||
.pos(1)
|
||||
.map(|(start, end)| left_str[start..end].to_string())
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
// Count characters in the match
|
||||
let (start, end) = region.pos(0).unwrap();
|
||||
left_str[start..end].chars().count().to_string()
|
||||
}
|
||||
} else {
|
||||
// No match
|
||||
if regex.captures_len() > 0 {
|
||||
String::new()
|
||||
} else {
|
||||
"0".to_string()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Invalid UTF-8 in UTF-8 locale
|
||||
// Try to match as bytes using ASCII encoding
|
||||
let left_encoded = EncodedBytes::ascii(&left_bytes);
|
||||
// Need to create ASCII version of regex too
|
||||
let re_ascii = Regex::with_options_and_encoding(
|
||||
EncodedBytes::ascii(re_string.as_bytes()),
|
||||
RegexOptions::REGEX_OPTION_SINGLELINE,
|
||||
Syntax::grep(),
|
||||
)
|
||||
.ok();
|
||||
|
||||
if let Some(re_ascii) = re_ascii {
|
||||
let pos = re_ascii.search_with_encoding(
|
||||
left_encoded,
|
||||
0,
|
||||
left_bytes.len(),
|
||||
onig::SearchOptions::SEARCH_OPTION_NONE,
|
||||
Some(&mut region),
|
||||
);
|
||||
|
||||
if pos.is_some() {
|
||||
if re_ascii.captures_len() > 0 {
|
||||
// Get first capture group
|
||||
region
|
||||
.pos(1)
|
||||
.map(|(start, end)| {
|
||||
// Return empty string for invalid UTF-8 capture in UTF-8 locale
|
||||
if std::str::from_utf8(&left_bytes[start..end]).is_err() {
|
||||
String::new()
|
||||
} else {
|
||||
String::from_utf8_lossy(&left_bytes[start..end])
|
||||
.into_owned()
|
||||
}
|
||||
})
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
// No capture groups - return 0 for invalid UTF-8 in UTF-8 locale
|
||||
"0".to_string()
|
||||
}
|
||||
} else {
|
||||
// No match
|
||||
if re_ascii.captures_len() > 0 {
|
||||
String::new()
|
||||
} else {
|
||||
"0".to_string()
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Couldn't create ASCII regex - no match
|
||||
if regex.captures_len() > 0 {
|
||||
String::new()
|
||||
} else {
|
||||
"0".to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
UEncoding::Ascii => {
|
||||
// In ASCII/C locale, work with bytes directly
|
||||
let left_encoded = EncodedBytes::ascii(&left_bytes);
|
||||
let pos = regex.search_with_encoding(
|
||||
left_encoded,
|
||||
0,
|
||||
left_bytes.len(),
|
||||
onig::SearchOptions::SEARCH_OPTION_NONE,
|
||||
Some(&mut region),
|
||||
);
|
||||
|
||||
if pos.is_some() {
|
||||
if regex.captures_len() > 0 {
|
||||
// Get first capture group - return raw bytes for C locale
|
||||
if let Some((start, end)) = region.pos(1) {
|
||||
let capture_bytes = &left_bytes[start..end];
|
||||
// Return raw bytes as String for consistency with other cases
|
||||
return Ok(String::from_utf8_lossy(capture_bytes).into_owned());
|
||||
}
|
||||
String::new()
|
||||
} else {
|
||||
// Return byte count of match
|
||||
let (start, end) = region.pos(0).unwrap();
|
||||
(end - start).to_string()
|
||||
}
|
||||
} else {
|
||||
// No match
|
||||
if regex.captures_len() > 0 {
|
||||
String::new()
|
||||
} else {
|
||||
"0".to_string()
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Ok(result)
|
||||
}
|
||||
|
||||
/// Evaluate a match expression with locale-aware regex matching
|
||||
fn evaluate_match_expression(left_bytes: Vec<u8>, right_bytes: Vec<u8>) -> ExprResult<NumOrStr> {
|
||||
let (regex, re_string) = build_regex(right_bytes)?;
|
||||
|
||||
// Special case for ASCII locale with capture groups that need to return raw bytes
|
||||
use uucore::i18n::{UEncoding, get_locale_encoding};
|
||||
let encoding = get_locale_encoding();
|
||||
|
||||
if matches!(encoding, UEncoding::Ascii) && regex.captures_len() > 0 {
|
||||
// Try to find the actual capture bytes for ASCII locale
|
||||
let mut region = onig::Region::new();
|
||||
let left_encoded = onig::EncodedBytes::ascii(&left_bytes);
|
||||
let pos = regex.search_with_encoding(
|
||||
left_encoded,
|
||||
0,
|
||||
left_bytes.len(),
|
||||
onig::SearchOptions::SEARCH_OPTION_NONE,
|
||||
Some(&mut region),
|
||||
);
|
||||
|
||||
if pos.is_some() {
|
||||
if let Some((start, end)) = region.pos(1) {
|
||||
let capture_bytes = &left_bytes[start..end];
|
||||
return Ok(MaybeNonUtf8String::from(capture_bytes.to_vec()).into());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let result = find_match(regex, re_string, left_bytes)?;
|
||||
Ok(result.into())
|
||||
}
|
||||
|
||||
/// Precedence for infix binary operators
|
||||
const PRECEDENCE: &[&[(&MaybeNonUtf8Str, BinOp)]] = &[
|
||||
&[(b"|", BinOp::String(StringOp::Or))],
|
||||
|
|
@ -1038,4 +1218,187 @@ mod test {
|
|||
Err(ExprError::TooBigRangeQuantifierIndex)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_basic() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Basic literal match
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b"hello".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"5");
|
||||
|
||||
// No match
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b"world".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"0");
|
||||
|
||||
// Partial match from beginning
|
||||
let result = evaluate_match_expression(b"hello world".to_vec(), b"hello".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"5");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_regex_patterns() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Dot matches any character
|
||||
let result = evaluate_match_expression(b"abc".to_vec(), b"a.c".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Star quantifier
|
||||
let result = evaluate_match_expression(b"aaaabc".to_vec(), b"a*bc".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"6");
|
||||
|
||||
// Plus quantifier (escaped in BRE)
|
||||
let result = evaluate_match_expression(b"aaaabc".to_vec(), b"a\\+bc".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"6");
|
||||
|
||||
// Question mark quantifier (escaped in BRE)
|
||||
let result = evaluate_match_expression(b"abc".to_vec(), b"ab\\?c".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_capture_groups() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Simple capture group
|
||||
let result =
|
||||
evaluate_match_expression(b"hello123".to_vec(), b"hello\\([0-9]*\\)".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"123");
|
||||
|
||||
// Empty capture group
|
||||
let result =
|
||||
evaluate_match_expression(b"hello".to_vec(), b"hello\\([0-9]*\\)".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"");
|
||||
|
||||
// No capture group, just match length
|
||||
let result =
|
||||
evaluate_match_expression(b"hello123".to_vec(), b"hello[0-9]*".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"8");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_character_classes() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Simple character class
|
||||
let result = evaluate_match_expression(b"abc123".to_vec(), b"[a-z]*".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Negated character class
|
||||
let result = evaluate_match_expression(b"123abc".to_vec(), b"[^a-z]*".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Digit character class
|
||||
let result = evaluate_match_expression(b"123abc".to_vec(), b"[0-9]*".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_anchoring() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Patterns are automatically anchored at start
|
||||
let result = evaluate_match_expression(b"world hello".to_vec(), b"hello".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"0");
|
||||
|
||||
// Explicit start anchor (redundant but should work)
|
||||
let result =
|
||||
evaluate_match_expression(b"hello world".to_vec(), b"^hello".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"5");
|
||||
|
||||
// End anchor
|
||||
let result =
|
||||
evaluate_match_expression(b"hello world".to_vec(), b"world$".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"0"); // Should fail because not at start
|
||||
|
||||
let result = evaluate_match_expression(b"world".to_vec(), b"world$".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"5");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_special_characters() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Escaped special characters
|
||||
let result = evaluate_match_expression(b"a.b".to_vec(), b"a\\.b".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Escaped asterisk
|
||||
let result = evaluate_match_expression(b"a*b".to_vec(), b"a\\*b".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Caret not at beginning should be escaped
|
||||
let result = evaluate_match_expression(b"a^b".to_vec(), b"a^b".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Dollar not at end should be escaped
|
||||
let result = evaluate_match_expression(b"a$b".to_vec(), b"a$b".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_range_quantifiers() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Fixed count quantifier
|
||||
let result = evaluate_match_expression(b"aaa".to_vec(), b"a\\{3\\}".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"3");
|
||||
|
||||
// Range quantifier
|
||||
let result = evaluate_match_expression(b"aa".to_vec(), b"a\\{1,3\\}".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"2");
|
||||
|
||||
// Minimum quantifier
|
||||
let result = evaluate_match_expression(b"aaaa".to_vec(), b"a\\{2,\\}".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"4");
|
||||
|
||||
// Maximum quantifier
|
||||
let result = evaluate_match_expression(b"aa".to_vec(), b"a\\{,3\\}".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_empty_and_edge_cases() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Empty input string
|
||||
let result = evaluate_match_expression(b"".to_vec(), b".*".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"0");
|
||||
|
||||
// Empty pattern (should match empty string)
|
||||
let result = evaluate_match_expression(b"".to_vec(), b"".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"0");
|
||||
|
||||
// Pattern matching empty string
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b".*".to_vec()).unwrap();
|
||||
assert_eq!(result.eval_as_string(), b"5");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_evaluate_match_expression_error_cases() {
|
||||
use super::evaluate_match_expression;
|
||||
|
||||
// Unmatched opening parenthesis
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b"\\(hello".to_vec());
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(ExprError::UnmatchedOpeningParenthesis)
|
||||
));
|
||||
|
||||
// Unmatched closing parenthesis
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b"hello\\)".to_vec());
|
||||
assert!(matches!(
|
||||
result,
|
||||
Err(ExprError::UnmatchedClosingParenthesis)
|
||||
));
|
||||
|
||||
// Trailing backslash
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b"hello\\".to_vec());
|
||||
assert!(matches!(result, Err(ExprError::TrailingBackslash)));
|
||||
|
||||
// Invalid bracket content
|
||||
let result = evaluate_match_expression(b"hello".to_vec(), b"a\\{invalid\\}".to_vec());
|
||||
assert!(matches!(result, Err(ExprError::InvalidBracketContent)));
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1869,11 +1869,10 @@ mod gnu_expr_multibyte {
|
|||
|
||||
// The regex engine should match the '.' to the first multibyte character.
|
||||
#[test]
|
||||
#[ignore = "not implemented"]
|
||||
fn test_m3() {
|
||||
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b".bc"];
|
||||
|
||||
let cases = &[TestCase::FR.out("3"), TestCase::C.code(1)];
|
||||
let cases = &[TestCase::FR.out("3"), TestCase::C.out("0").code(1)];
|
||||
|
||||
for tc in cases {
|
||||
check_test_case(args, tc);
|
||||
|
|
@ -1883,7 +1882,6 @@ mod gnu_expr_multibyte {
|
|||
// The opposite of the previous test: two dots should only match the two
|
||||
// octets in single-byte locale.
|
||||
#[test]
|
||||
#[ignore = "not implemented"]
|
||||
fn test_m4() {
|
||||
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"..bc"];
|
||||
|
||||
|
|
@ -1896,11 +1894,10 @@ mod gnu_expr_multibyte {
|
|||
|
||||
// Match with grouping - a single dot should return the two octets
|
||||
#[test]
|
||||
#[ignore = "not implemented"]
|
||||
fn test_m5() {
|
||||
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"\\(.b\\)c"];
|
||||
|
||||
let cases = &[TestCase::FR.out(b"\xCE\xB1b"), TestCase::C.code(1)];
|
||||
let cases = &[TestCase::FR.out(b"\xCE\xB1b"), TestCase::C.out("").code(1)];
|
||||
|
||||
for tc in cases {
|
||||
check_test_case(args, tc);
|
||||
|
|
@ -1910,11 +1907,10 @@ mod gnu_expr_multibyte {
|
|||
// Invalid multibyte sequences - regex should not match in multibyte locale
|
||||
// (POSIX requirement)
|
||||
#[test]
|
||||
#[ignore = "not implemented"]
|
||||
fn test_m6() {
|
||||
let args: &[&[u8]] = &[b"match", b"\xCEbc\xCE\xB4ef", b"\\(.\\)"];
|
||||
|
||||
let cases = &[TestCase::FR.code(1), TestCase::C.out(b"\xCE")];
|
||||
let cases = &[TestCase::FR.out("").code(1), TestCase::C.out(b"\xCE")];
|
||||
|
||||
for tc in cases {
|
||||
check_test_case(args, tc);
|
||||
|
|
@ -1926,7 +1922,6 @@ mod gnu_expr_multibyte {
|
|||
// In the single byte case, the regex engine sees two octets in the
|
||||
// character class ('\xCE' and '\xB1') - and it matches the first one.
|
||||
#[test]
|
||||
#[ignore = "not implemented"]
|
||||
fn test_m7() {
|
||||
let args: &[&[u8]] = &[b"match", b"\xCE\xB1bc\xCE\xB4ef", b"\\(.\\)"];
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue