From 92bb655b0ce6f3a5ebc452c2d51cb12fbbf91b80 Mon Sep 17 00:00:00 2001 From: karanabe <152078880+karanabe@users.noreply.github.com> Date: Sun, 9 Nov 2025 02:41:15 +0900 Subject: [PATCH] fix(base64): allow padded chunks mid-stream --- src/uu/base32/src/base_common.rs | 12 ++-- src/uucore/src/lib/features/encoding.rs | 88 +++++++++++++++++++++---- tests/by-util/test_base64.rs | 30 +++++++++ 3 files changed, 111 insertions(+), 19 deletions(-) diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index fe13e46cc..96d28e189 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -171,18 +171,16 @@ pub fn get_input(config: &Config) -> UResult> { } } -/// Determines if the input buffer ends with padding ('=') after trimming trailing whitespace. +/// Determines if the input buffer contains any padding ('=') ignoring trailing whitespace. fn read_and_has_padding(input: &mut R) -> UResult<(bool, Vec)> { let mut buf = Vec::new(); input .read_to_end(&mut buf) .map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?; - // Reverse iterator and skip trailing whitespace without extra collections - let has_padding = buf - .iter() - .rfind(|&&byte| !byte.is_ascii_whitespace()) - .is_some_and(|&byte| byte == b'='); + // Treat the stream as padded if any '=' exists (GNU coreutils continues decoding + // even when padding bytes are followed by more data). + let has_padding = buf.contains(&b'='); Ok((has_padding, buf)) } @@ -665,6 +663,8 @@ mod tests { ("aGVsbG8sIHdvcmxkIQ== \n", true), ("aGVsbG8sIHdvcmxkIQ=", true), ("aGVsbG8sIHdvcmxkIQ= ", true), + ("MTIzNA==MTIzNA", true), + ("MTIzNA==\nMTIzNA", true), ("aGVsbG8sIHdvcmxkIQ \n", false), ("aGVsbG8sIHdvcmxkIQ", false), ]; diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index 6a2dccd4f..6c6261c2c 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -22,6 +22,26 @@ pub struct Base64SimdWrapper { } impl Base64SimdWrapper { + fn decode_with_standard(input: &[u8], output: &mut Vec) -> Result<(), ()> { + match base64_simd::STANDARD.decode_to_vec(input) { + Ok(decoded_bytes) => { + output.extend_from_slice(&decoded_bytes); + Ok(()) + } + Err(_) => Err(()), + } + } + + fn decode_with_no_pad(input: &[u8], output: &mut Vec) -> Result<(), ()> { + match base64_simd::STANDARD_NO_PAD.decode_to_vec(input) { + Ok(decoded_bytes) => { + output.extend_from_slice(&decoded_bytes); + Ok(()) + } + Err(_) => Err(()), + } + } + pub fn new( use_padding: bool, valid_decoding_multiple: usize, @@ -47,22 +67,64 @@ impl SupportsFastDecodeAndEncode for Base64SimdWrapper { } fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()> { - let decoded = if self.use_padding { - base64_simd::STANDARD.decode_to_vec(input) + let original_len = output.len(); + + let decode_result = if self.use_padding { + // GNU coreutils keeps decoding even when '=' appears before the true end + // of the stream (e.g. concatenated padded chunks). Mirror that logic + // by splitting at each '='-containing quantum, decoding those 4-byte + // groups with the padded variant, then letting the remainder fall back + // to whichever alphabet fits. + let mut start = 0usize; + while start < input.len() { + let remaining = &input[start..]; + + if remaining.is_empty() { + break; + } + + if let Some(eq_rel_idx) = remaining.iter().position(|&b| b == b'=') { + let blocks = (eq_rel_idx / 4) + 1; + let segment_len = blocks * 4; + + if segment_len > remaining.len() { + return Err(USimpleError::new(1, "error: invalid input".to_owned())); + } + + if Self::decode_with_standard(&remaining[..segment_len], output).is_err() { + return Err(USimpleError::new(1, "error: invalid input".to_owned())); + } + + start += segment_len; + } else { + // If there are no more '=' bytes the tail might still be padded + // (len % 4 == 0) or purposely unpadded (GNU --ignore-garbage or + // concatenated streams), so select the matching alphabet. + let decoder = if remaining.len() % 4 == 0 { + Self::decode_with_standard + } else { + Self::decode_with_no_pad + }; + + if decoder(remaining, output).is_err() { + return Err(USimpleError::new(1, "error: invalid input".to_owned())); + } + + break; + } + } + + Ok(()) } else { - base64_simd::STANDARD_NO_PAD.decode_to_vec(input) + Self::decode_with_no_pad(input, output) + .map_err(|_| USimpleError::new(1, "error: invalid input".to_owned())) }; - match decoded { - Ok(decoded_bytes) => { - output.extend_from_slice(&decoded_bytes); - Ok(()) - } - Err(_) => { - // Restore original length on error - output.truncate(output.len()); - Err(USimpleError::new(1, "error: invalid input".to_owned())) - } + if let Err(err) = decode_result { + output.truncate(original_len); + Err(err) + } else { + Ok(()) } } diff --git a/tests/by-util/test_base64.rs b/tests/by-util/test_base64.rs index b46b8e0b1..f3657bb77 100644 --- a/tests/by-util/test_base64.rs +++ b/tests/by-util/test_base64.rs @@ -2,6 +2,9 @@ // // For the full copyright and license information, please view the LICENSE // file that was distributed with this source code. + +// spell-checker:ignore unpadded, QUJD + #[cfg(target_os = "linux")] use uutests::at_and_ucmd; use uutests::new_ucmd; @@ -108,6 +111,33 @@ fn test_decode_repeat_flags() { .stdout_only("hello, world!"); } +#[test] +fn test_decode_padded_block_followed_by_unpadded_tail() { + new_ucmd!() + .arg("--decode") + .pipe_in("MTIzNA==MTIzNA") + .succeeds() + .stdout_only("12341234"); +} + +#[test] +fn test_decode_padded_block_followed_by_aligned_tail() { + new_ucmd!() + .arg("--decode") + .pipe_in("MTIzNA==QUJD") + .succeeds() + .stdout_only("1234ABC"); +} + +#[test] +fn test_decode_unpadded_stream_without_equals() { + new_ucmd!() + .arg("--decode") + .pipe_in("MTIzNA") + .succeeds() + .stdout_only("1234"); +} + #[test] fn test_garbage() { let input = "aGVsbG8sIHdvcmxkIQ==\0"; // spell-checker:disable-line