basenc: Fix basenc.pl GNU-compat tests pass (#9203)

* fix(basenc): align base32 decode with GNU * Add GNU-style basenc base32 tests * Expand basenc base32 tests and simplify failures Adds the GNU-style auto-padding/truncated cases to tests/by-util/test_basenc.rs and rewrites the failure assertions to use the chained fails().stdout_*(…).stderr_is(…) style for clarity. * Restore GNU expectations for b32h_5 and b32h_6 Updates util/build-gnu.sh to stop forcing those two basenc tests to expect empty stdout, so the GNU suite again checks for the leaked five bytes before failure. * Allow base32 decoder to auto-pad truncated blocks Introduce PadResult, trim/pad incomplete base32 chunks, emit decoded prefixes, and still return error: invalid input in line with GNU basenc.
2025-12-23 08:47:37 +00:00 · 2025-11-11 03:13:51 +09:00 · 2025-11-11 03:13:51 +09:00 · 364d9e9dff
commit 364d9e9dff
parent eb223ba8b1
4 changed files with 276 additions and 80 deletions
--- a/src/uu/base32/src/base_common.rs
+++ b/src/uu/base32/src/base_common.rs
@ -8,11 +8,11 @@
 use clap::{Arg, ArgAction, Command};
 use std::ffi::OsString;
 use std::fs::File;
-use std::io::{self, ErrorKind, Read, Seek};
+use std::io::{self, ErrorKind, Read, Seek, Write};
 use std::path::{Path, PathBuf};
 use uucore::display::Quotable;
 use uucore::encoding::{
-    BASE2LSBF, BASE2MSBF, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
+    BASE2LSBF, BASE2MSBF, Base32Wrapper, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
    SupportsFastDecodeAndEncode, Z85Wrapper,
    for_base_common::{BASE32, BASE32HEX, BASE64URL, HEXUPPER_PERMISSIVE},
 };
@ -193,7 +193,7 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi

    let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref();
    let mut stdout_lock = io::stdout().lock();
-    if config.decode {
+    let result = if config.decode {
        fast_decode::fast_decode(
            read,
            &mut stdout_lock,
@ -207,6 +207,14 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi
            supports_fast_decode_and_encode_ref,
            config.wrap_cols,
        )
+    };
+
+    // Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc
+    // keeps already-decoded bytes visible before reporting the error.
+    match (result, stdout_lock.flush()) {
+        (res, Ok(())) => res,
+        (Ok(_), Err(err)) => Err(err.into()),
+        (Err(original), Err(_)) => Err(original),
    }
 }

@ -247,14 +255,14 @@ pub fn get_supports_fast_decode_and_encode(
            // spell-checker:disable-next-line
            b"01",
        )),
-        Format::Base32 => Box::from(EncodingWrapper::new(
+        Format::Base32 => Box::from(Base32Wrapper::new(
            BASE32,
            BASE32_VALID_DECODING_MULTIPLE,
            BASE32_UNPADDED_MULTIPLE,
            // spell-checker:disable-next-line
            b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=",
        )),
-        Format::Base32Hex => Box::from(EncodingWrapper::new(
+        Format::Base32Hex => Box::from(Base32Wrapper::new(
            BASE32HEX,
            BASE32_VALID_DECODING_MULTIPLE,
            BASE32_UNPADDED_MULTIPLE,
@ -502,43 +510,21 @@ pub mod fast_encode {

 pub mod fast_decode {
    use std::io::{self, Write};
-    use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
+    use uucore::{
+        encoding::SupportsFastDecodeAndEncode,
+        error::{UResult, USimpleError},
+    };

    // Start of helper functions
-    fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] {
-        // If `ignore_garbage` is enabled, all characters outside the alphabet are ignored
-        // If it is not enabled, only '\n' and '\r' are ignored
-        if ignore_garbage {
-            // Note: "false" here
-            let mut table = [false; 256];
+    fn alphabet_lookup(alphabet: &[u8]) -> [bool; 256] {
+        // Precompute O(1) membership checks so we can validate every byte before decoding.
+        let mut table = [false; 256];

-            // Pass through no characters except those in the alphabet
-            for ue in alphabet {
-                let us = usize::from(*ue);
-
-                // Should not have been set yet
-                assert!(!table[us]);
-
-                table[us] = true;
-            }
-
-            table
-        } else {
-            // Note: "true" here
-            let mut table = [true; 256];
-
-            // Pass through all characters except '\n' and '\r'
-            for ue in [b'\n', b'\r'] {
-                let us = usize::from(ue);
-
-                // Should not have been set yet
-                assert!(table[us]);
-
-                table[us] = false;
-            }
-
-            table
+        for &byte in alphabet {
+            table[usize::from(byte)] = true;
        }
+
+        table
    }

    fn decode_in_chunks_to_buffer(
@ -553,11 +539,44 @@ pub mod fast_decode {
    fn write_to_output(decoded_buffer: &mut Vec<u8>, output: &mut dyn Write) -> io::Result<()> {
        // Write all data in `decoded_buffer` to `output`
        output.write_all(decoded_buffer.as_slice())?;
+        output.flush()?;

        decoded_buffer.clear();

        Ok(())
    }
+
+    fn flush_ready_chunks(
+        buffer: &mut Vec<u8>,
+        block_limit: usize,
+        valid_multiple: usize,
+        supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
+        decoded_buffer: &mut Vec<u8>,
+        output: &mut dyn Write,
+    ) -> UResult<()> {
+        // While at least one full decode block is buffered, keep draining
+        // it and never yield more than block_limit per chunk.
+        while buffer.len() >= valid_multiple {
+            let take = buffer.len().min(block_limit);
+            let aligned_take = take - (take % valid_multiple);
+
+            if aligned_take < valid_multiple {
+                break;
+            }
+
+            decode_in_chunks_to_buffer(
+                supports_fast_decode_and_encode,
+                &buffer[..aligned_take],
+                decoded_buffer,
+            )?;
+
+            write_to_output(decoded_buffer, output)?;
+
+            buffer.drain(..aligned_take);
+        }
+
+        Ok(())
+    }
    // End of helper functions

    pub fn fast_decode(
@ -569,22 +588,12 @@ pub mod fast_decode {
        const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;

        let alphabet = supports_fast_decode_and_encode.alphabet();
-        let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple()
-            * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
+        let alphabet_table = alphabet_lookup(alphabet);
+        let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
+        let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;

        assert!(decode_in_chunks_of_size > 0);
-
-        // Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because
-        // "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before
-        // passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also
-        // allows execution to stay on the happy path in "data-encoding":
-        // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756
-        // It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is
-        // false.
-        // Note that the alphabet constants above already include the padding characters
-        // TODO
-        // Precompute this
-        let table = alphabet_to_table(alphabet, ignore_garbage);
+        assert!(valid_multiple > 0);

        // Start of buffers

@ -595,35 +604,69 @@ pub mod fast_decode {

        let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);

-        input
-            .iter()
-            .filter(|ch| table[usize::from(**ch)])
-            .for_each(|ch| {
-                buffer.push(*ch);
-                // How many bytes to steal from `read_buffer` to get
-                // `leftover_buffer` to the right size
-                if buffer.len() == decode_in_chunks_of_size {
-                    assert_eq!(decode_in_chunks_of_size, buffer.len());
-                    // Decode data in chunks, then place it in `decoded_buffer`
-                    decode_in_chunks_to_buffer(
-                        supports_fast_decode_and_encode,
-                        &buffer,
-                        &mut decoded_buffer,
-                    )
-                    .unwrap();
-                    // Write all data in `decoded_buffer` to `output`
-                    write_to_output(&mut decoded_buffer, output).unwrap();
-                    buffer.clear();
-                }
-            });
-        // Cleanup
-        // `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed
-        {
-            // Decode all remaining encoded bytes, placing them in `decoded_buffer`
-            supports_fast_decode_and_encode.decode_into_vec(&buffer, &mut decoded_buffer)?;
+        let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();

-            // Write all data in `decoded_buffer` to `output`
+        for &byte in &input {
+            if byte == b'\n' || byte == b'\r' {
+                continue;
+            }
+
+            if alphabet_table[usize::from(byte)] {
+                buffer.push(byte);
+            } else if ignore_garbage {
+                continue;
+            } else {
+                return Err(USimpleError::new(1, "error: invalid input".to_owned()));
+            }
+
+            if supports_partial_decode {
+                flush_ready_chunks(
+                    &mut buffer,
+                    decode_in_chunks_of_size,
+                    valid_multiple,
+                    supports_fast_decode_and_encode,
+                    &mut decoded_buffer,
+                    output,
+                )?;
+            } else if buffer.len() == decode_in_chunks_of_size {
+                decode_in_chunks_to_buffer(
+                    supports_fast_decode_and_encode,
+                    &buffer,
+                    &mut decoded_buffer,
+                )?;
+                write_to_output(&mut decoded_buffer, output)?;
+                buffer.clear();
+            }
+        }
+
+        if supports_partial_decode {
+            flush_ready_chunks(
+                &mut buffer,
+                decode_in_chunks_of_size,
+                valid_multiple,
+                supports_fast_decode_and_encode,
+                &mut decoded_buffer,
+                output,
+            )?;
+        }
+
+        if !buffer.is_empty() {
+            let mut owned_chunk: Option<Vec<u8>> = None;
+            let mut had_invalid_tail = false;
+
+            if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) {
+                had_invalid_tail = pad_result.had_invalid_tail;
+                owned_chunk = Some(pad_result.chunk);
+            }
+
+            let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer);
+
+            supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
            write_to_output(&mut decoded_buffer, output)?;
+
+            if had_invalid_tail {
+                return Err(USimpleError::new(1, "error: invalid input".to_owned()));
+            }
        }

        Ok(())
--- a/src/uucore/src/lib/features/encoding.rs
+++ b/src/uucore/src/lib/features/encoding.rs
@ -214,6 +214,11 @@ impl EncodingWrapper {
    }
 }

+pub struct PadResult {
+    pub chunk: Vec<u8>,
+    pub had_invalid_tail: bool,
+}
+
 pub trait SupportsFastDecodeAndEncode {
    /// Returns the list of characters used by this encoding
    fn alphabet(&self) -> &'static [u8];
@ -245,6 +250,19 @@ pub trait SupportsFastDecodeAndEncode {
    ///
    /// The decoding performed by `fast_decode` depends on this number being correct.
    fn valid_decoding_multiple(&self) -> usize;
+
+    /// Whether the decoder can flush partial chunks (multiples of `valid_decoding_multiple`)
+    /// before seeing the full input. Defaults to `false` for encodings that must consume the
+    /// entire input (e.g. base58).
+    fn supports_partial_decode(&self) -> bool {
+        false
+    }
+
+    /// Gives encoding-specific logic a chance to pad a trailing, non-empty remainder
+    /// before the final decode attempt. The default implementation opts out.
+    fn pad_remainder(&self, _remainder: &[u8]) -> Option<PadResult> {
+        None
+    }
 }

 impl SupportsFastDecodeAndEncode for Base58Wrapper {
@ -504,3 +522,80 @@ impl SupportsFastDecodeAndEncode for EncodingWrapper {
        self.unpadded_multiple
    }
 }
+
+pub struct Base32Wrapper {
+    inner: EncodingWrapper,
+}
+
+impl Base32Wrapper {
+    pub fn new(
+        encoding: Encoding,
+        valid_decoding_multiple: usize,
+        unpadded_multiple: usize,
+        alphabet: &'static [u8],
+    ) -> Self {
+        Self {
+            inner: EncodingWrapper::new(
+                encoding,
+                valid_decoding_multiple,
+                unpadded_multiple,
+                alphabet,
+            ),
+        }
+    }
+}
+
+impl SupportsFastDecodeAndEncode for Base32Wrapper {
+    fn alphabet(&self) -> &'static [u8] {
+        self.inner.alphabet()
+    }
+
+    fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
+        self.inner.decode_into_vec(input, output)
+    }
+
+    fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
+        self.inner.encode_to_vec_deque(input, output)
+    }
+
+    fn unpadded_multiple(&self) -> usize {
+        self.inner.unpadded_multiple()
+    }
+
+    fn valid_decoding_multiple(&self) -> usize {
+        self.inner.valid_decoding_multiple()
+    }
+
+    fn pad_remainder(&self, remainder: &[u8]) -> Option<PadResult> {
+        if remainder.is_empty() || remainder.contains(&b'=') {
+            return None;
+        }
+
+        const VALID_REMAINDERS: [usize; 4] = [2, 4, 5, 7];
+
+        let mut len = remainder.len();
+        let mut trimmed = false;
+
+        while len > 0 && !VALID_REMAINDERS.contains(&len) {
+            len -= 1;
+            trimmed = true;
+        }
+
+        if len == 0 {
+            return None;
+        }
+
+        let mut padded = remainder[..len].to_vec();
+        let missing = self.valid_decoding_multiple() - padded.len();
+        padded.extend(std::iter::repeat_n(b'=', missing));
+
+        Some(PadResult {
+            chunk: padded,
+            had_invalid_tail: trimmed,
+        })
+    }
+
+    fn supports_partial_decode(&self) -> bool {
+        true
+    }
+}
--- a/tests/by-util/test_basenc.rs
+++ b/tests/by-util/test_basenc.rs
@ -4,6 +4,7 @@
 // file that was distributed with this source code.

 // spell-checker: ignore (encodings) lsbf msbf
+// spell-checker: ignore autopad MFRGG MFRGGZDF abcdeabc baddecode CPNMUO

 use uutests::{at_and_ucmd, new_ucmd};

@ -112,6 +113,63 @@ fn test_base32hex_decode() {
        .stdout_only("nice>base?");
 }

+#[test]
+fn test_base32_autopad_short_quantum() {
+    new_ucmd!()
+        .args(&["--base32", "--decode"])
+        .pipe_in("MFRGG")
+        .succeeds()
+        .stdout_only("abc");
+}
+
+#[test]
+fn test_base32_autopad_multiline_stream() {
+    new_ucmd!()
+        .args(&["--base32", "--decode"])
+        .pipe_in("MFRGGZDF\nMFRGG")
+        .succeeds()
+        .stdout_only("abcdeabc");
+}
+
+#[test]
+fn test_base32_baddecode_keeps_prefix() {
+    new_ucmd!()
+        .args(&["--base32", "--decode"])
+        .pipe_in("MFRGGZDF=")
+        .fails()
+        .stdout_is("abcde")
+        .stderr_is("basenc: error: invalid input\n");
+}
+
+#[test]
+fn test_base32hex_autopad_short_quantum() {
+    new_ucmd!()
+        .args(&["--base32hex", "--decode"])
+        .pipe_in("C5H66")
+        .succeeds()
+        .stdout_only("abc");
+}
+
+#[test]
+fn test_base32hex_rejects_trailing_garbage() {
+    new_ucmd!()
+        .args(&["--base32hex", "-d"])
+        .pipe_in("VNC0FKD5W")
+        .fails()
+        .stdout_is_bytes(b"\xFD\xD8\x07\xD1\xA5")
+        .stderr_is("basenc: error: invalid input\n");
+}
+
+#[test]
+fn test_base32hex_truncated_block_keeps_prefix() {
+    new_ucmd!()
+        .args(&["--base32hex", "-d"])
+        .pipe_in("CPNMUO")
+        .fails()
+        .stdout_is_bytes(b"foo")
+        .stderr_is("basenc: error: invalid input\n");
+}
+
 #[test]
 fn test_base16() {
    new_ucmd!()
--- a/util/build-gnu.sh
+++ b/util/build-gnu.sh
@ -268,7 +268,7 @@ sed -i -e "s|invalid suffix in --pages argument|invalid --pages argument|" \
 # When decoding an invalid base32/64 string, gnu writes everything it was able to decode until
 # it hit the decode error, while we don't write anything if the input is invalid.
 sed -i "s/\(baddecode.*OUT=>\"\).*\"/\1\"/g" tests/basenc/base64.pl
-sed -i "s/\(\(b2[ml]_[69]\|b32h_[56]\|z85_8\|z85_35\).*OUT=>\)[^}]*\(.*\)/\1\"\"\3/g" tests/basenc/basenc.pl
+sed -i "s/\(\(b2[ml]_[69]\|z85_8\|z85_35\).*OUT=>\)[^}]*\(.*\)/\1\"\"\3/g" tests/basenc/basenc.pl

 # add "error: " to the expected error message
 sed -i "s/\$prog: invalid input/\$prog: error: invalid input/g" tests/basenc/basenc.pl