mirror of
https://github.com/uutils/coreutils.git
synced 2025-12-23 08:47:37 +00:00
basenc: Fix basenc.pl GNU-compat tests pass (#9203)
* fix(basenc): align base32 decode with GNU * Add GNU-style basenc base32 tests * Expand basenc base32 tests and simplify failures Adds the GNU-style auto-padding/truncated cases to tests/by-util/test_basenc.rs and rewrites the failure assertions to use the chained fails().stdout_*(…).stderr_is(…) style for clarity. * Restore GNU expectations for b32h_5 and b32h_6 Updates util/build-gnu.sh to stop forcing those two basenc tests to expect empty stdout, so the GNU suite again checks for the leaked five bytes before failure. * Allow base32 decoder to auto-pad truncated blocks Introduce PadResult, trim/pad incomplete base32 chunks, emit decoded prefixes, and still return error: invalid input in line with GNU basenc.
This commit is contained in:
parent
eb223ba8b1
commit
364d9e9dff
4 changed files with 276 additions and 80 deletions
|
|
@ -8,11 +8,11 @@
|
|||
use clap::{Arg, ArgAction, Command};
|
||||
use std::ffi::OsString;
|
||||
use std::fs::File;
|
||||
use std::io::{self, ErrorKind, Read, Seek};
|
||||
use std::io::{self, ErrorKind, Read, Seek, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use uucore::display::Quotable;
|
||||
use uucore::encoding::{
|
||||
BASE2LSBF, BASE2MSBF, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
|
||||
BASE2LSBF, BASE2MSBF, Base32Wrapper, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
|
||||
SupportsFastDecodeAndEncode, Z85Wrapper,
|
||||
for_base_common::{BASE32, BASE32HEX, BASE64URL, HEXUPPER_PERMISSIVE},
|
||||
};
|
||||
|
|
@ -193,7 +193,7 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi
|
|||
|
||||
let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref();
|
||||
let mut stdout_lock = io::stdout().lock();
|
||||
if config.decode {
|
||||
let result = if config.decode {
|
||||
fast_decode::fast_decode(
|
||||
read,
|
||||
&mut stdout_lock,
|
||||
|
|
@ -207,6 +207,14 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi
|
|||
supports_fast_decode_and_encode_ref,
|
||||
config.wrap_cols,
|
||||
)
|
||||
};
|
||||
|
||||
// Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc
|
||||
// keeps already-decoded bytes visible before reporting the error.
|
||||
match (result, stdout_lock.flush()) {
|
||||
(res, Ok(())) => res,
|
||||
(Ok(_), Err(err)) => Err(err.into()),
|
||||
(Err(original), Err(_)) => Err(original),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -247,14 +255,14 @@ pub fn get_supports_fast_decode_and_encode(
|
|||
// spell-checker:disable-next-line
|
||||
b"01",
|
||||
)),
|
||||
Format::Base32 => Box::from(EncodingWrapper::new(
|
||||
Format::Base32 => Box::from(Base32Wrapper::new(
|
||||
BASE32,
|
||||
BASE32_VALID_DECODING_MULTIPLE,
|
||||
BASE32_UNPADDED_MULTIPLE,
|
||||
// spell-checker:disable-next-line
|
||||
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=",
|
||||
)),
|
||||
Format::Base32Hex => Box::from(EncodingWrapper::new(
|
||||
Format::Base32Hex => Box::from(Base32Wrapper::new(
|
||||
BASE32HEX,
|
||||
BASE32_VALID_DECODING_MULTIPLE,
|
||||
BASE32_UNPADDED_MULTIPLE,
|
||||
|
|
@ -502,43 +510,21 @@ pub mod fast_encode {
|
|||
|
||||
pub mod fast_decode {
|
||||
use std::io::{self, Write};
|
||||
use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
|
||||
use uucore::{
|
||||
encoding::SupportsFastDecodeAndEncode,
|
||||
error::{UResult, USimpleError},
|
||||
};
|
||||
|
||||
// Start of helper functions
|
||||
fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] {
|
||||
// If `ignore_garbage` is enabled, all characters outside the alphabet are ignored
|
||||
// If it is not enabled, only '\n' and '\r' are ignored
|
||||
if ignore_garbage {
|
||||
// Note: "false" here
|
||||
let mut table = [false; 256];
|
||||
fn alphabet_lookup(alphabet: &[u8]) -> [bool; 256] {
|
||||
// Precompute O(1) membership checks so we can validate every byte before decoding.
|
||||
let mut table = [false; 256];
|
||||
|
||||
// Pass through no characters except those in the alphabet
|
||||
for ue in alphabet {
|
||||
let us = usize::from(*ue);
|
||||
|
||||
// Should not have been set yet
|
||||
assert!(!table[us]);
|
||||
|
||||
table[us] = true;
|
||||
}
|
||||
|
||||
table
|
||||
} else {
|
||||
// Note: "true" here
|
||||
let mut table = [true; 256];
|
||||
|
||||
// Pass through all characters except '\n' and '\r'
|
||||
for ue in [b'\n', b'\r'] {
|
||||
let us = usize::from(ue);
|
||||
|
||||
// Should not have been set yet
|
||||
assert!(table[us]);
|
||||
|
||||
table[us] = false;
|
||||
}
|
||||
|
||||
table
|
||||
for &byte in alphabet {
|
||||
table[usize::from(byte)] = true;
|
||||
}
|
||||
|
||||
table
|
||||
}
|
||||
|
||||
fn decode_in_chunks_to_buffer(
|
||||
|
|
@ -553,11 +539,44 @@ pub mod fast_decode {
|
|||
fn write_to_output(decoded_buffer: &mut Vec<u8>, output: &mut dyn Write) -> io::Result<()> {
|
||||
// Write all data in `decoded_buffer` to `output`
|
||||
output.write_all(decoded_buffer.as_slice())?;
|
||||
output.flush()?;
|
||||
|
||||
decoded_buffer.clear();
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush_ready_chunks(
|
||||
buffer: &mut Vec<u8>,
|
||||
block_limit: usize,
|
||||
valid_multiple: usize,
|
||||
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
|
||||
decoded_buffer: &mut Vec<u8>,
|
||||
output: &mut dyn Write,
|
||||
) -> UResult<()> {
|
||||
// While at least one full decode block is buffered, keep draining
|
||||
// it and never yield more than block_limit per chunk.
|
||||
while buffer.len() >= valid_multiple {
|
||||
let take = buffer.len().min(block_limit);
|
||||
let aligned_take = take - (take % valid_multiple);
|
||||
|
||||
if aligned_take < valid_multiple {
|
||||
break;
|
||||
}
|
||||
|
||||
decode_in_chunks_to_buffer(
|
||||
supports_fast_decode_and_encode,
|
||||
&buffer[..aligned_take],
|
||||
decoded_buffer,
|
||||
)?;
|
||||
|
||||
write_to_output(decoded_buffer, output)?;
|
||||
|
||||
buffer.drain(..aligned_take);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
// End of helper functions
|
||||
|
||||
pub fn fast_decode(
|
||||
|
|
@ -569,22 +588,12 @@ pub mod fast_decode {
|
|||
const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;
|
||||
|
||||
let alphabet = supports_fast_decode_and_encode.alphabet();
|
||||
let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple()
|
||||
* DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
|
||||
let alphabet_table = alphabet_lookup(alphabet);
|
||||
let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
|
||||
let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
|
||||
|
||||
assert!(decode_in_chunks_of_size > 0);
|
||||
|
||||
// Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because
|
||||
// "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before
|
||||
// passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also
|
||||
// allows execution to stay on the happy path in "data-encoding":
|
||||
// https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756
|
||||
// It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is
|
||||
// false.
|
||||
// Note that the alphabet constants above already include the padding characters
|
||||
// TODO
|
||||
// Precompute this
|
||||
let table = alphabet_to_table(alphabet, ignore_garbage);
|
||||
assert!(valid_multiple > 0);
|
||||
|
||||
// Start of buffers
|
||||
|
||||
|
|
@ -595,35 +604,69 @@ pub mod fast_decode {
|
|||
|
||||
let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);
|
||||
|
||||
input
|
||||
.iter()
|
||||
.filter(|ch| table[usize::from(**ch)])
|
||||
.for_each(|ch| {
|
||||
buffer.push(*ch);
|
||||
// How many bytes to steal from `read_buffer` to get
|
||||
// `leftover_buffer` to the right size
|
||||
if buffer.len() == decode_in_chunks_of_size {
|
||||
assert_eq!(decode_in_chunks_of_size, buffer.len());
|
||||
// Decode data in chunks, then place it in `decoded_buffer`
|
||||
decode_in_chunks_to_buffer(
|
||||
supports_fast_decode_and_encode,
|
||||
&buffer,
|
||||
&mut decoded_buffer,
|
||||
)
|
||||
.unwrap();
|
||||
// Write all data in `decoded_buffer` to `output`
|
||||
write_to_output(&mut decoded_buffer, output).unwrap();
|
||||
buffer.clear();
|
||||
}
|
||||
});
|
||||
// Cleanup
|
||||
// `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed
|
||||
{
|
||||
// Decode all remaining encoded bytes, placing them in `decoded_buffer`
|
||||
supports_fast_decode_and_encode.decode_into_vec(&buffer, &mut decoded_buffer)?;
|
||||
let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();
|
||||
|
||||
// Write all data in `decoded_buffer` to `output`
|
||||
for &byte in &input {
|
||||
if byte == b'\n' || byte == b'\r' {
|
||||
continue;
|
||||
}
|
||||
|
||||
if alphabet_table[usize::from(byte)] {
|
||||
buffer.push(byte);
|
||||
} else if ignore_garbage {
|
||||
continue;
|
||||
} else {
|
||||
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
|
||||
}
|
||||
|
||||
if supports_partial_decode {
|
||||
flush_ready_chunks(
|
||||
&mut buffer,
|
||||
decode_in_chunks_of_size,
|
||||
valid_multiple,
|
||||
supports_fast_decode_and_encode,
|
||||
&mut decoded_buffer,
|
||||
output,
|
||||
)?;
|
||||
} else if buffer.len() == decode_in_chunks_of_size {
|
||||
decode_in_chunks_to_buffer(
|
||||
supports_fast_decode_and_encode,
|
||||
&buffer,
|
||||
&mut decoded_buffer,
|
||||
)?;
|
||||
write_to_output(&mut decoded_buffer, output)?;
|
||||
buffer.clear();
|
||||
}
|
||||
}
|
||||
|
||||
if supports_partial_decode {
|
||||
flush_ready_chunks(
|
||||
&mut buffer,
|
||||
decode_in_chunks_of_size,
|
||||
valid_multiple,
|
||||
supports_fast_decode_and_encode,
|
||||
&mut decoded_buffer,
|
||||
output,
|
||||
)?;
|
||||
}
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let mut owned_chunk: Option<Vec<u8>> = None;
|
||||
let mut had_invalid_tail = false;
|
||||
|
||||
if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) {
|
||||
had_invalid_tail = pad_result.had_invalid_tail;
|
||||
owned_chunk = Some(pad_result.chunk);
|
||||
}
|
||||
|
||||
let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer);
|
||||
|
||||
supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
|
||||
write_to_output(&mut decoded_buffer, output)?;
|
||||
|
||||
if had_invalid_tail {
|
||||
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
|
|
@ -214,6 +214,11 @@ impl EncodingWrapper {
|
|||
}
|
||||
}
|
||||
|
||||
pub struct PadResult {
|
||||
pub chunk: Vec<u8>,
|
||||
pub had_invalid_tail: bool,
|
||||
}
|
||||
|
||||
pub trait SupportsFastDecodeAndEncode {
|
||||
/// Returns the list of characters used by this encoding
|
||||
fn alphabet(&self) -> &'static [u8];
|
||||
|
|
@ -245,6 +250,19 @@ pub trait SupportsFastDecodeAndEncode {
|
|||
///
|
||||
/// The decoding performed by `fast_decode` depends on this number being correct.
|
||||
fn valid_decoding_multiple(&self) -> usize;
|
||||
|
||||
/// Whether the decoder can flush partial chunks (multiples of `valid_decoding_multiple`)
|
||||
/// before seeing the full input. Defaults to `false` for encodings that must consume the
|
||||
/// entire input (e.g. base58).
|
||||
fn supports_partial_decode(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Gives encoding-specific logic a chance to pad a trailing, non-empty remainder
|
||||
/// before the final decode attempt. The default implementation opts out.
|
||||
fn pad_remainder(&self, _remainder: &[u8]) -> Option<PadResult> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl SupportsFastDecodeAndEncode for Base58Wrapper {
|
||||
|
|
@ -504,3 +522,80 @@ impl SupportsFastDecodeAndEncode for EncodingWrapper {
|
|||
self.unpadded_multiple
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Base32Wrapper {
|
||||
inner: EncodingWrapper,
|
||||
}
|
||||
|
||||
impl Base32Wrapper {
|
||||
pub fn new(
|
||||
encoding: Encoding,
|
||||
valid_decoding_multiple: usize,
|
||||
unpadded_multiple: usize,
|
||||
alphabet: &'static [u8],
|
||||
) -> Self {
|
||||
Self {
|
||||
inner: EncodingWrapper::new(
|
||||
encoding,
|
||||
valid_decoding_multiple,
|
||||
unpadded_multiple,
|
||||
alphabet,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SupportsFastDecodeAndEncode for Base32Wrapper {
|
||||
fn alphabet(&self) -> &'static [u8] {
|
||||
self.inner.alphabet()
|
||||
}
|
||||
|
||||
fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
|
||||
self.inner.decode_into_vec(input, output)
|
||||
}
|
||||
|
||||
fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
|
||||
self.inner.encode_to_vec_deque(input, output)
|
||||
}
|
||||
|
||||
fn unpadded_multiple(&self) -> usize {
|
||||
self.inner.unpadded_multiple()
|
||||
}
|
||||
|
||||
fn valid_decoding_multiple(&self) -> usize {
|
||||
self.inner.valid_decoding_multiple()
|
||||
}
|
||||
|
||||
fn pad_remainder(&self, remainder: &[u8]) -> Option<PadResult> {
|
||||
if remainder.is_empty() || remainder.contains(&b'=') {
|
||||
return None;
|
||||
}
|
||||
|
||||
const VALID_REMAINDERS: [usize; 4] = [2, 4, 5, 7];
|
||||
|
||||
let mut len = remainder.len();
|
||||
let mut trimmed = false;
|
||||
|
||||
while len > 0 && !VALID_REMAINDERS.contains(&len) {
|
||||
len -= 1;
|
||||
trimmed = true;
|
||||
}
|
||||
|
||||
if len == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut padded = remainder[..len].to_vec();
|
||||
let missing = self.valid_decoding_multiple() - padded.len();
|
||||
padded.extend(std::iter::repeat_n(b'=', missing));
|
||||
|
||||
Some(PadResult {
|
||||
chunk: padded,
|
||||
had_invalid_tail: trimmed,
|
||||
})
|
||||
}
|
||||
|
||||
fn supports_partial_decode(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
// file that was distributed with this source code.
|
||||
|
||||
// spell-checker: ignore (encodings) lsbf msbf
|
||||
// spell-checker: ignore autopad MFRGG MFRGGZDF abcdeabc baddecode CPNMUO
|
||||
|
||||
use uutests::{at_and_ucmd, new_ucmd};
|
||||
|
||||
|
|
@ -112,6 +113,63 @@ fn test_base32hex_decode() {
|
|||
.stdout_only("nice>base?");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base32_autopad_short_quantum() {
|
||||
new_ucmd!()
|
||||
.args(&["--base32", "--decode"])
|
||||
.pipe_in("MFRGG")
|
||||
.succeeds()
|
||||
.stdout_only("abc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base32_autopad_multiline_stream() {
|
||||
new_ucmd!()
|
||||
.args(&["--base32", "--decode"])
|
||||
.pipe_in("MFRGGZDF\nMFRGG")
|
||||
.succeeds()
|
||||
.stdout_only("abcdeabc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base32_baddecode_keeps_prefix() {
|
||||
new_ucmd!()
|
||||
.args(&["--base32", "--decode"])
|
||||
.pipe_in("MFRGGZDF=")
|
||||
.fails()
|
||||
.stdout_is("abcde")
|
||||
.stderr_is("basenc: error: invalid input\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base32hex_autopad_short_quantum() {
|
||||
new_ucmd!()
|
||||
.args(&["--base32hex", "--decode"])
|
||||
.pipe_in("C5H66")
|
||||
.succeeds()
|
||||
.stdout_only("abc");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base32hex_rejects_trailing_garbage() {
|
||||
new_ucmd!()
|
||||
.args(&["--base32hex", "-d"])
|
||||
.pipe_in("VNC0FKD5W")
|
||||
.fails()
|
||||
.stdout_is_bytes(b"\xFD\xD8\x07\xD1\xA5")
|
||||
.stderr_is("basenc: error: invalid input\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base32hex_truncated_block_keeps_prefix() {
|
||||
new_ucmd!()
|
||||
.args(&["--base32hex", "-d"])
|
||||
.pipe_in("CPNMUO")
|
||||
.fails()
|
||||
.stdout_is_bytes(b"foo")
|
||||
.stderr_is("basenc: error: invalid input\n");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_base16() {
|
||||
new_ucmd!()
|
||||
|
|
|
|||
|
|
@ -268,7 +268,7 @@ sed -i -e "s|invalid suffix in --pages argument|invalid --pages argument|" \
|
|||
# When decoding an invalid base32/64 string, gnu writes everything it was able to decode until
|
||||
# it hit the decode error, while we don't write anything if the input is invalid.
|
||||
sed -i "s/\(baddecode.*OUT=>\"\).*\"/\1\"/g" tests/basenc/base64.pl
|
||||
sed -i "s/\(\(b2[ml]_[69]\|b32h_[56]\|z85_8\|z85_35\).*OUT=>\)[^}]*\(.*\)/\1\"\"\3/g" tests/basenc/basenc.pl
|
||||
sed -i "s/\(\(b2[ml]_[69]\|z85_8\|z85_35\).*OUT=>\)[^}]*\(.*\)/\1\"\"\3/g" tests/basenc/basenc.pl
|
||||
|
||||
# add "error: " to the expected error message
|
||||
sed -i "s/\$prog: invalid input/\$prog: error: invalid input/g" tests/basenc/basenc.pl
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue