basenc: stream base32/base64 I/O to honor bounded-memory test

GNU basenc bounded-memory failed because the Rust impl buffered entire input
and exceeded the vmem limit. Stream base32/base64 via BufReader and chunked
encode/decode so the working set stays around 8 KiB. Keep base58 buffered to
preserve its big-integer semantics. Flush already-decoded bytes before
returning errors to match GNU output.
This commit is contained in:
mattsu 2025-12-01 19:42:39 +09:00
parent aaa061052f
commit f1f4973cd6

View file

@ -8,7 +8,7 @@
use clap::{Arg, ArgAction, Command};
use std::ffi::OsString;
use std::fs::File;
use std::io::{self, ErrorKind, Read, Seek, Write};
use std::io::{self, BufReader, ErrorKind, Read, Write};
use std::path::{Path, PathBuf};
use uucore::display::Quotable;
use uucore::encoding::{
@ -28,6 +28,8 @@ pub const BASE_CMD_PARSE_ERROR: i32 = 1;
///
/// This default is only used if no "-w"/"--wrap" argument is passed
pub const WRAP_DEFAULT: usize = 76;
// Fixed to 8 KiB (equivalent to std::io::DEFAULT_BUF_SIZE on most targets)
pub const DEFAULT_BUFFER_SIZE: usize = 8 * 1024;
pub struct Config {
pub decode: bool,
@ -149,64 +151,63 @@ pub fn base_app(about: &'static str, usage: &str) -> Command {
)
}
/// A trait alias for types that implement both `Read` and `Seek`.
pub trait ReadSeek: Read + Seek {}
/// Automatically implement the `ReadSeek` trait for any type that implements both `Read` and `Seek`.
impl<T: Read + Seek> ReadSeek for T {}
pub fn get_input(config: &Config) -> UResult<Box<dyn ReadSeek>> {
pub fn get_input(config: &Config) -> UResult<Box<dyn Read>> {
match &config.to_read {
Some(path_buf) => {
// Do not buffer input, because buffering is handled by `fast_decode` and `fast_encode`
let file =
File::open(path_buf).map_err_context(|| path_buf.maybe_quote().to_string())?;
Ok(Box::new(file))
Ok(Box::new(BufReader::new(file)))
}
None => {
let mut buffer = Vec::new();
io::stdin().read_to_end(&mut buffer)?;
Ok(Box::new(io::Cursor::new(buffer)))
// Stdin is already buffered by the OS; wrap once more to reduce syscalls per read.
Ok(Box::new(BufReader::new(io::stdin())))
}
}
}
/// Determines if the input buffer contains any padding ('=') ignoring trailing whitespace.
fn read_and_has_padding<R: Read>(input: &mut R) -> UResult<(bool, Vec<u8>)> {
let mut buf = Vec::new();
input
.read_to_end(&mut buf)
.map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?;
// Treat the stream as padded if any '=' exists (GNU coreutils continues decoding
// even when padding bytes are followed by more data).
let has_padding = buf.contains(&b'=');
Ok((has_padding, buf))
}
pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Config) -> UResult<()> {
let (has_padding, read) = read_and_has_padding(input)?;
pub fn handle_input<R: Read>(input: &mut R, format: Format, config: Config) -> UResult<()> {
// Always allow padding for Base64 to avoid a full pre-scan of the input.
let supports_fast_decode_and_encode =
get_supports_fast_decode_and_encode(format, config.decode, has_padding);
get_supports_fast_decode_and_encode(format, config.decode, true);
let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref();
let mut stdout_lock = io::stdout().lock();
let result = if config.decode {
fast_decode::fast_decode(
read,
let result = match (format, config.decode) {
// Base58 must process the entire input as one big integer; keep the
// historical behaviour of buffering everything for this format only.
(Format::Base58, _) => {
let mut buffered = Vec::new();
input
.read_to_end(&mut buffered)
.map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?;
if config.decode {
fast_decode::fast_decode_buffer(
buffered,
&mut stdout_lock,
supports_fast_decode_and_encode_ref,
config.ignore_garbage,
)
} else {
fast_encode::fast_encode_buffer(
buffered,
&mut stdout_lock,
supports_fast_decode_and_encode_ref,
config.wrap_cols,
)
}
}
// Streaming path for all other encodings keeps memory bounded.
(_, true) => fast_decode::fast_decode_stream(
input,
&mut stdout_lock,
supports_fast_decode_and_encode_ref,
config.ignore_garbage,
)
} else {
fast_encode::fast_encode(
read,
),
(_, false) => fast_encode::fast_encode_stream(
input,
&mut stdout_lock,
supports_fast_decode_and_encode_ref,
config.wrap_cols,
)
),
};
// Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc
@ -296,14 +297,17 @@ pub fn get_supports_fast_decode_and_encode(
}
pub mod fast_encode {
use crate::base_common::WRAP_DEFAULT;
use crate::base_common::{DEFAULT_BUFFER_SIZE, WRAP_DEFAULT};
use std::{
cmp::min,
collections::VecDeque,
io::{self, Write},
io::{self, Read, Write},
num::NonZeroUsize,
};
use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
use uucore::{
encoding::SupportsFastDecodeAndEncode,
error::{UResult, USimpleError},
};
struct LineWrapping {
line_length: NonZeroUsize,
@ -405,7 +409,7 @@ pub mod fast_encode {
}
// End of helper functions
pub fn fast_encode(
pub fn fast_encode_buffer(
input: Vec<u8>,
output: &mut dyn Write,
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
@ -506,10 +510,90 @@ pub mod fast_encode {
}
Ok(())
}
pub fn fast_encode_stream(
input: &mut dyn Read,
output: &mut dyn Write,
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
wrap: Option<usize>,
) -> UResult<()> {
const ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;
let encode_in_chunks_of_size =
supports_fast_decode_and_encode.unpadded_multiple() * ENCODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
assert!(encode_in_chunks_of_size > 0);
let mut line_wrapping = match wrap {
Some(0) => None,
Some(an) => Some(LineWrapping {
line_length: NonZeroUsize::new(an).unwrap(),
print_buffer: Vec::<u8>::new(),
}),
None => Some(LineWrapping {
line_length: NonZeroUsize::new(WRAP_DEFAULT).unwrap(),
print_buffer: Vec::<u8>::new(),
}),
};
// Buffers
let mut leftover_buffer = VecDeque::<u8>::new();
let mut encoded_buffer = VecDeque::<u8>::new();
let mut read_buffer = vec![0u8; encode_in_chunks_of_size.max(DEFAULT_BUFFER_SIZE)];
loop {
let read = input
.read(&mut read_buffer)
.map_err(|err| USimpleError::new(1, super::format_read_error(err.kind())))?;
if read == 0 {
break;
}
leftover_buffer.extend(&read_buffer[..read]);
while leftover_buffer.len() >= encode_in_chunks_of_size {
{
let contiguous = leftover_buffer.make_contiguous();
encode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&contiguous[..encode_in_chunks_of_size],
&mut encoded_buffer,
)?;
}
// Drop the data we just encoded
leftover_buffer.drain(..encode_in_chunks_of_size);
write_to_output(
&mut line_wrapping,
&mut encoded_buffer,
output,
false,
wrap == Some(0),
)?;
}
}
// Encode any remaining bytes and flush
supports_fast_decode_and_encode
.encode_to_vec_deque(leftover_buffer.make_contiguous(), &mut encoded_buffer)?;
write_to_output(
&mut line_wrapping,
&mut encoded_buffer,
output,
true,
wrap == Some(0),
)?;
Ok(())
}
}
pub mod fast_decode {
use std::io::{self, Write};
use crate::base_common::DEFAULT_BUFFER_SIZE;
use std::io::{self, Read, Write};
use uucore::{
encoding::SupportsFastDecodeAndEncode,
error::{UResult, USimpleError},
@ -579,7 +663,7 @@ pub mod fast_decode {
}
// End of helper functions
pub fn fast_decode(
pub fn fast_decode_buffer(
input: Vec<u8>,
output: &mut dyn Write,
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
@ -671,6 +755,123 @@ pub mod fast_decode {
Ok(())
}
pub fn fast_decode_stream(
input: &mut dyn Read,
output: &mut dyn Write,
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
ignore_garbage: bool,
) -> UResult<()> {
const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;
let alphabet = supports_fast_decode_and_encode.alphabet();
let alphabet_table = alphabet_lookup(alphabet);
let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
assert!(decode_in_chunks_of_size > 0);
assert!(valid_multiple > 0);
let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();
let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);
let mut decoded_buffer = Vec::<u8>::new();
let mut read_buffer = [0u8; DEFAULT_BUFFER_SIZE];
loop {
let read = input
.read(&mut read_buffer)
.map_err(|err| USimpleError::new(1, super::format_read_error(err.kind())))?;
if read == 0 {
break;
}
for &byte in &read_buffer[..read] {
if byte == b'\n' || byte == b'\r' {
continue;
}
if alphabet_table[usize::from(byte)] {
buffer.push(byte);
} else if ignore_garbage {
continue;
} else {
if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
} else {
while buffer.len() >= decode_in_chunks_of_size {
decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer[..decode_in_chunks_of_size],
&mut decoded_buffer,
)?;
write_to_output(&mut decoded_buffer, output)?;
buffer.drain(..decode_in_chunks_of_size);
}
}
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
}
if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
} else if buffer.len() == decode_in_chunks_of_size {
decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer,
&mut decoded_buffer,
)?;
write_to_output(&mut decoded_buffer, output)?;
buffer.clear();
}
}
}
if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
}
if !buffer.is_empty() {
let mut owned_chunk: Option<Vec<u8>> = None;
let mut had_invalid_tail = false;
if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) {
had_invalid_tail = pad_result.had_invalid_tail;
owned_chunk = Some(pad_result.chunk);
}
let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer);
supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
write_to_output(&mut decoded_buffer, output)?;
if had_invalid_tail {
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
}
}
Ok(())
}
}
fn format_read_error(kind: ErrorKind) -> String {
@ -692,6 +893,21 @@ fn format_read_error(kind: ErrorKind) -> String {
translate!("base-common-read-error", "error" => kind_string_capitalized)
}
/// Determines if the input buffer contains any padding ('=') ignoring trailing whitespace.
#[cfg(test)]
fn read_and_has_padding<R: Read>(input: &mut R) -> UResult<(bool, Vec<u8>)> {
let mut buf = Vec::new();
input
.read_to_end(&mut buf)
.map_err(|err| USimpleError::new(1, format_read_error(err.kind())))?;
// Treat the stream as padded if any '=' exists (GNU coreutils continues decoding
// even when padding bytes are followed by more data).
let has_padding = buf.contains(&b'=');
Ok((has_padding, buf))
}
#[cfg(test)]
mod tests {
use crate::base_common::read_and_has_padding;