util(cksum): Fix behavior with --text and --untagged, and prepare hashsum/cksum merge refactor

This commit is contained in:
Dorian Peron 2025-10-26 00:50:35 +02:00
parent 485fd8e207
commit a4f4542467
3 changed files with 304 additions and 118 deletions

View file

@ -15,8 +15,8 @@ use std::path::Path;
use uucore::checksum::{
ALGORITHM_OPTIONS_BLAKE2B, ALGORITHM_OPTIONS_BSD, ALGORITHM_OPTIONS_CRC,
ALGORITHM_OPTIONS_CRC32B, ALGORITHM_OPTIONS_SYSV, ChecksumError, ChecksumOptions,
ChecksumVerbose, SUPPORTED_ALGORITHMS, calculate_blake2b_length, detect_algo, digest_reader,
perform_checksum_validation,
ChecksumVerbose, HashAlgorithm, LEGACY_ALGORITHMS, SUPPORTED_ALGORITHMS,
calculate_blake2b_length, detect_algo, digest_reader, perform_checksum_validation,
};
use uucore::translate;
@ -29,63 +29,189 @@ use uucore::{
sum::Digest,
};
#[derive(Debug, PartialEq)]
enum OutputFormat {
Hexadecimal,
Raw,
Base64,
}
struct Options {
algo_name: &'static str,
digest: Box<dyn Digest + 'static>,
output_bits: usize,
tag: bool, // will cover the --untagged option
length: Option<usize>,
output_format: OutputFormat,
asterisk: bool, // if we display an asterisk or not (--binary/--text)
line_ending: LineEnding,
}
/// Reading mode used to compute digest.
///
/// On most linux systems, this is irrelevant, as there is no distinction
/// between text and binary files. Refer to GNU's cksum documentation for more
/// information.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum ReadingMode {
Binary,
Text,
}
impl ReadingMode {
#[inline]
fn as_char(&self) -> char {
match self {
Self::Binary => '*',
Self::Text => ' ',
}
}
}
/// Whether to write the digest as hexadecimal or encoded in base64.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum DigestFormat {
Hexadecimal,
Base64,
}
impl DigestFormat {
#[inline]
fn is_base64(&self) -> bool {
*self == Self::Base64
}
}
/// Holds the representation that shall be used for printing a checksum line
#[derive(Debug, PartialEq, Eq)]
enum OutputFormat {
/// Raw digest
Raw,
/// Selected for older algorithms which had their custom formatting
///
/// Default for crc, sysv, bsd
Legacy,
/// `$ALGO_NAME ($FILENAME) = $DIGEST`
Tagged(DigestFormat),
/// '$DIGEST $FLAG$FILENAME'
/// where 'flag' depends on the reading mode
///
/// Default for standalone checksum utilities
Untagged(DigestFormat, ReadingMode),
}
impl OutputFormat {
#[inline]
fn is_raw(&self) -> bool {
*self == Self::Raw
}
}
fn print_legacy_checksum(
options: &Options,
filename: &OsStr,
sum: &str,
size: usize,
) -> UResult<()> {
debug_assert!(LEGACY_ALGORITHMS.contains(&options.algo_name));
// Print the sum
match options.algo_name {
ALGORITHM_OPTIONS_SYSV => print!(
"{} {}",
sum.parse::<u16>().unwrap(),
size.div_ceil(options.output_bits),
),
ALGORITHM_OPTIONS_BSD => {
// The BSD checksum output is 5 digit integer
let bsd_width = 5;
print!(
"{:0bsd_width$} {:bsd_width$}",
sum.parse::<u16>().unwrap(),
size.div_ceil(options.output_bits),
);
}
ALGORITHM_OPTIONS_CRC | ALGORITHM_OPTIONS_CRC32B => {
print!("{sum} {size}");
}
_ => unreachable!("Not a legacy algorithm"),
};
// Print the filename after a space if not stdin
if filename != "-" {
print!(" ");
let _dropped_result = stdout().write_all(os_str_as_bytes(filename)?);
}
Ok(())
}
fn print_tagged_checksum(options: &Options, filename: &OsStr, sum: &String) -> UResult<()> {
// Print algo name and opening parenthesis.
print!(
"{} (",
match (options.algo_name, options.length) {
// Multiply the length by 8, as we want to print the length in bits.
(ALGORITHM_OPTIONS_BLAKE2B, Some(l)) => format!("BLAKE2b-{}", l * 8),
(ALGORITHM_OPTIONS_BLAKE2B, None) => "BLAKE2b".into(),
(name, _) => name.to_ascii_uppercase(),
}
);
// Print filename
let _dropped_result = stdout().write_all(os_str_as_bytes(filename)?);
// Print closing parenthesis and sum
print!(") = {sum}");
Ok(())
}
fn print_untagged_checksum(
filename: &OsStr,
sum: &String,
reading_mode: ReadingMode,
) -> UResult<()> {
// Print checksum and reading mode flag
print!("{sum} {}", reading_mode.as_char());
// Print filename
let _dropped_result = stdout().write_all(os_str_as_bytes(filename)?);
Ok(())
}
/// Calculate checksum
///
/// # Arguments
///
/// * `options` - CLI options for the assigning checksum algorithm
/// * `files` - A iterator of [`OsStr`] which is a bunch of files that are using for calculating checksum
#[allow(clippy::cognitive_complexity)]
fn cksum<'a, I>(mut options: Options, files: I) -> UResult<()>
where
I: Iterator<Item = &'a OsStr>,
{
let files: Vec<_> = files.collect();
if options.output_format == OutputFormat::Raw && files.len() > 1 {
if options.output_format.is_raw() && files.len() > 1 {
return Err(Box::new(ChecksumError::RawMultipleFiles));
}
for filename in files {
let filename = Path::new(filename);
let filepath = Path::new(filename);
let stdin_buf;
let file_buf;
let is_stdin = filename == OsStr::new("-");
if filename.is_dir() {
if filepath.is_dir() {
show!(USimpleError::new(
1,
translate!("cksum-error-is-directory", "file" => filename.display())
translate!("cksum-error-is-directory", "file" => filepath.display())
));
continue;
}
// Handle the file input
let mut file = BufReader::new(if is_stdin {
let mut file = BufReader::new(if filename == "-" {
stdin_buf = stdin();
Box::new(stdin_buf) as Box<dyn Read>
} else {
file_buf = match File::open(filename) {
file_buf = match File::open(filepath) {
Ok(file) => file,
Err(err) => {
show!(err.map_err_context(|| filename.to_string_lossy().to_string()));
show!(err.map_err_context(|| filepath.to_string_lossy().to_string()));
continue;
}
};
@ -96,7 +222,16 @@ where
digest_reader(&mut options.digest, &mut file, false, options.output_bits)
.map_err_context(|| translate!("cksum-error-failed-to-read-input"))?;
let sum = match options.output_format {
// Encodes the sum if df is Base64, leaves as-is otherwise.
let encode_sum = |sum: String, df: DigestFormat| {
if df.is_base64() {
encoding::for_cksum::BASE64.encode(&hex::decode(sum).unwrap())
} else {
sum
}
};
match options.output_format {
OutputFormat::Raw => {
let bytes = match options.algo_name {
ALGORITHM_OPTIONS_CRC => sum_hex.parse::<u32>().unwrap().to_be_bytes().to_vec(),
@ -109,77 +244,22 @@ where
stdout().write_all(&bytes)?;
return Ok(());
}
OutputFormat::Hexadecimal => sum_hex,
OutputFormat::Base64 => match options.algo_name {
ALGORITHM_OPTIONS_CRC
| ALGORITHM_OPTIONS_CRC32B
| ALGORITHM_OPTIONS_SYSV
| ALGORITHM_OPTIONS_BSD => sum_hex,
_ => encoding::for_cksum::BASE64.encode(&hex::decode(sum_hex).unwrap()),
},
};
// The BSD checksum output is 5 digit integer
let bsd_width = 5;
let (before_filename, should_print_filename, after_filename) = match options.algo_name {
ALGORITHM_OPTIONS_SYSV => (
format!(
"{} {}{}",
sum.parse::<u16>().unwrap(),
sz.div_ceil(options.output_bits),
if is_stdin { "" } else { " " }
),
!is_stdin,
String::new(),
),
ALGORITHM_OPTIONS_BSD => (
format!(
"{:0bsd_width$} {:bsd_width$}{}",
sum.parse::<u16>().unwrap(),
sz.div_ceil(options.output_bits),
if is_stdin { "" } else { " " }
),
!is_stdin,
String::new(),
),
ALGORITHM_OPTIONS_CRC | ALGORITHM_OPTIONS_CRC32B => (
format!("{sum} {sz}{}", if is_stdin { "" } else { " " }),
!is_stdin,
String::new(),
),
ALGORITHM_OPTIONS_BLAKE2B if options.tag => {
(
if let Some(length) = options.length {
// Multiply by 8 here, as we want to print the length in bits.
format!("BLAKE2b-{} (", length * 8)
} else {
"BLAKE2b (".to_owned()
},
true,
format!(") = {sum}"),
)
OutputFormat::Legacy => {
print_legacy_checksum(&options, filename, &sum_hex, sz)?;
}
_ => {
if options.tag {
(
format!("{} (", options.algo_name.to_ascii_uppercase()),
true,
format!(") = {sum}"),
)
} else {
let prefix = if options.asterisk { "*" } else { " " };
(format!("{sum} {prefix}"), true, String::new())
}
OutputFormat::Tagged(digest_format) => {
print_tagged_checksum(&options, filename, &encode_sum(sum_hex, digest_format))?;
}
OutputFormat::Untagged(digest_format, reading_mode) => {
print_untagged_checksum(
filename,
&encode_sum(sum_hex, digest_format),
reading_mode,
)?;
}
};
print!("{before_filename}");
if should_print_filename {
// The filename might not be valid UTF-8, and filename.display() would mangle the names.
// Therefore, emit the bytes directly to stdout, without any attempt at encoding them.
let _dropped_result = stdout().write_all(os_str_as_bytes(filename.as_os_str())?);
}
print!("{after_filename}{}", options.line_ending);
print!("{}", options.line_ending);
}
Ok(())
}
@ -203,31 +283,83 @@ mod options {
pub const ZERO: &str = "zero";
}
/// cksum has a bunch of legacy behavior.
/// We handle this in this function to make sure they are self contained
/// and "easier" to understand
/// cksum has a bunch of legacy behavior. We handle this in this function to
/// make sure they are self contained and "easier" to understand.
///
/// Returns a pair of boolean. The first one indicates if we should use tagged
/// output format, the second one indicates if we should use the binary flag in
/// the untagged case.
fn handle_tag_text_binary_flags<S: AsRef<OsStr>>(
args: impl Iterator<Item = S>,
) -> UResult<(bool, bool)> {
let mut tag = true;
let mut binary = false;
let mut text = false;
// --binary, --tag and --untagged are tight together: none of them
// conflicts with each other but --tag will reset "binary" and set "tag".
// conflicts with each other but --tag will reset "binary" and "text" and
// set "tag".
for arg in args {
let arg = arg.as_ref();
if arg == "-b" || arg == "--binary" {
text = false;
binary = true;
} else if arg == "--text" {
text = true;
binary = false;
} else if arg == "--tag" {
tag = true;
binary = false;
text = false;
} else if arg == "--untagged" {
tag = false;
}
}
Ok((tag, !tag && binary))
// Specifying --text without ever mentioning --untagged fails.
if text && tag {
return Err(ChecksumError::TextWithoutUntagged.into());
}
Ok((tag, binary))
}
/// Use already-processed arguments to decide the output format.
fn figure_out_output_format(
algo: &HashAlgorithm,
tag: bool,
binary: bool,
raw: bool,
base64: bool,
) -> OutputFormat {
// Raw output format takes precedence over anything else.
if raw {
return OutputFormat::Raw;
}
// Then, if the algo is legacy, takes precedence over the rest
if LEGACY_ALGORITHMS.contains(&algo.name) {
return OutputFormat::Legacy;
}
let digest_format = if base64 {
DigestFormat::Base64
} else {
DigestFormat::Hexadecimal
};
// After that, decide between tagged and untagged output
if tag {
OutputFormat::Tagged(digest_format)
} else {
let reading_mode = if binary {
ReadingMode::Binary
} else {
ReadingMode::Text
};
OutputFormat::Untagged(digest_format, reading_mode)
}
}
#[uucore::main]
@ -261,7 +393,7 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
None => None,
};
if ["bsd", "crc", "sysv", "crc32b"].contains(&algo_name) && check {
if LEGACY_ALGORITHMS.contains(&algo_name) && check {
return Err(ChecksumError::AlgorithmNotSupportedWithCheck.into());
}
@ -306,27 +438,25 @@ pub fn uumain(args: impl uucore::Args) -> UResult<()> {
return perform_checksum_validation(files, algo_option, length, opts);
}
let (tag, asterisk) = handle_tag_text_binary_flags(std::env::args_os())?;
let (tag, binary) = handle_tag_text_binary_flags(std::env::args_os())?;
let algo = detect_algo(algo_name, length)?;
let line_ending = LineEnding::from_zero_flag(matches.get_flag(options::ZERO));
let output_format = if matches.get_flag(options::RAW) {
OutputFormat::Raw
} else if matches.get_flag(options::BASE64) {
OutputFormat::Base64
} else {
OutputFormat::Hexadecimal
};
let output_format = figure_out_output_format(
&algo,
tag,
binary,
matches.get_flag(options::RAW),
matches.get_flag(options::BASE64),
);
let opts = Options {
algo_name: algo.name,
digest: (algo.create_fn)(),
output_bits: algo.bits,
length,
tag,
output_format,
asterisk,
line_ending,
};

View file

@ -66,6 +66,13 @@ pub const SUPPORTED_ALGORITHMS: [&str; 16] = [
ALGORITHM_OPTIONS_SHAKE256,
];
pub const LEGACY_ALGORITHMS: [&str; 4] = [
ALGORITHM_OPTIONS_SYSV,
ALGORITHM_OPTIONS_BSD,
ALGORITHM_OPTIONS_CRC,
ALGORITHM_OPTIONS_CRC32B,
];
pub struct HashAlgorithm {
pub name: &'static str,
pub create_fn: Box<dyn Fn() -> Box<dyn Digest + 'static>>,
@ -224,6 +231,8 @@ pub enum ChecksumError {
LengthOnlyForBlake2b,
#[error("the --binary and --text options are meaningless when verifying checksums")]
BinaryTextConflict,
#[error("--text mode is only supported with --untagged")]
TextWithoutUntagged,
#[error("--check is not supported with --algorithm={{bsd,sysv,crc,crc32b}}")]
AlgorithmNotSupportedWithCheck,
#[error("You cannot combine multiple hash algorithms!")]

View file

@ -615,20 +615,67 @@ fn test_reset_binary_but_set() {
.stdout_contains("d41d8cd98f00b204e9800998ecf8427e *");
}
#[test]
fn test_text_tag() {
let scene = TestScenario::new(util_name!());
let at = &scene.fixtures;
/// Test legacy behaviors with --tag, --untagged, --binary and --text
mod output_format {
use super::*;
at.touch("f");
#[test]
fn test_text_tag() {
let (at, mut ucmd) = at_and_ucmd!();
at.touch("f");
scene
.ucmd()
.arg("--text") // should disappear because of the following option
.arg("--tag")
.arg(at.subdir.join("f"))
.succeeds()
.stdout_contains("4294967295 0 ");
ucmd.arg("--text") // should disappear because of the following option
.arg("--tag")
.args(&["-a", "md5"])
.arg(at.subdir.join("f"))
.succeeds()
// Tagged output is used
.stdout_contains("f) = d41d8cd98f00b204e9800998ecf8427e");
}
#[test]
fn test_text_no_untagged() {
let (at, mut ucmd) = at_and_ucmd!();
at.touch("f");
// --text without --untagged fails
ucmd.arg("--text")
.args(&["-a", "md5"])
.arg(at.subdir.join("f"))
.fails_with_code(1)
.stderr_contains("--text mode is only supported with --untagged");
}
#[test]
fn test_text_binary() {
let (at, mut ucmd) = at_and_ucmd!();
at.touch("f");
// --binary overwrites --text, thus no error is raised
ucmd.arg("--text")
.arg("--binary")
.args(&["-a", "md5"])
.arg(at.subdir.join("f"))
.succeeds()
// No --untagged, tagged output is used
.stdout_contains("f) = d41d8cd98f00b204e9800998ecf8427e");
}
#[test]
fn test_text_binary_untagged() {
let (at, mut ucmd) = at_and_ucmd!();
at.touch("f");
// --binary overwrites --text
ucmd.arg("--text")
.arg("--binary")
.arg("--untagged")
.args(&["-a", "md5"])
.arg(at.subdir.join("f"))
.succeeds()
// Untagged output is used
.stdout_contains("d41d8cd98f00b204e9800998ecf8427e *");
}
}
#[test]