od: fix handling of non-ascii chars

This commit is contained in:
Tomasz Guz 2025-06-25 17:14:56 +02:00
parent fb2399f56b
commit cad92245f7
2 changed files with 20 additions and 45 deletions

View file

@ -2,7 +2,6 @@
//
// For the full copyright and license information, please view the LICENSE
// file that was distributed with this source code.
use std::str::from_utf8;
use crate::formatteriteminfo::{FormatWriter, FormatterItemInfo};
@ -51,33 +50,13 @@ fn format_item_c(bytes: &[u8]) -> String {
let b = bytes[0];
if b & 0x80 == 0x00 {
// ASCII byte (0xxxxxxx)
match C_CHARS.get(b as usize) {
Some(s) => format!("{s:>4}"),
None => format!("{b:>4}"),
}
} else if (b & 0xc0) == 0x80 {
// second or subsequent octet of an utf-8 sequence
String::from(" **")
} else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
// start of a 2 octet utf-8 sequence
match from_utf8(&bytes[0..2]) {
Ok(s) => format!("{s:>4}"),
Err(_) => format!(" {b:03o}"),
}
} else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
// start of a 3 octet utf-8 sequence
match from_utf8(&bytes[0..3]) {
Ok(s) => format!("{s:>4}"),
Err(_) => format!(" {b:03o}"),
}
} else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
// start of a 4 octet utf-8 sequence
match from_utf8(&bytes[0..4]) {
Ok(s) => format!("{s:>4}"),
Err(_) => format!(" {b:03o}"),
}
} else {
// invalid utf-8
// Continuation or leading byte of a multibyte UTF-8 sequence — treat as raw byte
format!(" {b:03o}")
}
}
@ -125,27 +104,22 @@ fn test_format_item_c() {
assert_eq!(" 177", format_item_c(&[0x7f]));
assert_eq!(" A", format_item_c(&[0x41, 0x21]));
assert_eq!(" **", format_item_c(&[0x80]));
assert_eq!(" **", format_item_c(&[0x9f]));
assert_eq!(" 200", format_item_c(&[0x80]));
assert_eq!(" 237", format_item_c(&[0x9f]));
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f]));
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21]));
assert_eq!(" 303", format_item_c(&[0xc3, 0x9f]));
assert_eq!(" 303", format_item_c(&[0xc3, 0x9f, 0x21]));
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
assert_eq!(" 341", format_item_c(&[0xe1, 0x80, 0x80]));
assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
assert_eq!(
" \u{1f496}",
format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21])
);
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (UTF-8 null)
assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD
assert_eq!(" 364", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD
assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8

View file

@ -279,18 +279,19 @@ fn test_f64() {
#[test]
fn test_multibyte() {
let input = "ˆ˜語🙂✅🐶𝛑Universität Tübingen \u{1B000}"; // spell-checker:disable-line
new_ucmd!()
.arg("-c")
.arg("-w12")
.run_piped_stdin("Universität Tübingen \u{1B000}".as_bytes()) // spell-checker:disable-line
.args(&["-t", "c"])
.run_piped_stdin(input.as_bytes())
.success()
.no_stderr()
.stdout_is(unindent(
"
0000000 U n i v e r s i t ä ** t
0000014 T ü ** b i n g e n \u{1B000}
0000030 ** ** **
0000033
r"
0000000 342 200 231 342 200 220 313 206 342 200 230 313 234 350 252 236
0000020 360 237 231 202 342 234 205 360 237 220 266 360 235 233 221 U
0000040 n i v e r s i t 303 244 t T 303 274 b
0000060 i n g e n 360 233 200 200
0000072
",
));
}
@ -714,10 +715,10 @@ fn test_ascii_dump() {
r"
0000000 00 01 0a 0d 10 1f 20 61 62 63 7d 7e 7f 80 90 a0 >...... abc}~....<
nul soh nl cr dle us sp a b c } ~ del nul dle sp
\0 001 \n \r 020 037 a b c } ~ 177 ** ** ** >...... abc}~....<
\0 001 \n \r 020 037 a b c } ~ 177 200 220 240 >...... abc}~....<
0000020 b0 c0 d0 e0 f0 ff >......<
0 @ P ` p del
** 300 320 340 360 377 >......<
260 300 320 340 360 377 >......<
0000026
",
));