mirror of
https://github.com/uutils/coreutils.git
synced 2025-07-07 21:45:01 +00:00
od: fix handling of non-ascii chars
This commit is contained in:
parent
fb2399f56b
commit
cad92245f7
2 changed files with 20 additions and 45 deletions
|
@ -2,7 +2,6 @@
|
|||
//
|
||||
// For the full copyright and license information, please view the LICENSE
|
||||
// file that was distributed with this source code.
|
||||
use std::str::from_utf8;
|
||||
|
||||
use crate::formatteriteminfo::{FormatWriter, FormatterItemInfo};
|
||||
|
||||
|
@ -51,33 +50,13 @@ fn format_item_c(bytes: &[u8]) -> String {
|
|||
let b = bytes[0];
|
||||
|
||||
if b & 0x80 == 0x00 {
|
||||
// ASCII byte (0xxxxxxx)
|
||||
match C_CHARS.get(b as usize) {
|
||||
Some(s) => format!("{s:>4}"),
|
||||
None => format!("{b:>4}"),
|
||||
}
|
||||
} else if (b & 0xc0) == 0x80 {
|
||||
// second or subsequent octet of an utf-8 sequence
|
||||
String::from(" **")
|
||||
} else if ((b & 0xe0) == 0xc0) && (bytes.len() >= 2) {
|
||||
// start of a 2 octet utf-8 sequence
|
||||
match from_utf8(&bytes[0..2]) {
|
||||
Ok(s) => format!("{s:>4}"),
|
||||
Err(_) => format!(" {b:03o}"),
|
||||
}
|
||||
} else if ((b & 0xf0) == 0xe0) && (bytes.len() >= 3) {
|
||||
// start of a 3 octet utf-8 sequence
|
||||
match from_utf8(&bytes[0..3]) {
|
||||
Ok(s) => format!("{s:>4}"),
|
||||
Err(_) => format!(" {b:03o}"),
|
||||
}
|
||||
} else if ((b & 0xf8) == 0xf0) && (bytes.len() >= 4) {
|
||||
// start of a 4 octet utf-8 sequence
|
||||
match from_utf8(&bytes[0..4]) {
|
||||
Ok(s) => format!("{s:>4}"),
|
||||
Err(_) => format!(" {b:03o}"),
|
||||
}
|
||||
} else {
|
||||
// invalid utf-8
|
||||
// Continuation or leading byte of a multibyte UTF-8 sequence — treat as raw byte
|
||||
format!(" {b:03o}")
|
||||
}
|
||||
}
|
||||
|
@ -125,27 +104,22 @@ fn test_format_item_c() {
|
|||
assert_eq!(" 177", format_item_c(&[0x7f]));
|
||||
assert_eq!(" A", format_item_c(&[0x41, 0x21]));
|
||||
|
||||
assert_eq!(" **", format_item_c(&[0x80]));
|
||||
assert_eq!(" **", format_item_c(&[0x9f]));
|
||||
assert_eq!(" 200", format_item_c(&[0x80]));
|
||||
assert_eq!(" 237", format_item_c(&[0x9f]));
|
||||
|
||||
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f]));
|
||||
assert_eq!(" ß", format_item_c(&[0xc3, 0x9f, 0x21]));
|
||||
assert_eq!(" 303", format_item_c(&[0xc3, 0x9f]));
|
||||
assert_eq!(" 303", format_item_c(&[0xc3, 0x9f, 0x21]));
|
||||
|
||||
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80]));
|
||||
assert_eq!(" \u{1000}", format_item_c(&[0xe1, 0x80, 0x80, 0x21]));
|
||||
assert_eq!(" 341", format_item_c(&[0xe1, 0x80, 0x80]));
|
||||
|
||||
assert_eq!(" \u{1f496}", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
|
||||
assert_eq!(
|
||||
" \u{1f496}",
|
||||
format_item_c(&[0xf0, 0x9f, 0x92, 0x96, 0x21])
|
||||
);
|
||||
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92, 0x96]));
|
||||
|
||||
assert_eq!(" 300", format_item_c(&[0xc0, 0x80])); // invalid utf-8 (UTF-8 null)
|
||||
assert_eq!(" 301", format_item_c(&[0xc1, 0xa1])); // invalid utf-8
|
||||
assert_eq!(" 303", format_item_c(&[0xc3, 0xc3])); // invalid utf-8
|
||||
assert_eq!(" 360", format_item_c(&[0xf0, 0x82, 0x82, 0xac])); // invalid utf-8 (overlong)
|
||||
assert_eq!(" 360", format_item_c(&[0xf0, 0x9f, 0x92])); // invalid utf-8 (missing octet)
|
||||
assert_eq!(" \u{10FFFD}", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD
|
||||
assert_eq!(" 364", format_item_c(&[0xf4, 0x8f, 0xbf, 0xbd])); // largest valid utf-8 // spell-checker:ignore 10FFFD FFFD
|
||||
assert_eq!(" 364", format_item_c(&[0xf4, 0x90, 0x00, 0x00])); // invalid utf-8
|
||||
assert_eq!(" 365", format_item_c(&[0xf5, 0x80, 0x80, 0x80])); // invalid utf-8
|
||||
assert_eq!(" 377", format_item_c(&[0xff])); // invalid utf-8
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue