From 101cd0dda6e6fe06202aaec580cb60ca878664f9 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Mon, 22 Dec 2025 14:15:40 -0500 Subject: [PATCH] shared: remove escaping and UTF-8 routines from `shared` module With the error refactor, these are no longer used. Namely, while switching to structured errors, I took that opportunity to slim down errors so that we are not repeating parts of the input as often. --- crates/jiff-static/src/shared/util/escape.rs | 1 + crates/jiff-static/src/shared/util/mod.rs | 2 - crates/jiff-static/src/shared/util/utf8.rs | 1 + src/error/fmt/offset.rs | 2 +- src/fmt/strtime/format.rs | 2 +- src/shared/util/escape.rs | 122 ------------------- src/shared/util/mod.rs | 2 - src/shared/util/utf8.rs | 89 -------------- src/util/escape.rs | 121 +++++++++++++++++- src/util/utf8.rs | 90 ++++++++++++++ 10 files changed, 210 insertions(+), 222 deletions(-) delete mode 100644 src/shared/util/escape.rs delete mode 100644 src/shared/util/utf8.rs diff --git a/crates/jiff-static/src/shared/util/escape.rs b/crates/jiff-static/src/shared/util/escape.rs index 5e3b8b3..e5b182b 100644 --- a/crates/jiff-static/src/shared/util/escape.rs +++ b/crates/jiff-static/src/shared/util/escape.rs @@ -122,3 +122,4 @@ impl core::fmt::Debug for RepeatByte { Ok(()) } } + diff --git a/crates/jiff-static/src/shared/util/mod.rs b/crates/jiff-static/src/shared/util/mod.rs index 98ff457..3630e73 100644 --- a/crates/jiff-static/src/shared/util/mod.rs +++ b/crates/jiff-static/src/shared/util/mod.rs @@ -1,6 +1,4 @@ // auto-generated by: jiff-cli generate shared pub(crate) mod array_str; -pub(crate) mod escape; pub(crate) mod itime; -pub(crate) mod utf8; diff --git a/crates/jiff-static/src/shared/util/utf8.rs b/crates/jiff-static/src/shared/util/utf8.rs index 585c0e7..1738a20 100644 --- a/crates/jiff-static/src/shared/util/utf8.rs +++ b/crates/jiff-static/src/shared/util/utf8.rs @@ -89,3 +89,4 @@ pub(crate) fn decode(bytes: &[u8]) -> Option> { // yield at least one Unicode scalar value. Some(Ok(string.chars().next().unwrap())) } + diff --git a/src/error/fmt/offset.rs b/src/error/fmt/offset.rs index 459133f..af81804 100644 --- a/src/error/fmt/offset.rs +++ b/src/error/fmt/offset.rs @@ -1,4 +1,4 @@ -use crate::{error, shared::util::escape}; +use crate::{error, util::escape}; #[derive(Clone, Debug)] pub(crate) enum Error { diff --git a/src/fmt/strtime/format.rs b/src/fmt/strtime/format.rs index 65de289..ab7c981 100644 --- a/src/fmt/strtime/format.rs +++ b/src/fmt/strtime/format.rs @@ -12,8 +12,8 @@ use crate::{ util::{DecimalFormatter, FractionalFormatter}, Write, WriteExt, }, - shared::util::utf8, tz::Offset, + util::utf8, Error, }; diff --git a/src/shared/util/escape.rs b/src/shared/util/escape.rs deleted file mode 100644 index dd0736d..0000000 --- a/src/shared/util/escape.rs +++ /dev/null @@ -1,122 +0,0 @@ -/*! -Provides convenience routines for escaping raw bytes. - -This was copied from `regex-automata` with a few light edits. -*/ - -use super::utf8; - -/// Provides a convenient `Debug` implementation for a `u8`. -/// -/// The `Debug` impl treats the byte as an ASCII, and emits a human -/// readable representation of it. If the byte isn't ASCII, then it's -/// emitted as a hex escape sequence. -#[derive(Clone, Copy)] -pub(crate) struct Byte(pub u8); - -impl core::fmt::Display for Byte { - #[inline(never)] - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - if self.0 == b' ' { - return write!(f, " "); - } - // 10 bytes is enough for any output from ascii::escape_default. - let mut bytes = [0u8; 10]; - let mut len = 0; - for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { - // capitalize \xab to \xAB - if i >= 2 && b'a' <= b && b <= b'f' { - b -= 32; - } - bytes[len] = b; - len += 1; - } - write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) - } -} - -impl core::fmt::Debug for Byte { - #[inline(never)] - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!(f, "\"")?; - core::fmt::Display::fmt(self, f)?; - write!(f, "\"")?; - Ok(()) - } -} - -/// Provides a convenient `Debug` implementation for `&[u8]`. -/// -/// This generally works best when the bytes are presumed to be mostly -/// UTF-8, but will work for anything. For any bytes that aren't UTF-8, -/// they are emitted as hex escape sequences. -#[derive(Clone, Copy)] -pub(crate) struct Bytes<'a>(pub &'a [u8]); - -impl<'a> core::fmt::Display for Bytes<'a> { - #[inline(never)] - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - // This is a sad re-implementation of a similar impl found in bstr. - let mut bytes = self.0; - while let Some(result) = utf8::decode(bytes) { - let ch = match result { - Ok(ch) => ch, - Err(err) => { - // The decode API guarantees `errant_bytes` is non-empty. - write!(f, r"\x{:02x}", err.as_slice()[0])?; - bytes = &bytes[1..]; - continue; - } - }; - bytes = &bytes[ch.len_utf8()..]; - match ch { - '\0' => write!(f, "\\0")?, - '\x01'..='\x7f' => { - write!(f, "{}", (ch as u8).escape_ascii())?; - } - _ => write!(f, "{}", ch.escape_debug())?, - } - } - Ok(()) - } -} - -impl<'a> core::fmt::Debug for Bytes<'a> { - #[inline(never)] - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!(f, "\"")?; - core::fmt::Display::fmt(self, f)?; - write!(f, "\"")?; - Ok(()) - } -} - -/// A helper for repeating a single byte utilizing `Byte`. -/// -/// This is limited to repeating a byte up to `u8::MAX` times in order -/// to reduce its size overhead. And in practice, Jiff just doesn't -/// need more than this (at time of writing, 2025-11-29). -pub(crate) struct RepeatByte { - pub(crate) byte: u8, - pub(crate) count: u8, -} - -impl core::fmt::Display for RepeatByte { - #[inline(never)] - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - for _ in 0..self.count { - write!(f, "{}", Byte(self.byte))?; - } - Ok(()) - } -} - -impl core::fmt::Debug for RepeatByte { - #[inline(never)] - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!(f, "\"")?; - core::fmt::Display::fmt(self, f)?; - write!(f, "\"")?; - Ok(()) - } -} diff --git a/src/shared/util/mod.rs b/src/shared/util/mod.rs index 812a7f4..971f365 100644 --- a/src/shared/util/mod.rs +++ b/src/shared/util/mod.rs @@ -1,4 +1,2 @@ pub(crate) mod array_str; -pub(crate) mod escape; pub(crate) mod itime; -pub(crate) mod utf8; diff --git a/src/shared/util/utf8.rs b/src/shared/util/utf8.rs deleted file mode 100644 index 33d920c..0000000 --- a/src/shared/util/utf8.rs +++ /dev/null @@ -1,89 +0,0 @@ -/// Represents an invalid UTF-8 sequence. -/// -/// This is an error returned by `decode`. It is guaranteed to -/// contain 1, 2 or 3 bytes. -pub(crate) struct Utf8Error { - bytes: [u8; 3], - len: u8, -} - -impl Utf8Error { - #[cold] - #[inline(never)] - fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error { - let len = err.error_len().unwrap_or_else(|| original_bytes.len()); - // OK because the biggest invalid UTF-8 - // sequence possible is 3. - debug_assert!(1 <= len && len <= 3); - let mut bytes = [0; 3]; - bytes[..len].copy_from_slice(&original_bytes[..len]); - Utf8Error { - bytes, - // OK because the biggest invalid UTF-8 - // sequence possible is 3. - len: u8::try_from(len).unwrap(), - } - } - - /// Returns the slice of invalid UTF-8 bytes. - /// - /// The slice returned is guaranteed to have length equivalent - /// to `Utf8Error::len`. - pub(crate) fn as_slice(&self) -> &[u8] { - &self.bytes[..self.len()] - } - - /// Returns the length of the invalid UTF-8 sequence found. - /// - /// This is guaranteed to be 1, 2 or 3. - pub(crate) fn len(&self) -> usize { - usize::from(self.len) - } -} - -impl core::fmt::Display for Utf8Error { - fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { - write!( - f, - "found invalid UTF-8 byte {errant_bytes:?} in format \ - string (format strings must be valid UTF-8)", - errant_bytes = crate::shared::util::escape::Bytes(self.as_slice()), - ) - } -} - -/// Decodes the next UTF-8 encoded codepoint from the given byte slice. -/// -/// If no valid encoding of a codepoint exists at the beginning of the -/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed -/// to be a prefix of `bytes`). That byte slice corresponds either to a single -/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar -/// value (but which ultimately did not lead to a valid encoding). -/// -/// This returns `None` if and only if `bytes` is empty. -/// -/// This never panics. -/// -/// *WARNING*: This is not designed for performance. If you're looking for -/// a fast UTF-8 decoder, this is not it. If you feel like you need one in -/// this crate, then please file an issue and discuss your use case. -pub(crate) fn decode(bytes: &[u8]) -> Option> { - if bytes.is_empty() { - return None; - } - let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) { - Ok(s) => s, - Err(ref err) if err.valid_up_to() > 0 => { - // OK because we just verified we have at least some - // valid UTF-8. - core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap() - } - // In this case, we want to return 1-3 bytes that make up a prefix of - // a potentially valid codepoint. - Err(err) => return Some(Err(Utf8Error::new(bytes, err))), - }; - // OK because we guaranteed above that `string` - // must be non-empty. And thus, `str::chars` must - // yield at least one Unicode scalar value. - Some(Ok(string.chars().next().unwrap())) -} diff --git a/src/util/escape.rs b/src/util/escape.rs index 9deb81a..dd0736d 100644 --- a/src/util/escape.rs +++ b/src/util/escape.rs @@ -4,8 +4,119 @@ Provides convenience routines for escaping raw bytes. This was copied from `regex-automata` with a few light edits. */ -// These were originally defined here, but they got moved to -// shared since they're needed there. We re-export them here -// because this is really where they should live, but they're -// in shared because `jiff-tzdb-static` needs it. -pub(crate) use crate::shared::util::escape::{Byte, Bytes, RepeatByte}; +use super::utf8; + +/// Provides a convenient `Debug` implementation for a `u8`. +/// +/// The `Debug` impl treats the byte as an ASCII, and emits a human +/// readable representation of it. If the byte isn't ASCII, then it's +/// emitted as a hex escape sequence. +#[derive(Clone, Copy)] +pub(crate) struct Byte(pub u8); + +impl core::fmt::Display for Byte { + #[inline(never)] + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + if self.0 == b' ' { + return write!(f, " "); + } + // 10 bytes is enough for any output from ascii::escape_default. + let mut bytes = [0u8; 10]; + let mut len = 0; + for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { + // capitalize \xab to \xAB + if i >= 2 && b'a' <= b && b <= b'f' { + b -= 32; + } + bytes[len] = b; + len += 1; + } + write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) + } +} + +impl core::fmt::Debug for Byte { + #[inline(never)] + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + core::fmt::Display::fmt(self, f)?; + write!(f, "\"")?; + Ok(()) + } +} + +/// Provides a convenient `Debug` implementation for `&[u8]`. +/// +/// This generally works best when the bytes are presumed to be mostly +/// UTF-8, but will work for anything. For any bytes that aren't UTF-8, +/// they are emitted as hex escape sequences. +#[derive(Clone, Copy)] +pub(crate) struct Bytes<'a>(pub &'a [u8]); + +impl<'a> core::fmt::Display for Bytes<'a> { + #[inline(never)] + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + // This is a sad re-implementation of a similar impl found in bstr. + let mut bytes = self.0; + while let Some(result) = utf8::decode(bytes) { + let ch = match result { + Ok(ch) => ch, + Err(err) => { + // The decode API guarantees `errant_bytes` is non-empty. + write!(f, r"\x{:02x}", err.as_slice()[0])?; + bytes = &bytes[1..]; + continue; + } + }; + bytes = &bytes[ch.len_utf8()..]; + match ch { + '\0' => write!(f, "\\0")?, + '\x01'..='\x7f' => { + write!(f, "{}", (ch as u8).escape_ascii())?; + } + _ => write!(f, "{}", ch.escape_debug())?, + } + } + Ok(()) + } +} + +impl<'a> core::fmt::Debug for Bytes<'a> { + #[inline(never)] + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + core::fmt::Display::fmt(self, f)?; + write!(f, "\"")?; + Ok(()) + } +} + +/// A helper for repeating a single byte utilizing `Byte`. +/// +/// This is limited to repeating a byte up to `u8::MAX` times in order +/// to reduce its size overhead. And in practice, Jiff just doesn't +/// need more than this (at time of writing, 2025-11-29). +pub(crate) struct RepeatByte { + pub(crate) byte: u8, + pub(crate) count: u8, +} + +impl core::fmt::Display for RepeatByte { + #[inline(never)] + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + for _ in 0..self.count { + write!(f, "{}", Byte(self.byte))?; + } + Ok(()) + } +} + +impl core::fmt::Debug for RepeatByte { + #[inline(never)] + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "\"")?; + core::fmt::Display::fmt(self, f)?; + write!(f, "\"")?; + Ok(()) + } +} diff --git a/src/util/utf8.rs b/src/util/utf8.rs index dc62b6b..bd5b462 100644 --- a/src/util/utf8.rs +++ b/src/util/utf8.rs @@ -1,5 +1,95 @@ use core::cmp::Ordering; +/// Represents an invalid UTF-8 sequence. +/// +/// This is an error returned by `decode`. It is guaranteed to +/// contain 1, 2 or 3 bytes. +pub(crate) struct Utf8Error { + bytes: [u8; 3], + len: u8, +} + +impl Utf8Error { + #[cold] + #[inline(never)] + fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error { + let len = err.error_len().unwrap_or_else(|| original_bytes.len()); + // OK because the biggest invalid UTF-8 + // sequence possible is 3. + debug_assert!(1 <= len && len <= 3); + let mut bytes = [0; 3]; + bytes[..len].copy_from_slice(&original_bytes[..len]); + Utf8Error { + bytes, + // OK because the biggest invalid UTF-8 + // sequence possible is 3. + len: u8::try_from(len).unwrap(), + } + } + + /// Returns the slice of invalid UTF-8 bytes. + /// + /// The slice returned is guaranteed to have length equivalent + /// to `Utf8Error::len`. + pub(crate) fn as_slice(&self) -> &[u8] { + &self.bytes[..self.len()] + } + + /// Returns the length of the invalid UTF-8 sequence found. + /// + /// This is guaranteed to be 1, 2 or 3. + pub(crate) fn len(&self) -> usize { + usize::from(self.len) + } +} + +impl core::fmt::Display for Utf8Error { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "found invalid UTF-8 byte {errant_bytes:?} in format \ + string (format strings must be valid UTF-8)", + errant_bytes = crate::util::escape::Bytes(self.as_slice()), + ) + } +} + +/// Decodes the next UTF-8 encoded codepoint from the given byte slice. +/// +/// If no valid encoding of a codepoint exists at the beginning of the +/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed +/// to be a prefix of `bytes`). That byte slice corresponds either to a single +/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar +/// value (but which ultimately did not lead to a valid encoding). +/// +/// This returns `None` if and only if `bytes` is empty. +/// +/// This never panics. +/// +/// *WARNING*: This is not designed for performance. If you're looking for +/// a fast UTF-8 decoder, this is not it. If you feel like you need one in +/// this crate, then please file an issue and discuss your use case. +pub(crate) fn decode(bytes: &[u8]) -> Option> { + if bytes.is_empty() { + return None; + } + let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) { + Ok(s) => s, + Err(ref err) if err.valid_up_to() > 0 => { + // OK because we just verified we have at least some + // valid UTF-8. + core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap() + } + // In this case, we want to return 1-3 bytes that make up a prefix of + // a potentially valid codepoint. + Err(err) => return Some(Err(Utf8Error::new(bytes, err))), + }; + // OK because we guaranteed above that `string` + // must be non-empty. And thus, `str::chars` must + // yield at least one Unicode scalar value. + Some(Ok(string.chars().next().unwrap())) +} + /// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`. #[inline] pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {