shared: remove escaping and UTF-8 routines from shared module

With the error refactor, these are no longer used. Namely, while switching to structured errors, I took that opportunity to slim down errors so that we are not repeating parts of the input as often.
2025-12-23 08:47:45 +00:00 · 2025-12-22 14:15:40 -05:00 · 2025-12-22 14:15:40 -05:00 · 101cd0dda6
commit 101cd0dda6
parent a50f6797ce
10 changed files with 210 additions and 222 deletions
--- a/crates/jiff-static/src/shared/util/escape.rs
+++ b/crates/jiff-static/src/shared/util/escape.rs
@ -122,3 +122,4 @@ impl core::fmt::Debug for RepeatByte {
        Ok(())
    }
 }
+
--- a/crates/jiff-static/src/shared/util/mod.rs
+++ b/crates/jiff-static/src/shared/util/mod.rs
@ -1,6 +1,4 @@
 // auto-generated by: jiff-cli generate shared

 pub(crate) mod array_str;
-pub(crate) mod escape;
 pub(crate) mod itime;
-pub(crate) mod utf8;
--- a/crates/jiff-static/src/shared/util/utf8.rs
+++ b/crates/jiff-static/src/shared/util/utf8.rs
@ -89,3 +89,4 @@ pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
    // yield at least one Unicode scalar value.
    Some(Ok(string.chars().next().unwrap()))
 }
+
--- a/src/error/fmt/offset.rs
+++ b/src/error/fmt/offset.rs
@ -1,4 +1,4 @@
-use crate::{error, shared::util::escape};
+use crate::{error, util::escape};

 #[derive(Clone, Debug)]
 pub(crate) enum Error {
--- a/src/fmt/strtime/format.rs
+++ b/src/fmt/strtime/format.rs
@ -12,8 +12,8 @@ use crate::{
        util::{DecimalFormatter, FractionalFormatter},
        Write, WriteExt,
    },
-    shared::util::utf8,
    tz::Offset,
+    util::utf8,
    Error,
 };

--- a/src/shared/util/escape.rs
+++ b/src/shared/util/escape.rs
@ -1,122 +0,0 @@
-/*!
-Provides convenience routines for escaping raw bytes.
-
-This was copied from `regex-automata` with a few light edits.
-*/
-
-use super::utf8;
-
-/// Provides a convenient `Debug` implementation for a `u8`.
-///
-/// The `Debug` impl treats the byte as an ASCII, and emits a human
-/// readable representation of it. If the byte isn't ASCII, then it's
-/// emitted as a hex escape sequence.
-#[derive(Clone, Copy)]
-pub(crate) struct Byte(pub u8);
-
-impl core::fmt::Display for Byte {
-    #[inline(never)]
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        if self.0 == b' ' {
-            return write!(f, " ");
-        }
-        // 10 bytes is enough for any output from ascii::escape_default.
-        let mut bytes = [0u8; 10];
-        let mut len = 0;
-        for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
-            // capitalize \xab to \xAB
-            if i >= 2 && b'a' <= b && b <= b'f' {
-                b -= 32;
-            }
-            bytes[len] = b;
-            len += 1;
-        }
-        write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
-    }
-}
-
-impl core::fmt::Debug for Byte {
-    #[inline(never)]
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(f, "\"")?;
-        core::fmt::Display::fmt(self, f)?;
-        write!(f, "\"")?;
-        Ok(())
-    }
-}
-
-/// Provides a convenient `Debug` implementation for `&[u8]`.
-///
-/// This generally works best when the bytes are presumed to be mostly
-/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
-/// they are emitted as hex escape sequences.
-#[derive(Clone, Copy)]
-pub(crate) struct Bytes<'a>(pub &'a [u8]);
-
-impl<'a> core::fmt::Display for Bytes<'a> {
-    #[inline(never)]
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        // This is a sad re-implementation of a similar impl found in bstr.
-        let mut bytes = self.0;
-        while let Some(result) = utf8::decode(bytes) {
-            let ch = match result {
-                Ok(ch) => ch,
-                Err(err) => {
-                    // The decode API guarantees `errant_bytes` is non-empty.
-                    write!(f, r"\x{:02x}", err.as_slice()[0])?;
-                    bytes = &bytes[1..];
-                    continue;
-                }
-            };
-            bytes = &bytes[ch.len_utf8()..];
-            match ch {
-                '\0' => write!(f, "\\0")?,
-                '\x01'..='\x7f' => {
-                    write!(f, "{}", (ch as u8).escape_ascii())?;
-                }
-                _ => write!(f, "{}", ch.escape_debug())?,
-            }
-        }
-        Ok(())
-    }
-}
-
-impl<'a> core::fmt::Debug for Bytes<'a> {
-    #[inline(never)]
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(f, "\"")?;
-        core::fmt::Display::fmt(self, f)?;
-        write!(f, "\"")?;
-        Ok(())
-    }
-}
-
-/// A helper for repeating a single byte utilizing `Byte`.
-///
-/// This is limited to repeating a byte up to `u8::MAX` times in order
-/// to reduce its size overhead. And in practice, Jiff just doesn't
-/// need more than this (at time of writing, 2025-11-29).
-pub(crate) struct RepeatByte {
-    pub(crate) byte: u8,
-    pub(crate) count: u8,
-}
-
-impl core::fmt::Display for RepeatByte {
-    #[inline(never)]
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        for _ in 0..self.count {
-            write!(f, "{}", Byte(self.byte))?;
-        }
-        Ok(())
-    }
-}
-
-impl core::fmt::Debug for RepeatByte {
-    #[inline(never)]
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(f, "\"")?;
-        core::fmt::Display::fmt(self, f)?;
-        write!(f, "\"")?;
-        Ok(())
-    }
-}
--- a/src/shared/util/mod.rs
+++ b/src/shared/util/mod.rs
@ -1,4 +1,2 @@
 pub(crate) mod array_str;
-pub(crate) mod escape;
 pub(crate) mod itime;
-pub(crate) mod utf8;
--- a/src/shared/util/utf8.rs
+++ b/src/shared/util/utf8.rs
@ -1,89 +0,0 @@
-/// Represents an invalid UTF-8 sequence.
-///
-/// This is an error returned by `decode`. It is guaranteed to
-/// contain 1, 2 or 3 bytes.
-pub(crate) struct Utf8Error {
-    bytes: [u8; 3],
-    len: u8,
-}
-
-impl Utf8Error {
-    #[cold]
-    #[inline(never)]
-    fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
-        let len = err.error_len().unwrap_or_else(|| original_bytes.len());
-        // OK because the biggest invalid UTF-8
-        // sequence possible is 3.
-        debug_assert!(1 <= len && len <= 3);
-        let mut bytes = [0; 3];
-        bytes[..len].copy_from_slice(&original_bytes[..len]);
-        Utf8Error {
-            bytes,
-            // OK because the biggest invalid UTF-8
-            // sequence possible is 3.
-            len: u8::try_from(len).unwrap(),
-        }
-    }
-
-    /// Returns the slice of invalid UTF-8 bytes.
-    ///
-    /// The slice returned is guaranteed to have length equivalent
-    /// to `Utf8Error::len`.
-    pub(crate) fn as_slice(&self) -> &[u8] {
-        &self.bytes[..self.len()]
-    }
-
-    /// Returns the length of the invalid UTF-8 sequence found.
-    ///
-    /// This is guaranteed to be 1, 2 or 3.
-    pub(crate) fn len(&self) -> usize {
-        usize::from(self.len)
-    }
-}
-
-impl core::fmt::Display for Utf8Error {
-    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
-        write!(
-            f,
-            "found invalid UTF-8 byte {errant_bytes:?} in format \
-             string (format strings must be valid UTF-8)",
-            errant_bytes = crate::shared::util::escape::Bytes(self.as_slice()),
-        )
-    }
-}
-
-/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
-///
-/// If no valid encoding of a codepoint exists at the beginning of the
-/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
-/// to be a prefix of `bytes`). That byte slice corresponds either to a single
-/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
-/// value (but which ultimately did not lead to a valid encoding).
-///
-/// This returns `None` if and only if `bytes` is empty.
-///
-/// This never panics.
-///
-/// *WARNING*: This is not designed for performance. If you're looking for
-/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
-/// this crate, then please file an issue and discuss your use case.
-pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
-    if bytes.is_empty() {
-        return None;
-    }
-    let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
-        Ok(s) => s,
-        Err(ref err) if err.valid_up_to() > 0 => {
-            // OK because we just verified we have at least some
-            // valid UTF-8.
-            core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
-        }
-        // In this case, we want to return 1-3 bytes that make up a prefix of
-        // a potentially valid codepoint.
-        Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
-    };
-    // OK because we guaranteed above that `string`
-    // must be non-empty. And thus, `str::chars` must
-    // yield at least one Unicode scalar value.
-    Some(Ok(string.chars().next().unwrap()))
-}
--- a/src/util/escape.rs
+++ b/src/util/escape.rs
@ -4,8 +4,119 @@ Provides convenience routines for escaping raw bytes.
 This was copied from `regex-automata` with a few light edits.
 */

-// These were originally defined here, but they got moved to
-// shared since they're needed there. We re-export them here
-// because this is really where they should live, but they're
-// in shared because `jiff-tzdb-static` needs it.
-pub(crate) use crate::shared::util::escape::{Byte, Bytes, RepeatByte};
+use super::utf8;
+
+/// Provides a convenient `Debug` implementation for a `u8`.
+///
+/// The `Debug` impl treats the byte as an ASCII, and emits a human
+/// readable representation of it. If the byte isn't ASCII, then it's
+/// emitted as a hex escape sequence.
+#[derive(Clone, Copy)]
+pub(crate) struct Byte(pub u8);
+
+impl core::fmt::Display for Byte {
+    #[inline(never)]
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        if self.0 == b' ' {
+            return write!(f, " ");
+        }
+        // 10 bytes is enough for any output from ascii::escape_default.
+        let mut bytes = [0u8; 10];
+        let mut len = 0;
+        for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
+            // capitalize \xab to \xAB
+            if i >= 2 && b'a' <= b && b <= b'f' {
+                b -= 32;
+            }
+            bytes[len] = b;
+            len += 1;
+        }
+        write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
+    }
+}
+
+impl core::fmt::Debug for Byte {
+    #[inline(never)]
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "\"")?;
+        core::fmt::Display::fmt(self, f)?;
+        write!(f, "\"")?;
+        Ok(())
+    }
+}
+
+/// Provides a convenient `Debug` implementation for `&[u8]`.
+///
+/// This generally works best when the bytes are presumed to be mostly
+/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
+/// they are emitted as hex escape sequences.
+#[derive(Clone, Copy)]
+pub(crate) struct Bytes<'a>(pub &'a [u8]);
+
+impl<'a> core::fmt::Display for Bytes<'a> {
+    #[inline(never)]
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        // This is a sad re-implementation of a similar impl found in bstr.
+        let mut bytes = self.0;
+        while let Some(result) = utf8::decode(bytes) {
+            let ch = match result {
+                Ok(ch) => ch,
+                Err(err) => {
+                    // The decode API guarantees `errant_bytes` is non-empty.
+                    write!(f, r"\x{:02x}", err.as_slice()[0])?;
+                    bytes = &bytes[1..];
+                    continue;
+                }
+            };
+            bytes = &bytes[ch.len_utf8()..];
+            match ch {
+                '\0' => write!(f, "\\0")?,
+                '\x01'..='\x7f' => {
+                    write!(f, "{}", (ch as u8).escape_ascii())?;
+                }
+                _ => write!(f, "{}", ch.escape_debug())?,
+            }
+        }
+        Ok(())
+    }
+}
+
+impl<'a> core::fmt::Debug for Bytes<'a> {
+    #[inline(never)]
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "\"")?;
+        core::fmt::Display::fmt(self, f)?;
+        write!(f, "\"")?;
+        Ok(())
+    }
+}
+
+/// A helper for repeating a single byte utilizing `Byte`.
+///
+/// This is limited to repeating a byte up to `u8::MAX` times in order
+/// to reduce its size overhead. And in practice, Jiff just doesn't
+/// need more than this (at time of writing, 2025-11-29).
+pub(crate) struct RepeatByte {
+    pub(crate) byte: u8,
+    pub(crate) count: u8,
+}
+
+impl core::fmt::Display for RepeatByte {
+    #[inline(never)]
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        for _ in 0..self.count {
+            write!(f, "{}", Byte(self.byte))?;
+        }
+        Ok(())
+    }
+}
+
+impl core::fmt::Debug for RepeatByte {
+    #[inline(never)]
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(f, "\"")?;
+        core::fmt::Display::fmt(self, f)?;
+        write!(f, "\"")?;
+        Ok(())
+    }
+}
--- a/src/util/utf8.rs
+++ b/src/util/utf8.rs
@ -1,5 +1,95 @@
 use core::cmp::Ordering;

+/// Represents an invalid UTF-8 sequence.
+///
+/// This is an error returned by `decode`. It is guaranteed to
+/// contain 1, 2 or 3 bytes.
+pub(crate) struct Utf8Error {
+    bytes: [u8; 3],
+    len: u8,
+}
+
+impl Utf8Error {
+    #[cold]
+    #[inline(never)]
+    fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
+        let len = err.error_len().unwrap_or_else(|| original_bytes.len());
+        // OK because the biggest invalid UTF-8
+        // sequence possible is 3.
+        debug_assert!(1 <= len && len <= 3);
+        let mut bytes = [0; 3];
+        bytes[..len].copy_from_slice(&original_bytes[..len]);
+        Utf8Error {
+            bytes,
+            // OK because the biggest invalid UTF-8
+            // sequence possible is 3.
+            len: u8::try_from(len).unwrap(),
+        }
+    }
+
+    /// Returns the slice of invalid UTF-8 bytes.
+    ///
+    /// The slice returned is guaranteed to have length equivalent
+    /// to `Utf8Error::len`.
+    pub(crate) fn as_slice(&self) -> &[u8] {
+        &self.bytes[..self.len()]
+    }
+
+    /// Returns the length of the invalid UTF-8 sequence found.
+    ///
+    /// This is guaranteed to be 1, 2 or 3.
+    pub(crate) fn len(&self) -> usize {
+        usize::from(self.len)
+    }
+}
+
+impl core::fmt::Display for Utf8Error {
+    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
+        write!(
+            f,
+            "found invalid UTF-8 byte {errant_bytes:?} in format \
+             string (format strings must be valid UTF-8)",
+            errant_bytes = crate::util::escape::Bytes(self.as_slice()),
+        )
+    }
+}
+
+/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
+///
+/// If no valid encoding of a codepoint exists at the beginning of the
+/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
+/// to be a prefix of `bytes`). That byte slice corresponds either to a single
+/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
+/// value (but which ultimately did not lead to a valid encoding).
+///
+/// This returns `None` if and only if `bytes` is empty.
+///
+/// This never panics.
+///
+/// *WARNING*: This is not designed for performance. If you're looking for
+/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
+/// this crate, then please file an issue and discuss your use case.
+pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
+    if bytes.is_empty() {
+        return None;
+    }
+    let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
+        Ok(s) => s,
+        Err(ref err) if err.valid_up_to() > 0 => {
+            // OK because we just verified we have at least some
+            // valid UTF-8.
+            core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
+        }
+        // In this case, we want to return 1-3 bytes that make up a prefix of
+        // a potentially valid codepoint.
+        Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
+    };
+    // OK because we guaranteed above that `string`
+    // must be non-empty. And thus, `str::chars` must
+    // yield at least one Unicode scalar value.
+    Some(Ok(string.chars().next().unwrap()))
+}
+
 /// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
 #[inline]
 pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {