shared: remove escaping and UTF-8 routines from shared module

With the error refactor, these are no longer used. Namely, while
switching to structured errors, I took that opportunity to slim down
errors so that we are not repeating parts of the input as often.
This commit is contained in:
Andrew Gallant 2025-12-22 14:15:40 -05:00
parent a50f6797ce
commit 101cd0dda6
No known key found for this signature in database
GPG key ID: B2E3A4923F8B0D44
10 changed files with 210 additions and 222 deletions

View file

@ -122,3 +122,4 @@ impl core::fmt::Debug for RepeatByte {
Ok(())
}
}

View file

@ -1,6 +1,4 @@
// auto-generated by: jiff-cli generate shared
pub(crate) mod array_str;
pub(crate) mod escape;
pub(crate) mod itime;
pub(crate) mod utf8;

View file

@ -89,3 +89,4 @@ pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
// yield at least one Unicode scalar value.
Some(Ok(string.chars().next().unwrap()))
}

View file

@ -1,4 +1,4 @@
use crate::{error, shared::util::escape};
use crate::{error, util::escape};
#[derive(Clone, Debug)]
pub(crate) enum Error {

View file

@ -12,8 +12,8 @@ use crate::{
util::{DecimalFormatter, FractionalFormatter},
Write, WriteExt,
},
shared::util::utf8,
tz::Offset,
util::utf8,
Error,
};

View file

@ -1,122 +0,0 @@
/*!
Provides convenience routines for escaping raw bytes.
This was copied from `regex-automata` with a few light edits.
*/
use super::utf8;
/// Provides a convenient `Debug` implementation for a `u8`.
///
/// The `Debug` impl treats the byte as an ASCII, and emits a human
/// readable representation of it. If the byte isn't ASCII, then it's
/// emitted as a hex escape sequence.
#[derive(Clone, Copy)]
pub(crate) struct Byte(pub u8);
impl core::fmt::Display for Byte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
if self.0 == b' ' {
return write!(f, " ");
}
// 10 bytes is enough for any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
impl core::fmt::Debug for Byte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// Provides a convenient `Debug` implementation for `&[u8]`.
///
/// This generally works best when the bytes are presumed to be mostly
/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
/// they are emitted as hex escape sequences.
#[derive(Clone, Copy)]
pub(crate) struct Bytes<'a>(pub &'a [u8]);
impl<'a> core::fmt::Display for Bytes<'a> {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8::decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(err) => {
// The decode API guarantees `errant_bytes` is non-empty.
write!(f, r"\x{:02x}", err.as_slice()[0])?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
'\x01'..='\x7f' => {
write!(f, "{}", (ch as u8).escape_ascii())?;
}
_ => write!(f, "{}", ch.escape_debug())?,
}
}
Ok(())
}
}
impl<'a> core::fmt::Debug for Bytes<'a> {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// A helper for repeating a single byte utilizing `Byte`.
///
/// This is limited to repeating a byte up to `u8::MAX` times in order
/// to reduce its size overhead. And in practice, Jiff just doesn't
/// need more than this (at time of writing, 2025-11-29).
pub(crate) struct RepeatByte {
pub(crate) byte: u8,
pub(crate) count: u8,
}
impl core::fmt::Display for RepeatByte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
for _ in 0..self.count {
write!(f, "{}", Byte(self.byte))?;
}
Ok(())
}
}
impl core::fmt::Debug for RepeatByte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}

View file

@ -1,4 +1,2 @@
pub(crate) mod array_str;
pub(crate) mod escape;
pub(crate) mod itime;
pub(crate) mod utf8;

View file

@ -1,89 +0,0 @@
/// Represents an invalid UTF-8 sequence.
///
/// This is an error returned by `decode`. It is guaranteed to
/// contain 1, 2 or 3 bytes.
pub(crate) struct Utf8Error {
bytes: [u8; 3],
len: u8,
}
impl Utf8Error {
#[cold]
#[inline(never)]
fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
let len = err.error_len().unwrap_or_else(|| original_bytes.len());
// OK because the biggest invalid UTF-8
// sequence possible is 3.
debug_assert!(1 <= len && len <= 3);
let mut bytes = [0; 3];
bytes[..len].copy_from_slice(&original_bytes[..len]);
Utf8Error {
bytes,
// OK because the biggest invalid UTF-8
// sequence possible is 3.
len: u8::try_from(len).unwrap(),
}
}
/// Returns the slice of invalid UTF-8 bytes.
///
/// The slice returned is guaranteed to have length equivalent
/// to `Utf8Error::len`.
pub(crate) fn as_slice(&self) -> &[u8] {
&self.bytes[..self.len()]
}
/// Returns the length of the invalid UTF-8 sequence found.
///
/// This is guaranteed to be 1, 2 or 3.
pub(crate) fn len(&self) -> usize {
usize::from(self.len)
}
}
impl core::fmt::Display for Utf8Error {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"found invalid UTF-8 byte {errant_bytes:?} in format \
string (format strings must be valid UTF-8)",
errant_bytes = crate::shared::util::escape::Bytes(self.as_slice()),
)
}
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the
/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
/// to be a prefix of `bytes`). That byte slice corresponds either to a single
/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
/// value (but which ultimately did not lead to a valid encoding).
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for
/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
/// this crate, then please file an issue and discuss your use case.
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
if bytes.is_empty() {
return None;
}
let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
Ok(s) => s,
Err(ref err) if err.valid_up_to() > 0 => {
// OK because we just verified we have at least some
// valid UTF-8.
core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
}
// In this case, we want to return 1-3 bytes that make up a prefix of
// a potentially valid codepoint.
Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
};
// OK because we guaranteed above that `string`
// must be non-empty. And thus, `str::chars` must
// yield at least one Unicode scalar value.
Some(Ok(string.chars().next().unwrap()))
}

View file

@ -4,8 +4,119 @@ Provides convenience routines for escaping raw bytes.
This was copied from `regex-automata` with a few light edits.
*/
// These were originally defined here, but they got moved to
// shared since they're needed there. We re-export them here
// because this is really where they should live, but they're
// in shared because `jiff-tzdb-static` needs it.
pub(crate) use crate::shared::util::escape::{Byte, Bytes, RepeatByte};
use super::utf8;
/// Provides a convenient `Debug` implementation for a `u8`.
///
/// The `Debug` impl treats the byte as an ASCII, and emits a human
/// readable representation of it. If the byte isn't ASCII, then it's
/// emitted as a hex escape sequence.
#[derive(Clone, Copy)]
pub(crate) struct Byte(pub u8);
impl core::fmt::Display for Byte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
if self.0 == b' ' {
return write!(f, " ");
}
// 10 bytes is enough for any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
impl core::fmt::Debug for Byte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// Provides a convenient `Debug` implementation for `&[u8]`.
///
/// This generally works best when the bytes are presumed to be mostly
/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
/// they are emitted as hex escape sequences.
#[derive(Clone, Copy)]
pub(crate) struct Bytes<'a>(pub &'a [u8]);
impl<'a> core::fmt::Display for Bytes<'a> {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8::decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(err) => {
// The decode API guarantees `errant_bytes` is non-empty.
write!(f, r"\x{:02x}", err.as_slice()[0])?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
'\x01'..='\x7f' => {
write!(f, "{}", (ch as u8).escape_ascii())?;
}
_ => write!(f, "{}", ch.escape_debug())?,
}
}
Ok(())
}
}
impl<'a> core::fmt::Debug for Bytes<'a> {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// A helper for repeating a single byte utilizing `Byte`.
///
/// This is limited to repeating a byte up to `u8::MAX` times in order
/// to reduce its size overhead. And in practice, Jiff just doesn't
/// need more than this (at time of writing, 2025-11-29).
pub(crate) struct RepeatByte {
pub(crate) byte: u8,
pub(crate) count: u8,
}
impl core::fmt::Display for RepeatByte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
for _ in 0..self.count {
write!(f, "{}", Byte(self.byte))?;
}
Ok(())
}
}
impl core::fmt::Debug for RepeatByte {
#[inline(never)]
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}

View file

@ -1,5 +1,95 @@
use core::cmp::Ordering;
/// Represents an invalid UTF-8 sequence.
///
/// This is an error returned by `decode`. It is guaranteed to
/// contain 1, 2 or 3 bytes.
pub(crate) struct Utf8Error {
bytes: [u8; 3],
len: u8,
}
impl Utf8Error {
#[cold]
#[inline(never)]
fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
let len = err.error_len().unwrap_or_else(|| original_bytes.len());
// OK because the biggest invalid UTF-8
// sequence possible is 3.
debug_assert!(1 <= len && len <= 3);
let mut bytes = [0; 3];
bytes[..len].copy_from_slice(&original_bytes[..len]);
Utf8Error {
bytes,
// OK because the biggest invalid UTF-8
// sequence possible is 3.
len: u8::try_from(len).unwrap(),
}
}
/// Returns the slice of invalid UTF-8 bytes.
///
/// The slice returned is guaranteed to have length equivalent
/// to `Utf8Error::len`.
pub(crate) fn as_slice(&self) -> &[u8] {
&self.bytes[..self.len()]
}
/// Returns the length of the invalid UTF-8 sequence found.
///
/// This is guaranteed to be 1, 2 or 3.
pub(crate) fn len(&self) -> usize {
usize::from(self.len)
}
}
impl core::fmt::Display for Utf8Error {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"found invalid UTF-8 byte {errant_bytes:?} in format \
string (format strings must be valid UTF-8)",
errant_bytes = crate::util::escape::Bytes(self.as_slice()),
)
}
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the
/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
/// to be a prefix of `bytes`). That byte slice corresponds either to a single
/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
/// value (but which ultimately did not lead to a valid encoding).
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for
/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
/// this crate, then please file an issue and discuss your use case.
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
if bytes.is_empty() {
return None;
}
let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
Ok(s) => s,
Err(ref err) if err.valid_up_to() > 0 => {
// OK because we just verified we have at least some
// valid UTF-8.
core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
}
// In this case, we want to return 1-3 bytes that make up a prefix of
// a potentially valid codepoint.
Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
};
// OK because we guaranteed above that `string`
// must be non-empty. And thus, `str::chars` must
// yield at least one Unicode scalar value.
Some(Ok(string.chars().next().unwrap()))
}
/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
#[inline]
pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {