mirror of
https://github.com/BurntSushi/jiff.git
synced 2025-12-23 08:47:45 +00:00
shared: remove escaping and UTF-8 routines from shared module
With the error refactor, these are no longer used. Namely, while switching to structured errors, I took that opportunity to slim down errors so that we are not repeating parts of the input as often.
This commit is contained in:
parent
a50f6797ce
commit
101cd0dda6
10 changed files with 210 additions and 222 deletions
|
|
@ -122,3 +122,4 @@ impl core::fmt::Debug for RepeatByte {
|
|||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,4 @@
|
|||
// auto-generated by: jiff-cli generate shared
|
||||
|
||||
pub(crate) mod array_str;
|
||||
pub(crate) mod escape;
|
||||
pub(crate) mod itime;
|
||||
pub(crate) mod utf8;
|
||||
|
|
|
|||
|
|
@ -89,3 +89,4 @@ pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
|
|||
// yield at least one Unicode scalar value.
|
||||
Some(Ok(string.chars().next().unwrap()))
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
use crate::{error, shared::util::escape};
|
||||
use crate::{error, util::escape};
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub(crate) enum Error {
|
||||
|
|
|
|||
|
|
@ -12,8 +12,8 @@ use crate::{
|
|||
util::{DecimalFormatter, FractionalFormatter},
|
||||
Write, WriteExt,
|
||||
},
|
||||
shared::util::utf8,
|
||||
tz::Offset,
|
||||
util::utf8,
|
||||
Error,
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,122 +0,0 @@
|
|||
/*!
|
||||
Provides convenience routines for escaping raw bytes.
|
||||
|
||||
This was copied from `regex-automata` with a few light edits.
|
||||
*/
|
||||
|
||||
use super::utf8;
|
||||
|
||||
/// Provides a convenient `Debug` implementation for a `u8`.
|
||||
///
|
||||
/// The `Debug` impl treats the byte as an ASCII, and emits a human
|
||||
/// readable representation of it. If the byte isn't ASCII, then it's
|
||||
/// emitted as a hex escape sequence.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct Byte(pub u8);
|
||||
|
||||
impl core::fmt::Display for Byte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
if self.0 == b' ' {
|
||||
return write!(f, " ");
|
||||
}
|
||||
// 10 bytes is enough for any output from ascii::escape_default.
|
||||
let mut bytes = [0u8; 10];
|
||||
let mut len = 0;
|
||||
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
|
||||
// capitalize \xab to \xAB
|
||||
if i >= 2 && b'a' <= b && b <= b'f' {
|
||||
b -= 32;
|
||||
}
|
||||
bytes[len] = b;
|
||||
len += 1;
|
||||
}
|
||||
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for Byte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
core::fmt::Display::fmt(self, f)?;
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides a convenient `Debug` implementation for `&[u8]`.
|
||||
///
|
||||
/// This generally works best when the bytes are presumed to be mostly
|
||||
/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
|
||||
/// they are emitted as hex escape sequences.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct Bytes<'a>(pub &'a [u8]);
|
||||
|
||||
impl<'a> core::fmt::Display for Bytes<'a> {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
// This is a sad re-implementation of a similar impl found in bstr.
|
||||
let mut bytes = self.0;
|
||||
while let Some(result) = utf8::decode(bytes) {
|
||||
let ch = match result {
|
||||
Ok(ch) => ch,
|
||||
Err(err) => {
|
||||
// The decode API guarantees `errant_bytes` is non-empty.
|
||||
write!(f, r"\x{:02x}", err.as_slice()[0])?;
|
||||
bytes = &bytes[1..];
|
||||
continue;
|
||||
}
|
||||
};
|
||||
bytes = &bytes[ch.len_utf8()..];
|
||||
match ch {
|
||||
'\0' => write!(f, "\\0")?,
|
||||
'\x01'..='\x7f' => {
|
||||
write!(f, "{}", (ch as u8).escape_ascii())?;
|
||||
}
|
||||
_ => write!(f, "{}", ch.escape_debug())?,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> core::fmt::Debug for Bytes<'a> {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
core::fmt::Display::fmt(self, f)?;
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper for repeating a single byte utilizing `Byte`.
|
||||
///
|
||||
/// This is limited to repeating a byte up to `u8::MAX` times in order
|
||||
/// to reduce its size overhead. And in practice, Jiff just doesn't
|
||||
/// need more than this (at time of writing, 2025-11-29).
|
||||
pub(crate) struct RepeatByte {
|
||||
pub(crate) byte: u8,
|
||||
pub(crate) count: u8,
|
||||
}
|
||||
|
||||
impl core::fmt::Display for RepeatByte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
for _ in 0..self.count {
|
||||
write!(f, "{}", Byte(self.byte))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for RepeatByte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
core::fmt::Display::fmt(self, f)?;
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
@ -1,4 +1,2 @@
|
|||
pub(crate) mod array_str;
|
||||
pub(crate) mod escape;
|
||||
pub(crate) mod itime;
|
||||
pub(crate) mod utf8;
|
||||
|
|
|
|||
|
|
@ -1,89 +0,0 @@
|
|||
/// Represents an invalid UTF-8 sequence.
|
||||
///
|
||||
/// This is an error returned by `decode`. It is guaranteed to
|
||||
/// contain 1, 2 or 3 bytes.
|
||||
pub(crate) struct Utf8Error {
|
||||
bytes: [u8; 3],
|
||||
len: u8,
|
||||
}
|
||||
|
||||
impl Utf8Error {
|
||||
#[cold]
|
||||
#[inline(never)]
|
||||
fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
|
||||
let len = err.error_len().unwrap_or_else(|| original_bytes.len());
|
||||
// OK because the biggest invalid UTF-8
|
||||
// sequence possible is 3.
|
||||
debug_assert!(1 <= len && len <= 3);
|
||||
let mut bytes = [0; 3];
|
||||
bytes[..len].copy_from_slice(&original_bytes[..len]);
|
||||
Utf8Error {
|
||||
bytes,
|
||||
// OK because the biggest invalid UTF-8
|
||||
// sequence possible is 3.
|
||||
len: u8::try_from(len).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the slice of invalid UTF-8 bytes.
|
||||
///
|
||||
/// The slice returned is guaranteed to have length equivalent
|
||||
/// to `Utf8Error::len`.
|
||||
pub(crate) fn as_slice(&self) -> &[u8] {
|
||||
&self.bytes[..self.len()]
|
||||
}
|
||||
|
||||
/// Returns the length of the invalid UTF-8 sequence found.
|
||||
///
|
||||
/// This is guaranteed to be 1, 2 or 3.
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
usize::from(self.len)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for Utf8Error {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"found invalid UTF-8 byte {errant_bytes:?} in format \
|
||||
string (format strings must be valid UTF-8)",
|
||||
errant_bytes = crate::shared::util::escape::Bytes(self.as_slice()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
|
||||
///
|
||||
/// If no valid encoding of a codepoint exists at the beginning of the
|
||||
/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
|
||||
/// to be a prefix of `bytes`). That byte slice corresponds either to a single
|
||||
/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
|
||||
/// value (but which ultimately did not lead to a valid encoding).
|
||||
///
|
||||
/// This returns `None` if and only if `bytes` is empty.
|
||||
///
|
||||
/// This never panics.
|
||||
///
|
||||
/// *WARNING*: This is not designed for performance. If you're looking for
|
||||
/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
|
||||
/// this crate, then please file an issue and discuss your use case.
|
||||
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
|
||||
if bytes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
|
||||
Ok(s) => s,
|
||||
Err(ref err) if err.valid_up_to() > 0 => {
|
||||
// OK because we just verified we have at least some
|
||||
// valid UTF-8.
|
||||
core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
|
||||
}
|
||||
// In this case, we want to return 1-3 bytes that make up a prefix of
|
||||
// a potentially valid codepoint.
|
||||
Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
|
||||
};
|
||||
// OK because we guaranteed above that `string`
|
||||
// must be non-empty. And thus, `str::chars` must
|
||||
// yield at least one Unicode scalar value.
|
||||
Some(Ok(string.chars().next().unwrap()))
|
||||
}
|
||||
|
|
@ -4,8 +4,119 @@ Provides convenience routines for escaping raw bytes.
|
|||
This was copied from `regex-automata` with a few light edits.
|
||||
*/
|
||||
|
||||
// These were originally defined here, but they got moved to
|
||||
// shared since they're needed there. We re-export them here
|
||||
// because this is really where they should live, but they're
|
||||
// in shared because `jiff-tzdb-static` needs it.
|
||||
pub(crate) use crate::shared::util::escape::{Byte, Bytes, RepeatByte};
|
||||
use super::utf8;
|
||||
|
||||
/// Provides a convenient `Debug` implementation for a `u8`.
|
||||
///
|
||||
/// The `Debug` impl treats the byte as an ASCII, and emits a human
|
||||
/// readable representation of it. If the byte isn't ASCII, then it's
|
||||
/// emitted as a hex escape sequence.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct Byte(pub u8);
|
||||
|
||||
impl core::fmt::Display for Byte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
if self.0 == b' ' {
|
||||
return write!(f, " ");
|
||||
}
|
||||
// 10 bytes is enough for any output from ascii::escape_default.
|
||||
let mut bytes = [0u8; 10];
|
||||
let mut len = 0;
|
||||
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
|
||||
// capitalize \xab to \xAB
|
||||
if i >= 2 && b'a' <= b && b <= b'f' {
|
||||
b -= 32;
|
||||
}
|
||||
bytes[len] = b;
|
||||
len += 1;
|
||||
}
|
||||
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for Byte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
core::fmt::Display::fmt(self, f)?;
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Provides a convenient `Debug` implementation for `&[u8]`.
|
||||
///
|
||||
/// This generally works best when the bytes are presumed to be mostly
|
||||
/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
|
||||
/// they are emitted as hex escape sequences.
|
||||
#[derive(Clone, Copy)]
|
||||
pub(crate) struct Bytes<'a>(pub &'a [u8]);
|
||||
|
||||
impl<'a> core::fmt::Display for Bytes<'a> {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
// This is a sad re-implementation of a similar impl found in bstr.
|
||||
let mut bytes = self.0;
|
||||
while let Some(result) = utf8::decode(bytes) {
|
||||
let ch = match result {
|
||||
Ok(ch) => ch,
|
||||
Err(err) => {
|
||||
// The decode API guarantees `errant_bytes` is non-empty.
|
||||
write!(f, r"\x{:02x}", err.as_slice()[0])?;
|
||||
bytes = &bytes[1..];
|
||||
continue;
|
||||
}
|
||||
};
|
||||
bytes = &bytes[ch.len_utf8()..];
|
||||
match ch {
|
||||
'\0' => write!(f, "\\0")?,
|
||||
'\x01'..='\x7f' => {
|
||||
write!(f, "{}", (ch as u8).escape_ascii())?;
|
||||
}
|
||||
_ => write!(f, "{}", ch.escape_debug())?,
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> core::fmt::Debug for Bytes<'a> {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
core::fmt::Display::fmt(self, f)?;
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// A helper for repeating a single byte utilizing `Byte`.
|
||||
///
|
||||
/// This is limited to repeating a byte up to `u8::MAX` times in order
|
||||
/// to reduce its size overhead. And in practice, Jiff just doesn't
|
||||
/// need more than this (at time of writing, 2025-11-29).
|
||||
pub(crate) struct RepeatByte {
|
||||
pub(crate) byte: u8,
|
||||
pub(crate) count: u8,
|
||||
}
|
||||
|
||||
impl core::fmt::Display for RepeatByte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
for _ in 0..self.count {
|
||||
write!(f, "{}", Byte(self.byte))?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Debug for RepeatByte {
|
||||
#[inline(never)]
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(f, "\"")?;
|
||||
core::fmt::Display::fmt(self, f)?;
|
||||
write!(f, "\"")?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,5 +1,95 @@
|
|||
use core::cmp::Ordering;
|
||||
|
||||
/// Represents an invalid UTF-8 sequence.
|
||||
///
|
||||
/// This is an error returned by `decode`. It is guaranteed to
|
||||
/// contain 1, 2 or 3 bytes.
|
||||
pub(crate) struct Utf8Error {
|
||||
bytes: [u8; 3],
|
||||
len: u8,
|
||||
}
|
||||
|
||||
impl Utf8Error {
|
||||
#[cold]
|
||||
#[inline(never)]
|
||||
fn new(original_bytes: &[u8], err: core::str::Utf8Error) -> Utf8Error {
|
||||
let len = err.error_len().unwrap_or_else(|| original_bytes.len());
|
||||
// OK because the biggest invalid UTF-8
|
||||
// sequence possible is 3.
|
||||
debug_assert!(1 <= len && len <= 3);
|
||||
let mut bytes = [0; 3];
|
||||
bytes[..len].copy_from_slice(&original_bytes[..len]);
|
||||
Utf8Error {
|
||||
bytes,
|
||||
// OK because the biggest invalid UTF-8
|
||||
// sequence possible is 3.
|
||||
len: u8::try_from(len).unwrap(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the slice of invalid UTF-8 bytes.
|
||||
///
|
||||
/// The slice returned is guaranteed to have length equivalent
|
||||
/// to `Utf8Error::len`.
|
||||
pub(crate) fn as_slice(&self) -> &[u8] {
|
||||
&self.bytes[..self.len()]
|
||||
}
|
||||
|
||||
/// Returns the length of the invalid UTF-8 sequence found.
|
||||
///
|
||||
/// This is guaranteed to be 1, 2 or 3.
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
usize::from(self.len)
|
||||
}
|
||||
}
|
||||
|
||||
impl core::fmt::Display for Utf8Error {
|
||||
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
|
||||
write!(
|
||||
f,
|
||||
"found invalid UTF-8 byte {errant_bytes:?} in format \
|
||||
string (format strings must be valid UTF-8)",
|
||||
errant_bytes = crate::util::escape::Bytes(self.as_slice()),
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
|
||||
///
|
||||
/// If no valid encoding of a codepoint exists at the beginning of the
|
||||
/// given byte slice, then a 1-3 byte slice is returned (which is guaranteed
|
||||
/// to be a prefix of `bytes`). That byte slice corresponds either to a single
|
||||
/// invalid byte, or to a prefix of a valid UTF-8 encoding of a Unicode scalar
|
||||
/// value (but which ultimately did not lead to a valid encoding).
|
||||
///
|
||||
/// This returns `None` if and only if `bytes` is empty.
|
||||
///
|
||||
/// This never panics.
|
||||
///
|
||||
/// *WARNING*: This is not designed for performance. If you're looking for
|
||||
/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
|
||||
/// this crate, then please file an issue and discuss your use case.
|
||||
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, Utf8Error>> {
|
||||
if bytes.is_empty() {
|
||||
return None;
|
||||
}
|
||||
let string = match core::str::from_utf8(&bytes[..bytes.len().min(4)]) {
|
||||
Ok(s) => s,
|
||||
Err(ref err) if err.valid_up_to() > 0 => {
|
||||
// OK because we just verified we have at least some
|
||||
// valid UTF-8.
|
||||
core::str::from_utf8(&bytes[..err.valid_up_to()]).unwrap()
|
||||
}
|
||||
// In this case, we want to return 1-3 bytes that make up a prefix of
|
||||
// a potentially valid codepoint.
|
||||
Err(err) => return Some(Err(Utf8Error::new(bytes, err))),
|
||||
};
|
||||
// OK because we guaranteed above that `string`
|
||||
// must be non-empty. And thus, `str::chars` must
|
||||
// yield at least one Unicode scalar value.
|
||||
Some(Ok(string.chars().next().unwrap()))
|
||||
}
|
||||
|
||||
/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
|
||||
#[inline]
|
||||
pub(crate) fn cmp_ignore_ascii_case(s1: &str, s2: &str) -> Ordering {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue