This commit is contained in:
Andrew Gallant 2025-02-18 22:53:27 -05:00
parent 8bea2f5533
commit 007a4bffe9
No known key found for this signature in database
GPG key ID: B2E3A4923F8B0D44
20 changed files with 3560 additions and 2914 deletions

View file

@ -111,7 +111,7 @@ fn add_years_months_days(c: &mut Criterion) {
/// This is useful when you have a known time zone already and want to get
/// a specific instant for many distinct civil datetimes in that time zone.
fn to_timestamp_static(c: &mut Criterion) {
const NAME: &str = "civil_datetime/to_datetime_static";
const NAME: &str = "civil_datetime/to_timestamp_static";
const TZNAME: &str = "America/New_York";
const STAMP: i64 = 1719755160;
const DATETIME: civil::DateTime = civil::date(2024, 6, 30).at(9, 46, 0, 0);

View file

@ -46,7 +46,7 @@ pub fn run(p: &mut Parser) -> anyhow::Result<()> {
args::configure(p, USAGE, &mut [&mut config])?;
let jiff = config.jiff();
let table_path = jiff.join("src/util/crc32/table.rs");
let table_path = jiff.join("src/shared/crc32/table.rs");
write_crc_tables(&table_path).with_context(|| {
format!("failed to write CRC32 data table to {}", table_path.display())
})?;

View file

@ -731,6 +731,8 @@ mod error;
pub mod fmt;
#[cfg(feature = "std")]
mod now;
#[doc(hidden)]
pub mod shared;
mod signed_duration;
mod span;
mod timestamp;

View file

@ -1,4 +1,4 @@
use crate::util::crc32::table::{TABLE, TABLE16};
use self::table::{TABLE, TABLE16};
mod table;

View file

@ -1,4 +1,6 @@
pub const TABLE: [u32; 256] = [
// auto-generated by: jiff-cli generate crc32
pub(super) const TABLE: [u32; 256] = [
0, 4067132163, 3778769143, 324072436, 3348797215, 904991772, 648144872,
3570033899, 2329499855, 2024987596, 1809983544, 2575936315, 1296289744,
3207089363, 2893594407, 1578318884, 274646895, 3795141740, 4049975192,
@ -44,7 +46,7 @@ pub const TABLE: [u32; 256] = [
1279665062, 1595330642, 2910671697,
];
pub const TABLE16: [[u32; 256]; 16] = [
pub(super) const TABLE16: [[u32; 256]; 16] = [
[
0, 4067132163, 3778769143, 324072436, 3348797215, 904991772,
648144872, 3570033899, 2329499855, 2024987596, 1809983544, 2575936315,

166
src/shared/mod.rs Normal file
View file

@ -0,0 +1,166 @@
/*!
TODO
*/
use core::ops::Range;
pub type TzifStatic = Tzif<
&'static str,
&'static [TzifLocalTimeType],
&'static [TzifTransition],
>;
#[cfg(feature = "alloc")]
pub type TzifOwned = Tzif<
alloc::string::String,
alloc::vec::Vec<TzifLocalTimeType>,
alloc::vec::Vec<TzifTransition>,
>;
#[derive(Debug)]
pub struct Tzif<STRING, TYPES, TRANS> {
pub fixed: TzifFixed<STRING>,
pub types: TYPES,
pub transitions: TRANS,
}
#[derive(Debug)]
pub struct TzifFixed<STRING> {
pub name: Option<STRING>,
pub version: u8,
pub checksum: u32,
pub designations: STRING,
pub posix_tz: Option<PosixTimeZone<STRING>>,
}
// only-jiff-impl-start
impl TzifFixed<&'static str> {
pub const fn to_jiff(
&self,
types: &'static [crate::tz::tzif::LocalTimeType],
trans: &'static [crate::tz::tzif::Transition],
) -> crate::tz::tzif::TzifStatic {
crate::tz::tzif::TzifStatic::from_shared_const(self, types, trans)
}
}
// only-jiff-impl-end
#[derive(Debug)]
pub struct TzifLocalTimeType {
pub offset: i32,
pub is_dst: bool,
pub designation: Range<u8>,
pub indicator: TzifIndicator,
}
// only-jiff-impl-start
impl TzifLocalTimeType {
pub const fn to_jiff(&self) -> crate::tz::tzif::LocalTimeType {
crate::tz::tzif::LocalTimeType::from_shared(self)
}
}
// only-jiff-impl-end
#[derive(Debug)]
pub enum TzifIndicator {
LocalWall,
LocalStandard,
UTStandard,
}
#[derive(Debug)]
pub struct TzifTransition {
pub timestamp: i64,
pub type_index: u8,
}
// only-jiff-impl-start
impl TzifTransition {
pub const fn to_jiff(
&self,
prev_offset: i32,
this_offset: i32,
) -> crate::tz::tzif::Transition {
crate::tz::tzif::Transition::from_shared(
self,
prev_offset,
this_offset,
)
}
}
// only-jiff-impl-end
#[derive(Debug, Eq, PartialEq)]
pub struct PosixTimeZone<ABBREV> {
pub std_abbrev: ABBREV,
pub std_offset: i32,
pub dst: Option<PosixDst<ABBREV>>,
}
#[derive(Debug, Eq, PartialEq)]
pub struct PosixDst<ABBREV> {
pub abbrev: ABBREV,
pub offset: i32,
pub rule: Option<PosixRule>,
}
#[derive(Debug, Eq, PartialEq)]
pub struct PosixRule {
pub start: PosixDayTime,
pub end: PosixDayTime,
}
#[derive(Debug, Eq, PartialEq)]
pub struct PosixDayTime {
pub date: PosixDay,
pub time: i32,
}
#[derive(Debug, Eq, PartialEq)]
pub enum PosixDay {
/// Julian day in a year, no counting for leap days.
///
/// Valid range is `1..=365`.
JulianOne(i16),
/// Julian day in a year, counting for leap days.
///
/// Valid range is `0..=365`.
JulianZero(i16),
/// The nth weekday of a month.
WeekdayOfMonth {
/// The month.
///
/// Valid range is: `1..=12`.
month: i8,
/// The week.
///
/// Valid range is `1..=5`.
///
/// One interesting thing to note here (or my interpretation anyway),
/// is that a week of `4` means the "4th weekday in a month" where as
/// a week of `5` means the "last weekday in a month, even if it's the
/// 4th weekday."
week: i8,
/// The weekday.
///
/// Valid range is `0..=6`, with `0` corresponding to Sunday.
weekday: i8,
},
}
// only-jiff-impl-start
impl PosixTimeZone<&'static str> {
pub const fn to_jiff(&self) -> crate::tz::posix::ReasonablePosixTimeZone {
crate::tz::posix::ReasonablePosixTimeZone::from_shared_const(self)
}
}
// only-jiff-impl-end
// Does not require `alloc`, but is only used when `alloc` is enabled.
#[cfg(feature = "alloc")]
pub(crate) mod crc32;
#[cfg(feature = "alloc")]
pub(crate) mod posix;
#[cfg(feature = "alloc")]
pub(crate) mod tzif;
pub(crate) mod util;

1939
src/shared/posix.rs Normal file

File diff suppressed because it is too large Load diff

789
src/shared/tzif.rs Normal file
View file

@ -0,0 +1,789 @@
#![allow(warnings)]
use alloc::{string::String, vec};
use super::{
util::{Byte, Bytes},
PosixTimeZone, TzifFixed, TzifIndicator, TzifLocalTimeType, TzifOwned,
TzifTransition,
};
macro_rules! err {
($($tt:tt)*) => {{
self::Error(alloc::format!($($tt)*))
}}
}
// These are Jiff min and max timestamp (in seconds) values.
//
// The TZif parser will clamp timestamps to this range. It's
// not ideal, but Jiff can't handle values outside of this range
// and completely refusing to use TZif data with pathological
// timestamps in typically irrelevant transitions is bad juju.
//
// Ref: https://github.com/BurntSushi/jiff/issues/163
// Ref: https://github.com/BurntSushi/jiff/pull/164
const TIMESTAMP_MIN: i64 = -377705023201;
const TIMESTAMP_MAX: i64 = 253402207200;
// Similarly for offsets, although in this case, if we find
// an offset outside of this range, we do actually error. This
// is because it could result in true incorrect datetimes for
// actual transitions.
//
// But our supported offset range is `-25:59:59..=+25:59:59`.
// There's no real time zone with offsets even close to those
// boundaries.
//
// If there is pathological data that we should ignore, then
// we should wait for a real bug report in order to determine
// the right way to ignore/clamp it.
const OFFSET_MIN: i32 = -93599;
const OFFSET_MAX: i32 = 93599;
/// An error that can be returned when parsing.
#[derive(Debug)]
pub struct Error(String);
impl core::fmt::Display for Error {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
core::fmt::Display::fmt(&self.0, f)
}
}
impl TzifOwned {
/// Parses the given data as a TZif formatted file.
///
/// The name given is attached to the `Tzif` value returned, but is
/// otherwise not significant.
///
/// If the given data is not recognized to be valid TZif, then an error is
/// returned.
///
/// In general, callers may assume that it is safe to pass arbitrary or
/// even untrusted data to this function and count on it not panicking
/// or using resources that aren't limited to a small constant factor of
/// the size of the data itself. That is, callers can reliably limit the
/// resources used by limiting the size of the data given to this parse
/// function.
pub(crate) fn parse(
name: Option<String>,
bytes: &[u8],
) -> Result<TzifOwned, Error> {
let original = bytes;
let name = name.into();
let (header32, rest) = Header::parse(4, bytes)
.map_err(|e| err!("failed to parse 32-bit header: {e}"))?;
let (mut tzif, rest) = if header32.version == 0 {
TzifOwned::parse32(name, header32, rest)?
} else {
TzifOwned::parse64(name, header32, rest)?
};
// Compute the checksum using the entire contents of the TZif data.
let tzif_raw_len = (rest.as_ptr() as usize)
.checked_sub(original.as_ptr() as usize)
.unwrap();
let tzif_raw_bytes = &original[..tzif_raw_len];
tzif.fixed.checksum = super::crc32::sum(tzif_raw_bytes);
Ok(tzif)
}
fn parse32<'b>(
name: Option<String>,
header32: Header,
bytes: &'b [u8],
) -> Result<(TzifOwned, &'b [u8]), Error> {
let mut tzif = TzifOwned {
fixed: TzifFixed {
name,
version: header32.version,
// filled in later
checksum: 0,
designations: String::new(),
posix_tz: None,
},
types: vec![],
transitions: vec![],
};
let rest = tzif.parse_transitions(&header32, bytes)?;
let rest = tzif.parse_transition_types(&header32, rest)?;
let rest = tzif.parse_local_time_types(&header32, rest)?;
let rest = tzif.parse_time_zone_designations(&header32, rest)?;
let rest = tzif.parse_leap_seconds(&header32, rest)?;
let rest = tzif.parse_indicators(&header32, rest)?;
Ok((tzif, rest))
}
fn parse64<'b>(
name: Option<String>,
header32: Header,
bytes: &'b [u8],
) -> Result<(TzifOwned, &'b [u8]), Error> {
let (_, rest) = try_split_at(
"V1 TZif data block",
bytes,
header32.data_block_len()?,
)?;
let (header64, rest) = Header::parse(8, rest)
.map_err(|e| err!("failed to parse 64-bit header: {e}"))?;
let mut tzif = TzifOwned {
fixed: TzifFixed {
name,
version: header64.version,
// filled in later
checksum: 0,
designations: String::new(),
posix_tz: None,
},
types: vec![],
transitions: vec![],
};
let rest = tzif.parse_transitions(&header64, rest)?;
let rest = tzif.parse_transition_types(&header64, rest)?;
let rest = tzif.parse_local_time_types(&header64, rest)?;
let rest = tzif.parse_time_zone_designations(&header64, rest)?;
let rest = tzif.parse_leap_seconds(&header64, rest)?;
let rest = tzif.parse_indicators(&header64, rest)?;
let rest = tzif.parse_footer(&header64, rest)?;
// Note that we specifically and unfortunately do not "validate"
// the POSIX TZ string here. We *should* check that it is
// consistent with the last transition. Since:
//
// RFC 8536 says, "If the string is nonempty and one or more
// transitions appear in the version 2+ data, the string MUST be
// consistent with the last version 2+ transition."
//
// But in this context, we don't have any of the infrastructure
// to actually do TZ operations on a POSIX time zone. It requires
// civil datetimes and a bunch of other bullshit. This means that
// this verification step doesn't run when using the `jiff-tzdb-static`
// proc macro. However, we do still run it when parsing TZif data
// at runtime.
//
// We otherwise don't check that the TZif data is fully valid. It is
// possible for it to contain superfluous information. For example, a
// non-zero local time type that is never referenced by a transition.
Ok((tzif, rest))
}
fn parse_transitions<'b>(
&mut self,
header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
let (bytes, rest) = try_split_at(
"transition times data block",
bytes,
header.transition_times_len()?,
)?;
let mut it = bytes.chunks_exact(header.time_size);
// RFC 8536 says: "If there are no transitions, local time for all
// timestamps is specified by the TZ string in the footer if present
// and nonempty; otherwise, it is specified by time type 0."
//
// RFC 8536 also says: "Local time for timestamps before the first
// transition is specified by the first time type (time type
// 0)."
//
// So if there are no transitions, pushing this dummy one will result
// in the desired behavior even when it's the only transition.
// Similarly, since this is the minimum timestamp value, it will
// trigger for any times before the first transition found in the TZif
// data.
self.transitions
.push(TzifTransition { timestamp: TIMESTAMP_MIN, type_index: 0 });
while let Some(chunk) = it.next() {
let mut timestamp = if header.is_32bit() {
i64::from(from_be_bytes_i32(chunk))
} else {
from_be_bytes_i64(chunk)
};
if !(TIMESTAMP_MIN <= timestamp && timestamp <= TIMESTAMP_MAX) {
// We really shouldn't error here just because the Unix
// timestamp is outside what Jiff supports. Since what Jiff
// supports is _somewhat_ arbitrary. But Jiff's supported
// range is good enough for all realistic purposes, so we
// just clamp an out-of-range Unix timestamp to the Jiff
// min or max value.
//
// This can't result in the sorting order being wrong, but
// it can result in a transition that is duplicative with
// the dummy transition we inserted above. This should be
// fine.
let clamped = timestamp.clamp(TIMESTAMP_MIN, TIMESTAMP_MAX);
// only-jiff-warn-start
warn!(
"found Unix timestamp {timestamp} that is outside \
Jiff's supported range, clamping to {clamped}",
);
// only-jiff-warn-end
timestamp = clamped;
}
self.transitions.push(TzifTransition {
timestamp,
// We can't fill in the type index yet. We fill this in
// later when we parse the transition types.
type_index: 0,
});
}
assert!(it.remainder().is_empty());
Ok(rest)
}
fn parse_transition_types<'b>(
&mut self,
header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
let (bytes, rest) = try_split_at(
"transition types data block",
bytes,
header.transition_types_len()?,
)?;
// We start our transition indices at 1 because we always insert a
// dummy first transition corresponding to `Timestamp::MIN`. Its type
// index is always 0, so there's no need to change it here.
for (transition_index, &type_index) in (1..).zip(bytes) {
if usize::from(type_index) >= header.tzh_typecnt {
return Err(err!(
"found transition type index {type_index},
but there are only {} local time types",
header.tzh_typecnt,
));
}
self.transitions[transition_index].type_index = type_index;
}
Ok(rest)
}
fn parse_local_time_types<'b>(
&mut self,
header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
let (bytes, rest) = try_split_at(
"local time types data block",
bytes,
header.local_time_types_len()?,
)?;
let mut it = bytes.chunks_exact(6);
while let Some(chunk) = it.next() {
let offset = from_be_bytes_i32(&chunk[..4]);
if !(OFFSET_MIN <= offset && offset <= OFFSET_MAX) {
return Err(err!(
"found local time type with out-of-bounds offset: {offset}"
));
}
let is_dst = chunk[4] == 1;
let designation = chunk[5]..chunk[5];
self.types.push(TzifLocalTimeType {
offset,
is_dst,
designation,
indicator: TzifIndicator::LocalWall,
});
}
assert!(it.remainder().is_empty());
Ok(rest)
}
fn parse_time_zone_designations<'b>(
&mut self,
header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
let (bytes, rest) = try_split_at(
"time zone designations data block",
bytes,
header.time_zone_designations_len()?,
)?;
self.fixed.designations =
String::from_utf8(bytes.to_vec()).map_err(|_| {
err!(
"time zone designations are not valid UTF-8: {:?}",
Bytes(bytes),
)
})?;
// Holy hell, this is brutal. The boundary conditions are crazy.
for (i, typ) in self.types.iter_mut().enumerate() {
let start = usize::from(typ.designation.start);
let Some(suffix) = self.fixed.designations.get(start..) else {
return Err(err!(
"local time type {i} has designation index of {start}, \
but cannot be more than {}",
self.fixed.designations.len(),
));
};
let Some(len) = suffix.find('\x00') else {
return Err(err!(
"local time type {i} has designation index of {start}, \
but could not find NUL terminator after it in \
designations: {:?}",
self.fixed.designations,
));
};
let Some(end) = start.checked_add(len) else {
return Err(err!(
"local time type {i} has designation index of {start}, \
but its length {len} is too big",
));
};
typ.designation.end = u8::try_from(end).map_err(|_| {
err!(
"local time type {i} has designation range of \
{start}..{end}, but end is too big",
)
})?;
}
Ok(rest)
}
/// This parses the leap second corrections in the TZif data.
///
/// Note that we only parse and verify them. We don't actually use them.
/// Jiff effectively ignores leap seconds.
fn parse_leap_seconds<'b>(
&mut self,
header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
let (bytes, rest) = try_split_at(
"leap seconds data block",
bytes,
header.leap_second_len()?,
)?;
let chunk_len = header
.time_size
.checked_add(4)
.expect("time_size plus 4 fits in usize");
let mut it = bytes.chunks_exact(chunk_len);
while let Some(chunk) = it.next() {
let (occur_bytes, _corr_bytes) = chunk.split_at(header.time_size);
let occur = if header.is_32bit() {
i64::from(from_be_bytes_i32(occur_bytes))
} else {
from_be_bytes_i64(occur_bytes)
};
if !(TIMESTAMP_MIN <= occur && occur <= TIMESTAMP_MAX) {
// only-jiff-warn-start
warn!(
"leap second occurrence {occur} is \
not in Jiff's supported range"
)
// only-jiff-warn-end
}
}
assert!(it.remainder().is_empty());
Ok(rest)
}
fn parse_indicators<'b>(
&mut self,
header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
let (std_wall_bytes, rest) = try_split_at(
"standard/wall indicators data block",
bytes,
header.standard_wall_len()?,
)?;
let (ut_local_bytes, rest) = try_split_at(
"UT/local indicators data block",
rest,
header.ut_local_len()?,
)?;
if std_wall_bytes.is_empty() && !ut_local_bytes.is_empty() {
// This is a weird case, but technically possible only if all
// UT/local indicators are 0. If any are 1, then it's an error,
// because it would require the corresponding std/wall indicator
// to be 1 too. Which it can't be, because there aren't any. So
// we just check that they're all zeros.
for (i, &byte) in ut_local_bytes.iter().enumerate() {
if byte != 0 {
return Err(err!(
"found UT/local indicator '{byte}' for local time \
type {i}, but it must be 0 since all std/wall \
indicators are 0",
));
}
}
} else if !std_wall_bytes.is_empty() && ut_local_bytes.is_empty() {
for (i, &byte) in std_wall_bytes.iter().enumerate() {
// Indexing is OK because Header guarantees that the number of
// indicators is 0 or equal to the number of types.
self.types[i].indicator = if byte == 0 {
TzifIndicator::LocalWall
} else if byte == 1 {
TzifIndicator::LocalStandard
} else {
return Err(err!(
"found invalid std/wall indicator '{byte}' for \
local time type {i}, it must be 0 or 1",
));
};
}
} else if !std_wall_bytes.is_empty() && !ut_local_bytes.is_empty() {
assert_eq!(std_wall_bytes.len(), ut_local_bytes.len());
let it = std_wall_bytes.iter().zip(ut_local_bytes);
for (i, (&stdwall, &utlocal)) in it.enumerate() {
// Indexing is OK because Header guarantees that the number of
// indicators is 0 or equal to the number of types.
self.types[i].indicator = match (stdwall, utlocal) {
(0, 0) => TzifIndicator::LocalWall,
(1, 0) => TzifIndicator::LocalStandard,
(1, 1) => TzifIndicator::UTStandard,
(0, 1) => {
return Err(err!(
"found illegal ut-wall combination for \
local time type {i}, only local-wall, \
local-standard and ut-standard are allowed",
))
}
_ => {
return Err(err!(
"found illegal std/wall or ut/local value for \
local time type {i}, each must be 0 or 1",
))
}
};
}
} else {
// If they're both empty then we don't need to do anything. Every
// local time type record already has the correct default for this
// case set.
debug_assert!(std_wall_bytes.is_empty());
debug_assert!(ut_local_bytes.is_empty());
}
Ok(rest)
}
fn parse_footer<'b>(
&mut self,
_header: &Header,
bytes: &'b [u8],
) -> Result<&'b [u8], Error> {
if bytes.is_empty() {
return Err(err!(
"invalid V2+ TZif footer, expected \\n, \
but found unexpected end of data",
));
}
if bytes[0] != b'\n' {
return Err(err!(
"invalid V2+ TZif footer, expected {:?}, but found {:?}",
Byte(b'\n'),
Byte(bytes[0]),
));
}
let bytes = &bytes[1..];
// Only scan up to 1KB for a NUL terminator in case we somehow got
// passed a huge block of bytes.
let toscan = &bytes[..bytes.len().min(1024)];
let Some(nlat) = toscan.iter().position(|&b| b == b'\n') else {
return Err(err!(
"invalid V2 TZif footer, could not find {:?} \
terminator in: {:?}",
Byte(b'\n'),
Bytes(toscan),
));
};
let (bytes, rest) = bytes.split_at(nlat);
if !bytes.is_empty() {
let posix_tz =
PosixTimeZone::parse(bytes).map_err(|e| err!("{e}"))?;
// We could in theory limit TZ strings to their strict POSIX
// definition here for TZif V2, but I don't think there is any
// harm in allowing the extensions in V2 formatted TZif data. Note
// that the GNU tooling allow it via the `TZ` environment variable
// even though POSIX doesn't specify it. This all seems okay to me
// because the V3+ extension is a strict superset of functionality.
if let Some(ref dst) = posix_tz.dst {
if dst.rule.is_none() {
return Err(err!(
"TZ string `{}` in v3+ tzfile has DST \
but no transition rules",
Bytes(bytes),
));
}
}
self.fixed.posix_tz = Some(posix_tz);
}
Ok(&rest[1..])
}
}
/// The header for a TZif formatted file.
///
/// V2+ TZif format have two headers: one for V1 data, and then a second
/// following the V1 data block that describes another data block which uses
/// 64-bit timestamps. The two headers both have the same format and both
/// use 32-bit big-endian encoded integers.
#[derive(Debug)]
struct Header {
/// The size of the timestamps encoded in the data block.
///
/// This is guaranteed to be either 4 (for V1) or 8 (for the 64-bit header
/// block in V2+).
time_size: usize,
/// The file format version.
///
/// Note that this is either a NUL byte (for version 1), or an ASCII byte
/// corresponding to the version number. That is, `0x32` for `2`, `0x33`
/// for `3` or `0x34` for `4`. Note also that just because zoneinfo might
/// have been recently generated does not mean it uses the latest format
/// version. It seems like newer versions are only compiled by `zic` when
/// they are needed. For example, `America/New_York` on my system (as of
/// `2024-03-25`) has version `0x32`, but `Asia/Jerusalem` has version
/// `0x33`.
version: u8,
/// Number of UT/local indicators stored in the file.
///
/// This is checked to be either equal to `0` or equal to `tzh_typecnt`.
tzh_ttisutcnt: usize,
/// The number of standard/wall indicators stored in the file.
///
/// This is checked to be either equal to `0` or equal to `tzh_typecnt`.
tzh_ttisstdcnt: usize,
/// The number of leap seconds for which data entries are stored in the
/// file.
tzh_leapcnt: usize,
/// The number of transition times for which data entries are stored in
/// the file.
tzh_timecnt: usize,
/// The number of local time types for which data entries are stored in the
/// file.
///
/// This is checked to be at least `1`.
tzh_typecnt: usize,
/// The number of bytes of time zone abbreviation strings stored in the
/// file.
///
/// This is checked to be at least `1`.
tzh_charcnt: usize,
}
impl Header {
/// Parse the header record from the given bytes.
///
/// Upon success, return the header and all bytes after the header.
///
/// The given `time_size` must be 4 or 8, corresponding to either the
/// V1 header block or the V2+ header block, respectively.
fn parse(
time_size: usize,
bytes: &[u8],
) -> Result<(Header, &[u8]), Error> {
assert!(time_size == 4 || time_size == 8, "time size must be 4 or 8");
if bytes.len() < 44 {
return Err(err!("invalid header: too short"));
}
let (magic, rest) = bytes.split_at(4);
if magic != b"TZif" {
return Err(err!("invalid header: magic bytes mismatch"));
}
let (version, rest) = rest.split_at(1);
let (_reserved, rest) = rest.split_at(15);
let (tzh_ttisutcnt_bytes, rest) = rest.split_at(4);
let (tzh_ttisstdcnt_bytes, rest) = rest.split_at(4);
let (tzh_leapcnt_bytes, rest) = rest.split_at(4);
let (tzh_timecnt_bytes, rest) = rest.split_at(4);
let (tzh_typecnt_bytes, rest) = rest.split_at(4);
let (tzh_charcnt_bytes, rest) = rest.split_at(4);
let tzh_ttisutcnt = from_be_bytes_u32_to_usize(tzh_ttisutcnt_bytes)
.map_err(|e| err!("failed to parse tzh_ttisutcnt: {e}"))?;
let tzh_ttisstdcnt = from_be_bytes_u32_to_usize(tzh_ttisstdcnt_bytes)
.map_err(|e| err!("failed to parse tzh_ttisstdcnt: {e}"))?;
let tzh_leapcnt = from_be_bytes_u32_to_usize(tzh_leapcnt_bytes)
.map_err(|e| err!("failed to parse tzh_leapcnt: {e}"))?;
let tzh_timecnt = from_be_bytes_u32_to_usize(tzh_timecnt_bytes)
.map_err(|e| err!("failed to parse tzh_timecnt: {e}"))?;
let tzh_typecnt = from_be_bytes_u32_to_usize(tzh_typecnt_bytes)
.map_err(|e| err!("failed to parse tzh_typecnt: {e}"))?;
let tzh_charcnt = from_be_bytes_u32_to_usize(tzh_charcnt_bytes)
.map_err(|e| err!("failed to parse tzh_charcnt: {e}"))?;
if tzh_ttisutcnt != 0 && tzh_ttisutcnt != tzh_typecnt {
return Err(err!(
"expected tzh_ttisutcnt={tzh_ttisutcnt} to be zero \
or equal to tzh_typecnt={tzh_typecnt}",
));
}
if tzh_ttisstdcnt != 0 && tzh_ttisstdcnt != tzh_typecnt {
return Err(err!(
"expected tzh_ttisstdcnt={tzh_ttisstdcnt} to be zero \
or equal to tzh_typecnt={tzh_typecnt}",
));
}
if tzh_typecnt < 1 {
return Err(err!(
"expected tzh_typecnt={tzh_typecnt} to be at least 1",
));
}
if tzh_charcnt < 1 {
return Err(err!(
"expected tzh_charcnt={tzh_charcnt} to be at least 1",
));
}
let header = Header {
time_size,
version: version[0],
tzh_ttisutcnt,
tzh_ttisstdcnt,
tzh_leapcnt,
tzh_timecnt,
tzh_typecnt,
tzh_charcnt,
};
Ok((header, rest))
}
/// Returns true if this header is for a 32-bit data block.
///
/// When false, it is guaranteed that this header is for a 64-bit data
/// block.
fn is_32bit(&self) -> bool {
self.time_size == 4
}
/// Returns the size of the data block, in bytes, for this header.
///
/// This returns an error if the arithmetic required to compute the
/// length would overflow.
///
/// This is useful for, e.g., skipping over the 32-bit V1 data block in
/// V2+ TZif formatted files.
fn data_block_len(&self) -> Result<usize, Error> {
let a = self.transition_times_len()?;
let b = self.transition_types_len()?;
let c = self.local_time_types_len()?;
let d = self.time_zone_designations_len()?;
let e = self.leap_second_len()?;
let f = self.standard_wall_len()?;
let g = self.ut_local_len()?;
a.checked_add(b)
.and_then(|z| z.checked_add(c))
.and_then(|z| z.checked_add(d))
.and_then(|z| z.checked_add(e))
.and_then(|z| z.checked_add(f))
.and_then(|z| z.checked_add(g))
.ok_or_else(|| {
err!(
"length of data block in V{} tzfile is too big",
self.version
)
})
}
fn transition_times_len(&self) -> Result<usize, Error> {
self.tzh_timecnt.checked_mul(self.time_size).ok_or_else(|| {
err!("tzh_timecnt value {} is too big", self.tzh_timecnt)
})
}
fn transition_types_len(&self) -> Result<usize, Error> {
Ok(self.tzh_timecnt)
}
fn local_time_types_len(&self) -> Result<usize, Error> {
self.tzh_typecnt.checked_mul(6).ok_or_else(|| {
err!("tzh_typecnt value {} is too big", self.tzh_typecnt)
})
}
fn time_zone_designations_len(&self) -> Result<usize, Error> {
Ok(self.tzh_charcnt)
}
fn leap_second_len(&self) -> Result<usize, Error> {
let record_len = self
.time_size
.checked_add(4)
.expect("4-or-8 plus 4 always fits in usize");
self.tzh_leapcnt.checked_mul(record_len).ok_or_else(|| {
err!("tzh_leapcnt value {} is too big", self.tzh_leapcnt)
})
}
fn standard_wall_len(&self) -> Result<usize, Error> {
Ok(self.tzh_ttisstdcnt)
}
fn ut_local_len(&self) -> Result<usize, Error> {
Ok(self.tzh_ttisutcnt)
}
}
/// Splits the given slice of bytes at the index given.
///
/// If the index is out of range (greater than `bytes.len()`) then an error is
/// returned. The error message will include the `what` string given, which is
/// meant to describe the thing being split.
fn try_split_at<'b>(
what: &'static str,
bytes: &'b [u8],
at: usize,
) -> Result<(&'b [u8], &'b [u8]), Error> {
if at > bytes.len() {
Err(err!(
"expected at least {at} bytes for {what}, \
but found only {} bytes",
bytes.len(),
))
} else {
Ok(bytes.split_at(at))
}
}
/// Interprets the given slice as an unsigned 32-bit big endian integer,
/// attempts to convert it to a `usize` and returns it.
///
/// # Panics
///
/// When `bytes.len() != 4`.
///
/// # Errors
///
/// This errors if the `u32` parsed from the given bytes cannot fit in a
/// `usize`.
fn from_be_bytes_u32_to_usize(bytes: &[u8]) -> Result<usize, Error> {
let n = from_be_bytes_u32(bytes);
usize::try_from(n).map_err(|_| {
err!(
"failed to parse integer {n} (too big, max allowed is {}",
usize::MAX
)
})
}
/// Interprets the given slice as an unsigned 32-bit big endian integer and
/// returns it.
///
/// # Panics
///
/// When `bytes.len() != 4`.
fn from_be_bytes_u32(bytes: &[u8]) -> u32 {
u32::from_be_bytes(bytes.try_into().unwrap())
}
/// Interprets the given slice as a signed 32-bit big endian integer and
/// returns it.
///
/// # Panics
///
/// When `bytes.len() != 4`.
fn from_be_bytes_i32(bytes: &[u8]) -> i32 {
i32::from_be_bytes(bytes.try_into().unwrap())
}
/// Interprets the given slice as a signed 64-bit big endian integer and
/// returns it.
///
/// # Panics
///
/// When `bytes.len() != 8`.
fn from_be_bytes_i64(bytes: &[u8]) -> i64 {
i64::from_be_bytes(bytes.try_into().unwrap())
}

128
src/shared/util.rs Normal file
View file

@ -0,0 +1,128 @@
/// Provides a convenient `Debug` implementation for a `u8`.
///
/// The `Debug` impl treats the byte as an ASCII, and emits a human
/// readable representation of it. If the byte isn't ASCII, then it's
/// emitted as a hex escape sequence.
#[derive(Clone, Copy)]
pub(crate) struct Byte(pub u8);
impl core::fmt::Display for Byte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
if self.0 == b' ' {
return write!(f, " ");
}
// 10 bytes is enough for any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
impl core::fmt::Debug for Byte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// Provides a convenient `Debug` implementation for `&[u8]`.
///
/// This generally works best when the bytes are presumed to be mostly
/// UTF-8, but will work for anything. For any bytes that aren't UTF-8,
/// they are emitted as hex escape sequences.
#[derive(Clone, Copy)]
pub(crate) struct Bytes<'a>(pub &'a [u8]);
impl<'a> core::fmt::Display for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8_decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(byte) => {
write!(f, r"\x{:02x}", byte)?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
'\x01'..='\x7f' => {
write!(f, "{}", (ch as u8).escape_ascii())?;
}
_ => write!(f, "{}", ch.escape_debug())?,
}
}
Ok(())
}
}
impl<'a> core::fmt::Debug for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// Decodes the next UTF-8 encoded codepoint from the given byte slice.
///
/// If no valid encoding of a codepoint exists at the beginning of the
/// given byte slice, then the first byte is returned instead.
///
/// This returns `None` if and only if `bytes` is empty.
///
/// This never panics.
///
/// *WARNING*: This is not designed for performance. If you're looking for
/// a fast UTF-8 decoder, this is not it. If you feel like you need one in
/// this crate, then please file an issue and discuss your use case.
pub(crate) fn utf8_decode(bytes: &[u8]) -> Option<Result<char, u8>> {
/// Given a UTF-8 leading byte, this returns the total number of code
/// units in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this
/// returns `None`.
fn utf8_len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
return Some(1);
} else if byte & 0b1100_0000 == 0b1000_0000 {
return None;
} else if byte <= 0b1101_1111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}
if bytes.is_empty() {
return None;
}
let len = match utf8_len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
}

View file

@ -101,15 +101,13 @@ mod ambiguous;
mod concatenated;
mod db;
mod offset;
#[cfg(feature = "alloc")]
pub(crate) mod posix;
#[cfg(feature = "tz-system")]
mod system;
#[cfg(all(test, feature = "alloc"))]
mod testdata;
mod timezone;
#[cfg(feature = "alloc")]
mod tzif;
pub(crate) mod tzif;
// See module comment for WIP status. :-(
#[cfg(test)]
mod zic;

View file

@ -232,7 +232,7 @@ impl Offset {
// warrant its existence. And I think I'd rather `Offset::hms` be const and
// exported instead of this monstrosity.
#[inline]
const fn constant_seconds(seconds: i32) -> Offset {
pub(crate) const fn constant_seconds(seconds: i32) -> Offset {
if !t::SpanZoneOffset::contains(seconds) {
panic!("invalid time zone offset seconds")
}

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,6 @@
use alloc::string::ToString;
use crate::tz::tzif::Tzif;
use crate::tz::tzif::TzifOwned;
/// A concatenated list of TZif data with a header and an index block.
///
@ -94,20 +94,20 @@ impl TzifTestFile {
/// Parse this test TZif data into a structured representation.
#[cfg(not(miri))]
pub(crate) fn parse(self) -> Tzif {
pub(crate) fn parse(self) -> TzifOwned {
let name = Some(self.name.to_string());
Tzif::parse(name, self.data).unwrap_or_else(|err| {
TzifOwned::parse(name, self.data).unwrap_or_else(|err| {
panic!("failed to parse TZif test file for {:?}: {err}", self.name)
})
}
/// Parse this test TZif data as if it were V1.
#[cfg(not(miri))]
pub(crate) fn parse_v1(self) -> Tzif {
pub(crate) fn parse_v1(self) -> TzifOwned {
let name = Some(self.name.to_string());
let mut data = self.data.to_vec();
data[4] = 0;
Tzif::parse(name, &data).unwrap_or_else(|err| {
TzifOwned::parse(name, &data).unwrap_or_else(|err| {
panic!(
"failed to parse V1 TZif test file for {:?}: {err}",
self.name

View file

@ -1534,7 +1534,7 @@ impl core::fmt::Display for TimeZonePosix {
#[cfg(feature = "alloc")]
#[derive(Eq, PartialEq)]
struct TimeZoneTzif {
tzif: crate::tz::tzif::Tzif,
tzif: crate::tz::tzif::TzifOwned,
}
#[cfg(feature = "alloc")]

File diff suppressed because it is too large Load diff

View file

@ -46,11 +46,6 @@ impl<const N: usize> ArrayStr<N> {
Some(ArrayStr { bytes, len: len as u8 })
}
/// Returns the capacity of this fixed string.
pub(crate) const fn capacity() -> usize {
N
}
/// Append the bytes given to the end of this string.
///
/// If the capacity would be exceeded, then this is a no-op and `false`

13
src/util/constant.rs Normal file
View file

@ -0,0 +1,13 @@
/// Unwrap an `Option<T>` in a `const` context.
///
/// If it fails, panics with the given message.
macro_rules! unwrap {
($val:expr, $msg:expr$(,)?) => {
match $val {
Some(val) => val,
None => panic!($msg),
}
};
}
pub(crate) use unwrap;

View file

@ -4,91 +4,8 @@ Provides convenience routines for escaping raw bytes.
This was copied from `regex-automata` with a few light edits.
*/
use crate::util::utf8;
/// Provides a convenient `Debug` implementation for a `u8`.
///
/// The `Debug` impl treats the byte as an ASCII, and emits a human readable
/// representation of it. If the byte isn't ASCII, then it's emitted as a hex
/// escape sequence.
#[derive(Clone, Copy)]
pub struct Byte(pub u8);
impl core::fmt::Display for Byte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
if self.0 == b' ' {
return write!(f, " ");
}
// 10 bytes is enough to cover any output from ascii::escape_default.
let mut bytes = [0u8; 10];
let mut len = 0;
for (i, mut b) in core::ascii::escape_default(self.0).enumerate() {
// capitalize \xab to \xAB
if i >= 2 && b'a' <= b && b <= b'f' {
b -= 32;
}
bytes[len] = b;
len += 1;
}
write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap())
}
}
impl core::fmt::Debug for Byte {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
/// Provides a convenient `Debug` implementation for `&[u8]`.
///
/// This generally works best when the bytes are presumed to be mostly UTF-8,
/// but will work for anything. For any bytes that aren't UTF-8, they are
/// emitted as hex escape sequences.
#[derive(Clone, Copy)]
pub struct Bytes<'a>(pub &'a [u8]);
impl<'a> core::fmt::Display for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
// This is a sad re-implementation of a similar impl found in bstr.
let mut bytes = self.0;
while let Some(result) = utf8::decode(bytes) {
let ch = match result {
Ok(ch) => ch,
Err(byte) => {
write!(f, r"\x{:02x}", byte)?;
bytes = &bytes[1..];
continue;
}
};
bytes = &bytes[ch.len_utf8()..];
match ch {
'\0' => write!(f, "\\0")?,
// ASCII control characters except \0, \n, \r, \t
'\x01'..='\x08'
| '\x0b'
| '\x0c'
| '\x0e'..='\x19'
| '\x7f' => {
write!(f, "\\x{:02x}", u32::from(ch))?;
}
'\n' | '\r' | '\t' | _ => {
write!(f, "{}", ch.escape_debug())?;
}
}
}
Ok(())
}
}
impl<'a> core::fmt::Debug for Bytes<'a> {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(f, "\"")?;
core::fmt::Display::fmt(self, f)?;
write!(f, "\"")?;
Ok(())
}
}
// These were originally defined here, but they got moved to
// shared since they're needed there. We re-export them here
// because this is really where they should live, but they're
// in shared because `jiff-tzdb-static` needs it.
pub(crate) use crate::shared::util::{Byte, Bytes};

View file

@ -7,7 +7,7 @@ pub(crate) mod borrow;
))]
pub(crate) mod cache;
pub(crate) mod common;
pub(crate) mod crc32;
pub(crate) mod constant;
pub(crate) mod escape;
#[cfg(feature = "std")]
pub(crate) mod fs;

View file

@ -13,19 +13,7 @@ use core::cmp::Ordering;
/// fast UTF-8 decoder, this is not it. If you feel like you need one in this
/// crate, then please file an issue and discuss your use case.
pub(crate) fn decode(bytes: &[u8]) -> Option<Result<char, u8>> {
if bytes.is_empty() {
return None;
}
let len = match utf8_len(bytes[0]) {
None => return Some(Err(bytes[0])),
Some(len) if len > bytes.len() => return Some(Err(bytes[0])),
Some(1) => return Some(Ok(char::from(bytes[0]))),
Some(len) => len,
};
match core::str::from_utf8(&bytes[..len]) {
Ok(s) => Some(Ok(s.chars().next().unwrap())),
Err(_) => Some(Err(bytes[0])),
}
crate::shared::util::utf8_decode(bytes)
}
/// Like std's `eq_ignore_ascii_case`, but returns a full `Ordering`.
@ -58,24 +46,3 @@ pub(crate) fn cmp_ignore_ascii_case_bytes(s1: &[u8], s2: &[u8]) -> Ordering {
}
}
}
/// Given a UTF-8 leading byte, this returns the total number of code units
/// in the following encoded codepoint.
///
/// If the given byte is not a valid UTF-8 leading byte, then this returns
/// `None`.
fn utf8_len(byte: u8) -> Option<usize> {
if byte <= 0x7F {
return Some(1);
} else if byte & 0b1100_0000 == 0b1000_0000 {
return None;
} else if byte <= 0b1101_1111 {
Some(2)
} else if byte <= 0b1110_1111 {
Some(3)
} else if byte <= 0b1111_0111 {
Some(4)
} else {
None
}
}