mirror of
https://github.com/python/cpython.git
synced 2025-08-04 00:48:58 +00:00
gh-80010: Expand fromisoformat to include most of ISO-8601 (#92177)
This expands `fromisoformat` to cover most of the common uses of ISO 8601. We may expand the scope more in the future.
This commit is contained in:
parent
ada8b6d1b1
commit
1303f8c927
6 changed files with 778 additions and 178 deletions
|
@ -395,6 +395,39 @@ iso_week1_monday(int year)
|
|||
return week1_monday;
|
||||
}
|
||||
|
||||
static int
|
||||
iso_to_ymd(const int iso_year, const int iso_week, const int iso_day,
|
||||
int *year, int *month, int *day) {
|
||||
if (iso_week <= 0 || iso_week >= 53) {
|
||||
int out_of_range = 1;
|
||||
if (iso_week == 53) {
|
||||
// ISO years have 53 weeks in it on years starting with a Thursday
|
||||
// and on leap years starting on Wednesday
|
||||
int first_weekday = weekday(iso_year, 1, 1);
|
||||
if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) {
|
||||
out_of_range = 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (out_of_range) {
|
||||
return -2;
|
||||
}
|
||||
}
|
||||
|
||||
if (iso_day <= 0 || iso_day >= 8) {
|
||||
return -3;
|
||||
}
|
||||
|
||||
// Convert (Y, W, D) to (Y, M, D) in-place
|
||||
int day_1 = iso_week1_monday(iso_year);
|
||||
|
||||
int day_offset = (iso_week - 1)*7 + iso_day - 1;
|
||||
|
||||
ord_to_ymd(day_1 + day_offset, year, month, day);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/* ---------------------------------------------------------------------------
|
||||
* Range checkers.
|
||||
*/
|
||||
|
@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
|
|||
* String parsing utilities and helper functions
|
||||
*/
|
||||
|
||||
static unsigned char
|
||||
is_digit(const char c) {
|
||||
return ((unsigned int)(c - '0')) < 10;
|
||||
}
|
||||
|
||||
static const char *
|
||||
parse_digits(const char *ptr, int *var, size_t num_digits)
|
||||
{
|
||||
|
@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits)
|
|||
}
|
||||
|
||||
static int
|
||||
parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
|
||||
parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day)
|
||||
{
|
||||
/* Parse the date components of the result of date.isoformat()
|
||||
*
|
||||
* Return codes:
|
||||
* 0: Success
|
||||
* -1: Failed to parse date component
|
||||
* -2: Failed to parse dateseparator
|
||||
* -2: Inconsistent date separator usage
|
||||
* -3: Failed to parse ISO week.
|
||||
* -4: Failed to parse ISO day.
|
||||
* -5, -6: Failure in iso_to_ymd
|
||||
*/
|
||||
const char *p = dtstr;
|
||||
p = parse_digits(p, year, 4);
|
||||
|
@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
|
|||
return -1;
|
||||
}
|
||||
|
||||
if (*(p++) != '-') {
|
||||
return -2;
|
||||
const unsigned char uses_separator = (*p == '-');
|
||||
if (uses_separator) {
|
||||
++p;
|
||||
}
|
||||
|
||||
if(*p == 'W') {
|
||||
// This is an isocalendar-style date string
|
||||
p++;
|
||||
int iso_week = 0;
|
||||
int iso_day = 0;
|
||||
|
||||
p = parse_digits(p, &iso_week, 2);
|
||||
if (NULL == p) {
|
||||
return -3;
|
||||
}
|
||||
|
||||
assert(p > dtstr);
|
||||
if ((size_t)(p - dtstr) < len) {
|
||||
if (uses_separator && *(p++) != '-') {
|
||||
return -2;
|
||||
}
|
||||
|
||||
p = parse_digits(p, &iso_day, 1);
|
||||
if (NULL == p) {
|
||||
return -4;
|
||||
}
|
||||
} else {
|
||||
iso_day = 1;
|
||||
}
|
||||
|
||||
int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day);
|
||||
if (rv) {
|
||||
return -3 + rv;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
p = parse_digits(p, month, 2);
|
||||
|
@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
|
|||
return -1;
|
||||
}
|
||||
|
||||
if (*(p++) != '-') {
|
||||
if (uses_separator && *(p++) != '-') {
|
||||
return -2;
|
||||
}
|
||||
|
||||
p = parse_digits(p, day, 2);
|
||||
if (p == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -736,11 +809,14 @@ static int
|
|||
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
|
||||
int *minute, int *second, int *microsecond)
|
||||
{
|
||||
*hour = *minute = *second = *microsecond = 0;
|
||||
const char *p = tstr;
|
||||
const char *p_end = tstr_end;
|
||||
int *vals[3] = {hour, minute, second};
|
||||
// This is initialized to satisfy an erroneous compiler warning.
|
||||
unsigned char has_separator = 1;
|
||||
|
||||
// Parse [HH[:MM[:SS]]]
|
||||
// Parse [HH[:?MM[:?SS]]]
|
||||
for (size_t i = 0; i < 3; ++i) {
|
||||
p = parse_digits(p, vals[i], 2);
|
||||
if (NULL == p) {
|
||||
|
@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
|
|||
}
|
||||
|
||||
char c = *(p++);
|
||||
if (i == 0) {
|
||||
has_separator = (c == ':');
|
||||
}
|
||||
|
||||
if (p >= p_end) {
|
||||
return c != '\0';
|
||||
}
|
||||
else if (c == ':') {
|
||||
else if (has_separator && (c == ':')) {
|
||||
continue;
|
||||
}
|
||||
else if (c == '.') {
|
||||
else if (c == '.' || c == ',') {
|
||||
break;
|
||||
}
|
||||
else {
|
||||
} else if (!has_separator) {
|
||||
--p;
|
||||
} else {
|
||||
return -4; // Malformed time separator
|
||||
}
|
||||
}
|
||||
|
||||
// Parse .fff[fff]
|
||||
// Parse fractional components
|
||||
size_t len_remains = p_end - p;
|
||||
if (!(len_remains == 6 || len_remains == 3)) {
|
||||
return -3;
|
||||
size_t to_parse = len_remains;
|
||||
if (len_remains >= 6) {
|
||||
to_parse = 6;
|
||||
}
|
||||
|
||||
p = parse_digits(p, microsecond, len_remains);
|
||||
p = parse_digits(p, microsecond, to_parse);
|
||||
if (NULL == p) {
|
||||
return -3;
|
||||
}
|
||||
|
||||
if (len_remains == 3) {
|
||||
*microsecond *= 1000;
|
||||
static int correction[] = {
|
||||
100000, 10000, 1000, 100, 10
|
||||
};
|
||||
|
||||
if (to_parse < 6) {
|
||||
*microsecond *= correction[to_parse-1];
|
||||
}
|
||||
|
||||
while (is_digit(*p)){
|
||||
++p; // skip truncated digits
|
||||
}
|
||||
|
||||
// Return 1 if it's not the end of the string
|
||||
|
@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
|
|||
|
||||
const char *tzinfo_pos = p;
|
||||
do {
|
||||
if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
|
||||
if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') {
|
||||
break;
|
||||
}
|
||||
} while (++tzinfo_pos < p_end);
|
||||
|
@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
|
|||
}
|
||||
}
|
||||
|
||||
// Parse time zone component
|
||||
// Valid formats are:
|
||||
// - +HH:MM (len 6)
|
||||
// - +HH:MM:SS (len 9)
|
||||
// - +HH:MM:SS.ffffff (len 16)
|
||||
size_t tzlen = p_end - tzinfo_pos;
|
||||
if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) {
|
||||
return -5;
|
||||
// Special case UTC / Zulu time.
|
||||
if (*tzinfo_pos == 'Z') {
|
||||
*tzoffset = 0;
|
||||
*tzmicrosecond = 0;
|
||||
|
||||
if (*(tzinfo_pos + 1) != '\0') {
|
||||
return -5;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
|
||||
|
@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr)
|
|||
int year = 0, month = 0, day = 0;
|
||||
|
||||
int rv;
|
||||
if (len == 10) {
|
||||
rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
|
||||
if (len == 7 || len == 8 || len == 10) {
|
||||
rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day);
|
||||
}
|
||||
else {
|
||||
rv = -1;
|
||||
|
@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (week <= 0 || week >= 53) {
|
||||
int out_of_range = 1;
|
||||
if (week == 53) {
|
||||
// ISO years have 53 weeks in it on years starting with a Thursday
|
||||
// and on leap years starting on Wednesday
|
||||
int first_weekday = weekday(year, 1, 1);
|
||||
if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) {
|
||||
out_of_range = 0;
|
||||
}
|
||||
}
|
||||
int month;
|
||||
int rv = iso_to_ymd(year, week, day, &year, &month, &day);
|
||||
|
||||
if (out_of_range) {
|
||||
PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (rv == -2) {
|
||||
PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (day <= 0 || day >= 8) {
|
||||
if (rv == -3) {
|
||||
PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])",
|
||||
day);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Convert (Y, W, D) to (Y, M, D) in-place
|
||||
int day_1 = iso_week1_monday(year);
|
||||
|
||||
int month = week;
|
||||
int day_offset = (month - 1)*7 + day - 1;
|
||||
|
||||
ord_to_ymd(day_1 + day_offset, &year, &month, &day);
|
||||
|
||||
return new_date_subclass_ex(year, month, day, cls);
|
||||
}
|
||||
|
||||
|
@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = {
|
|||
|
||||
{"fromisoformat", (PyCFunction)date_fromisoformat, METH_O |
|
||||
METH_CLASS,
|
||||
PyDoc_STR("str -> Construct a date from the output of date.isoformat()")},
|
||||
PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")},
|
||||
|
||||
{"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar),
|
||||
METH_VARARGS | METH_KEYWORDS | METH_CLASS,
|
||||
|
@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
|
|||
goto invalid_string_error;
|
||||
}
|
||||
|
||||
// The spec actually requires that time-only ISO 8601 strings start with
|
||||
// T, but the extended format allows this to be omitted as long as there
|
||||
// is no ambiguity with date strings.
|
||||
if (*p == 'T') {
|
||||
++p;
|
||||
len -= 1;
|
||||
}
|
||||
|
||||
int hour = 0, minute = 0, second = 0, microsecond = 0;
|
||||
int tzoffset, tzimicrosecond = 0;
|
||||
int rv = parse_isoformat_time(p, len,
|
||||
|
@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = {
|
|||
PyDoc_STR("Return time with new specified fields.")},
|
||||
|
||||
{"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS,
|
||||
PyDoc_STR("string -> time from time.isoformat() output")},
|
||||
PyDoc_STR("string -> time from a string in ISO 8601 format")},
|
||||
|
||||
{"__reduce_ex__", (PyCFunction)time_reduce_ex, METH_VARARGS,
|
||||
PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")},
|
||||
|
@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
|
|||
static PyObject *
|
||||
_sanitize_isoformat_str(PyObject *dtstr)
|
||||
{
|
||||
Py_ssize_t len = PyUnicode_GetLength(dtstr);
|
||||
if (len < 7) { // All valid ISO 8601 strings are at least 7 characters long
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// `fromisoformat` allows surrogate characters in exactly one position,
|
||||
// the separator; to allow datetime_fromisoformat to make the simplifying
|
||||
// assumption that all valid strings can be encoded in UTF-8, this function
|
||||
// replaces any surrogate character separators with `T`.
|
||||
//
|
||||
// The result of this, if not NULL, returns a new reference
|
||||
Py_ssize_t len = PyUnicode_GetLength(dtstr);
|
||||
if (len < 0) {
|
||||
return NULL;
|
||||
const void* const unicode_data = PyUnicode_DATA(dtstr);
|
||||
const unsigned int kind = PyUnicode_KIND(dtstr);
|
||||
|
||||
// Depending on the format of the string, the separator can only ever be
|
||||
// in positions 7, 8 or 10. We'll check each of these for a surrogate and
|
||||
// if we find one, replace it with `T`. If there is more than one surrogate,
|
||||
// we don't have to bother sanitizing it, because the function will later
|
||||
// fail when we try to encode the string as ASCII.
|
||||
static const size_t potential_separators[3] = {7, 8, 10};
|
||||
size_t surrogate_separator = 0;
|
||||
for(size_t idx = 0;
|
||||
idx < sizeof(potential_separators) / sizeof(*potential_separators);
|
||||
++idx) {
|
||||
size_t pos = potential_separators[idx];
|
||||
if (pos > (size_t)len) {
|
||||
break;
|
||||
}
|
||||
|
||||
if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) {
|
||||
surrogate_separator = pos;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (len <= 10 ||
|
||||
!Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
|
||||
if (surrogate_separator == 0) {
|
||||
Py_INCREF(dtstr);
|
||||
return dtstr;
|
||||
}
|
||||
|
@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
|
||||
if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) {
|
||||
Py_DECREF(str_out);
|
||||
return NULL;
|
||||
}
|
||||
|
@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr)
|
|||
return str_out;
|
||||
}
|
||||
|
||||
|
||||
static Py_ssize_t
|
||||
_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) {
|
||||
// The valid date formats can all be distinguished by characters 4 and 5
|
||||
// and further narrowed down by character
|
||||
// which tells us where to look for the separator character.
|
||||
// Format | As-rendered | Position
|
||||
// ---------------------------------------
|
||||
// %Y-%m-%d | YYYY-MM-DD | 10
|
||||
// %Y%m%d | YYYYMMDD | 8
|
||||
// %Y-W%V | YYYY-Www | 8
|
||||
// %YW%V | YYYYWww | 7
|
||||
// %Y-W%V-%u | YYYY-Www-d | 10
|
||||
// %YW%V%u | YYYYWwwd | 8
|
||||
// %Y-%j | YYYY-DDD | 8
|
||||
// %Y%j | YYYYDDD | 7
|
||||
//
|
||||
// Note that because we allow *any* character for the separator, in the
|
||||
// case where character 4 is W, it's not straightforward to determine where
|
||||
// the separator is — in the case of YYYY-Www-d, you have actual ambiguity,
|
||||
// e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the
|
||||
// separator character is a number in the former case or a hyphen in the
|
||||
// latter case.
|
||||
//
|
||||
// The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead
|
||||
// to either the end of the string or the first non-numeric character —
|
||||
// since the time components all come in pairs YYYYWww#HH can be
|
||||
// distinguished from YYYYWwwd#HH by the fact that there will always be an
|
||||
// odd number of digits before the first non-digit character in the former
|
||||
// case.
|
||||
static const char date_separator = '-';
|
||||
static const char week_indicator = 'W';
|
||||
|
||||
if (len == 7) {
|
||||
return 7;
|
||||
}
|
||||
|
||||
if (dtstr[4] == date_separator) {
|
||||
// YYYY-???
|
||||
|
||||
if (dtstr[5] == week_indicator) {
|
||||
// YYYY-W??
|
||||
|
||||
if (len < 8) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (len > 8 && dtstr[8] == date_separator) {
|
||||
// YYYY-Www-D (10) or YYYY-Www-HH (8)
|
||||
if (len == 9) { return -1; }
|
||||
if (len > 10 && is_digit(dtstr[10])) {
|
||||
// This is as far as we'll try to go to resolve the
|
||||
// ambiguity for the moment — if we have YYYY-Www-##, the
|
||||
// separator is either a hyphen at 8 or a number at 10.
|
||||
//
|
||||
// We'll assume it's a hyphen at 8 because it's way more
|
||||
// likely that someone will use a hyphen as a separator
|
||||
// than a number, but at this point it's really best effort
|
||||
// because this is an extension of the spec anyway.
|
||||
return 8;
|
||||
}
|
||||
|
||||
return 10;
|
||||
} else {
|
||||
// YYYY-Www (8)
|
||||
return 8;
|
||||
}
|
||||
} else {
|
||||
// YYYY-MM-DD (10)
|
||||
return 10;
|
||||
}
|
||||
} else {
|
||||
// YYYY???
|
||||
if (dtstr[4] == week_indicator) {
|
||||
// YYYYWww (7) or YYYYWwwd (8)
|
||||
size_t idx = 7;
|
||||
for (; idx < (size_t)len; ++idx) {
|
||||
// Keep going until we run out of digits.
|
||||
if (!is_digit(dtstr[idx])) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (idx < 9) {
|
||||
return idx;
|
||||
}
|
||||
|
||||
if (idx % 2 == 0) {
|
||||
// If the index of the last number is even, it's YYYYWww
|
||||
return 7;
|
||||
} else {
|
||||
return 8;
|
||||
}
|
||||
} else {
|
||||
// YYYYMMDD (8)
|
||||
return 8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
|
||||
{
|
||||
|
@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
|
|||
return NULL;
|
||||
}
|
||||
|
||||
// We only need to sanitize this string if the separator is a surrogate
|
||||
// character. In the situation where the separator location is ambiguous,
|
||||
// we don't have to sanitize it anything because that can only happen when
|
||||
// the separator is either '-' or a number. This should mostly be a noop
|
||||
// but it makes the reference counting easier if we still sanitize.
|
||||
PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
|
||||
if (dtstr_clean == NULL) {
|
||||
goto error;
|
||||
goto invalid_string_error;
|
||||
}
|
||||
|
||||
Py_ssize_t len;
|
||||
|
@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
|
|||
}
|
||||
}
|
||||
|
||||
const Py_ssize_t separator_location = _find_isoformat_datetime_separator(
|
||||
dt_ptr, len);
|
||||
|
||||
|
||||
const char *p = dt_ptr;
|
||||
|
||||
int year = 0, month = 0, day = 0;
|
||||
int hour = 0, minute = 0, second = 0, microsecond = 0;
|
||||
int tzoffset = 0, tzusec = 0;
|
||||
|
||||
// date has a fixed length of 10
|
||||
int rv = parse_isoformat_date(p, &year, &month, &day);
|
||||
// date runs up to separator_location
|
||||
int rv = parse_isoformat_date(p, separator_location, &year, &month, &day);
|
||||
|
||||
if (!rv && len > 10) {
|
||||
if (!rv && len > separator_location) {
|
||||
// In UTF-8, the length of multi-byte characters is encoded in the MSB
|
||||
if ((p[10] & 0x80) == 0) {
|
||||
p += 11;
|
||||
p += separator_location;
|
||||
if ((p[0] & 0x80) == 0) {
|
||||
p += 1;
|
||||
}
|
||||
else {
|
||||
switch (p[10] & 0xf0) {
|
||||
switch (p[0] & 0xf0) {
|
||||
case 0xe0:
|
||||
p += 13;
|
||||
p += 3;
|
||||
break;
|
||||
case 0xf0:
|
||||
p += 14;
|
||||
p += 4;
|
||||
break;
|
||||
default:
|
||||
p += 12;
|
||||
p += 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = {
|
|||
|
||||
{"fromisoformat", (PyCFunction)datetime_fromisoformat,
|
||||
METH_O | METH_CLASS,
|
||||
PyDoc_STR("string -> datetime from datetime.isoformat() output")},
|
||||
PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")},
|
||||
|
||||
/* Instance methods: */
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue