gh-80010: Expand fromisoformat to include most of ISO-8601 (#92177)

This expands `fromisoformat` to cover most of the common uses of ISO 8601. We may expand the scope more in the future.
This commit is contained in:
Paul Ganssle 2022-05-05 18:31:24 -06:00 committed by GitHub
parent ada8b6d1b1
commit 1303f8c927
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 778 additions and 178 deletions

View file

@ -395,6 +395,39 @@ iso_week1_monday(int year)
return week1_monday;
}
static int
iso_to_ymd(const int iso_year, const int iso_week, const int iso_day,
int *year, int *month, int *day) {
if (iso_week <= 0 || iso_week >= 53) {
int out_of_range = 1;
if (iso_week == 53) {
// ISO years have 53 weeks in it on years starting with a Thursday
// and on leap years starting on Wednesday
int first_weekday = weekday(iso_year, 1, 1);
if (first_weekday == 3 || (first_weekday == 2 && is_leap(iso_year))) {
out_of_range = 0;
}
}
if (out_of_range) {
return -2;
}
}
if (iso_day <= 0 || iso_day >= 8) {
return -3;
}
// Convert (Y, W, D) to (Y, M, D) in-place
int day_1 = iso_week1_monday(iso_year);
int day_offset = (iso_week - 1)*7 + iso_day - 1;
ord_to_ymd(day_1 + day_offset, year, month, day);
return 0;
}
/* ---------------------------------------------------------------------------
* Range checkers.
*/
@ -680,6 +713,11 @@ set_date_fields(PyDateTime_Date *self, int y, int m, int d)
* String parsing utilities and helper functions
*/
static unsigned char
is_digit(const char c) {
return ((unsigned int)(c - '0')) < 10;
}
static const char *
parse_digits(const char *ptr, int *var, size_t num_digits)
{
@ -696,14 +734,17 @@ parse_digits(const char *ptr, int *var, size_t num_digits)
}
static int
parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
parse_isoformat_date(const char *dtstr, const size_t len, int *year, int *month, int *day)
{
/* Parse the date components of the result of date.isoformat()
*
* Return codes:
* 0: Success
* -1: Failed to parse date component
* -2: Failed to parse dateseparator
* -2: Inconsistent date separator usage
* -3: Failed to parse ISO week.
* -4: Failed to parse ISO day.
* -5, -6: Failure in iso_to_ymd
*/
const char *p = dtstr;
p = parse_digits(p, year, 4);
@ -711,8 +752,42 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
return -1;
}
if (*(p++) != '-') {
return -2;
const unsigned char uses_separator = (*p == '-');
if (uses_separator) {
++p;
}
if(*p == 'W') {
// This is an isocalendar-style date string
p++;
int iso_week = 0;
int iso_day = 0;
p = parse_digits(p, &iso_week, 2);
if (NULL == p) {
return -3;
}
assert(p > dtstr);
if ((size_t)(p - dtstr) < len) {
if (uses_separator && *(p++) != '-') {
return -2;
}
p = parse_digits(p, &iso_day, 1);
if (NULL == p) {
return -4;
}
} else {
iso_day = 1;
}
int rv = iso_to_ymd(*year, iso_week, iso_day, year, month, day);
if (rv) {
return -3 + rv;
} else {
return 0;
}
}
p = parse_digits(p, month, 2);
@ -720,15 +795,13 @@ parse_isoformat_date(const char *dtstr, int *year, int *month, int *day)
return -1;
}
if (*(p++) != '-') {
if (uses_separator && *(p++) != '-') {
return -2;
}
p = parse_digits(p, day, 2);
if (p == NULL) {
return -1;
}
return 0;
}
@ -736,11 +809,14 @@ static int
parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
int *minute, int *second, int *microsecond)
{
*hour = *minute = *second = *microsecond = 0;
const char *p = tstr;
const char *p_end = tstr_end;
int *vals[3] = {hour, minute, second};
// This is initialized to satisfy an erroneous compiler warning.
unsigned char has_separator = 1;
// Parse [HH[:MM[:SS]]]
// Parse [HH[:?MM[:?SS]]]
for (size_t i = 0; i < 3; ++i) {
p = parse_digits(p, vals[i], 2);
if (NULL == p) {
@ -748,33 +824,47 @@ parse_hh_mm_ss_ff(const char *tstr, const char *tstr_end, int *hour,
}
char c = *(p++);
if (i == 0) {
has_separator = (c == ':');
}
if (p >= p_end) {
return c != '\0';
}
else if (c == ':') {
else if (has_separator && (c == ':')) {
continue;
}
else if (c == '.') {
else if (c == '.' || c == ',') {
break;
}
else {
} else if (!has_separator) {
--p;
} else {
return -4; // Malformed time separator
}
}
// Parse .fff[fff]
// Parse fractional components
size_t len_remains = p_end - p;
if (!(len_remains == 6 || len_remains == 3)) {
return -3;
size_t to_parse = len_remains;
if (len_remains >= 6) {
to_parse = 6;
}
p = parse_digits(p, microsecond, len_remains);
p = parse_digits(p, microsecond, to_parse);
if (NULL == p) {
return -3;
}
if (len_remains == 3) {
*microsecond *= 1000;
static int correction[] = {
100000, 10000, 1000, 100, 10
};
if (to_parse < 6) {
*microsecond *= correction[to_parse-1];
}
while (is_digit(*p)){
++p; // skip truncated digits
}
// Return 1 if it's not the end of the string
@ -800,7 +890,7 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
const char *tzinfo_pos = p;
do {
if (*tzinfo_pos == '+' || *tzinfo_pos == '-') {
if (*tzinfo_pos == 'Z' || *tzinfo_pos == '+' || *tzinfo_pos == '-') {
break;
}
} while (++tzinfo_pos < p_end);
@ -822,14 +912,16 @@ parse_isoformat_time(const char *dtstr, size_t dtlen, int *hour, int *minute,
}
}
// Parse time zone component
// Valid formats are:
// - +HH:MM (len 6)
// - +HH:MM:SS (len 9)
// - +HH:MM:SS.ffffff (len 16)
size_t tzlen = p_end - tzinfo_pos;
if (!(tzlen == 6 || tzlen == 9 || tzlen == 16)) {
return -5;
// Special case UTC / Zulu time.
if (*tzinfo_pos == 'Z') {
*tzoffset = 0;
*tzmicrosecond = 0;
if (*(tzinfo_pos + 1) != '\0') {
return -5;
} else {
return 1;
}
}
int tzsign = (*tzinfo_pos == '-') ? -1 : 1;
@ -2983,8 +3075,8 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr)
int year = 0, month = 0, day = 0;
int rv;
if (len == 10) {
rv = parse_isoformat_date(dt_ptr, &year, &month, &day);
if (len == 7 || len == 8 || len == 10) {
rv = parse_isoformat_date(dt_ptr, len, &year, &month, &day);
}
else {
rv = -1;
@ -3027,37 +3119,21 @@ date_fromisocalendar(PyObject *cls, PyObject *args, PyObject *kw)
return NULL;
}
if (week <= 0 || week >= 53) {
int out_of_range = 1;
if (week == 53) {
// ISO years have 53 weeks in it on years starting with a Thursday
// and on leap years starting on Wednesday
int first_weekday = weekday(year, 1, 1);
if (first_weekday == 3 || (first_weekday == 2 && is_leap(year))) {
out_of_range = 0;
}
}
int month;
int rv = iso_to_ymd(year, week, day, &year, &month, &day);
if (out_of_range) {
PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
return NULL;
}
if (rv == -2) {
PyErr_Format(PyExc_ValueError, "Invalid week: %d", week);
return NULL;
}
if (day <= 0 || day >= 8) {
if (rv == -3) {
PyErr_Format(PyExc_ValueError, "Invalid day: %d (range is [1, 7])",
day);
return NULL;
}
// Convert (Y, W, D) to (Y, M, D) in-place
int day_1 = iso_week1_monday(year);
int month = week;
int day_offset = (month - 1)*7 + day - 1;
ord_to_ymd(day_1 + day_offset, &year, &month, &day);
return new_date_subclass_ex(year, month, day, cls);
}
@ -3489,7 +3565,7 @@ static PyMethodDef date_methods[] = {
{"fromisoformat", (PyCFunction)date_fromisoformat, METH_O |
METH_CLASS,
PyDoc_STR("str -> Construct a date from the output of date.isoformat()")},
PyDoc_STR("str -> Construct a date from a string in ISO 8601 format.")},
{"fromisocalendar", _PyCFunction_CAST(date_fromisocalendar),
METH_VARARGS | METH_KEYWORDS | METH_CLASS,
@ -4564,6 +4640,14 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
goto invalid_string_error;
}
// The spec actually requires that time-only ISO 8601 strings start with
// T, but the extended format allows this to be omitted as long as there
// is no ambiguity with date strings.
if (*p == 'T') {
++p;
len -= 1;
}
int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset, tzimicrosecond = 0;
int rv = parse_isoformat_time(p, len,
@ -4671,7 +4755,7 @@ static PyMethodDef time_methods[] = {
PyDoc_STR("Return time with new specified fields.")},
{"fromisoformat", (PyCFunction)time_fromisoformat, METH_O | METH_CLASS,
PyDoc_STR("string -> time from time.isoformat() output")},
PyDoc_STR("string -> time from a string in ISO 8601 format")},
{"__reduce_ex__", (PyCFunction)time_reduce_ex, METH_VARARGS,
PyDoc_STR("__reduce_ex__(proto) -> (cls, state)")},
@ -5184,19 +5268,42 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
static PyObject *
_sanitize_isoformat_str(PyObject *dtstr)
{
Py_ssize_t len = PyUnicode_GetLength(dtstr);
if (len < 7) { // All valid ISO 8601 strings are at least 7 characters long
return NULL;
}
// `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`.
//
// The result of this, if not NULL, returns a new reference
Py_ssize_t len = PyUnicode_GetLength(dtstr);
if (len < 0) {
return NULL;
const void* const unicode_data = PyUnicode_DATA(dtstr);
const unsigned int kind = PyUnicode_KIND(dtstr);
// Depending on the format of the string, the separator can only ever be
// in positions 7, 8 or 10. We'll check each of these for a surrogate and
// if we find one, replace it with `T`. If there is more than one surrogate,
// we don't have to bother sanitizing it, because the function will later
// fail when we try to encode the string as ASCII.
static const size_t potential_separators[3] = {7, 8, 10};
size_t surrogate_separator = 0;
for(size_t idx = 0;
idx < sizeof(potential_separators) / sizeof(*potential_separators);
++idx) {
size_t pos = potential_separators[idx];
if (pos > (size_t)len) {
break;
}
if(Py_UNICODE_IS_SURROGATE(PyUnicode_READ(kind, unicode_data, pos))) {
surrogate_separator = pos;
break;
}
}
if (len <= 10 ||
!Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
if (surrogate_separator == 0) {
Py_INCREF(dtstr);
return dtstr;
}
@ -5206,7 +5313,7 @@ _sanitize_isoformat_str(PyObject *dtstr)
return NULL;
}
if (PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
if (PyUnicode_WriteChar(str_out, surrogate_separator, (Py_UCS4)'T')) {
Py_DECREF(str_out);
return NULL;
}
@ -5214,6 +5321,106 @@ _sanitize_isoformat_str(PyObject *dtstr)
return str_out;
}
static Py_ssize_t
_find_isoformat_datetime_separator(const char *dtstr, Py_ssize_t len) {
// The valid date formats can all be distinguished by characters 4 and 5
// and further narrowed down by character
// which tells us where to look for the separator character.
// Format | As-rendered | Position
// ---------------------------------------
// %Y-%m-%d | YYYY-MM-DD | 10
// %Y%m%d | YYYYMMDD | 8
// %Y-W%V | YYYY-Www | 8
// %YW%V | YYYYWww | 7
// %Y-W%V-%u | YYYY-Www-d | 10
// %YW%V%u | YYYYWwwd | 8
// %Y-%j | YYYY-DDD | 8
// %Y%j | YYYYDDD | 7
//
// Note that because we allow *any* character for the separator, in the
// case where character 4 is W, it's not straightforward to determine where
// the separator is — in the case of YYYY-Www-d, you have actual ambiguity,
// e.g. 2020-W01-0000 could be YYYY-Www-D0HH or YYYY-Www-HHMM, when the
// separator character is a number in the former case or a hyphen in the
// latter case.
//
// The case of YYYYWww can be distinguished from YYYYWwwd by tracking ahead
// to either the end of the string or the first non-numeric character —
// since the time components all come in pairs YYYYWww#HH can be
// distinguished from YYYYWwwd#HH by the fact that there will always be an
// odd number of digits before the first non-digit character in the former
// case.
static const char date_separator = '-';
static const char week_indicator = 'W';
if (len == 7) {
return 7;
}
if (dtstr[4] == date_separator) {
// YYYY-???
if (dtstr[5] == week_indicator) {
// YYYY-W??
if (len < 8) {
return -1;
}
if (len > 8 && dtstr[8] == date_separator) {
// YYYY-Www-D (10) or YYYY-Www-HH (8)
if (len == 9) { return -1; }
if (len > 10 && is_digit(dtstr[10])) {
// This is as far as we'll try to go to resolve the
// ambiguity for the moment — if we have YYYY-Www-##, the
// separator is either a hyphen at 8 or a number at 10.
//
// We'll assume it's a hyphen at 8 because it's way more
// likely that someone will use a hyphen as a separator
// than a number, but at this point it's really best effort
// because this is an extension of the spec anyway.
return 8;
}
return 10;
} else {
// YYYY-Www (8)
return 8;
}
} else {
// YYYY-MM-DD (10)
return 10;
}
} else {
// YYYY???
if (dtstr[4] == week_indicator) {
// YYYYWww (7) or YYYYWwwd (8)
size_t idx = 7;
for (; idx < (size_t)len; ++idx) {
// Keep going until we run out of digits.
if (!is_digit(dtstr[idx])) {
break;
}
}
if (idx < 9) {
return idx;
}
if (idx % 2 == 0) {
// If the index of the last number is even, it's YYYYWww
return 7;
} else {
return 8;
}
} else {
// YYYYMMDD (8)
return 8;
}
}
}
static PyObject *
datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
{
@ -5225,9 +5432,14 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
return NULL;
}
// We only need to sanitize this string if the separator is a surrogate
// character. In the situation where the separator location is ambiguous,
// we don't have to sanitize it anything because that can only happen when
// the separator is either '-' or a number. This should mostly be a noop
// but it makes the reference counting easier if we still sanitize.
PyObject *dtstr_clean = _sanitize_isoformat_str(dtstr);
if (dtstr_clean == NULL) {
goto error;
goto invalid_string_error;
}
Py_ssize_t len;
@ -5243,30 +5455,35 @@ datetime_fromisoformat(PyObject *cls, PyObject *dtstr)
}
}
const Py_ssize_t separator_location = _find_isoformat_datetime_separator(
dt_ptr, len);
const char *p = dt_ptr;
int year = 0, month = 0, day = 0;
int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset = 0, tzusec = 0;
// date has a fixed length of 10
int rv = parse_isoformat_date(p, &year, &month, &day);
// date runs up to separator_location
int rv = parse_isoformat_date(p, separator_location, &year, &month, &day);
if (!rv && len > 10) {
if (!rv && len > separator_location) {
// In UTF-8, the length of multi-byte characters is encoded in the MSB
if ((p[10] & 0x80) == 0) {
p += 11;
p += separator_location;
if ((p[0] & 0x80) == 0) {
p += 1;
}
else {
switch (p[10] & 0xf0) {
switch (p[0] & 0xf0) {
case 0xe0:
p += 13;
p += 3;
break;
case 0xf0:
p += 14;
p += 4;
break;
default:
p += 12;
p += 2;
break;
}
}
@ -6327,7 +6544,7 @@ static PyMethodDef datetime_methods[] = {
{"fromisoformat", (PyCFunction)datetime_fromisoformat,
METH_O | METH_CLASS,
PyDoc_STR("string -> datetime from datetime.isoformat() output")},
PyDoc_STR("string -> datetime from a string in most ISO 8601 formats")},
/* Instance methods: */