mirror of
https://github.com/python/cpython.git
synced 2025-11-13 07:26:31 +00:00
bpo-34454: fix .fromisoformat() methods crashing on inputs with surrogate code points (GH-8862)
The current C implementations **crash** if the input includes a surrogate
Unicode code point, which is not possible to encode in UTF-8.
Important notes:
1. It is possible to pass a non-UTF-8 string as a separator to the
`.isoformat()` methods.
2. The pure-Python `datetime.fromisoformat()` implementation accepts
strings with a surrogate as the separator.
In `datetime.fromisoformat()`, in the special case of non-UTF-8 separators,
this implementation will take a performance hit by making a copy of the
input string and replacing the separator with 'T'.
Co-authored-by: Alexey Izbyshev <izbyshev@ispras.ru>
Co-authored-by: Paul Ganssle <paul@ganssle.io>
This commit is contained in:
parent
c33bb5d401
commit
096329f0b2
3 changed files with 84 additions and 10 deletions
|
|
@ -1667,6 +1667,7 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase):
|
||||||
# Test that fromisoformat() fails on invalid values
|
# Test that fromisoformat() fails on invalid values
|
||||||
bad_strs = [
|
bad_strs = [
|
||||||
'', # Empty string
|
'', # Empty string
|
||||||
|
'\ud800', # bpo-34454: Surrogate code point
|
||||||
'009-03-04', # Not 10 characters
|
'009-03-04', # Not 10 characters
|
||||||
'123456789', # Not a date
|
'123456789', # Not a date
|
||||||
'200a-12-04', # Invalid character in year
|
'200a-12-04', # Invalid character in year
|
||||||
|
|
@ -1675,6 +1676,7 @@ class TestDate(HarmlessMixedComparison, unittest.TestCase):
|
||||||
'2009-01-32', # Invalid day
|
'2009-01-32', # Invalid day
|
||||||
'2009-02-29', # Invalid leap day
|
'2009-02-29', # Invalid leap day
|
||||||
'20090228', # Valid ISO8601 output not from isoformat()
|
'20090228', # Valid ISO8601 output not from isoformat()
|
||||||
|
'2009\ud80002\ud80028', # Separators are surrogate codepoints
|
||||||
]
|
]
|
||||||
|
|
||||||
for bad_str in bad_strs:
|
for bad_str in bad_strs:
|
||||||
|
|
@ -2587,7 +2589,8 @@ class TestDateTime(TestDate):
|
||||||
' ', 'T', '\u007f', # 1-bit widths
|
' ', 'T', '\u007f', # 1-bit widths
|
||||||
'\u0080', 'ʁ', # 2-bit widths
|
'\u0080', 'ʁ', # 2-bit widths
|
||||||
'ᛇ', '時', # 3-bit widths
|
'ᛇ', '時', # 3-bit widths
|
||||||
'🐍' # 4-bit widths
|
'🐍', # 4-bit widths
|
||||||
|
'\ud800', # bpo-34454: Surrogate code point
|
||||||
]
|
]
|
||||||
|
|
||||||
for sep in separators:
|
for sep in separators:
|
||||||
|
|
@ -2639,6 +2642,7 @@ class TestDateTime(TestDate):
|
||||||
# Test that fromisoformat() fails on invalid values
|
# Test that fromisoformat() fails on invalid values
|
||||||
bad_strs = [
|
bad_strs = [
|
||||||
'', # Empty string
|
'', # Empty string
|
||||||
|
'\ud800', # bpo-34454: Surrogate code point
|
||||||
'2009.04-19T03', # Wrong first separator
|
'2009.04-19T03', # Wrong first separator
|
||||||
'2009-04.19T03', # Wrong second separator
|
'2009-04.19T03', # Wrong second separator
|
||||||
'2009-04-19T0a', # Invalid hours
|
'2009-04-19T0a', # Invalid hours
|
||||||
|
|
@ -2652,6 +2656,8 @@ class TestDateTime(TestDate):
|
||||||
'2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset
|
'2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset
|
||||||
'2009-04-19T03:15:45.123456-24:30', # Invalid negative offset
|
'2009-04-19T03:15:45.123456-24:30', # Invalid negative offset
|
||||||
'2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators
|
'2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators
|
||||||
|
'2009-04\ud80010T12:15', # Surrogate char in date
|
||||||
|
'2009-04-10T12\ud80015', # Surrogate char in time
|
||||||
'2009-04-19T1', # Incomplete hours
|
'2009-04-19T1', # Incomplete hours
|
||||||
'2009-04-19T12:3', # Incomplete minutes
|
'2009-04-19T12:3', # Incomplete minutes
|
||||||
'2009-04-19T12:30:4', # Incomplete seconds
|
'2009-04-19T12:30:4', # Incomplete seconds
|
||||||
|
|
@ -3521,6 +3527,7 @@ class TestTimeTZ(TestTime, TZInfoBase, unittest.TestCase):
|
||||||
def test_fromisoformat_fails(self):
|
def test_fromisoformat_fails(self):
|
||||||
bad_strs = [
|
bad_strs = [
|
||||||
'', # Empty string
|
'', # Empty string
|
||||||
|
'12\ud80000', # Invalid separator - surrogate char
|
||||||
'12:', # Ends on a separator
|
'12:', # Ends on a separator
|
||||||
'12:30:', # Ends on a separator
|
'12:30:', # Ends on a separator
|
||||||
'12:30:15.', # Ends on a separator
|
'12:30:15.', # Ends on a separator
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
Fix the .fromisoformat() methods of datetime types crashing when given
|
||||||
|
unicode with non-UTF-8-encodable code points. Specifically,
|
||||||
|
datetime.fromisoformat() now accepts surrogate unicode code points used as
|
||||||
|
the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.
|
||||||
|
|
@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
|
||||||
Py_ssize_t len;
|
Py_ssize_t len;
|
||||||
|
|
||||||
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
|
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
|
||||||
|
if (dt_ptr == NULL) {
|
||||||
|
goto invalid_string_error;
|
||||||
|
}
|
||||||
|
|
||||||
int year = 0, month = 0, day = 0;
|
int year = 0, month = 0, day = 0;
|
||||||
|
|
||||||
|
|
@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (rv < 0) {
|
if (rv < 0) {
|
||||||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
|
goto invalid_string_error;
|
||||||
dt_ptr);
|
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new_date_subclass_ex(year, month, day, cls);
|
return new_date_subclass_ex(year, month, day, cls);
|
||||||
|
|
||||||
|
invalid_string_error:
|
||||||
|
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
|
||||||
|
dtstr);
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -4258,6 +4264,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
|
||||||
Py_ssize_t len;
|
Py_ssize_t len;
|
||||||
const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);
|
const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);
|
||||||
|
|
||||||
|
if (p == NULL) {
|
||||||
|
goto invalid_string_error;
|
||||||
|
}
|
||||||
|
|
||||||
int hour = 0, minute = 0, second = 0, microsecond = 0;
|
int hour = 0, minute = 0, second = 0, microsecond = 0;
|
||||||
int tzoffset, tzimicrosecond = 0;
|
int tzoffset, tzimicrosecond = 0;
|
||||||
int rv = parse_isoformat_time(p, len,
|
int rv = parse_isoformat_time(p, len,
|
||||||
|
|
@ -4265,8 +4275,7 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
|
||||||
&tzoffset, &tzimicrosecond);
|
&tzoffset, &tzimicrosecond);
|
||||||
|
|
||||||
if (rv < 0) {
|
if (rv < 0) {
|
||||||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
|
goto invalid_string_error;
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
|
PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
|
||||||
|
|
@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
|
||||||
|
|
||||||
Py_DECREF(tzinfo);
|
Py_DECREF(tzinfo);
|
||||||
return t;
|
return t;
|
||||||
|
|
||||||
|
invalid_string_error:
|
||||||
|
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -4839,6 +4852,33 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static PyObject *
|
||||||
|
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
|
||||||
|
// `fromisoformat` allows surrogate characters in exactly one position,
|
||||||
|
// the separator; to allow datetime_fromisoformat to make the simplifying
|
||||||
|
// assumption that all valid strings can be encoded in UTF-8, this function
|
||||||
|
// replaces any surrogate character separators with `T`.
|
||||||
|
Py_ssize_t len = PyUnicode_GetLength(dtstr);
|
||||||
|
*needs_decref = 0;
|
||||||
|
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
|
||||||
|
return dtstr;
|
||||||
|
}
|
||||||
|
|
||||||
|
PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
|
||||||
|
if (str_out == NULL) {
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
|
||||||
|
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
|
||||||
|
Py_DECREF(str_out);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
*needs_decref = 1;
|
||||||
|
return str_out;
|
||||||
|
}
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
|
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
|
||||||
assert(dtstr != NULL);
|
assert(dtstr != NULL);
|
||||||
|
|
@ -4848,9 +4888,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int needs_decref = 0;
|
||||||
|
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
|
||||||
|
if (dtstr == NULL) {
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
|
||||||
Py_ssize_t len;
|
Py_ssize_t len;
|
||||||
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
|
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
|
||||||
const char * p = dt_ptr;
|
|
||||||
|
if (dt_ptr == NULL) {
|
||||||
|
goto invalid_string_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *p = dt_ptr;
|
||||||
|
|
||||||
int year = 0, month = 0, day = 0;
|
int year = 0, month = 0, day = 0;
|
||||||
int hour = 0, minute = 0, second = 0, microsecond = 0;
|
int hour = 0, minute = 0, second = 0, microsecond = 0;
|
||||||
|
|
@ -4883,20 +4934,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
|
||||||
&tzoffset, &tzusec);
|
&tzoffset, &tzusec);
|
||||||
}
|
}
|
||||||
if (rv < 0) {
|
if (rv < 0) {
|
||||||
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
|
goto invalid_string_error;
|
||||||
return NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
|
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
|
||||||
if (tzinfo == NULL) {
|
if (tzinfo == NULL) {
|
||||||
return NULL;
|
goto error;
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
|
PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
|
||||||
second, microsecond, tzinfo, cls);
|
second, microsecond, tzinfo, cls);
|
||||||
|
|
||||||
Py_DECREF(tzinfo);
|
Py_DECREF(tzinfo);
|
||||||
|
if (needs_decref) {
|
||||||
|
Py_DECREF(dtstr);
|
||||||
|
}
|
||||||
return dt;
|
return dt;
|
||||||
|
|
||||||
|
invalid_string_error:
|
||||||
|
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
|
||||||
|
|
||||||
|
error:
|
||||||
|
if (needs_decref) {
|
||||||
|
Py_DECREF(dtstr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue