[3.10] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (GH-134345)

If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623)
(cherry picked from commit 6279eb8c07)
(cherry picked from commit a75953b347)
(cherry picked from commit 0c33e5baed)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Serhiy Storchaka 2025-06-02 18:55:48 +03:00 committed by GitHub
parent f85e71a008
commit ab9893c406
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 163 additions and 40 deletions

View file

@ -1089,10 +1089,11 @@ _PyBytes_FormatEx(const char *format, Py_ssize_t format_len,
}
/* Unescape a backslash-escaped string. */
PyObject *_PyBytes_DecodeEscape(const char *s,
PyObject *_PyBytes_DecodeEscape2(const char *s,
Py_ssize_t len,
const char *errors,
const char **first_invalid_escape)
int *first_invalid_escape_char,
const char **first_invalid_escape_ptr)
{
int c;
char *p;
@ -1106,7 +1107,8 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
return NULL;
writer.overallocate = 1;
*first_invalid_escape = NULL;
*first_invalid_escape_char = -1;
*first_invalid_escape_ptr = NULL;
end = s + len;
while (s < end) {
@ -1181,9 +1183,10 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
break;
default:
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
if (*first_invalid_escape_char == -1) {
*first_invalid_escape_char = (unsigned char)s[-1];
/* Back up one char, since we've already incremented s. */
*first_invalid_escape_ptr = s - 1;
}
*p++ = '\\';
s--;
@ -1197,21 +1200,36 @@ PyObject *_PyBytes_DecodeEscape(const char *s,
return NULL;
}
// Export for binary compatibility.
PyObject *_PyBytes_DecodeEscape(const char *s,
Py_ssize_t len,
const char *errors,
const char **first_invalid_escape)
{
int first_invalid_escape_char;
return _PyBytes_DecodeEscape2(
s, len, errors,
&first_invalid_escape_char,
first_invalid_escape);
}
PyObject *PyBytes_DecodeEscape(const char *s,
Py_ssize_t len,
const char *errors,
Py_ssize_t Py_UNUSED(unicode),
const char *Py_UNUSED(recode_encoding))
{
const char* first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, errors,
&first_invalid_escape);
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
PyObject *result = _PyBytes_DecodeEscape2(s, len, errors,
&first_invalid_escape_char,
&first_invalid_escape_ptr);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
if (first_invalid_escape_char != -1) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
(unsigned char)*first_invalid_escape) < 0) {
first_invalid_escape_char) < 0) {
Py_DECREF(result);
return NULL;
}