[3.12] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (#134337)

If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623)
(cherry picked from commit 6279eb8c07)
This commit is contained in:
Serhiy Storchaka 2025-05-26 06:33:22 +03:00 committed by GitHub
parent 310cd8943a
commit 4398b788ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 194 additions and 57 deletions

View file

@ -6046,13 +6046,15 @@ PyUnicode_AsUTF16String(PyObject *unicode)
/* --- Unicode Escape Codec ----------------------------------------------- */
PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
_PyUnicode_DecodeUnicodeEscapeInternal2(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
int *first_invalid_escape_char,
const char **first_invalid_escape_ptr)
{
const char *starts = s;
const char *initial_starts = starts;
_PyUnicodeWriter writer;
const char *end;
PyObject *errorHandler = NULL;
@ -6061,7 +6063,8 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
PyInterpreterState *interp = _PyInterpreterState_Get();
// so we can remember if we've seen an invalid escape char or not
*first_invalid_escape = NULL;
*first_invalid_escape_char = -1;
*first_invalid_escape_ptr = NULL;
if (size == 0) {
if (consumed) {
@ -6149,9 +6152,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
}
}
if (ch > 0377) {
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-3; /* Back up 3 chars, since we've
already incremented s. */
if (*first_invalid_escape_char == -1) {
*first_invalid_escape_char = ch;
if (starts == initial_starts) {
/* Back up 3 chars, since we've already incremented s. */
*first_invalid_escape_ptr = s - 3;
}
}
}
WRITE_CHAR(ch);
@ -6252,9 +6258,12 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
goto error;
default:
if (*first_invalid_escape == NULL) {
*first_invalid_escape = s-1; /* Back up one char, since we've
already incremented s. */
if (*first_invalid_escape_char == -1) {
*first_invalid_escape_char = c;
if (starts == initial_starts) {
/* Back up one char, since we've already incremented s. */
*first_invalid_escape_ptr = s - 1;
}
}
WRITE_ASCII_CHAR('\\');
WRITE_CHAR(c);
@ -6293,24 +6302,40 @@ _PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
return NULL;
}
// Export for binary compatibility.
PyObject *
_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed,
const char **first_invalid_escape)
{
int first_invalid_escape_char;
return _PyUnicode_DecodeUnicodeEscapeInternal2(
s, size, errors, consumed,
&first_invalid_escape_char,
first_invalid_escape);
}
PyObject *
_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Py_ssize_t size,
const char *errors,
Py_ssize_t *consumed)
{
const char *first_invalid_escape;
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal2(s, size, errors,
consumed,
&first_invalid_escape);
&first_invalid_escape_char,
&first_invalid_escape_ptr);
if (result == NULL)
return NULL;
if (first_invalid_escape != NULL) {
unsigned char c = *first_invalid_escape;
if ('4' <= c && c <= '7') {
if (first_invalid_escape_char != -1) {
if (first_invalid_escape_char > 0xff) {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid octal escape sequence '\\%.3s'",
first_invalid_escape) < 0)
"invalid octal escape sequence '\\%o'",
first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;
@ -6319,7 +6344,7 @@ _PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
else {
if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
"invalid escape sequence '\\%c'",
c) < 0)
first_invalid_escape_char) < 0)
{
Py_DECREF(result);
return NULL;