[3.12] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133944) (#134337)

If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623)
(cherry picked from commit 6279eb8c07)
This commit is contained in:
Serhiy Storchaka 2025-05-26 06:33:22 +03:00 committed by GitHub
parent 310cd8943a
commit 4398b788ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 194 additions and 57 deletions

View file

@ -181,15 +181,18 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
len = p - buf;
s = buf;
const char *first_invalid_escape;
v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
&first_invalid_escape_char,
&first_invalid_escape_ptr);
// HACK: later we can simply pass the line no, since we don't preserve the tokens
// when we are decoding the string but we preserve the line numbers.
if (v != NULL && first_invalid_escape != NULL && t != NULL) {
if (warn_invalid_escape_sequence(parser, s, first_invalid_escape, t) < 0) {
/* We have not decref u before because first_invalid_escape points
inside u. */
if (v != NULL && first_invalid_escape_ptr != NULL && t != NULL) {
if (warn_invalid_escape_sequence(parser, s, first_invalid_escape_ptr, t) < 0) {
/* We have not decref u before because first_invalid_escape_ptr
points inside u. */
Py_XDECREF(u);
Py_DECREF(v);
return NULL;
@ -202,14 +205,17 @@ decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
static PyObject *
decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
{
const char *first_invalid_escape;
PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
int first_invalid_escape_char;
const char *first_invalid_escape_ptr;
PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
&first_invalid_escape_char,
&first_invalid_escape_ptr);
if (result == NULL) {
return NULL;
}
if (first_invalid_escape != NULL) {
if (warn_invalid_escape_sequence(p, s, first_invalid_escape, t) < 0) {
if (first_invalid_escape_ptr != NULL) {
if (warn_invalid_escape_sequence(p, s, first_invalid_escape_ptr, t) < 0) {
Py_DECREF(result);
return NULL;
}