gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648)

If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
This commit is contained in:
Serhiy Storchaka 2025-05-12 20:42:23 +03:00 committed by GitHub
parent 734e15b70d
commit 9f69a58623
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 160 additions and 63 deletions

View file

@ -2,6 +2,7 @@ from _codecs import _unregister_error as _codecs_unregister_error
import codecs
import html.entities
import itertools
import re
import sys
import unicodedata
import unittest
@ -1125,7 +1126,7 @@ class CodecCallbackTest(unittest.TestCase):
text = 'abc<def>ghi'*n
text.translate(charmap)
def test_mutatingdecodehandler(self):
def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
@ -1160,6 +1161,42 @@ class CodecCallbackTest(unittest.TestCase):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
def test_mutating_decode_handler_unicode_escape(self):
decode = codecs.unicode_escape_decode
def mutating(exc):
if isinstance(exc, UnicodeDecodeError):
r = data.get(exc.object[:exc.end])
if r is not None:
exc.object = r[0] + exc.object[exc.end:]
return ('\u0404', r[1])
raise AssertionError("don't know how to handle %r" % exc)
codecs.register_error('test.mutating2', mutating)
data = {
br'\x0': (b'\\', 0),
br'\x3': (b'xxx\\', 3),
br'\x5': (b'x\\', 1),
}
def check(input, expected, msg):
with self.assertWarns(DeprecationWarning) as cm:
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
self.assertIn(msg, str(cm.warning))
check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')
check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot