[3.14] gh-133767: Fix use-after-free in the unicode-escape decoder with an error handler (GH-129648) (GH-133942)

If the error handler is used, a new bytes object is created to set as
the object attribute of UnicodeDecodeError, and that bytes object then
replaces the original data. A pointer to the decoded data will became invalid
after destroying that temporary bytes object. So we need other way to return
the first invalid escape from _PyUnicode_DecodeUnicodeEscapeInternal().

_PyBytes_DecodeEscape() does not have such issue, because it does not
use the error handlers registry, but it should be changed for compatibility
with _PyUnicode_DecodeUnicodeEscapeInternal().
(cherry picked from commit 9f69a58623)

Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
This commit is contained in:
Miss Islington (bot) 2025-05-13 15:25:08 +02:00 committed by GitHub
parent f0a7a6c2cc
commit 69b4387f78
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 160 additions and 80 deletions

View file

@ -2,6 +2,7 @@ from _codecs import _unregister_error as _codecs_unregister_error
import codecs
import html.entities
import itertools
import re
import sys
import unicodedata
import unittest
@ -1125,7 +1126,7 @@ class CodecCallbackTest(unittest.TestCase):
text = 'abc<def>ghi'*n
text.translate(charmap)
def test_mutatingdecodehandler(self):
def test_mutating_decode_handler(self):
baddata = [
("ascii", b"\xff"),
("utf-7", b"++"),
@ -1160,6 +1161,42 @@ class CodecCallbackTest(unittest.TestCase):
for (encoding, data) in baddata:
self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
def test_mutating_decode_handler_unicode_escape(self):
decode = codecs.unicode_escape_decode
def mutating(exc):
if isinstance(exc, UnicodeDecodeError):
r = data.get(exc.object[:exc.end])
if r is not None:
exc.object = r[0] + exc.object[exc.end:]
return ('\u0404', r[1])
raise AssertionError("don't know how to handle %r" % exc)
codecs.register_error('test.mutating2', mutating)
data = {
br'\x0': (b'\\', 0),
br'\x3': (b'xxx\\', 3),
br'\x5': (b'x\\', 1),
}
def check(input, expected, msg):
with self.assertWarns(DeprecationWarning) as cm:
self.assertEqual(decode(input, 'test.mutating2'), (expected, len(input)))
self.assertIn(msg, str(cm.warning))
check(br'\x0n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
check(br'\x0n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
check(br'\x0z', '\u0404\\z', r'"\z" is an invalid escape sequence')
check(br'\x3n\zr', '\u0404\n\\zr', r'"\z" is an invalid escape sequence')
check(br'\x3zr', '\u0404\\zr', r'"\z" is an invalid escape sequence')
check(br'\x3z5', '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x3z5x')[:-1], '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x3z5xy')[:-2], '\u0404\\z5', r'"\z" is an invalid escape sequence')
check(br'\x5n\z', '\u0404\n\\z', r'"\z" is an invalid escape sequence')
check(br'\x5n\501', '\u0404\n\u0141', r'"\501" is an invalid octal escape sequence')
check(br'\x5z', '\u0404\\z', r'"\z" is an invalid escape sequence')
check(memoryview(br'\x5zy')[:-1], '\u0404\\z', r'"\z" is an invalid escape sequence')
# issue32583
def test_crashing_decode_handler(self):
# better generating one more character to fill the extra space slot