mirror of
https://github.com/python/cpython.git
synced 2025-08-03 00:23:06 +00:00
Issue #24848: Fixed bugs in UTF-7 decoding of misformed data:
1. Non-ASCII bytes were accepted after shift sequence. 2. A low surrogate could be emitted in case of error in high surrogate. 3. In some circumstances the '\xfd' character was produced instead of the replacement character '\ufffd' (due to a bug in _PyUnicodeWriter).
This commit is contained in:
commit
58c8f2bb6d
4 changed files with 75 additions and 11 deletions
|
@ -903,6 +903,32 @@ class CP65001Test(ReadTest, unittest.TestCase):
|
|||
class UTF7Test(ReadTest, unittest.TestCase):
|
||||
encoding = "utf-7"
|
||||
|
||||
def test_ascii(self):
|
||||
# Set D (directly encoded characters)
|
||||
set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
'abcdefghijklmnopqrstuvwxyz'
|
||||
'0123456789'
|
||||
'\'(),-./:?')
|
||||
self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
|
||||
self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
|
||||
# Set O (optional direct characters)
|
||||
set_o = ' !"#$%&*;<=>@[]^_`{|}'
|
||||
self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
|
||||
self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
|
||||
# +
|
||||
self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
|
||||
self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
|
||||
# White spaces
|
||||
ws = ' \t\n\r'
|
||||
self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
|
||||
self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
|
||||
# Other ASCII characters
|
||||
other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
|
||||
set(set_d + set_o + '+' + ws)))
|
||||
self.assertEqual(other_ascii.encode(self.encoding),
|
||||
b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
|
||||
b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
|
||||
|
||||
def test_partial(self):
|
||||
self.check_partial(
|
||||
'a+-b\x00c\x80d\u0100e\U00010000f',
|
||||
|
@ -944,7 +970,9 @@ class UTF7Test(ReadTest, unittest.TestCase):
|
|||
|
||||
def test_errors(self):
|
||||
tests = [
|
||||
(b'\xffb', '\ufffdb'),
|
||||
(b'a\xffb', 'a\ufffdb'),
|
||||
(b'a\xff\xffb', 'a\ufffd\ufffdb'),
|
||||
(b'a+IK', 'a\ufffd'),
|
||||
(b'a+IK-b', 'a\ufffdb'),
|
||||
(b'a+IK,b', 'a\ufffdb'),
|
||||
|
@ -960,6 +988,8 @@ class UTF7Test(ReadTest, unittest.TestCase):
|
|||
(b'a+//,+IKw-b', 'a\ufffd\u20acb'),
|
||||
(b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
|
||||
(b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
|
||||
(b'a+IKw-b\xff', 'a\u20acb\ufffd'),
|
||||
(b'a+IKw\xffb', 'a\u20ac\ufffdb'),
|
||||
]
|
||||
for raw, expected in tests:
|
||||
with self.subTest(raw=raw):
|
||||
|
@ -971,8 +1001,36 @@ class UTF7Test(ReadTest, unittest.TestCase):
|
|||
self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
|
||||
self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
|
||||
self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
|
||||
self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
|
||||
self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
|
||||
self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
|
||||
self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
|
||||
self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
|
||||
b'+IKwgrNgB3KA-')
|
||||
self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
|
||||
'\u20ac\u20ac\U000104A0')
|
||||
self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
|
||||
'\u20ac\u20ac\U000104A0')
|
||||
|
||||
test_lone_surrogates = None
|
||||
def test_lone_surrogates(self):
|
||||
tests = [
|
||||
(b'a+2AE-b', 'a\ud801b'),
|
||||
(b'a+2AE\xffb', 'a\ufffdb'),
|
||||
(b'a+2AE', 'a\ufffd'),
|
||||
(b'a+2AEA-b', 'a\ufffdb'),
|
||||
(b'a+2AH-b', 'a\ufffdb'),
|
||||
(b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
|
||||
(b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
|
||||
(b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
|
||||
(b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
|
||||
(b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
|
||||
(b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
|
||||
(b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
|
||||
(b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
|
||||
]
|
||||
for raw, expected in tests:
|
||||
with self.subTest(raw=raw):
|
||||
self.assertEqual(raw.decode('utf-7', 'replace'), expected)
|
||||
|
||||
|
||||
class UTF16ExTest(unittest.TestCase):
|
||||
|
|
|
@ -1553,7 +1553,7 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
|
||||
|
||||
# Issue #2242: crash on some Windows/MSVC versions
|
||||
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')
|
||||
self.assertEqual(b'+\xc1'.decode('utf-7', 'ignore'), '')
|
||||
|
||||
# Direct encoded characters
|
||||
set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
|
||||
|
@ -1995,6 +1995,7 @@ class UnicodeTest(string_tests.CommonTest,
|
|||
self.assertRaises(UnicodeError, str, b'Andr\202 x', 'ascii', 'strict')
|
||||
self.assertEqual(str(b'Andr\202 x', 'ascii', 'ignore'), "Andr x")
|
||||
self.assertEqual(str(b'Andr\202 x', 'ascii', 'replace'), 'Andr\uFFFD x')
|
||||
self.assertEqual(str(b'\202 x', 'ascii', 'replace'), '\uFFFD x')
|
||||
|
||||
# Error handling (unknown character names)
|
||||
self.assertEqual(b"\\N{foo}xx".decode("unicode-escape", "ignore"), "xx")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue