Issue #13333: The UTF-7 decoder now accepts lone surrogates

(the encoder already accepts them).
This commit is contained in:
Antoine Pitrou 2011-11-15 01:44:16 +01:00
commit 78edf7576e
3 changed files with 21 additions and 12 deletions

View file

@ -1108,10 +1108,18 @@ class UnicodeTest(string_tests.CommonTest,
for (x, y) in utfTests: for (x, y) in utfTests:
self.assertEqual(x.encode('utf-7'), y) self.assertEqual(x.encode('utf-7'), y)
# Unpaired surrogates not supported # Unpaired surrogates are passed through
self.assertRaises(UnicodeError, str, b'+3ADYAA-', 'utf-7') self.assertEqual('\uD801'.encode('utf-7'), b'+2AE-')
self.assertEqual('\uD801x'.encode('utf-7'), b'+2AE-x')
self.assertEqual('\uDC01'.encode('utf-7'), b'+3AE-')
self.assertEqual('\uDC01x'.encode('utf-7'), b'+3AE-x')
self.assertEqual(b'+2AE-'.decode('utf-7'), '\uD801')
self.assertEqual(b'+2AE-x'.decode('utf-7'), '\uD801x')
self.assertEqual(b'+3AE-'.decode('utf-7'), '\uDC01')
self.assertEqual(b'+3AE-x'.decode('utf-7'), '\uDC01x')
self.assertEqual(str(b'+3ADYAA-', 'utf-7', 'replace'), '\ufffd\ufffd') self.assertEqual('\uD801\U000abcde'.encode('utf-7'), b'+2AHab9ze-')
self.assertEqual(b'+2AHab9ze-'.decode('utf-7'), '\uD801\U000abcde')
# Issue #2242: crash on some Windows/MSVC versions # Issue #2242: crash on some Windows/MSVC versions
self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1') self.assertEqual(b'+\xc1'.decode('utf-7'), '\xc1')

View file

@ -10,6 +10,9 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #13333: The UTF-7 decoder now accepts lone surrogates (the encoder
already accepts them).
- Issue #13389: Full garbage collection passes now clear the freelists for - Issue #13389: Full garbage collection passes now clear the freelists for
list and dict objects. They already cleared other freelists in the list and dict objects. They already cleared other freelists in the
interpreter. interpreter.

View file

@ -3884,21 +3884,18 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
if (unicode_putchar(&unicode, &outpos, ch2) < 0) if (unicode_putchar(&unicode, &outpos, ch2) < 0)
goto onError; goto onError;
surrogate = 0; surrogate = 0;
continue;
} }
else { else {
if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
goto onError;
surrogate = 0; surrogate = 0;
errmsg = "second surrogate missing";
goto utf7Error;
} }
} }
else if (outCh >= 0xD800 && outCh <= 0xDBFF) { if (outCh >= 0xD800 && outCh <= 0xDBFF) {
/* first surrogate */ /* first surrogate */
surrogate = outCh; surrogate = outCh;
} }
else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
errmsg = "unexpected second surrogate";
goto utf7Error;
}
else { else {
if (unicode_putchar(&unicode, &outpos, outCh) < 0) if (unicode_putchar(&unicode, &outpos, outCh) < 0)
goto onError; goto onError;
@ -3909,8 +3906,9 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
inShift = 0; inShift = 0;
s++; s++;
if (surrogate) { if (surrogate) {
errmsg = "second surrogate missing at end of shift sequence"; if (unicode_putchar(&unicode, &outpos, surrogate) < 0)
goto utf7Error; goto onError;
surrogate = 0;
} }
if (base64bits > 0) { /* left-over bits */ if (base64bits > 0) { /* left-over bits */
if (base64bits >= 6) { if (base64bits >= 6) {