Issue #9804: ascii() now always represents unicode surrogate pairs as

a single `\UXXXXXXXX`, regardless of whether the character is printable
or not.  Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
This commit is contained in:
Antoine Pitrou 2010-09-09 20:30:23 +00:00
parent ea99c5c949
commit e4a189274f
4 changed files with 72 additions and 17 deletions

View file

@ -179,6 +179,28 @@ class BuiltinTest(unittest.TestCase):
a = {} a = {}
a[0] = a a[0] = a
self.assertEqual(ascii(a), '{0: {...}}') self.assertEqual(ascii(a), '{0: {...}}')
# Advanced checks for unicode strings
def _check_uni(s):
self.assertEqual(ascii(s), repr(s))
_check_uni("'")
_check_uni('"')
_check_uni('"\'')
_check_uni('\0')
_check_uni('\r\n\t .')
# Unprintable non-ASCII characters
_check_uni('\x85')
_check_uni('\u1fff')
_check_uni('\U00012fff')
# Lone surrogates
_check_uni('\ud800')
_check_uni('\udfff')
# Issue #9804: surrogates should be joined even for printable
# wide characters (UCS-2 builds).
self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
# All together
s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
self.assertEqual(ascii(s),
r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
def test_neg(self): def test_neg(self):
x = -sys.maxsize-1 x = -sys.maxsize-1

View file

@ -577,16 +577,30 @@ class CodecCallbackTest(unittest.TestCase):
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
("\\uffff", 1) ("\\uffff", 1)
) )
if sys.maxunicode>0xffff: # 1 on UCS-4 builds, 2 on UCS-2
len_wide = len("\U00010000")
self.assertEquals( self.assertEquals(
codecs.backslashreplace_errors( codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\U00010000",
("\\U00010000", 1) 0, len_wide, "ouch")),
("\\U00010000", len_wide)
) )
self.assertEquals( self.assertEquals(
codecs.backslashreplace_errors( codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\U0010ffff",
("\\U0010ffff", 1) 0, len_wide, "ouch")),
("\\U0010ffff", len_wide)
)
# Lone surrogates (regardless of unicode width)
self.assertEquals(
codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
("\\ud800", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
("\\udfff", 1)
) )
def test_badhandlerresults(self): def test_badhandlerresults(self):

View file

@ -10,6 +10,11 @@ What's New in Python 3.2 Alpha 3?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #9804: ascii() now always represents unicode surrogate pairs as
a single ``\UXXXXXXXX``, regardless of whether the character is printable
or not. Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
- Issue #9757: memoryview objects get a release() method to release the - Issue #9757: memoryview objects get a release() method to release the
underlying buffer (previously this was only done when deallocating the underlying buffer (previously this was only done when deallocating the
memoryview), and gain support for the context management protocol. memoryview), and gain support for the context management protocol.

View file

@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{ {
#ifndef Py_UNICODE_WIDE
#define IS_SURROGATE_PAIR(p, end) \
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
#else
#define IS_SURROGATE_PAIR(p, end) 0
#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
@ -702,6 +709,11 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
else else
#endif #endif
if (*p >= 0x100) { if (*p >= 0x100) {
if (IS_SURROGATE_PAIR(p, startp+end)) {
ressize += 1+1+8;
++p;
}
else
ressize += 1+1+4; ressize += 1+1+4;
} }
else else
@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
return NULL; return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) { p < startp+end; ++p) {
Py_UNICODE c = *p; Py_UCS4 c = (Py_UCS4) *p;
*outp++ = '\\'; *outp++ = '\\';
#ifdef Py_UNICODE_WIDE if (IS_SURROGATE_PAIR(p, startp+end)) {
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
++p;
}
if (c >= 0x00010000) { if (c >= 0x00010000) {
*outp++ = 'U'; *outp++ = 'U';
*outp++ = hexdigits[(c>>28)&0xf]; *outp++ = hexdigits[(c>>28)&0xf];
@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
*outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf]; *outp++ = hexdigits[(c>>8)&0xf];
} }
else else if (c >= 0x100) {
#endif
if (c >= 0x100) {
*outp++ = 'u'; *outp++ = 'u';
*outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf]; *outp++ = hexdigits[(c>>8)&0xf];
@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc); wrong_exception_type(exc);
return NULL; return NULL;
} }
#undef IS_SURROGATE_PAIR
} }
/* This handler is declared static until someone demonstrates /* This handler is declared static until someone demonstrates