mirror of
https://github.com/python/cpython.git
synced 2025-07-24 03:35:53 +00:00
Merged revisions 84655 via svnmerge from
svn+ssh://pythondev@svn.python.org/python/branches/py3k ........ r84655 | antoine.pitrou | 2010-09-09 22:30:23 +0200 (jeu., 09 sept. 2010) | 6 lines Issue #9804: ascii() now always represents unicode surrogate pairs as a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds. ........
This commit is contained in:
parent
8e0bb6a1e2
commit
c9a8df24cc
4 changed files with 72 additions and 17 deletions
|
@ -174,6 +174,28 @@ class BuiltinTest(unittest.TestCase):
|
||||||
a = {}
|
a = {}
|
||||||
a[0] = a
|
a[0] = a
|
||||||
self.assertEqual(ascii(a), '{0: {...}}')
|
self.assertEqual(ascii(a), '{0: {...}}')
|
||||||
|
# Advanced checks for unicode strings
|
||||||
|
def _check_uni(s):
|
||||||
|
self.assertEqual(ascii(s), repr(s))
|
||||||
|
_check_uni("'")
|
||||||
|
_check_uni('"')
|
||||||
|
_check_uni('"\'')
|
||||||
|
_check_uni('\0')
|
||||||
|
_check_uni('\r\n\t .')
|
||||||
|
# Unprintable non-ASCII characters
|
||||||
|
_check_uni('\x85')
|
||||||
|
_check_uni('\u1fff')
|
||||||
|
_check_uni('\U00012fff')
|
||||||
|
# Lone surrogates
|
||||||
|
_check_uni('\ud800')
|
||||||
|
_check_uni('\udfff')
|
||||||
|
# Issue #9804: surrogates should be joined even for printable
|
||||||
|
# wide characters (UCS-2 builds).
|
||||||
|
self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
|
||||||
|
# All together
|
||||||
|
s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
|
||||||
|
self.assertEqual(ascii(s),
|
||||||
|
r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
|
||||||
|
|
||||||
def test_neg(self):
|
def test_neg(self):
|
||||||
x = -sys.maxsize-1
|
x = -sys.maxsize-1
|
||||||
|
|
|
@ -577,17 +577,31 @@ class CodecCallbackTest(unittest.TestCase):
|
||||||
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
|
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
|
||||||
("\\uffff", 1)
|
("\\uffff", 1)
|
||||||
)
|
)
|
||||||
if sys.maxunicode>0xffff:
|
# 1 on UCS-4 builds, 2 on UCS-2
|
||||||
self.assertEquals(
|
len_wide = len("\U00010000")
|
||||||
codecs.backslashreplace_errors(
|
self.assertEquals(
|
||||||
UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
|
codecs.backslashreplace_errors(
|
||||||
("\\U00010000", 1)
|
UnicodeEncodeError("ascii", "\U00010000",
|
||||||
)
|
0, len_wide, "ouch")),
|
||||||
self.assertEquals(
|
("\\U00010000", len_wide)
|
||||||
codecs.backslashreplace_errors(
|
)
|
||||||
UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
|
self.assertEquals(
|
||||||
("\\U0010ffff", 1)
|
codecs.backslashreplace_errors(
|
||||||
)
|
UnicodeEncodeError("ascii", "\U0010ffff",
|
||||||
|
0, len_wide, "ouch")),
|
||||||
|
("\\U0010ffff", len_wide)
|
||||||
|
)
|
||||||
|
# Lone surrogates (regardless of unicode width)
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.backslashreplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
|
||||||
|
("\\ud800", 1)
|
||||||
|
)
|
||||||
|
self.assertEquals(
|
||||||
|
codecs.backslashreplace_errors(
|
||||||
|
UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
|
||||||
|
("\\udfff", 1)
|
||||||
|
)
|
||||||
|
|
||||||
def test_badhandlerresults(self):
|
def test_badhandlerresults(self):
|
||||||
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
|
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
|
||||||
|
|
|
@ -12,6 +12,11 @@ What's New in Python 3.1.3?
|
||||||
Core and Builtins
|
Core and Builtins
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
|
- Issue #9804: ascii() now always represents unicode surrogate pairs as
|
||||||
|
a single ``\UXXXXXXXX``, regardless of whether the character is printable
|
||||||
|
or not. Also, the "backslashreplace" error handler now joins surrogate
|
||||||
|
pairs into a single character on UCS-2 builds.
|
||||||
|
|
||||||
- Issue #9797: pystate.c wrongly assumed that zero couldn't be a valid
|
- Issue #9797: pystate.c wrongly assumed that zero couldn't be a valid
|
||||||
thread-local storage key.
|
thread-local storage key.
|
||||||
|
|
||||||
|
|
|
@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
|
||||||
|
|
||||||
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
{
|
{
|
||||||
|
#ifndef Py_UNICODE_WIDE
|
||||||
|
#define IS_SURROGATE_PAIR(p, end) \
|
||||||
|
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
|
||||||
|
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
|
||||||
|
#else
|
||||||
|
#define IS_SURROGATE_PAIR(p, end) 0
|
||||||
|
#endif
|
||||||
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||||
PyObject *restuple;
|
PyObject *restuple;
|
||||||
PyObject *object;
|
PyObject *object;
|
||||||
|
@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
if (*p >= 0x100) {
|
if (*p >= 0x100) {
|
||||||
ressize += 1+1+4;
|
if (IS_SURROGATE_PAIR(p, startp+end)) {
|
||||||
|
ressize += 1+1+8;
|
||||||
|
++p;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
ressize += 1+1+4;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
ressize += 1+1+2;
|
ressize += 1+1+2;
|
||||||
|
@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
return NULL;
|
return NULL;
|
||||||
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
|
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
|
||||||
p < startp+end; ++p) {
|
p < startp+end; ++p) {
|
||||||
Py_UNICODE c = *p;
|
Py_UCS4 c = (Py_UCS4) *p;
|
||||||
*outp++ = '\\';
|
*outp++ = '\\';
|
||||||
#ifdef Py_UNICODE_WIDE
|
if (IS_SURROGATE_PAIR(p, startp+end)) {
|
||||||
|
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
|
||||||
|
++p;
|
||||||
|
}
|
||||||
if (c >= 0x00010000) {
|
if (c >= 0x00010000) {
|
||||||
*outp++ = 'U';
|
*outp++ = 'U';
|
||||||
*outp++ = hexdigits[(c>>28)&0xf];
|
*outp++ = hexdigits[(c>>28)&0xf];
|
||||||
|
@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
*outp++ = hexdigits[(c>>12)&0xf];
|
*outp++ = hexdigits[(c>>12)&0xf];
|
||||||
*outp++ = hexdigits[(c>>8)&0xf];
|
*outp++ = hexdigits[(c>>8)&0xf];
|
||||||
}
|
}
|
||||||
else
|
else if (c >= 0x100) {
|
||||||
#endif
|
|
||||||
if (c >= 0x100) {
|
|
||||||
*outp++ = 'u';
|
*outp++ = 'u';
|
||||||
*outp++ = hexdigits[(c>>12)&0xf];
|
*outp++ = hexdigits[(c>>12)&0xf];
|
||||||
*outp++ = hexdigits[(c>>8)&0xf];
|
*outp++ = hexdigits[(c>>8)&0xf];
|
||||||
|
@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
||||||
wrong_exception_type(exc);
|
wrong_exception_type(exc);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
#undef IS_SURROGATE_PAIR
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This handler is declared static until someone demonstrates
|
/* This handler is declared static until someone demonstrates
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue