Issue #9804: ascii() now always represents unicode surrogate pairs as

a single `\UXXXXXXXX`, regardless of whether the character is printable or not. Also, the "backslashreplace" error handler now joins surrogate pairs into a single character on UCS-2 builds.
2025-09-27 18:59:43 +00:00 · 2010-09-09 20:30:23 +00:00 · 2010-09-09 20:30:23 +00:00 · e4a189274f
commit e4a189274f
parent ea99c5c949
4 changed files with 72 additions and 17 deletions
--- a/Lib/test/test_builtin.py
+++ b/Lib/test/test_builtin.py
@ -179,6 +179,28 @@ class BuiltinTest(unittest.TestCase):
        a = {}
        a[0] = a
        self.assertEqual(ascii(a), '{0: {...}}')
        # Advanced checks for unicode strings
        def _check_uni(s):
            self.assertEqual(ascii(s), repr(s))
        _check_uni("'")
        _check_uni('"')
        _check_uni('"\'')
        _check_uni('\0')
        _check_uni('\r\n\t .')
        # Unprintable non-ASCII characters
        _check_uni('\x85')
        _check_uni('\u1fff')
        _check_uni('\U00012fff')
        # Lone surrogates
        _check_uni('\ud800')
        _check_uni('\udfff')
        # Issue #9804: surrogates should be joined even for printable
        # wide characters (UCS-2 builds).
        self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
        # All together
        s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
        self.assertEqual(ascii(s),
            r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
    def test_neg(self):
        x = -sys.maxsize-1
--- a/Lib/test/test_codeccallbacks.py
+++ b/Lib/test/test_codeccallbacks.py
@ -577,16 +577,30 @@ class CodecCallbackTest(unittest.TestCase):
                UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
            ("\\uffff", 1)
        )
-        if sys.maxunicode>0xffff:
+        # 1 on UCS-4 builds, 2 on UCS-2
        len_wide = len("\U00010000")
        self.assertEquals(
            codecs.backslashreplace_errors(
-                    UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")),
+                UnicodeEncodeError("ascii", "\U00010000",
-                ("\\U00010000", 1)
+                                   0, len_wide, "ouch")),
            ("\\U00010000", len_wide)
        )
        self.assertEquals(
            codecs.backslashreplace_errors(
-                    UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")),
+                UnicodeEncodeError("ascii", "\U0010ffff",
-                ("\\U0010ffff", 1)
+                                   0, len_wide, "ouch")),
            ("\\U0010ffff", len_wide)
        )
        # Lone surrogates (regardless of unicode width)
        self.assertEquals(
            codecs.backslashreplace_errors(
                UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
            ("\\ud800", 1)
        )
        self.assertEquals(
            codecs.backslashreplace_errors(
                UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
            ("\\udfff", 1)
        )
    def test_badhandlerresults(self):
--- a/Misc/NEWS
+++ b/Misc/NEWS
@ -10,6 +10,11 @@ What's New in Python 3.2 Alpha 3?
 Core and Builtins
 -----------------
 - Issue #9804: ascii() now always represents unicode surrogate pairs as
  a single ``\UXXXXXXXX``, regardless of whether the character is printable
  or not.  Also, the "backslashreplace" error handler now joins surrogate
  pairs into a single character on UCS-2 builds.
 - Issue #9757: memoryview objects get a release() method to release the
  underlying buffer (previously this was only done when deallocating the
  memoryview), and gain support for the context management protocol.
--- a/Python/codecs.c
+++ b/Python/codecs.c
@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
 PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
 {
 #ifndef Py_UNICODE_WIDE
 #define IS_SURROGATE_PAIR(p, end) \
    (*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
     *(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
 #else
 #define IS_SURROGATE_PAIR(p, end) 0
 #endif
    if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
        PyObject *restuple;
        PyObject *object;
@ -702,6 +709,11 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
            else
 #endif
            if (*p >= 0x100) {
                if (IS_SURROGATE_PAIR(p, startp+end)) {
                    ressize += 1+1+8;
                    ++p;
                }
                else
                    ressize += 1+1+4;
            }
            else
@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
            return NULL;
        for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
            p < startp+end; ++p) {
-            Py_UNICODE c = *p;
+            Py_UCS4 c = (Py_UCS4) *p;
            *outp++ = '\\';
-#ifdef Py_UNICODE_WIDE
+            if (IS_SURROGATE_PAIR(p, startp+end)) {
                c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
                ++p;
            }
            if (c >= 0x00010000) {
                *outp++ = 'U';
                *outp++ = hexdigits[(c>>28)&0xf];
@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
                *outp++ = hexdigits[(c>>12)&0xf];
                *outp++ = hexdigits[(c>>8)&0xf];
            }
-            else
+            else if (c >= 0x100) {
 #endif
            if (c >= 0x100) {
                *outp++ = 'u';
                *outp++ = hexdigits[(c>>12)&0xf];
                *outp++ = hexdigits[(c>>8)&0xf];
@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
        wrong_exception_type(exc);
        return NULL;
    }
 #undef IS_SURROGATE_PAIR
 }
 /* This handler is declared static until someone demonstrates