Change PyUnicode_EncodeUnicodeEscape() to return a bytes object.

However PyUnicode_AsUnicodeEscapeString() (which is used by Objects/fileobject.c::file_repr()) still returns a str8 object. Give unicode_repr() it's own implementation which returns a str8 object (it was formerly just calling unicodeescape_string() which was used to implement PyUnicode_EncodeUnicodeEscape() too), because once repr() is required to return unicode objects it needs its own implementation anyway.
2025-10-22 14:42:22 +00:00 · 2007-05-12 11:08:06 +00:00 · 2007-05-12 11:08:06 +00:00 · 79e913eac7
commit 79e913eac7
parent 1324c6f5e7
1 changed files with 179 additions and 53 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -2094,16 +2094,14 @@ Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
    return NULL;
 }
-static
+static const char *hexdigits = "0123456789abcdef";
-PyObject *unicodeescape_string(const Py_UNICODE *s,
+
-                               Py_ssize_t size,
+PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
-                               int quotes)
+					Py_ssize_t size)
 {
    PyObject *repr;
    char *p;
    static const char *hexdigit = "0123456789abcdef";
    /* XXX(nnorwitz): rather than over-allocating, it would be
       better to choose a different scheme.  Perhaps scan the
       first N-chars of the string and allocate based on that size.
@ -2122,8 +2120,7 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
       escape.
    */
-    repr = PyString_FromStringAndSize(NULL,
+    repr = PyBytes_FromStringAndSize(NULL,
        2
 #ifdef Py_UNICODE_WIDE
        + 10*size
 #else
@ -2133,18 +2130,13 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
    if (repr == NULL)
        return NULL;
-    p = PyString_AS_STRING(repr);
+    p = PyBytes_AS_STRING(repr);
    if (quotes) {
        *p++ = (findchar(s, size, '\'') &&
                !findchar(s, size, '"')) ? '"' : '\'';
    }
    while (size-- > 0) {
        Py_UNICODE ch = *s++;
-        /* Escape quotes and backslashes */
+        /* Escape backslashes */
-        if ((quotes &&
+        if (ch == '\\') {
 	     ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || ch == '\\') {
            *p++ = '\\';
            *p++ = (char) ch;
            continue;
@ -2155,14 +2147,14 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
        else if (ch >= 0x10000) {
            *p++ = '\\';
            *p++ = 'U';
-            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
-            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
-            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
-            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
-            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
-            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
-            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
+            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
-            *p++ = hexdigit[ch & 0x0000000F];
+            *p++ = hexdigits[ch & 0x0000000F];
 	    continue;
        }
 #else
@ -2177,14 +2169,14 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
 		*p++ = '\\';
 		*p++ = 'U';
-		*p++ = hexdigit[(ucs >> 28) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
-		*p++ = hexdigit[(ucs >> 24) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
-		*p++ = hexdigit[(ucs >> 20) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
-		*p++ = hexdigit[(ucs >> 16) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
-		*p++ = hexdigit[(ucs >> 12) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
-		*p++ = hexdigit[(ucs >> 8) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
-		*p++ = hexdigit[(ucs >> 4) & 0x0000000F];
+		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
-		*p++ = hexdigit[ucs & 0x0000000F];
+		*p++ = hexdigits[ucs & 0x0000000F];
 		continue;
 	    }
 	    /* Fall through: isolated surrogates are copied as-is */
@ -2197,10 +2189,10 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
        if (ch >= 256) {
            *p++ = '\\';
            *p++ = 'u';
-            *p++ = hexdigit[(ch >> 12) & 0x000F];
+            *p++ = hexdigits[(ch >> 12) & 0x000F];
-            *p++ = hexdigit[(ch >> 8) & 0x000F];
+            *p++ = hexdigits[(ch >> 8) & 0x000F];
-            *p++ = hexdigit[(ch >> 4) & 0x000F];
+            *p++ = hexdigits[(ch >> 4) & 0x000F];
-            *p++ = hexdigit[ch & 0x000F];
+            *p++ = hexdigits[ch & 0x000F];
        }
        /* Map special whitespace to '\t', \n', '\r' */
@ -2221,36 +2213,39 @@ PyObject *unicodeescape_string(const Py_UNICODE *s,
        else if (ch < ' ' || ch >= 0x7F) {
            *p++ = '\\';
            *p++ = 'x';
-            *p++ = hexdigit[(ch >> 4) & 0x000F];
+            *p++ = hexdigits[(ch >> 4) & 0x000F];
-            *p++ = hexdigit[ch & 0x000F];
+            *p++ = hexdigits[ch & 0x000F];
        }
        /* Copy everything else as-is */
        else
            *p++ = (char) ch;
    }
    if (quotes)
        *p++ = PyString_AS_STRING(repr)[0];
    *p = '\0';
-    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
+    if (PyBytes_Resize(repr, p - PyBytes_AS_STRING(repr))) {
        Py_DECREF(repr);
        return NULL;
    }
    return repr;
 }
 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
 					Py_ssize_t size)
 {
    return unicodeescape_string(s, size, 0);
 }
 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
 {
    PyObject *s, *result;
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return NULL;
    }
-    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
+    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
                                      PyUnicode_GET_SIZE(unicode));
    if (!s)
        return NULL;
    result = PyString_FromStringAndSize(PyBytes_AS_STRING(s),
                                        PyBytes_GET_SIZE(s));
    Py_DECREF(s);
    return result;
 }
 /* --- Raw Unicode Escape Codec ------------------------------------------- */
@ -6521,9 +6516,140 @@ unicode_replace(PyUnicodeObject *self, PyObject *args)
 static
 PyObject *unicode_repr(PyObject *unicode)
 {
-    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
+    PyObject *repr;
-				PyUnicode_GET_SIZE(unicode),
+    char *p;
-				1);
+    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
    /* XXX(nnorwitz): rather than over-allocating, it would be
       better to choose a different scheme.  Perhaps scan the
       first N-chars of the string and allocate based on that size.
    */
    /* Initial allocation is based on the longest-possible unichr
       escape.
       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
       unichr, so in this case it's the longest unichr escape. In
       narrow (UTF-16) builds this is five chars per source unichr
       since there are two unichrs in the surrogate pair, so in narrow
       (UTF-16) builds it's not the longest unichr escape.
       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
       so in the narrow (UTF-16) build case it's the longest unichr
       escape.
    */
    repr = PyString_FromStringAndSize(NULL,
        2 /* quotes */
 #ifdef Py_UNICODE_WIDE
        + 10*size
 #else
        + 6*size
 #endif
        + 1);
    if (repr == NULL)
        return NULL;
    p = PyString_AS_STRING(repr);
    /* Add quote */
    *p++ = (findchar(s, size, '\'') &&
            !findchar(s, size, '"')) ? '"' : '\'';
    while (size-- > 0) {
        Py_UNICODE ch = *s++;
        /* Escape quotes and backslashes */
        if ((ch == (Py_UNICODE) PyString_AS_STRING(repr)[0]) || (ch == '\\')) {
            *p++ = '\\';
            *p++ = (char) ch;
            continue;
        }
 #ifdef Py_UNICODE_WIDE
        /* Map 21-bit characters to '\U00xxxxxx' */
        else if (ch >= 0x10000) {
            *p++ = '\\';
            *p++ = 'U';
            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
            *p++ = hexdigits[ch & 0x0000000F];
 	    continue;
        }
 #else
 	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
 	else if (ch >= 0xD800 && ch < 0xDC00) {
 	    Py_UNICODE ch2;
 	    Py_UCS4 ucs;
 	    ch2 = *s++;
 	    size--;
 	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
 		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
 		*p++ = '\\';
 		*p++ = 'U';
 		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
 		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
 		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
 		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
 		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
 		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
 		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
 		*p++ = hexdigits[ucs & 0x0000000F];
 		continue;
 	    }
 	    /* Fall through: isolated surrogates are copied as-is */
 	    s--;
 	    size++;
 	}
 #endif
        /* Map 16-bit characters to '\uxxxx' */
        if (ch >= 256) {
            *p++ = '\\';
            *p++ = 'u';
            *p++ = hexdigits[(ch >> 12) & 0x000F];
            *p++ = hexdigits[(ch >> 8) & 0x000F];
            *p++ = hexdigits[(ch >> 4) & 0x000F];
            *p++ = hexdigits[ch & 0x000F];
        }
        /* Map special whitespace to '\t', \n', '\r' */
        else if (ch == '\t') {
            *p++ = '\\';
            *p++ = 't';
        }
        else if (ch == '\n') {
            *p++ = '\\';
            *p++ = 'n';
        }
        else if (ch == '\r') {
            *p++ = '\\';
            *p++ = 'r';
        }
        /* Map non-printable US ASCII to '\xhh' */
        else if (ch < ' ' || ch >= 0x7F) {
            *p++ = '\\';
            *p++ = 'x';
            *p++ = hexdigits[(ch >> 4) & 0x000F];
            *p++ = hexdigits[ch & 0x000F];
        }
        /* Copy everything else as-is */
        else
            *p++ = (char) ch;
    }
    /* Add quote */
    *p++ = PyString_AS_STRING(repr)[0];
    *p = '\0';
    _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
    return repr;
 }
 PyDoc_STRVAR(rfind__doc__,