#2630: Implement PEP 3138.

The repr() of a string now contains printable Unicode characters unescaped. The new ascii() builtin can be used to get a repr() with only ASCII characters in it. PEP and patch were written by Atsuo Ishimoto.
2025-08-03 16:39:00 +00:00 · 2008-06-11 18:37:52 +00:00 · 2008-06-11 18:37:52 +00:00 · 559e5d7f4d
commit 559e5d7f4d
parent ea6d58d9d3
25 changed files with 1271 additions and 974 deletions
--- a/Objects/object.c
+++ b/Objects/object.c
@ -425,6 +425,33 @@ PyObject_Str(PyObject *v)
 	return res;
 }

+PyObject *
+PyObject_ASCII(PyObject *v)
+{
+	PyObject *repr, *ascii, *res;
+	
+	repr = PyObject_Repr(v);
+	if (repr == NULL)
+		return NULL;
+
+	/* repr is guaranteed to be a PyUnicode object by PyObject_Repr */
+	ascii = PyUnicode_EncodeASCII(
+		PyUnicode_AS_UNICODE(repr),
+		PyUnicode_GET_SIZE(repr),
+		"backslashreplace");
+
+	Py_DECREF(repr);
+	if (ascii == NULL) 
+		return NULL;
+
+	res = PyUnicode_DecodeASCII(
+		PyBytes_AS_STRING(ascii),
+		PyBytes_GET_SIZE(ascii),
+		NULL);
+
+	Py_DECREF(ascii);
+	return res;
+}

 /* The new comparison philosophy is: we completely separate three-way
   comparison from rich comparison.  That is, PyObject_Compare() and
--- a/Objects/stringlib/string_format.h
+++ b/Objects/stringlib/string_format.h
@ -766,6 +766,10 @@ do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
        return PyObject_Repr(obj);
    case 's':
        return STRINGLIB_TOSTR(obj);
+#if PY_VERSION_HEX >= 0x03000000
+    case 'a':
+        return STRINGLIB_TOASCII(obj);
+#endif
    default:
 	if (conversion > 32 && conversion < 127) {
 		/* It's the ASCII subrange; casting to char is safe
--- a/Objects/stringlib/stringdefs.h
+++ b/Objects/stringlib/stringdefs.h
@ -24,5 +24,5 @@
 #define STRINGLIB_CMP            memcmp
 #define STRINGLIB_TOSTR          PyObject_Str
 #define STRINGLIB_GROUPING       _PyBytes_InsertThousandsGrouping
-
+#define STRINGLIB_TOASCII        PyObject_Repr
 #endif /* !STRINGLIB_STRINGDEFS_H */
--- a/Objects/stringlib/unicodedefs.h
+++ b/Objects/stringlib/unicodedefs.h
@ -25,8 +25,10 @@

 #if PY_VERSION_HEX < 0x03000000
 #define STRINGLIB_TOSTR          PyObject_Unicode
+#define STRINGLIB_TOASCII        PyObject_Repr
 #else
 #define STRINGLIB_TOSTR          PyObject_Str
+#define STRINGLIB_TOASCII        PyObject_ASCII
 #endif

 #define STRINGLIB_WANT_CONTAINS_OBJ 1
--- a/Objects/unicodectype.c
+++ b/Objects/unicodectype.c
@ -21,6 +21,7 @@
 #define UPPER_MASK 0x80
 #define XID_START_MASK 0x100
 #define XID_CONTINUE_MASK 0x200
+#define NONPRINTABLE_MASK 0x400

 typedef struct {
    const Py_UNICODE upper;
@ -675,6 +676,26 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
    return _PyUnicode_ToNumeric(ch) != -1.0;
 }

+/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
+   0 otherwise.
+   All characters except those characters defined in the Unicode character
+   database as following categories are considered printable.
+      * Cc (Other, Control)
+      * Cf (Other, Format)
+      * Cs (Other, Surrogate)
+      * Co (Other, Private Use)
+      * Cn (Other, Not Assigned)
+      * Zl Separator, Line ('\u2028', LINE SEPARATOR)
+      * Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
+      * Zs (Separator, Space) other than ASCII space('\x20').
+*/
+int _PyUnicode_IsPrintable(Py_UNICODE ch)
+{
+    const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
+
+    return (ctype->flags & NONPRINTABLE_MASK) == 0;
+}
+
 #ifndef WANT_WCTYPE_FUNCTIONS

 /* Returns 1 for Unicode characters having the bidirectional type
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -645,11 +645,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
 	count = vargs;
 #endif
 #endif
-	/* step 1: count the number of %S/%R format specifications
-	 * (we call PyObject_Str()/PyObject_Repr() for these objects
-	 * once during step 3 and put the result in an array) */
+	/* step 1: count the number of %S/%R/%A format specifications
+	 * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for 
+	 * these objects once during step 3 and put the result in 
+	   an array) */
 	for (f = format; *f; f++) {
-		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
+		if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
 			++callcount;
 	}
 	/* step 2: allocate memory for the results of
@ -778,6 +779,19 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
 				*callresult++ = repr;
 				break;
 			}
+			case 'A':
+			{
+				PyObject *obj = va_arg(count, PyObject *);
+				PyObject *ascii;
+				assert(obj);
+				ascii = PyObject_ASCII(obj);
+				if (!ascii)
+					goto fail;
+				n += PyUnicode_GET_SIZE(ascii);
+				/* Remember the repr and switch to the next slot */
+				*callresult++ = ascii;
+				break;
+			}
 			case 'p':
 				(void) va_arg(count, int);
 				/* maximum 64-bit pointer representation:
@ -7231,6 +7245,32 @@ unicode_isidentifier(PyObject *self)
    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
 }

+PyDoc_STRVAR(isprintable__doc__,
+"S.isprintable() -> bool\n\
+\n\
+Return True if all characters in S are considered\n\
+printable in repr() or S is empty, False otherwise.");
+
+static PyObject*
+unicode_isprintable(PyObject *self)
+{
+    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
+    register const Py_UNICODE *e;
+
+    /* Shortcut for single character strings */
+    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
+        Py_RETURN_TRUE;
+    }
+
+    e = p + PyUnicode_GET_SIZE(self);
+    for (; p < e; p++) {
+        if (!Py_UNICODE_ISPRINTABLE(*p)) {
+            Py_RETURN_FALSE;
+        }
+    }
+    Py_RETURN_TRUE;
+}
+
 PyDoc_STRVAR(join__doc__,
 "S.join(sequence) -> str\n\
 \n\
@ -7608,61 +7648,8 @@ PyObject *unicode_repr(PyObject *unicode)
            continue;
        }

-#ifdef Py_UNICODE_WIDE
-        /* Map 21-bit characters to '\U00xxxxxx' */
-        else if (ch >= 0x10000) {
-            *p++ = '\\';
-            *p++ = 'U';
-            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
-            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
-            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
-            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
-            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
-            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
-            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
-            *p++ = hexdigits[ch & 0x0000000F];
-	    continue;
-        }
-#else
-	/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
-	else if (ch >= 0xD800 && ch < 0xDC00) {
-	    Py_UNICODE ch2;
-	    Py_UCS4 ucs;
-
-	    ch2 = *s++;
-	    size--;
-	    if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
-		ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
-		*p++ = '\\';
-		*p++ = 'U';
-		*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
-		*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
-		*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
-		*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
-		*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
-		*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
-		*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
-		*p++ = hexdigits[ucs & 0x0000000F];
-		continue;
-	    }
-	    /* Fall through: isolated surrogates are copied as-is */
-	    s--;
-	    size++;
-	}
-#endif
-
-        /* Map 16-bit characters to '\uxxxx' */
-        if (ch >= 256) {
-            *p++ = '\\';
-            *p++ = 'u';
-            *p++ = hexdigits[(ch >> 12) & 0x000F];
-            *p++ = hexdigits[(ch >> 8) & 0x000F];
-            *p++ = hexdigits[(ch >> 4) & 0x000F];
-            *p++ = hexdigits[ch & 0x000F];
-        }
-
-        /* Map special whitespace to '\t', \n', '\r' */
-        else if (ch == '\t') {
+	/* Map special whitespace to '\t', \n', '\r' */
+        if (ch == '\t') {
            *p++ = '\\';
            *p++ = 't';
        }
@ -7676,16 +7663,79 @@ PyObject *unicode_repr(PyObject *unicode)
        }

        /* Map non-printable US ASCII to '\xhh' */
-        else if (ch < ' ' || ch >= 0x7F) {
+        else if (ch < ' ' || ch == 0x7F) {
            *p++ = '\\';
            *p++ = 'x';
            *p++ = hexdigits[(ch >> 4) & 0x000F];
            *p++ = hexdigits[ch & 0x000F];
        }

-        /* Copy everything else as-is */
-        else
-            *p++ = (char) ch;
+        /* Copy ASCII characters as-is */
+        else if (ch < 0x7F) {
+            *p++ = ch;
+        }
+
+	/* Non-ASCII characters */
+        else {
+            Py_UCS4 ucs = ch;
+
+#ifndef Py_UNICODE_WIDE
+            Py_UNICODE ch2 = 0;
+            /* Get code point from surrogate pair */
+            if (size > 0) {
+                ch2 = *s;
+                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
+                            && ch2 <= 0xDFFF) {
+                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) 
+                            + 0x00010000;
+                    s++; 
+                    size--;
+                }
+            }
+#endif
+            /* Map Unicode whitespace and control characters 
+               (categories Z* and C* except ASCII space)
+            */
+            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
+                /* Map 8-bit characters to '\xhh' */
+                if (ucs <= 0xff) {
+                    *p++ = '\\';
+                    *p++ = 'x';
+                    *p++ = hexdigits[(ch >> 4) & 0x000F];
+                    *p++ = hexdigits[ch & 0x000F];
+                }
+                /* Map 21-bit characters to '\U00xxxxxx' */
+                else if (ucs >= 0x10000) {
+                    *p++ = '\\';
+                    *p++ = 'U';
+                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
+                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
+                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
+                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
+                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
+                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
+                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
+                    *p++ = hexdigits[ucs & 0x0000000F];
+                }
+                /* Map 16-bit characters to '\uxxxx' */
+                else {
+                    *p++ = '\\';
+                    *p++ = 'u';
+                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
+                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
+                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
+                    *p++ = hexdigits[ucs & 0x000F];
+                }
+            }
+            /* Copy characters as-is */
+            else {
+                *p++ = ch;
+#ifndef Py_UNICODE_WIDE
+                if (ucs >= 0x10000)
+                    *p++ = ch2;
+#endif
+            }
+        }
    }
    /* Add quote */
    *p++ = PyUnicode_AS_UNICODE(repr)[0];
@ -8372,6 +8422,7 @@ static PyMethodDef unicode_methods[] = {
    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
+    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
@ -8958,6 +9009,7 @@ PyObject *PyUnicode_Format(PyObject *format,

 	    case 's':
 	    case 'r':
+	    case 'a':
 		if (PyUnicode_Check(v) && c == 's') {
 		    temp = v;
 		    Py_INCREF(temp);
@ -8965,8 +9017,10 @@ PyObject *PyUnicode_Format(PyObject *format,
 		else {
 		    if (c == 's')
 			temp = PyObject_Str(v);
-		    else
+		    else if (c == 'r')
 			temp = PyObject_Repr(v);
+		    else
+			temp = PyObject_ASCII(v);
 		    if (temp == NULL)
 			goto onError;
                    if (PyUnicode_Check(temp))
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h