mirror of
https://github.com/python/cpython.git
synced 2025-08-03 16:39:00 +00:00
#2630: Implement PEP 3138.
The repr() of a string now contains printable Unicode characters unescaped. The new ascii() builtin can be used to get a repr() with only ASCII characters in it. PEP and patch were written by Atsuo Ishimoto.
This commit is contained in:
parent
ea6d58d9d3
commit
559e5d7f4d
25 changed files with 1271 additions and 974 deletions
|
@ -425,6 +425,33 @@ PyObject_Str(PyObject *v)
|
|||
return res;
|
||||
}
|
||||
|
||||
PyObject *
|
||||
PyObject_ASCII(PyObject *v)
|
||||
{
|
||||
PyObject *repr, *ascii, *res;
|
||||
|
||||
repr = PyObject_Repr(v);
|
||||
if (repr == NULL)
|
||||
return NULL;
|
||||
|
||||
/* repr is guaranteed to be a PyUnicode object by PyObject_Repr */
|
||||
ascii = PyUnicode_EncodeASCII(
|
||||
PyUnicode_AS_UNICODE(repr),
|
||||
PyUnicode_GET_SIZE(repr),
|
||||
"backslashreplace");
|
||||
|
||||
Py_DECREF(repr);
|
||||
if (ascii == NULL)
|
||||
return NULL;
|
||||
|
||||
res = PyUnicode_DecodeASCII(
|
||||
PyBytes_AS_STRING(ascii),
|
||||
PyBytes_GET_SIZE(ascii),
|
||||
NULL);
|
||||
|
||||
Py_DECREF(ascii);
|
||||
return res;
|
||||
}
|
||||
|
||||
/* The new comparison philosophy is: we completely separate three-way
|
||||
comparison from rich comparison. That is, PyObject_Compare() and
|
||||
|
|
|
@ -766,6 +766,10 @@ do_conversion(PyObject *obj, STRINGLIB_CHAR conversion)
|
|||
return PyObject_Repr(obj);
|
||||
case 's':
|
||||
return STRINGLIB_TOSTR(obj);
|
||||
#if PY_VERSION_HEX >= 0x03000000
|
||||
case 'a':
|
||||
return STRINGLIB_TOASCII(obj);
|
||||
#endif
|
||||
default:
|
||||
if (conversion > 32 && conversion < 127) {
|
||||
/* It's the ASCII subrange; casting to char is safe
|
||||
|
|
|
@ -24,5 +24,5 @@
|
|||
#define STRINGLIB_CMP memcmp
|
||||
#define STRINGLIB_TOSTR PyObject_Str
|
||||
#define STRINGLIB_GROUPING _PyBytes_InsertThousandsGrouping
|
||||
|
||||
#define STRINGLIB_TOASCII PyObject_Repr
|
||||
#endif /* !STRINGLIB_STRINGDEFS_H */
|
||||
|
|
|
@ -25,8 +25,10 @@
|
|||
|
||||
#if PY_VERSION_HEX < 0x03000000
|
||||
#define STRINGLIB_TOSTR PyObject_Unicode
|
||||
#define STRINGLIB_TOASCII PyObject_Repr
|
||||
#else
|
||||
#define STRINGLIB_TOSTR PyObject_Str
|
||||
#define STRINGLIB_TOASCII PyObject_ASCII
|
||||
#endif
|
||||
|
||||
#define STRINGLIB_WANT_CONTAINS_OBJ 1
|
||||
|
|
|
@ -21,6 +21,7 @@
|
|||
#define UPPER_MASK 0x80
|
||||
#define XID_START_MASK 0x100
|
||||
#define XID_CONTINUE_MASK 0x200
|
||||
#define NONPRINTABLE_MASK 0x400
|
||||
|
||||
typedef struct {
|
||||
const Py_UNICODE upper;
|
||||
|
@ -675,6 +676,26 @@ int _PyUnicode_IsNumeric(Py_UNICODE ch)
|
|||
return _PyUnicode_ToNumeric(ch) != -1.0;
|
||||
}
|
||||
|
||||
/* Returns 1 for Unicode characters to be hex-escaped when repr()ed,
|
||||
0 otherwise.
|
||||
All characters except those characters defined in the Unicode character
|
||||
database as following categories are considered printable.
|
||||
* Cc (Other, Control)
|
||||
* Cf (Other, Format)
|
||||
* Cs (Other, Surrogate)
|
||||
* Co (Other, Private Use)
|
||||
* Cn (Other, Not Assigned)
|
||||
* Zl Separator, Line ('\u2028', LINE SEPARATOR)
|
||||
* Zp Separator, Paragraph ('\u2029', PARAGRAPH SEPARATOR)
|
||||
* Zs (Separator, Space) other than ASCII space('\x20').
|
||||
*/
|
||||
int _PyUnicode_IsPrintable(Py_UNICODE ch)
|
||||
{
|
||||
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
|
||||
|
||||
return (ctype->flags & NONPRINTABLE_MASK) == 0;
|
||||
}
|
||||
|
||||
#ifndef WANT_WCTYPE_FUNCTIONS
|
||||
|
||||
/* Returns 1 for Unicode characters having the bidirectional type
|
||||
|
|
|
@ -645,11 +645,12 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
|||
count = vargs;
|
||||
#endif
|
||||
#endif
|
||||
/* step 1: count the number of %S/%R format specifications
|
||||
* (we call PyObject_Str()/PyObject_Repr() for these objects
|
||||
* once during step 3 and put the result in an array) */
|
||||
/* step 1: count the number of %S/%R/%A format specifications
|
||||
* (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII() for
|
||||
* these objects once during step 3 and put the result in
|
||||
an array) */
|
||||
for (f = format; *f; f++) {
|
||||
if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R'))
|
||||
if (*f == '%' && (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A'))
|
||||
++callcount;
|
||||
}
|
||||
/* step 2: allocate memory for the results of
|
||||
|
@ -778,6 +779,19 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
|||
*callresult++ = repr;
|
||||
break;
|
||||
}
|
||||
case 'A':
|
||||
{
|
||||
PyObject *obj = va_arg(count, PyObject *);
|
||||
PyObject *ascii;
|
||||
assert(obj);
|
||||
ascii = PyObject_ASCII(obj);
|
||||
if (!ascii)
|
||||
goto fail;
|
||||
n += PyUnicode_GET_SIZE(ascii);
|
||||
/* Remember the repr and switch to the next slot */
|
||||
*callresult++ = ascii;
|
||||
break;
|
||||
}
|
||||
case 'p':
|
||||
(void) va_arg(count, int);
|
||||
/* maximum 64-bit pointer representation:
|
||||
|
@ -7231,6 +7245,32 @@ unicode_isidentifier(PyObject *self)
|
|||
return PyBool_FromLong(PyUnicode_IsIdentifier(self));
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(isprintable__doc__,
|
||||
"S.isprintable() -> bool\n\
|
||||
\n\
|
||||
Return True if all characters in S are considered\n\
|
||||
printable in repr() or S is empty, False otherwise.");
|
||||
|
||||
static PyObject*
|
||||
unicode_isprintable(PyObject *self)
|
||||
{
|
||||
register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
|
||||
register const Py_UNICODE *e;
|
||||
|
||||
/* Shortcut for single character strings */
|
||||
if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
|
||||
Py_RETURN_TRUE;
|
||||
}
|
||||
|
||||
e = p + PyUnicode_GET_SIZE(self);
|
||||
for (; p < e; p++) {
|
||||
if (!Py_UNICODE_ISPRINTABLE(*p)) {
|
||||
Py_RETURN_FALSE;
|
||||
}
|
||||
}
|
||||
Py_RETURN_TRUE;
|
||||
}
|
||||
|
||||
PyDoc_STRVAR(join__doc__,
|
||||
"S.join(sequence) -> str\n\
|
||||
\n\
|
||||
|
@ -7608,61 +7648,8 @@ PyObject *unicode_repr(PyObject *unicode)
|
|||
continue;
|
||||
}
|
||||
|
||||
#ifdef Py_UNICODE_WIDE
|
||||
/* Map 21-bit characters to '\U00xxxxxx' */
|
||||
else if (ch >= 0x10000) {
|
||||
*p++ = '\\';
|
||||
*p++ = 'U';
|
||||
*p++ = hexdigits[(ch >> 28) & 0x0000000F];
|
||||
*p++ = hexdigits[(ch >> 24) & 0x0000000F];
|
||||
*p++ = hexdigits[(ch >> 20) & 0x0000000F];
|
||||
*p++ = hexdigits[(ch >> 16) & 0x0000000F];
|
||||
*p++ = hexdigits[(ch >> 12) & 0x0000000F];
|
||||
*p++ = hexdigits[(ch >> 8) & 0x0000000F];
|
||||
*p++ = hexdigits[(ch >> 4) & 0x0000000F];
|
||||
*p++ = hexdigits[ch & 0x0000000F];
|
||||
continue;
|
||||
}
|
||||
#else
|
||||
/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
|
||||
else if (ch >= 0xD800 && ch < 0xDC00) {
|
||||
Py_UNICODE ch2;
|
||||
Py_UCS4 ucs;
|
||||
|
||||
ch2 = *s++;
|
||||
size--;
|
||||
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
|
||||
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
|
||||
*p++ = '\\';
|
||||
*p++ = 'U';
|
||||
*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
|
||||
*p++ = hexdigits[ucs & 0x0000000F];
|
||||
continue;
|
||||
}
|
||||
/* Fall through: isolated surrogates are copied as-is */
|
||||
s--;
|
||||
size++;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Map 16-bit characters to '\uxxxx' */
|
||||
if (ch >= 256) {
|
||||
*p++ = '\\';
|
||||
*p++ = 'u';
|
||||
*p++ = hexdigits[(ch >> 12) & 0x000F];
|
||||
*p++ = hexdigits[(ch >> 8) & 0x000F];
|
||||
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
||||
*p++ = hexdigits[ch & 0x000F];
|
||||
}
|
||||
|
||||
/* Map special whitespace to '\t', \n', '\r' */
|
||||
else if (ch == '\t') {
|
||||
/* Map special whitespace to '\t', \n', '\r' */
|
||||
if (ch == '\t') {
|
||||
*p++ = '\\';
|
||||
*p++ = 't';
|
||||
}
|
||||
|
@ -7676,16 +7663,79 @@ PyObject *unicode_repr(PyObject *unicode)
|
|||
}
|
||||
|
||||
/* Map non-printable US ASCII to '\xhh' */
|
||||
else if (ch < ' ' || ch >= 0x7F) {
|
||||
else if (ch < ' ' || ch == 0x7F) {
|
||||
*p++ = '\\';
|
||||
*p++ = 'x';
|
||||
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
||||
*p++ = hexdigits[ch & 0x000F];
|
||||
}
|
||||
|
||||
/* Copy everything else as-is */
|
||||
else
|
||||
*p++ = (char) ch;
|
||||
/* Copy ASCII characters as-is */
|
||||
else if (ch < 0x7F) {
|
||||
*p++ = ch;
|
||||
}
|
||||
|
||||
/* Non-ASCII characters */
|
||||
else {
|
||||
Py_UCS4 ucs = ch;
|
||||
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
Py_UNICODE ch2 = 0;
|
||||
/* Get code point from surrogate pair */
|
||||
if (size > 0) {
|
||||
ch2 = *s;
|
||||
if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
|
||||
&& ch2 <= 0xDFFF) {
|
||||
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
|
||||
+ 0x00010000;
|
||||
s++;
|
||||
size--;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
/* Map Unicode whitespace and control characters
|
||||
(categories Z* and C* except ASCII space)
|
||||
*/
|
||||
if (!Py_UNICODE_ISPRINTABLE(ucs)) {
|
||||
/* Map 8-bit characters to '\xhh' */
|
||||
if (ucs <= 0xff) {
|
||||
*p++ = '\\';
|
||||
*p++ = 'x';
|
||||
*p++ = hexdigits[(ch >> 4) & 0x000F];
|
||||
*p++ = hexdigits[ch & 0x000F];
|
||||
}
|
||||
/* Map 21-bit characters to '\U00xxxxxx' */
|
||||
else if (ucs >= 0x10000) {
|
||||
*p++ = '\\';
|
||||
*p++ = 'U';
|
||||
*p++ = hexdigits[(ucs >> 28) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 24) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 20) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 16) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 12) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 8) & 0x0000000F];
|
||||
*p++ = hexdigits[(ucs >> 4) & 0x0000000F];
|
||||
*p++ = hexdigits[ucs & 0x0000000F];
|
||||
}
|
||||
/* Map 16-bit characters to '\uxxxx' */
|
||||
else {
|
||||
*p++ = '\\';
|
||||
*p++ = 'u';
|
||||
*p++ = hexdigits[(ucs >> 12) & 0x000F];
|
||||
*p++ = hexdigits[(ucs >> 8) & 0x000F];
|
||||
*p++ = hexdigits[(ucs >> 4) & 0x000F];
|
||||
*p++ = hexdigits[ucs & 0x000F];
|
||||
}
|
||||
}
|
||||
/* Copy characters as-is */
|
||||
else {
|
||||
*p++ = ch;
|
||||
#ifndef Py_UNICODE_WIDE
|
||||
if (ucs >= 0x10000)
|
||||
*p++ = ch2;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
/* Add quote */
|
||||
*p++ = PyUnicode_AS_UNICODE(repr)[0];
|
||||
|
@ -8372,6 +8422,7 @@ static PyMethodDef unicode_methods[] = {
|
|||
{"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
|
||||
{"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
|
||||
{"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
|
||||
{"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
|
||||
{"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
|
||||
{"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
|
||||
{"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
|
||||
|
@ -8958,6 +9009,7 @@ PyObject *PyUnicode_Format(PyObject *format,
|
|||
|
||||
case 's':
|
||||
case 'r':
|
||||
case 'a':
|
||||
if (PyUnicode_Check(v) && c == 's') {
|
||||
temp = v;
|
||||
Py_INCREF(temp);
|
||||
|
@ -8965,8 +9017,10 @@ PyObject *PyUnicode_Format(PyObject *format,
|
|||
else {
|
||||
if (c == 's')
|
||||
temp = PyObject_Str(v);
|
||||
else
|
||||
else if (c == 'r')
|
||||
temp = PyObject_Repr(v);
|
||||
else
|
||||
temp = PyObject_ASCII(v);
|
||||
if (temp == NULL)
|
||||
goto onError;
|
||||
if (PyUnicode_Check(temp))
|
||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue