mirror of
https://github.com/python/cpython.git
synced 2025-08-31 14:07:50 +00:00
Issue #25353: Optimize unicode escape and raw unicode escape encoders to use
the new _PyBytesWriter API.
This commit is contained in:
parent
d65e4f4eea
commit
358af13526
2 changed files with 93 additions and 64 deletions
|
@ -2110,38 +2110,35 @@ save_bytes(PicklerObject *self, PyObject *obj)
|
||||||
static PyObject *
|
static PyObject *
|
||||||
raw_unicode_escape(PyObject *obj)
|
raw_unicode_escape(PyObject *obj)
|
||||||
{
|
{
|
||||||
PyObject *repr;
|
|
||||||
char *p;
|
char *p;
|
||||||
Py_ssize_t i, size;
|
Py_ssize_t i, size;
|
||||||
size_t expandsize;
|
|
||||||
void *data;
|
void *data;
|
||||||
unsigned int kind;
|
unsigned int kind;
|
||||||
|
_PyBytesWriter writer;
|
||||||
|
|
||||||
if (PyUnicode_READY(obj))
|
if (PyUnicode_READY(obj))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
_PyBytesWriter_Init(&writer);
|
||||||
|
|
||||||
size = PyUnicode_GET_LENGTH(obj);
|
size = PyUnicode_GET_LENGTH(obj);
|
||||||
data = PyUnicode_DATA(obj);
|
data = PyUnicode_DATA(obj);
|
||||||
kind = PyUnicode_KIND(obj);
|
kind = PyUnicode_KIND(obj);
|
||||||
if (kind == PyUnicode_4BYTE_KIND)
|
|
||||||
expandsize = 10;
|
|
||||||
else
|
|
||||||
expandsize = 6;
|
|
||||||
|
|
||||||
if ((size_t)size > (size_t)PY_SSIZE_T_MAX / expandsize)
|
p = _PyBytesWriter_Alloc(&writer, size);
|
||||||
return PyErr_NoMemory();
|
if (p == NULL)
|
||||||
repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
|
goto error;
|
||||||
if (repr == NULL)
|
writer.overallocate = 1;
|
||||||
return NULL;
|
|
||||||
if (size == 0)
|
|
||||||
return repr;
|
|
||||||
assert(Py_REFCNT(repr) == 1);
|
|
||||||
|
|
||||||
p = PyBytes_AS_STRING(repr);
|
|
||||||
for (i=0; i < size; i++) {
|
for (i=0; i < size; i++) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
||||||
if (ch >= 0x10000) {
|
if (ch >= 0x10000) {
|
||||||
|
/* -1: substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 10-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'U';
|
*p++ = 'U';
|
||||||
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
|
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
|
||||||
|
@ -2153,8 +2150,13 @@ raw_unicode_escape(PyObject *obj)
|
||||||
*p++ = Py_hexdigits[(ch >> 4) & 0xf];
|
*p++ = Py_hexdigits[(ch >> 4) & 0xf];
|
||||||
*p++ = Py_hexdigits[ch & 15];
|
*p++ = Py_hexdigits[ch & 15];
|
||||||
}
|
}
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
/* Map 16-bit characters, '\\' and '\n' to '\uxxxx' */
|
||||||
else if (ch >= 256 || ch == '\\' || ch == '\n') {
|
else if (ch >= 256 || ch == '\\' || ch == '\n') {
|
||||||
|
/* -1: substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 6-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'u';
|
*p++ = 'u';
|
||||||
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
|
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
|
||||||
|
@ -2166,10 +2168,12 @@ raw_unicode_escape(PyObject *obj)
|
||||||
else
|
else
|
||||||
*p++ = (char) ch;
|
*p++ = (char) ch;
|
||||||
}
|
}
|
||||||
size = p - PyBytes_AS_STRING(repr);
|
|
||||||
if (_PyBytes_Resize(&repr, size) < 0)
|
return _PyBytesWriter_Finish(&writer, p);
|
||||||
|
|
||||||
|
error:
|
||||||
|
_PyBytesWriter_Dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
return repr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
|
|
|
@ -6052,11 +6052,10 @@ PyObject *
|
||||||
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
||||||
{
|
{
|
||||||
Py_ssize_t i, len;
|
Py_ssize_t i, len;
|
||||||
PyObject *repr;
|
|
||||||
char *p;
|
char *p;
|
||||||
int kind;
|
int kind;
|
||||||
void *data;
|
void *data;
|
||||||
Py_ssize_t expandsize = 0;
|
_PyBytesWriter writer;
|
||||||
|
|
||||||
/* Initial allocation is based on the longest-possible character
|
/* Initial allocation is based on the longest-possible character
|
||||||
escape.
|
escape.
|
||||||
|
@ -6072,35 +6071,28 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
||||||
}
|
}
|
||||||
if (PyUnicode_READY(unicode) == -1)
|
if (PyUnicode_READY(unicode) == -1)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
_PyBytesWriter_Init(&writer);
|
||||||
|
|
||||||
len = PyUnicode_GET_LENGTH(unicode);
|
len = PyUnicode_GET_LENGTH(unicode);
|
||||||
kind = PyUnicode_KIND(unicode);
|
kind = PyUnicode_KIND(unicode);
|
||||||
data = PyUnicode_DATA(unicode);
|
data = PyUnicode_DATA(unicode);
|
||||||
switch (kind) {
|
|
||||||
case PyUnicode_1BYTE_KIND: expandsize = 4; break;
|
|
||||||
case PyUnicode_2BYTE_KIND: expandsize = 6; break;
|
|
||||||
case PyUnicode_4BYTE_KIND: expandsize = 10; break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (len == 0)
|
p = _PyBytesWriter_Alloc(&writer, len);
|
||||||
return PyBytes_FromStringAndSize(NULL, 0);
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
|
writer.overallocate = 1;
|
||||||
return PyErr_NoMemory();
|
|
||||||
|
|
||||||
repr = PyBytes_FromStringAndSize(NULL,
|
|
||||||
2
|
|
||||||
+ expandsize*len
|
|
||||||
+ 1);
|
|
||||||
if (repr == NULL)
|
|
||||||
return NULL;
|
|
||||||
|
|
||||||
p = PyBytes_AS_STRING(repr);
|
|
||||||
|
|
||||||
for (i = 0; i < len; i++) {
|
for (i = 0; i < len; i++) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
|
||||||
|
|
||||||
/* Escape backslashes */
|
/* Escape backslashes */
|
||||||
if (ch == '\\') {
|
if (ch == '\\') {
|
||||||
|
/* -1: substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 2-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = (char) ch;
|
*p++ = (char) ch;
|
||||||
continue;
|
continue;
|
||||||
|
@ -6109,6 +6101,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
||||||
/* Map 21-bit characters to '\U00xxxxxx' */
|
/* Map 21-bit characters to '\U00xxxxxx' */
|
||||||
else if (ch >= 0x10000) {
|
else if (ch >= 0x10000) {
|
||||||
assert(ch <= MAX_UNICODE);
|
assert(ch <= MAX_UNICODE);
|
||||||
|
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 10-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'U';
|
*p++ = 'U';
|
||||||
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
|
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
|
||||||
|
@ -6124,6 +6121,10 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
||||||
|
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
if (ch >= 256) {
|
if (ch >= 256) {
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 6-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'u';
|
*p++ = 'u';
|
||||||
*p++ = Py_hexdigits[(ch >> 12) & 0x000F];
|
*p++ = Py_hexdigits[(ch >> 12) & 0x000F];
|
||||||
|
@ -6134,20 +6135,37 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
||||||
|
|
||||||
/* Map special whitespace to '\t', \n', '\r' */
|
/* Map special whitespace to '\t', \n', '\r' */
|
||||||
else if (ch == '\t') {
|
else if (ch == '\t') {
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 2-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 't';
|
*p++ = 't';
|
||||||
}
|
}
|
||||||
else if (ch == '\n') {
|
else if (ch == '\n') {
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 2-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'n';
|
*p++ = 'n';
|
||||||
}
|
}
|
||||||
else if (ch == '\r') {
|
else if (ch == '\r') {
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 2-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'r';
|
*p++ = 'r';
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Map non-printable US ASCII to '\xhh' */
|
/* Map non-printable US ASCII to '\xhh' */
|
||||||
else if (ch < ' ' || ch >= 0x7F) {
|
else if (ch < ' ' || ch >= 0x7F) {
|
||||||
|
/* -1: substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 4-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'x';
|
*p++ = 'x';
|
||||||
*p++ = Py_hexdigits[(ch >> 4) & 0x000F];
|
*p++ = Py_hexdigits[(ch >> 4) & 0x000F];
|
||||||
|
@ -6159,10 +6177,11 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
|
||||||
*p++ = (char) ch;
|
*p++ = (char) ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(p - PyBytes_AS_STRING(repr) > 0);
|
return _PyBytesWriter_Finish(&writer, p);
|
||||||
if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
|
|
||||||
|
error:
|
||||||
|
_PyBytesWriter_Dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
return repr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
@ -6291,13 +6310,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
|
||||||
PyObject *
|
PyObject *
|
||||||
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
||||||
{
|
{
|
||||||
PyObject *repr;
|
|
||||||
char *p;
|
char *p;
|
||||||
char *q;
|
Py_ssize_t pos;
|
||||||
Py_ssize_t expandsize, pos;
|
|
||||||
int kind;
|
int kind;
|
||||||
void *data;
|
void *data;
|
||||||
Py_ssize_t len;
|
Py_ssize_t len;
|
||||||
|
_PyBytesWriter writer;
|
||||||
|
|
||||||
if (!PyUnicode_Check(unicode)) {
|
if (!PyUnicode_Check(unicode)) {
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
|
@ -6305,28 +6323,29 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
||||||
}
|
}
|
||||||
if (PyUnicode_READY(unicode) == -1)
|
if (PyUnicode_READY(unicode) == -1)
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
_PyBytesWriter_Init(&writer);
|
||||||
|
|
||||||
kind = PyUnicode_KIND(unicode);
|
kind = PyUnicode_KIND(unicode);
|
||||||
data = PyUnicode_DATA(unicode);
|
data = PyUnicode_DATA(unicode);
|
||||||
len = PyUnicode_GET_LENGTH(unicode);
|
len = PyUnicode_GET_LENGTH(unicode);
|
||||||
/* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
|
|
||||||
bytes, and 1 byte characters 4. */
|
|
||||||
expandsize = kind * 2 + 2;
|
|
||||||
|
|
||||||
if (len > PY_SSIZE_T_MAX / expandsize)
|
p = _PyBytesWriter_Alloc(&writer, len);
|
||||||
return PyErr_NoMemory();
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
writer.overallocate = 1;
|
||||||
|
|
||||||
repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
|
|
||||||
if (repr == NULL)
|
|
||||||
return NULL;
|
|
||||||
if (len == 0)
|
|
||||||
return repr;
|
|
||||||
|
|
||||||
p = q = PyBytes_AS_STRING(repr);
|
|
||||||
for (pos = 0; pos < len; pos++) {
|
for (pos = 0; pos < len; pos++) {
|
||||||
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
|
Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
|
||||||
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
/* Map 32-bit characters to '\Uxxxxxxxx' */
|
||||||
if (ch >= 0x10000) {
|
if (ch >= 0x10000) {
|
||||||
assert(ch <= MAX_UNICODE);
|
assert(ch <= MAX_UNICODE);
|
||||||
|
|
||||||
|
/* -1: substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 10-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'U';
|
*p++ = 'U';
|
||||||
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
|
*p++ = Py_hexdigits[(ch >> 28) & 0xf];
|
||||||
|
@ -6340,6 +6359,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
||||||
}
|
}
|
||||||
/* Map 16-bit characters to '\uxxxx' */
|
/* Map 16-bit characters to '\uxxxx' */
|
||||||
else if (ch >= 256) {
|
else if (ch >= 256) {
|
||||||
|
/* -1: substract 1 preallocated byte */
|
||||||
|
p = _PyBytesWriter_Prepare(&writer, p, 6-1);
|
||||||
|
if (p == NULL)
|
||||||
|
goto error;
|
||||||
|
|
||||||
*p++ = '\\';
|
*p++ = '\\';
|
||||||
*p++ = 'u';
|
*p++ = 'u';
|
||||||
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
|
*p++ = Py_hexdigits[(ch >> 12) & 0xf];
|
||||||
|
@ -6352,10 +6376,11 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
|
||||||
*p++ = (char) ch;
|
*p++ = (char) ch;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert(p > q);
|
return _PyBytesWriter_Finish(&writer, p);
|
||||||
if (_PyBytes_Resize(&repr, p - q) < 0)
|
|
||||||
|
error:
|
||||||
|
_PyBytesWriter_Dealloc(&writer);
|
||||||
return NULL;
|
return NULL;
|
||||||
return repr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
PyObject *
|
PyObject *
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue