Optimize built-in unicode codecs by avoiding unnecessary copying.

The approach used is similiar to what is currently used in the version
of unicodeobject.c in Python 2.x. The only difference is we use
_PyBytes_Resize instead of _PyString_Resize.
This commit is contained in:
Alexandre Vassalotti 2008-12-27 09:16:49 +00:00
parent 9cb6f7f7a5
commit 44531cb2db

View file

@ -1873,7 +1873,7 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
int encodeWhiteSpace, int encodeWhiteSpace,
const char *errors) const char *errors)
{ {
PyObject *v, *result; PyObject *v;
/* It might be possible to tighten this worst case */ /* It might be possible to tighten this worst case */
Py_ssize_t cbAllocated = 5 * size; Py_ssize_t cbAllocated = 5 * size;
int inShift = 0; int inShift = 0;
@ -1889,11 +1889,11 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
if (cbAllocated / 5 != size) if (cbAllocated / 5 != size)
return PyErr_NoMemory(); return PyErr_NoMemory();
v = PyByteArray_FromStringAndSize(NULL, cbAllocated); v = PyBytes_FromStringAndSize(NULL, cbAllocated);
if (v == NULL) if (v == NULL)
return NULL; return NULL;
start = out = PyByteArray_AS_STRING(v); start = out = PyBytes_AS_STRING(v);
for (;i < size; ++i) { for (;i < size; ++i) {
Py_UNICODE ch = s[i]; Py_UNICODE ch = s[i];
@ -1958,10 +1958,9 @@ PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
*out++= B64(charsleft << (6-bitsleft) ); *out++= B64(charsleft << (6-bitsleft) );
*out++ = '-'; *out++ = '-';
} }
if (_PyBytes_Resize(&v, out - start) < 0)
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), out - start); return NULL;
Py_DECREF(v); return v;
return result;
} }
#undef SPECIAL #undef SPECIAL
@ -2479,7 +2478,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
const char *errors, const char *errors,
int byteorder) int byteorder)
{ {
PyObject *v, *result; PyObject *v;
unsigned char *p; unsigned char *p;
Py_ssize_t nsize, bytesize; Py_ssize_t nsize, bytesize;
#ifndef Py_UNICODE_WIDE #ifndef Py_UNICODE_WIDE
@ -2515,11 +2514,11 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
bytesize = nsize * 4; bytesize = nsize * 4;
if (bytesize / 4 != nsize) if (bytesize / 4 != nsize)
return PyErr_NoMemory(); return PyErr_NoMemory();
v = PyByteArray_FromStringAndSize(NULL, bytesize); v = PyBytes_FromStringAndSize(NULL, bytesize);
if (v == NULL) if (v == NULL)
return NULL; return NULL;
p = (unsigned char *)PyByteArray_AS_STRING(v); p = (unsigned char *)PyBytes_AS_STRING(v);
if (byteorder == 0) if (byteorder == 0)
STORECHAR(0xFEFF); STORECHAR(0xFEFF);
if (size == 0) if (size == 0)
@ -2556,9 +2555,7 @@ PyUnicode_EncodeUTF32(const Py_UNICODE *s,
} }
done: done:
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); return v;
Py_DECREF(v);
return result;
#undef STORECHAR #undef STORECHAR
} }
@ -2757,7 +2754,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
const char *errors, const char *errors,
int byteorder) int byteorder)
{ {
PyObject *v, *result; PyObject *v;
unsigned char *p; unsigned char *p;
Py_ssize_t nsize, bytesize; Py_ssize_t nsize, bytesize;
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
@ -2792,11 +2789,11 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
bytesize = nsize * 2; bytesize = nsize * 2;
if (bytesize / 2 != nsize) if (bytesize / 2 != nsize)
return PyErr_NoMemory(); return PyErr_NoMemory();
v = PyByteArray_FromStringAndSize(NULL, bytesize); v = PyBytes_FromStringAndSize(NULL, bytesize);
if (v == NULL) if (v == NULL)
return NULL; return NULL;
p = (unsigned char *)PyByteArray_AS_STRING(v); p = (unsigned char *)PyBytes_AS_STRING(v);
if (byteorder == 0) if (byteorder == 0)
STORECHAR(0xFEFF); STORECHAR(0xFEFF);
if (size == 0) if (size == 0)
@ -2828,9 +2825,7 @@ PyUnicode_EncodeUTF16(const Py_UNICODE *s,
} }
done: done:
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v)); return v;
Py_DECREF(v);
return result;
#undef STORECHAR #undef STORECHAR
} }
@ -3120,7 +3115,7 @@ static const char *hexdigits = "0123456789abcdef";
PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s, PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size) Py_ssize_t size)
{ {
PyObject *repr, *result; PyObject *repr;
char *p; char *p;
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
@ -3147,17 +3142,20 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
escape. escape.
*/ */
if (size == 0)
return PyBytes_FromStringAndSize(NULL, 0);
if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize) if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
return PyErr_NoMemory(); return PyErr_NoMemory();
repr = PyByteArray_FromStringAndSize(NULL, repr = PyBytes_FromStringAndSize(NULL,
2 2
+ expandsize*size + expandsize*size
+ 1); + 1);
if (repr == NULL) if (repr == NULL)
return NULL; return NULL;
p = PyByteArray_AS_STRING(repr); p = PyBytes_AS_STRING(repr);
while (size-- > 0) { while (size-- > 0) {
Py_UNICODE ch = *s++; Py_UNICODE ch = *s++;
@ -3249,13 +3247,13 @@ PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
*p++ = (char) ch; *p++ = (char) ch;
} }
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), assert(p - PyBytes_AS_STRING(repr) > 0);
p - PyByteArray_AS_STRING(repr)); if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
Py_DECREF(repr); return NULL;
return result; return repr;
} }
PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode) PyObject *PyUnicodeAsUnicodeEscapeString(PyObject *unicode)
{ {
PyObject *s; PyObject *s;
if (!PyUnicode_Check(unicode)) { if (!PyUnicode_Check(unicode)) {
@ -3389,7 +3387,7 @@ PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s, PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
Py_ssize_t size) Py_ssize_t size)
{ {
PyObject *repr, *result; PyObject *repr;
char *p; char *p;
char *q; char *q;
@ -3402,13 +3400,13 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
if (size > PY_SSIZE_T_MAX / expandsize) if (size > PY_SSIZE_T_MAX / expandsize)
return PyErr_NoMemory(); return PyErr_NoMemory();
repr = PyByteArray_FromStringAndSize(NULL, expandsize * size); repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
if (repr == NULL) if (repr == NULL)
return NULL; return NULL;
if (size == 0) if (size == 0)
goto done; return repr;
p = q = PyByteArray_AS_STRING(repr); p = q = PyBytes_AS_STRING(repr);
while (size-- > 0) { while (size-- > 0) {
Py_UNICODE ch = *s++; Py_UNICODE ch = *s++;
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
@ -3468,10 +3466,10 @@ PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
} }
size = p - q; size = p - q;
done: assert(size > 0);
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size); if (_PyBytes_Resize(&repr, size) < 0)
Py_DECREF(repr); return NULL;
return result; return repr;
} }
PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
@ -3706,7 +3704,6 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)"; const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
PyObject *result = NULL;
/* the following variable is used for caching string comparisons /* the following variable is used for caching string comparisons
* -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */ * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
int known_errorHandler = -1; int known_errorHandler = -1;
@ -3715,10 +3712,10 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
replacements, if we need more, we'll resize */ replacements, if we need more, we'll resize */
if (size == 0) if (size == 0)
return PyBytes_FromStringAndSize(NULL, 0); return PyBytes_FromStringAndSize(NULL, 0);
res = PyByteArray_FromStringAndSize(NULL, size); res = PyBytes_FromStringAndSize(NULL, size);
if (res == NULL) if (res == NULL)
return NULL; return NULL;
str = PyByteArray_AS_STRING(res); str = PyBytes_AS_STRING(res);
ressize = size; ressize = size;
while (p<endp) { while (p<endp) {
@ -3768,7 +3765,7 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
p = collend; p = collend;
break; break;
case 4: /* xmlcharrefreplace */ case 4: /* xmlcharrefreplace */
respos = str - PyByteArray_AS_STRING(res); respos = str - PyBytes_AS_STRING(res);
/* determine replacement size (temporarily (mis)uses p) */ /* determine replacement size (temporarily (mis)uses p) */
for (p = collstart, repsize = 0; p < collend; ++p) { for (p = collstart, repsize = 0; p < collend; ++p) {
if (*p<10) if (*p<10)
@ -3795,9 +3792,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
if (requiredsize > ressize) { if (requiredsize > ressize) {
if (requiredsize<2*ressize) if (requiredsize<2*ressize)
requiredsize = 2*ressize; requiredsize = 2*ressize;
if (PyByteArray_Resize(res, requiredsize)) if (_PyBytes_Resize(&res, requiredsize))
goto onError; goto onError;
str = PyByteArray_AS_STRING(res) + respos; str = PyBytes_AS_STRING(res) + respos;
ressize = requiredsize; ressize = requiredsize;
} }
/* generate replacement (temporarily (mis)uses p) */ /* generate replacement (temporarily (mis)uses p) */
@ -3815,17 +3812,17 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
/* need more space? (at least enough for what we /* need more space? (at least enough for what we
have+the replacement+the rest of the string, so have+the replacement+the rest of the string, so
we won't have to check space for encodable characters) */ we won't have to check space for encodable characters) */
respos = str - PyByteArray_AS_STRING(res); respos = str - PyBytes_AS_STRING(res);
repsize = PyUnicode_GET_SIZE(repunicode); repsize = PyUnicode_GET_SIZE(repunicode);
requiredsize = respos+repsize+(endp-collend); requiredsize = respos+repsize+(endp-collend);
if (requiredsize > ressize) { if (requiredsize > ressize) {
if (requiredsize<2*ressize) if (requiredsize<2*ressize)
requiredsize = 2*ressize; requiredsize = 2*ressize;
if (PyByteArray_Resize(res, requiredsize)) { if (_PyBytes_Resize(&res, requiredsize)) {
Py_DECREF(repunicode); Py_DECREF(repunicode);
goto onError; goto onError;
} }
str = PyByteArray_AS_STRING(res) + respos; str = PyBytes_AS_STRING(res) + respos;
ressize = requiredsize; ressize = requiredsize;
} }
/* check if there is anything unencodable in the replacement /* check if there is anything unencodable in the replacement
@ -3845,13 +3842,23 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
} }
} }
} }
result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(res), /* Resize if we allocated to much */
str - PyByteArray_AS_STRING(res)); size = str - PyBytes_AS_STRING(res);
onError: if (size < ressize) { /* If this falls res will be NULL */
Py_DECREF(res); assert(size > 0);
if (_PyBytes_Resize(&res, size) < 0)
goto onError;
}
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
return result; return res;
onError:
Py_XDECREF(res);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
} }
PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p, PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
@ -4104,7 +4111,7 @@ static int encode_mbcs(PyObject **repr,
else { else {
/* Extend string object */ /* Extend string object */
n = PyBytes_Size(*repr); n = PyBytes_Size(*repr);
if (_PyBytes_Resize(repr, n + mbcssize) < 0) if (_PyBytes_Resize(&repr, n + mbcssize) < 0)
return -1; return -1;
} }
@ -4834,7 +4841,8 @@ PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
/* Resize if we allocated to much */ /* Resize if we allocated to much */
if (respos<PyBytes_GET_SIZE(res)) if (respos<PyBytes_GET_SIZE(res))
_PyBytes_Resize(&res, respos); if (_PyBytes_Resize(&res, respos) < 0)
goto onError;
Py_XDECREF(exc); Py_XDECREF(exc);
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);