Rewrite PyUnicode_Resize()

* Rename _PyUnicode_Resize() to unicode_resize()
 * unicode_resize() creates a copy if the string cannot be resized instead
   of failing
 * Optimize resize_copy() for wstr strings
 * Disable temporary resize_inplace()
This commit is contained in:
Victor Stinner 2011-10-03 03:52:20 +02:00
parent 829c0adca9
commit fe226c0d37

View file

@ -193,6 +193,8 @@ const unsigned char _Py_ascii_whitespace[] = {
0, 0, 0, 0, 0, 0, 0, 0 0, 0, 0, 0, 0, 0, 0, 0
}; };
static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
static PyObject * static PyObject *
unicode_encode_call_errorhandler(const char *errors, unicode_encode_call_errorhandler(const char *errors,
PyObject **errorHandler,const char *encoding, const char *reason, PyObject **errorHandler,const char *encoding, const char *reason,
@ -320,41 +322,94 @@ Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
return NULL; return NULL;
} }
static PyObject*
resize_compact(PyObject *unicode, Py_ssize_t length)
{
Py_ssize_t char_size;
Py_ssize_t struct_size;
Py_ssize_t new_size;
int share_wstr;
assert(PyUnicode_IS_READY(unicode));
char_size = PyUnicode_CHARACTER_SIZE(unicode);
if (PyUnicode_IS_COMPACT_ASCII(unicode))
struct_size = sizeof(PyASCIIObject);
else
struct_size = sizeof(PyCompactUnicodeObject);
share_wstr = (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(unicode));
_Py_DEC_REFTOTAL;
_Py_ForgetReference(unicode);
if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
PyErr_NoMemory();
return NULL;
}
new_size = (struct_size + (length + 1) * char_size);
unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
if (unicode == NULL) {
PyObject_Del(unicode);
PyErr_NoMemory();
return NULL;
}
_Py_NewReference(unicode);
_PyUnicode_LENGTH(unicode) = length;
if (share_wstr)
_PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
length, 0);
return unicode;
}
static int static int
unicode_resize(register PyUnicodeObject *unicode, resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
Py_ssize_t length)
{ {
void *oldstr; void *oldstr;
/* Resizing is only supported for old unicode objects. */
assert(!PyUnicode_IS_COMPACT(unicode)); assert(!PyUnicode_IS_COMPACT(unicode));
assert(_PyUnicode_WSTR(unicode) != NULL);
/* ... and only if they have not been readied yet, because assert(Py_REFCNT(unicode) == 1);
callees usually rely on the wstr representation when resizing. */ _PyUnicode_DIRTY(unicode);
assert(unicode->data.any == NULL);
/* Shortcut if there's nothing much to do. */ if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
if (_PyUnicode_WSTR_LENGTH(unicode) == length) {
goto reset; PyObject_DEL(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL;
/* Resizing shared object (unicode_empty or single character
objects) in-place is not allowed. Use PyUnicode_Resize()
instead ! */
if (unicode == unicode_empty ||
(_PyUnicode_WSTR_LENGTH(unicode) == 1 &&
_PyUnicode_WSTR(unicode)[0] < 256U &&
unicode_latin1[_PyUnicode_WSTR(unicode)[0]] == unicode)) {
PyErr_SetString(PyExc_SystemError,
"can't resize shared str objects");
return -1;
} }
/* We allocate one more byte to make sure the string is Ux0000 terminated. if (PyUnicode_IS_READY(unicode)) {
The overallocation is also used by fastsearch, which assumes that it's Py_ssize_t char_size;
safe to look at str[length] (without making any assumptions about what Py_ssize_t new_size;
it contains). */ int share_wstr;
void *data;
data = _PyUnicode_DATA_ANY(unicode);
assert(data != NULL);
char_size = PyUnicode_CHARACTER_SIZE(unicode);
share_wstr = (_PyUnicode_WSTR(unicode) == data);
if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
PyErr_NoMemory();
return -1;
}
new_size = (length + 1) * char_size;
data = (PyObject *)PyObject_REALLOC(data, new_size);
if (data == NULL) {
PyErr_NoMemory();
return -1;
}
_PyUnicode_DATA_ANY(unicode) = data;
if (share_wstr)
_PyUnicode_WSTR(unicode) = data;
_PyUnicode_LENGTH(unicode) = length;
PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
if (share_wstr)
return 0;
}
if (_PyUnicode_WSTR(unicode) != NULL) {
assert(_PyUnicode_WSTR(unicode) != NULL);
oldstr = _PyUnicode_WSTR(unicode); oldstr = _PyUnicode_WSTR(unicode);
_PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode), _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
@ -366,25 +421,45 @@ unicode_resize(register PyUnicodeObject *unicode,
} }
_PyUnicode_WSTR(unicode)[length] = 0; _PyUnicode_WSTR(unicode)[length] = 0;
_PyUnicode_WSTR_LENGTH(unicode) = length; _PyUnicode_WSTR_LENGTH(unicode) = length;
reset:
if (unicode->data.any != NULL) {
PyObject_FREE(unicode->data.any);
if (_PyUnicode_UTF8(unicode) && _PyUnicode_UTF8(unicode) != unicode->data.any) {
PyObject_FREE(_PyUnicode_UTF8(unicode));
} }
_PyUnicode_UTF8(unicode) = NULL;
_PyUnicode_UTF8_LENGTH(unicode) = 0;
unicode->data.any = NULL;
_PyUnicode_LENGTH(unicode) = 0;
_PyUnicode_STATE(unicode).interned = _PyUnicode_STATE(unicode).interned;
_PyUnicode_STATE(unicode).kind = PyUnicode_WCHAR_KIND;
}
_PyUnicode_DIRTY(unicode);
return 0; return 0;
} }
static PyObject*
resize_copy(PyObject *unicode, Py_ssize_t length)
{
Py_ssize_t copy_length;
if (PyUnicode_IS_COMPACT(unicode)) {
PyObject *copy;
assert(PyUnicode_IS_READY(unicode));
copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
if (copy == NULL)
return NULL;
copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
if (PyUnicode_CopyCharacters(copy, 0,
unicode, 0,
copy_length) < 0)
{
Py_DECREF(copy);
return NULL;
}
return copy;
} else {
assert(_PyUnicode_WSTR(unicode) != NULL);
assert(_PyUnicode_DATA_ANY(unicode) == NULL);
PyUnicodeObject *w = _PyUnicode_New(length);
if (w == NULL)
return NULL;
copy_length = _PyUnicode_WSTR_LENGTH(unicode);
copy_length = Py_MIN(copy_length, length);
Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
copy_length);
return (PyObject*)w;
}
}
/* We allocate one more byte to make sure the string is /* We allocate one more byte to make sure the string is
Ux0000 terminated; some code (e.g. new_identifier) Ux0000 terminated; some code (e.g. new_identifier)
relies on that. relies on that.
@ -690,7 +765,6 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode))); _PyUnicode_GET_LENGTH(unicode)));
return 0;
} }
#endif #endif
@ -1044,50 +1118,84 @@ unicode_dealloc(register PyUnicodeObject *unicode)
} }
static int static int
_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length) unicode_resizable(PyObject *unicode)
{ {
register PyUnicodeObject *v; if (Py_REFCNT(unicode) != 1)
return 0;
/* Argument checks */ if (PyUnicode_CHECK_INTERNED(unicode))
if (unicode == NULL) { return 0;
PyErr_BadInternalCall(); if (unicode == unicode_empty)
return -1; return 0;
} if (PyUnicode_WSTR_LENGTH(unicode) == 1) {
v = *unicode; Py_UCS4 ch;
if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0 || if (PyUnicode_IS_COMPACT(unicode))
PyUnicode_IS_COMPACT(v) || _PyUnicode_WSTR(v) == NULL) { ch = PyUnicode_READ_CHAR(unicode, 0);
PyErr_BadInternalCall(); else
return -1; ch = _PyUnicode_WSTR(unicode)[0];
if (ch < 256 && unicode_latin1[ch] == unicode)
return 0;
} }
/* FIXME: reenable resize_inplace */
if (!PyUnicode_IS_COMPACT(unicode))
return 0;
return 1;
}
/* Resizing unicode_empty and single character objects is not static int
possible since these are being shared. unicode_resize(PyObject **p_unicode, Py_ssize_t length)
The same goes for new-representation unicode objects or objects which {
have already been readied. PyObject *unicode;
For these, we simply return a fresh copy with the same Unicode content. Py_ssize_t old_length;
*/
if ((_PyUnicode_WSTR_LENGTH(v) != length && assert(p_unicode != NULL);
(v == unicode_empty || _PyUnicode_WSTR_LENGTH(v) == 1)) || unicode = *p_unicode;
PyUnicode_IS_COMPACT(v) || v->data.any) {
PyUnicodeObject *w = _PyUnicode_New(length); assert(unicode != NULL);
if (w == NULL) assert(PyUnicode_Check(unicode));
assert(0 <= length);
if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode))
old_length = PyUnicode_WSTR_LENGTH(unicode);
else
old_length = PyUnicode_GET_LENGTH(unicode);
if (old_length == length)
return 0;
/* FIXME: really create a new object? */
if (!unicode_resizable(unicode)) {
PyObject *copy = resize_copy(unicode, length);
if (copy == NULL)
return -1; return -1;
Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(v), Py_DECREF(*p_unicode);
length < _PyUnicode_WSTR_LENGTH(v) ? length : _PyUnicode_WSTR_LENGTH(v)); *p_unicode = copy;
Py_DECREF(*unicode);
*unicode = w;
return 0; return 0;
} }
/* Note that we don't have to modify *unicode for unshared Unicode if (PyUnicode_IS_COMPACT(unicode)) {
objects, since we can modify them in-place. */ *p_unicode = resize_compact(unicode, length);
return unicode_resize(v, length); if (*p_unicode == NULL)
return -1;
return 0;
} else
return resize_inplace((PyUnicodeObject*)unicode, length);
} }
int int
PyUnicode_Resize(PyObject **unicode, Py_ssize_t length) PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
{ {
return _PyUnicode_Resize((PyUnicodeObject **)unicode, length); PyObject *unicode;
if (p_unicode == NULL) {
PyErr_BadInternalCall();
return -1;
}
unicode = *p_unicode;
if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
|| _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
{
PyErr_BadInternalCall();
return -1;
}
return unicode_resize(p_unicode, length);
} }
static PyObject* static PyObject*
@ -3085,7 +3193,7 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
if (requiredsize > outsize) { if (requiredsize > outsize) {
if (requiredsize<2*outsize) if (requiredsize<2*outsize)
requiredsize = 2*outsize; requiredsize = 2*outsize;
if (_PyUnicode_Resize(output, requiredsize) < 0) if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
goto onError; goto onError;
*outptr = PyUnicode_AS_UNICODE(*output) + *outpos; *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
} }
@ -3375,7 +3483,7 @@ utf7Error:
} }
} }
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
@ -3944,7 +4052,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
/* Adjust length and ready string when it contained errors and /* Adjust length and ready string when it contained errors and
is of the old resizable kind. */ is of the old resizable kind. */
if (kind == PyUnicode_WCHAR_KIND) { if (kind == PyUnicode_WCHAR_KIND) {
if (_PyUnicode_Resize(&unicode, i) < 0 || if (PyUnicode_Resize((PyObject**)&unicode, i) < 0 ||
PyUnicode_READY(unicode) == -1) PyUnicode_READY(unicode) == -1)
goto onError; goto onError;
} }
@ -4449,7 +4557,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;
/* Adjust length */ /* Adjust length */
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
@ -4847,7 +4955,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;
/* Adjust length */ /* Adjust length */
if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
@ -5304,9 +5412,13 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
/* Ensure the length prediction worked in case of ASCII strings */ /* Ensure the length prediction worked in case of ASCII strings */
assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length); assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
if (kind == PyUnicode_WCHAR_KIND && (_PyUnicode_Resize(&v, i) < 0 || if (kind == PyUnicode_WCHAR_KIND)
PyUnicode_READY(v) == -1)) {
if (PyUnicode_Resize((PyObject**)&v, i) < 0)
goto onError; goto onError;
if (PyUnicode_READY(v) == -1)
goto onError;
}
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
return (PyObject *)v; return (PyObject *)v;
@ -5602,7 +5714,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
nextByte: nextByte:
; ;
} }
if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
@ -5790,7 +5902,7 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
} }
} }
if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
@ -6216,7 +6328,7 @@ PyUnicode_DecodeASCII(const char *s,
} }
} }
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
@ -6343,7 +6455,7 @@ decode_mbcs(PyUnicodeObject **v,
else { else {
/* Extend unicode object */ /* Extend unicode object */
n = PyUnicode_GET_SIZE(*v); n = PyUnicode_GET_SIZE(*v);
if (_PyUnicode_Resize(v, n + usize) < 0) if (PyUnicode_Resize(v, n + usize) < 0)
return -1; return -1;
} }
@ -6682,7 +6794,7 @@ PyUnicode_DecodeCharmap(const char *s,
(targetsize << 2); (targetsize << 2);
extrachars += needed; extrachars += needed;
/* XXX overflow detection missing */ /* XXX overflow detection missing */
if (_PyUnicode_Resize(&v, if (PyUnicode_Resize((PyObject**)&v,
PyUnicode_GET_SIZE(v) + needed) < 0) { PyUnicode_GET_SIZE(v) + needed) < 0) {
Py_DECREF(x); Py_DECREF(x);
goto onError; goto onError;
@ -6709,7 +6821,7 @@ PyUnicode_DecodeCharmap(const char *s,
} }
} }
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);