mirror of
https://github.com/python/cpython.git
synced 2025-09-08 09:51:34 +00:00
Rewrite PyUnicode_Append(); unicode_modifiable() is more strict
* Rename unicode_resizable() to unicode_modifiable() * Rename _PyUnicode_Dirty() to unicode_check_modifiable() to make it clear that the function is private * Inline PyUnicode_Concat() and unicode_append_inplace() in PyUnicode_Append() to simplify the code * unicode_modifiable() return 0 if the hash has been computed or if the string is not an exact unicode string * Remove _PyUnicode_DIRTY(): no need to reset the hash anymore, because if the hash has already been computed, you cannot modify a string inplace anymore * PyUnicode_Concat() checks for integer overflow
This commit is contained in:
parent
24c74be9a3
commit
488fa49acf
1 changed files with 84 additions and 79 deletions
|
@ -165,9 +165,6 @@ extern "C" {
|
||||||
*_to++ = (to_type) *_iter++; \
|
*_to++ = (to_type) *_iter++; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
/* The Unicode string has been modified: reset the hash */
|
|
||||||
#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
|
|
||||||
|
|
||||||
/* This dictionary holds all interned unicode strings. Note that references
|
/* This dictionary holds all interned unicode strings. Note that references
|
||||||
to strings in this dictionary are *not* counted in the string's ob_refcnt.
|
to strings in this dictionary are *not* counted in the string's ob_refcnt.
|
||||||
When the interned string reaches a refcnt of 0 the string deallocation
|
When the interned string reaches a refcnt of 0 the string deallocation
|
||||||
|
@ -226,6 +223,8 @@ static void copy_characters(
|
||||||
PyObject *to, Py_ssize_t to_start,
|
PyObject *to, Py_ssize_t to_start,
|
||||||
PyObject *from, Py_ssize_t from_start,
|
PyObject *from, Py_ssize_t from_start,
|
||||||
Py_ssize_t how_many);
|
Py_ssize_t how_many);
|
||||||
|
static int unicode_modifiable(PyObject *unicode);
|
||||||
|
|
||||||
|
|
||||||
static PyObject *
|
static PyObject *
|
||||||
unicode_fromascii(const unsigned char *s, Py_ssize_t size);
|
unicode_fromascii(const unsigned char *s, Py_ssize_t size);
|
||||||
|
@ -645,10 +644,11 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
|
||||||
Py_ssize_t new_size;
|
Py_ssize_t new_size;
|
||||||
int share_wstr;
|
int share_wstr;
|
||||||
PyObject *new_unicode;
|
PyObject *new_unicode;
|
||||||
|
|
||||||
assert(PyUnicode_IS_READY(unicode));
|
assert(PyUnicode_IS_READY(unicode));
|
||||||
|
assert(PyUnicode_IS_COMPACT(unicode));
|
||||||
|
|
||||||
char_size = PyUnicode_KIND(unicode);
|
char_size = PyUnicode_KIND(unicode);
|
||||||
if (PyUnicode_IS_COMPACT_ASCII(unicode))
|
if (PyUnicode_IS_ASCII(unicode))
|
||||||
struct_size = sizeof(PyASCIIObject);
|
struct_size = sizeof(PyASCIIObject);
|
||||||
else
|
else
|
||||||
struct_size = sizeof(PyCompactUnicodeObject);
|
struct_size = sizeof(PyCompactUnicodeObject);
|
||||||
|
@ -676,7 +676,7 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
|
||||||
_PyUnicode_LENGTH(unicode) = length;
|
_PyUnicode_LENGTH(unicode) = length;
|
||||||
if (share_wstr) {
|
if (share_wstr) {
|
||||||
_PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
|
_PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
|
||||||
if (!PyUnicode_IS_COMPACT_ASCII(unicode))
|
if (!PyUnicode_IS_ASCII(unicode))
|
||||||
_PyUnicode_WSTR_LENGTH(unicode) = length;
|
_PyUnicode_WSTR_LENGTH(unicode) = length;
|
||||||
}
|
}
|
||||||
PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
|
PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
|
||||||
|
@ -691,8 +691,6 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
|
||||||
assert(!PyUnicode_IS_COMPACT(unicode));
|
assert(!PyUnicode_IS_COMPACT(unicode));
|
||||||
assert(Py_REFCNT(unicode) == 1);
|
assert(Py_REFCNT(unicode) == 1);
|
||||||
|
|
||||||
_PyUnicode_DIRTY(unicode);
|
|
||||||
|
|
||||||
if (PyUnicode_IS_READY(unicode)) {
|
if (PyUnicode_IS_READY(unicode)) {
|
||||||
Py_ssize_t char_size;
|
Py_ssize_t char_size;
|
||||||
Py_ssize_t new_size;
|
Py_ssize_t new_size;
|
||||||
|
@ -1115,15 +1113,13 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int
|
static int
|
||||||
_PyUnicode_Dirty(PyObject *unicode)
|
unicode_check_modifiable(PyObject *unicode)
|
||||||
{
|
{
|
||||||
assert(_PyUnicode_CHECK(unicode));
|
if (!unicode_modifiable(unicode)) {
|
||||||
if (Py_REFCNT(unicode) != 1) {
|
|
||||||
PyErr_SetString(PyExc_SystemError,
|
PyErr_SetString(PyExc_SystemError,
|
||||||
"Cannot modify a string having more than 1 reference");
|
"Cannot modify a string currently used");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
_PyUnicode_DIRTY(unicode);
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1289,7 +1285,7 @@ PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
|
||||||
if (how_many == 0)
|
if (how_many == 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
if (_PyUnicode_Dirty(to))
|
if (unicode_check_modifiable(to))
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
err = _copy_characters(to, to_start, from, from_start, how_many, 1);
|
err = _copy_characters(to, to_start, from, from_start, how_many, 1);
|
||||||
|
@ -1537,12 +1533,17 @@ unicode_is_singleton(PyObject *unicode)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int
|
static int
|
||||||
unicode_resizable(PyObject *unicode)
|
unicode_modifiable(PyObject *unicode)
|
||||||
{
|
{
|
||||||
|
assert(_PyUnicode_CHECK(unicode));
|
||||||
if (Py_REFCNT(unicode) != 1)
|
if (Py_REFCNT(unicode) != 1)
|
||||||
return 0;
|
return 0;
|
||||||
|
if (_PyUnicode_HASH(unicode) != -1)
|
||||||
|
return 0;
|
||||||
if (PyUnicode_CHECK_INTERNED(unicode))
|
if (PyUnicode_CHECK_INTERNED(unicode))
|
||||||
return 0;
|
return 0;
|
||||||
|
if (!PyUnicode_CheckExact(unicode))
|
||||||
|
return 0;
|
||||||
#ifdef Py_DEBUG
|
#ifdef Py_DEBUG
|
||||||
/* singleton refcount is greater than 1 */
|
/* singleton refcount is greater than 1 */
|
||||||
assert(!unicode_is_singleton(unicode));
|
assert(!unicode_is_singleton(unicode));
|
||||||
|
@ -1577,7 +1578,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!unicode_resizable(unicode)) {
|
if (!unicode_modifiable(unicode)) {
|
||||||
PyObject *copy = resize_copy(unicode, length);
|
PyObject *copy = resize_copy(unicode, length);
|
||||||
if (copy == NULL)
|
if (copy == NULL)
|
||||||
return -1;
|
return -1;
|
||||||
|
@ -3591,11 +3592,12 @@ PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
|
||||||
PyErr_BadArgument();
|
PyErr_BadArgument();
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
assert(PyUnicode_IS_READY(unicode));
|
||||||
if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
|
if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
|
||||||
PyErr_SetString(PyExc_IndexError, "string index out of range");
|
PyErr_SetString(PyExc_IndexError, "string index out of range");
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
if (_PyUnicode_Dirty(unicode))
|
if (unicode_check_modifiable(unicode))
|
||||||
return -1;
|
return -1;
|
||||||
PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
|
PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
|
||||||
index, ch);
|
index, ch);
|
||||||
|
@ -10566,6 +10568,7 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
|
||||||
{
|
{
|
||||||
PyObject *u = NULL, *v = NULL, *w;
|
PyObject *u = NULL, *v = NULL, *w;
|
||||||
Py_UCS4 maxchar, maxchar2;
|
Py_UCS4 maxchar, maxchar2;
|
||||||
|
Py_ssize_t u_len, v_len, new_len;
|
||||||
|
|
||||||
/* Coerce the two arguments */
|
/* Coerce the two arguments */
|
||||||
u = PyUnicode_FromObject(left);
|
u = PyUnicode_FromObject(left);
|
||||||
|
@ -10585,18 +10588,25 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
u_len = PyUnicode_GET_LENGTH(u);
|
||||||
|
v_len = PyUnicode_GET_LENGTH(v);
|
||||||
|
if (u_len > PY_SSIZE_T_MAX - v_len) {
|
||||||
|
PyErr_SetString(PyExc_OverflowError,
|
||||||
|
"strings are too large to concat");
|
||||||
|
goto onError;
|
||||||
|
}
|
||||||
|
new_len = u_len + v_len;
|
||||||
|
|
||||||
maxchar = PyUnicode_MAX_CHAR_VALUE(u);
|
maxchar = PyUnicode_MAX_CHAR_VALUE(u);
|
||||||
maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
|
maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
|
||||||
maxchar = Py_MAX(maxchar, maxchar2);
|
maxchar = Py_MAX(maxchar, maxchar2);
|
||||||
|
|
||||||
/* Concat the two Unicode strings */
|
/* Concat the two Unicode strings */
|
||||||
w = PyUnicode_New(
|
w = PyUnicode_New(new_len, maxchar);
|
||||||
PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
|
|
||||||
maxchar);
|
|
||||||
if (w == NULL)
|
if (w == NULL)
|
||||||
goto onError;
|
goto onError;
|
||||||
copy_characters(w, 0, u, 0, PyUnicode_GET_LENGTH(u));
|
copy_characters(w, 0, u, 0, u_len);
|
||||||
copy_characters(w, PyUnicode_GET_LENGTH(u), v, 0, PyUnicode_GET_LENGTH(v));
|
copy_characters(w, u_len, v, 0, v_len);
|
||||||
Py_DECREF(u);
|
Py_DECREF(u);
|
||||||
Py_DECREF(v);
|
Py_DECREF(v);
|
||||||
assert(_PyUnicode_CheckConsistency(w, 1));
|
assert(_PyUnicode_CheckConsistency(w, 1));
|
||||||
|
@ -10608,49 +10618,12 @@ PyUnicode_Concat(PyObject *left, PyObject *right)
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
unicode_append_inplace(PyObject **p_left, PyObject *right)
|
|
||||||
{
|
|
||||||
Py_ssize_t left_len, right_len, new_len;
|
|
||||||
|
|
||||||
assert(PyUnicode_IS_READY(*p_left));
|
|
||||||
assert(PyUnicode_IS_READY(right));
|
|
||||||
|
|
||||||
left_len = PyUnicode_GET_LENGTH(*p_left);
|
|
||||||
right_len = PyUnicode_GET_LENGTH(right);
|
|
||||||
if (left_len > PY_SSIZE_T_MAX - right_len) {
|
|
||||||
PyErr_SetString(PyExc_OverflowError,
|
|
||||||
"strings are too large to concat");
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
new_len = left_len + right_len;
|
|
||||||
|
|
||||||
/* Now we own the last reference to 'left', so we can resize it
|
|
||||||
* in-place.
|
|
||||||
*/
|
|
||||||
if (unicode_resize(p_left, new_len) != 0) {
|
|
||||||
/* XXX if _PyUnicode_Resize() fails, 'left' has been
|
|
||||||
* deallocated so it cannot be put back into
|
|
||||||
* 'variable'. The MemoryError is raised when there
|
|
||||||
* is no value in 'variable', which might (very
|
|
||||||
* remotely) be a cause of incompatibilities.
|
|
||||||
*/
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
/* copy 'right' into the newly allocated area of 'left' */
|
|
||||||
copy_characters(*p_left, left_len, right, 0, right_len);
|
|
||||||
_PyUnicode_DIRTY(*p_left);
|
|
||||||
return;
|
|
||||||
|
|
||||||
error:
|
|
||||||
Py_DECREF(*p_left);
|
|
||||||
*p_left = NULL;
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
PyUnicode_Append(PyObject **p_left, PyObject *right)
|
PyUnicode_Append(PyObject **p_left, PyObject *right)
|
||||||
{
|
{
|
||||||
PyObject *left, *res;
|
PyObject *left, *res;
|
||||||
|
Py_UCS4 maxchar, maxchar2;
|
||||||
|
Py_ssize_t left_len, right_len, new_len;
|
||||||
|
|
||||||
if (p_left == NULL) {
|
if (p_left == NULL) {
|
||||||
if (!PyErr_Occurred())
|
if (!PyErr_Occurred())
|
||||||
|
@ -10669,34 +10642,66 @@ PyUnicode_Append(PyObject **p_left, PyObject *right)
|
||||||
if (PyUnicode_READY(right))
|
if (PyUnicode_READY(right))
|
||||||
goto error;
|
goto error;
|
||||||
|
|
||||||
if (PyUnicode_CheckExact(left) && left != unicode_empty
|
/* Shortcuts */
|
||||||
&& PyUnicode_CheckExact(right) && right != unicode_empty
|
if (left == unicode_empty) {
|
||||||
&& unicode_resizable(left)
|
Py_DECREF(left);
|
||||||
&& (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
|
Py_INCREF(right);
|
||||||
|| _PyUnicode_WSTR(left) != NULL))
|
*p_left = right;
|
||||||
{
|
return;
|
||||||
|
}
|
||||||
|
if (right == unicode_empty)
|
||||||
|
return;
|
||||||
|
|
||||||
|
left_len = PyUnicode_GET_LENGTH(left);
|
||||||
|
right_len = PyUnicode_GET_LENGTH(right);
|
||||||
|
if (left_len > PY_SSIZE_T_MAX - right_len) {
|
||||||
|
PyErr_SetString(PyExc_OverflowError,
|
||||||
|
"strings are too large to concat");
|
||||||
|
goto error;
|
||||||
|
}
|
||||||
|
new_len = left_len + right_len;
|
||||||
|
|
||||||
|
if (unicode_modifiable(left)
|
||||||
|
&& PyUnicode_CheckExact(right)
|
||||||
|
&& PyUnicode_KIND(right) <= PyUnicode_KIND(left)
|
||||||
/* Don't resize for ascii += latin1. Convert ascii to latin1 requires
|
/* Don't resize for ascii += latin1. Convert ascii to latin1 requires
|
||||||
to change the structure size, but characters are stored just after
|
to change the structure size, but characters are stored just after
|
||||||
the structure, and so it requires to move all characters which is
|
the structure, and so it requires to move all characters which is
|
||||||
not so different than duplicating the string. */
|
not so different than duplicating the string. */
|
||||||
if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
|
&& !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
|
||||||
{
|
{
|
||||||
unicode_append_inplace(p_left, right);
|
/* append inplace */
|
||||||
assert(p_left == NULL || _PyUnicode_CheckConsistency(*p_left, 1));
|
if (unicode_resize(p_left, new_len) != 0) {
|
||||||
return;
|
/* XXX if _PyUnicode_Resize() fails, 'left' has been
|
||||||
|
* deallocated so it cannot be put back into
|
||||||
|
* 'variable'. The MemoryError is raised when there
|
||||||
|
* is no value in 'variable', which might (very
|
||||||
|
* remotely) be a cause of incompatibilities.
|
||||||
|
*/
|
||||||
|
goto error;
|
||||||
}
|
}
|
||||||
|
/* copy 'right' into the newly allocated area of 'left' */
|
||||||
|
copy_characters(*p_left, left_len, right, 0, right_len);
|
||||||
}
|
}
|
||||||
|
else {
|
||||||
|
maxchar = PyUnicode_MAX_CHAR_VALUE(left);
|
||||||
|
maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
|
||||||
|
maxchar = Py_MAX(maxchar, maxchar2);
|
||||||
|
|
||||||
res = PyUnicode_Concat(left, right);
|
/* Concat the two Unicode strings */
|
||||||
|
res = PyUnicode_New(new_len, maxchar);
|
||||||
if (res == NULL)
|
if (res == NULL)
|
||||||
goto error;
|
goto error;
|
||||||
|
copy_characters(res, 0, left, 0, left_len);
|
||||||
|
copy_characters(res, left_len, right, 0, right_len);
|
||||||
Py_DECREF(left);
|
Py_DECREF(left);
|
||||||
*p_left = res;
|
*p_left = res;
|
||||||
|
}
|
||||||
|
assert(_PyUnicode_CheckConsistency(*p_left, 1));
|
||||||
return;
|
return;
|
||||||
|
|
||||||
error:
|
error:
|
||||||
Py_DECREF(*p_left);
|
Py_CLEAR(*p_left);
|
||||||
*p_left = NULL;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue