mirror of
https://github.com/python/cpython.git
synced 2025-10-07 07:31:46 +00:00
Add _PyUnicode_CheckConsistency() macro to help debugging
* Document Unicode string states * Use _PyUnicode_CheckConsistency() to ensure that objects are always consistent.
This commit is contained in:
parent
4fae54cb0e
commit
910337b42e
2 changed files with 144 additions and 37 deletions
|
@ -206,6 +206,52 @@ extern "C" {
|
||||||
immediately follow the structure. utf8_length and wstr_length can be found
|
immediately follow the structure. utf8_length and wstr_length can be found
|
||||||
in the length field; the utf8 pointer is equal to the data pointer. */
|
in the length field; the utf8 pointer is equal to the data pointer. */
|
||||||
typedef struct {
|
typedef struct {
|
||||||
|
/* Unicode strings can be in 4 states:
|
||||||
|
|
||||||
|
- compact ascii:
|
||||||
|
|
||||||
|
* structure = PyASCIIObject
|
||||||
|
* kind = PyUnicode_1BYTE_KIND
|
||||||
|
* compact = 1
|
||||||
|
* ascii = 1
|
||||||
|
* ready = 1
|
||||||
|
* utf8 = data
|
||||||
|
|
||||||
|
- compact:
|
||||||
|
|
||||||
|
* structure = PyCompactUnicodeObject
|
||||||
|
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
|
||||||
|
PyUnicode_4BYTE_KIND
|
||||||
|
* compact = 1
|
||||||
|
* ready = 1
|
||||||
|
* (ascii = 0)
|
||||||
|
|
||||||
|
- string created by the legacy API (not ready):
|
||||||
|
|
||||||
|
* structure = PyUnicodeObject
|
||||||
|
* kind = PyUnicode_WCHAR_KIND
|
||||||
|
* compact = 0
|
||||||
|
* ready = 0
|
||||||
|
* wstr is not NULL
|
||||||
|
* data.any is NULL
|
||||||
|
* utf8 is NULL
|
||||||
|
* interned = SSTATE_NOT_INTERNED
|
||||||
|
* (ascii = 0)
|
||||||
|
|
||||||
|
- string created by the legacy API, ready:
|
||||||
|
|
||||||
|
* structure = PyUnicodeObject structure
|
||||||
|
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
|
||||||
|
PyUnicode_4BYTE_KIND
|
||||||
|
* compact = 0
|
||||||
|
* ready = 1
|
||||||
|
* data.any is not NULL
|
||||||
|
* (ascii = 0)
|
||||||
|
|
||||||
|
String created by the legacy API becomes ready when calling
|
||||||
|
PyUnicode_READY().
|
||||||
|
|
||||||
|
See also _PyUnicode_CheckConsistency(). */
|
||||||
PyObject_HEAD
|
PyObject_HEAD
|
||||||
Py_ssize_t length; /* Number of code points in the string */
|
Py_ssize_t length; /* Number of code points in the string */
|
||||||
Py_hash_t hash; /* Hash value; -1 if not set */
|
Py_hash_t hash; /* Hash value; -1 if not set */
|
||||||
|
|
|
@ -89,6 +89,55 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef Py_DEBUG
|
||||||
|
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
|
||||||
|
#else
|
||||||
|
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define _PyUnicode_UTF8(op) \
|
||||||
|
(((PyCompactUnicodeObject*)(op))->utf8)
|
||||||
|
#define PyUnicode_UTF8(op) \
|
||||||
|
(assert(_PyUnicode_CHECK(op)), \
|
||||||
|
assert(PyUnicode_IS_READY(op)), \
|
||||||
|
PyUnicode_IS_COMPACT_ASCII(op) ? \
|
||||||
|
((char*)((PyASCIIObject*)(op) + 1)) : \
|
||||||
|
_PyUnicode_UTF8(op))
|
||||||
|
#define _PyUnicode_UTF8_LENGTH(op) \
|
||||||
|
(((PyCompactUnicodeObject*)(op))->utf8_length)
|
||||||
|
#define PyUnicode_UTF8_LENGTH(op) \
|
||||||
|
(assert(_PyUnicode_CHECK(op)), \
|
||||||
|
assert(PyUnicode_IS_READY(op)), \
|
||||||
|
PyUnicode_IS_COMPACT_ASCII(op) ? \
|
||||||
|
((PyASCIIObject*)(op))->length : \
|
||||||
|
_PyUnicode_UTF8_LENGTH(op))
|
||||||
|
#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
|
||||||
|
#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
|
||||||
|
#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
|
||||||
|
#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
|
||||||
|
#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
|
||||||
|
#define _PyUnicode_KIND(op) \
|
||||||
|
(assert(_PyUnicode_CHECK(op)), \
|
||||||
|
((PyASCIIObject *)(op))->state.kind)
|
||||||
|
#define _PyUnicode_GET_LENGTH(op) \
|
||||||
|
(assert(_PyUnicode_CHECK(op)), \
|
||||||
|
((PyASCIIObject *)(op))->length)
|
||||||
|
#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
|
||||||
|
|
||||||
|
#undef PyUnicode_READY
|
||||||
|
#define PyUnicode_READY(op) \
|
||||||
|
(assert(_PyUnicode_CHECK(op)), \
|
||||||
|
(PyUnicode_IS_READY(op) ? \
|
||||||
|
0 : _PyUnicode_Ready((PyObject *)(op))))
|
||||||
|
|
||||||
|
/* true if the Unicode object has an allocated UTF-8 memory block
|
||||||
|
(not shared with other data) */
|
||||||
|
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
|
||||||
|
(assert(_PyUnicode_CHECK(op)), \
|
||||||
|
(!PyUnicode_IS_COMPACT_ASCII(op) \
|
||||||
|
&& _PyUnicode_UTF8(op) \
|
||||||
|
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
|
||||||
|
|
||||||
/* Generic helper macro to convert characters of different types.
|
/* Generic helper macro to convert characters of different types.
|
||||||
from_type and to_type have to be valid type names, begin and end
|
from_type and to_type have to be valid type names, begin and end
|
||||||
are pointers to the source characters which should be of type
|
are pointers to the source characters which should be of type
|
||||||
|
@ -104,44 +153,6 @@ extern "C" {
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define _PyUnicode_UTF8(op) \
|
|
||||||
(((PyCompactUnicodeObject*)(op))->utf8)
|
|
||||||
#define PyUnicode_UTF8(op) \
|
|
||||||
(assert(PyUnicode_Check(op)), \
|
|
||||||
assert(PyUnicode_IS_READY(op)), \
|
|
||||||
PyUnicode_IS_COMPACT_ASCII(op) ? \
|
|
||||||
((char*)((PyASCIIObject*)(op) + 1)) : \
|
|
||||||
_PyUnicode_UTF8(op))
|
|
||||||
#define _PyUnicode_UTF8_LENGTH(op) \
|
|
||||||
(((PyCompactUnicodeObject*)(op))->utf8_length)
|
|
||||||
#define PyUnicode_UTF8_LENGTH(op) \
|
|
||||||
(assert(PyUnicode_Check(op)), \
|
|
||||||
assert(PyUnicode_IS_READY(op)), \
|
|
||||||
PyUnicode_IS_COMPACT_ASCII(op) ? \
|
|
||||||
((PyASCIIObject*)(op))->length : \
|
|
||||||
_PyUnicode_UTF8_LENGTH(op))
|
|
||||||
#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
|
|
||||||
#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
|
|
||||||
#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
|
|
||||||
#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
|
|
||||||
#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
|
|
||||||
#define _PyUnicode_KIND(op) \
|
|
||||||
(assert(PyUnicode_Check(op)), \
|
|
||||||
((PyASCIIObject *)(op))->state.kind)
|
|
||||||
#define _PyUnicode_GET_LENGTH(op) \
|
|
||||||
(assert(PyUnicode_Check(op)), \
|
|
||||||
((PyASCIIObject *)(op))->length)
|
|
||||||
#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
|
|
||||||
|
|
||||||
/* true if the Unicode object has an allocated UTF-8 memory block
|
|
||||||
(not shared with other data) */
|
|
||||||
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
|
|
||||||
(assert(PyUnicode_Check(op)), \
|
|
||||||
(!PyUnicode_IS_COMPACT_ASCII(op) \
|
|
||||||
&& _PyUnicode_UTF8(op) \
|
|
||||||
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
|
|
||||||
|
|
||||||
|
|
||||||
/* The Unicode string has been modified: reset the hash */
|
/* The Unicode string has been modified: reset the hash */
|
||||||
#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
|
#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
|
||||||
|
|
||||||
|
@ -250,6 +261,57 @@ PyUnicode_GetMax(void)
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef Py_DEBUG
|
||||||
|
static int
|
||||||
|
_PyUnicode_CheckConsistency(void *op)
|
||||||
|
{
|
||||||
|
PyASCIIObject *ascii;
|
||||||
|
unsigned int kind;
|
||||||
|
|
||||||
|
assert(PyUnicode_Check(op));
|
||||||
|
|
||||||
|
ascii = (PyASCIIObject *)op;
|
||||||
|
kind = ascii->state.kind;
|
||||||
|
|
||||||
|
if (ascii->state.ascii == 1) {
|
||||||
|
assert(kind == PyUnicode_1BYTE_KIND);
|
||||||
|
assert(ascii->state.compact == 1);
|
||||||
|
assert(ascii->state.ready == 1);
|
||||||
|
}
|
||||||
|
else if (ascii->state.compact == 1) {
|
||||||
|
assert(kind == PyUnicode_1BYTE_KIND
|
||||||
|
|| kind == PyUnicode_2BYTE_KIND
|
||||||
|
|| kind == PyUnicode_4BYTE_KIND);
|
||||||
|
assert(ascii->state.compact == 1);
|
||||||
|
assert(ascii->state.ascii == 0);
|
||||||
|
assert(ascii->state.ready == 1);
|
||||||
|
} else {
|
||||||
|
PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
|
||||||
|
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
|
||||||
|
|
||||||
|
if (kind == PyUnicode_WCHAR_KIND) {
|
||||||
|
assert(!ascii->state.compact == 1);
|
||||||
|
assert(ascii->state.ascii == 0);
|
||||||
|
assert(!ascii->state.ready == 1);
|
||||||
|
assert(ascii->wstr != NULL);
|
||||||
|
assert(unicode->data.any == NULL);
|
||||||
|
assert(compact->utf8 == NULL);
|
||||||
|
assert(ascii->state.interned == SSTATE_NOT_INTERNED);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert(kind == PyUnicode_1BYTE_KIND
|
||||||
|
|| kind == PyUnicode_2BYTE_KIND
|
||||||
|
|| kind == PyUnicode_4BYTE_KIND);
|
||||||
|
assert(!ascii->state.compact == 1);
|
||||||
|
assert(ascii->state.ready == 1);
|
||||||
|
assert(unicode->data.any != NULL);
|
||||||
|
assert(ascii->state.ascii == 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
/* --- Bloom Filters ----------------------------------------------------- */
|
/* --- Bloom Filters ----------------------------------------------------- */
|
||||||
|
|
||||||
/* stuff to implement simple "bloom filters" for Unicode characters.
|
/* stuff to implement simple "bloom filters" for Unicode characters.
|
||||||
|
@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length)
|
||||||
static const char*
|
static const char*
|
||||||
unicode_kind_name(PyObject *unicode)
|
unicode_kind_name(PyObject *unicode)
|
||||||
{
|
{
|
||||||
assert(PyUnicode_Check(unicode));
|
assert(_PyUnicode_CHECK(unicode));
|
||||||
if (!PyUnicode_IS_COMPACT(unicode))
|
if (!PyUnicode_IS_COMPACT(unicode))
|
||||||
{
|
{
|
||||||
if (!PyUnicode_IS_READY(unicode))
|
if (!PyUnicode_IS_READY(unicode))
|
||||||
|
@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
|
||||||
const wchar_t *iter;
|
const wchar_t *iter;
|
||||||
Py_UCS4 *ucs4_out;
|
Py_UCS4 *ucs4_out;
|
||||||
|
|
||||||
assert(unicode && PyUnicode_Check(unicode));
|
assert(unicode != NULL);
|
||||||
|
assert(_PyUnicode_CHECK(unicode));
|
||||||
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
|
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
|
||||||
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
|
ucs4_out = PyUnicode_4BYTE_DATA(unicode);
|
||||||
|
|
||||||
|
@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
|
||||||
static int
|
static int
|
||||||
_PyUnicode_Dirty(PyObject *unicode)
|
_PyUnicode_Dirty(PyObject *unicode)
|
||||||
{
|
{
|
||||||
assert(PyUnicode_Check(unicode));
|
assert(_PyUnicode_CHECK(unicode));
|
||||||
if (Py_REFCNT(unicode) != 1) {
|
if (Py_REFCNT(unicode) != 1) {
|
||||||
PyErr_SetString(PyExc_ValueError,
|
PyErr_SetString(PyExc_ValueError,
|
||||||
"Cannot modify a string having more than 1 reference");
|
"Cannot modify a string having more than 1 reference");
|
||||||
|
@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj)
|
||||||
strings were created using _PyObject_New() and where no canonical
|
strings were created using _PyObject_New() and where no canonical
|
||||||
representation (the str field) has been set yet aka strings
|
representation (the str field) has been set yet aka strings
|
||||||
which are not yet ready. */
|
which are not yet ready. */
|
||||||
assert(PyUnicode_Check(obj));
|
assert(_PyUnicode_CHECK(unicode));
|
||||||
assert(!PyUnicode_IS_READY(obj));
|
assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
|
||||||
assert(!PyUnicode_IS_COMPACT(obj));
|
|
||||||
assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
|
|
||||||
assert(_PyUnicode_WSTR(unicode) != NULL);
|
assert(_PyUnicode_WSTR(unicode) != NULL);
|
||||||
assert(_PyUnicode_DATA_ANY(unicode) == NULL);
|
assert(_PyUnicode_DATA_ANY(unicode) == NULL);
|
||||||
assert(_PyUnicode_UTF8(unicode) == NULL);
|
assert(_PyUnicode_UTF8(unicode) == NULL);
|
||||||
|
@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
|
||||||
assert(PyUnicode_Check(unicode));
|
assert(PyUnicode_Check(unicode));
|
||||||
assert(0 <= length);
|
assert(0 <= length);
|
||||||
|
|
||||||
if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode))
|
if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
|
||||||
old_length = PyUnicode_WSTR_LENGTH(unicode);
|
old_length = PyUnicode_WSTR_LENGTH(unicode);
|
||||||
else
|
else
|
||||||
old_length = PyUnicode_GET_LENGTH(unicode);
|
old_length = PyUnicode_GET_LENGTH(unicode);
|
||||||
|
@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
||||||
case 'U':
|
case 'U':
|
||||||
{
|
{
|
||||||
PyObject *obj = va_arg(count, PyObject *);
|
PyObject *obj = va_arg(count, PyObject *);
|
||||||
assert(obj && PyUnicode_Check(obj));
|
assert(obj && _PyUnicode_CHECK(obj));
|
||||||
if (PyUnicode_READY(obj) == -1)
|
if (PyUnicode_READY(obj) == -1)
|
||||||
goto fail;
|
goto fail;
|
||||||
argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
|
argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
|
||||||
|
@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
|
||||||
const char *str = va_arg(count, const char *);
|
const char *str = va_arg(count, const char *);
|
||||||
PyObject *str_obj;
|
PyObject *str_obj;
|
||||||
assert(obj || str);
|
assert(obj || str);
|
||||||
assert(!obj || PyUnicode_Check(obj));
|
assert(!obj || _PyUnicode_CHECK(obj));
|
||||||
if (obj) {
|
if (obj) {
|
||||||
if (PyUnicode_READY(obj) == -1)
|
if (PyUnicode_READY(obj) == -1)
|
||||||
goto fail;
|
goto fail;
|
||||||
|
@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
|
||||||
void *data;
|
void *data;
|
||||||
Py_UCS4 chr;
|
Py_UCS4 chr;
|
||||||
|
|
||||||
assert(PyUnicode_Check(uni));
|
assert(_PyUnicode_CHECK(uni));
|
||||||
if (PyUnicode_READY(uni) == -1)
|
if (PyUnicode_READY(uni) == -1)
|
||||||
return -1;
|
return -1;
|
||||||
kind = PyUnicode_KIND(uni);
|
kind = PyUnicode_KIND(uni);
|
||||||
|
@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
|
||||||
unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
|
unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
|
||||||
if (unicode == NULL)
|
if (unicode == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
assert(PyUnicode_Check(unicode));
|
assert(_PyUnicode_CHECK(unicode));
|
||||||
if (PyUnicode_READY(unicode))
|
if (PyUnicode_READY(unicode))
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
||||||
|
@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it)
|
||||||
seq = it->it_seq;
|
seq = it->it_seq;
|
||||||
if (seq == NULL)
|
if (seq == NULL)
|
||||||
return NULL;
|
return NULL;
|
||||||
assert(PyUnicode_Check(seq));
|
assert(_PyUnicode_CHECK(seq));
|
||||||
|
|
||||||
if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
|
if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
|
||||||
int kind = PyUnicode_KIND(seq);
|
int kind = PyUnicode_KIND(seq);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue