Add _PyUnicode_CheckConsistency() macro to help debugging

* Document Unicode string states
 * Use _PyUnicode_CheckConsistency() to ensure that objects are always
   consistent.
This commit is contained in:
Victor Stinner 2011-10-03 03:20:16 +02:00
parent 4fae54cb0e
commit 910337b42e
2 changed files with 144 additions and 37 deletions

View file

@ -206,6 +206,52 @@ extern "C" {
immediately follow the structure. utf8_length and wstr_length can be found immediately follow the structure. utf8_length and wstr_length can be found
in the length field; the utf8 pointer is equal to the data pointer. */ in the length field; the utf8 pointer is equal to the data pointer. */
typedef struct { typedef struct {
/* Unicode strings can be in 4 states:
- compact ascii:
* structure = PyASCIIObject
* kind = PyUnicode_1BYTE_KIND
* compact = 1
* ascii = 1
* ready = 1
* utf8 = data
- compact:
* structure = PyCompactUnicodeObject
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 1
* ready = 1
* (ascii = 0)
- string created by the legacy API (not ready):
* structure = PyUnicodeObject
* kind = PyUnicode_WCHAR_KIND
* compact = 0
* ready = 0
* wstr is not NULL
* data.any is NULL
* utf8 is NULL
* interned = SSTATE_NOT_INTERNED
* (ascii = 0)
- string created by the legacy API, ready:
* structure = PyUnicodeObject structure
* kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
PyUnicode_4BYTE_KIND
* compact = 0
* ready = 1
* data.any is not NULL
* (ascii = 0)
String created by the legacy API becomes ready when calling
PyUnicode_READY().
See also _PyUnicode_CheckConsistency(). */
PyObject_HEAD PyObject_HEAD
Py_ssize_t length; /* Number of code points in the string */ Py_ssize_t length; /* Number of code points in the string */
Py_hash_t hash; /* Hash value; -1 if not set */ Py_hash_t hash; /* Hash value; -1 if not set */

View file

@ -89,6 +89,55 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
extern "C" { extern "C" {
#endif #endif
#ifdef Py_DEBUG
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
#else
# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
#endif
#define _PyUnicode_UTF8(op) \
(((PyCompactUnicodeObject*)(op))->utf8)
#define PyUnicode_UTF8(op) \
(assert(_PyUnicode_CHECK(op)), \
assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)((PyASCIIObject*)(op) + 1)) : \
_PyUnicode_UTF8(op))
#define _PyUnicode_UTF8_LENGTH(op) \
(((PyCompactUnicodeObject*)(op))->utf8_length)
#define PyUnicode_UTF8_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)(op))->length : \
_PyUnicode_UTF8_LENGTH(op))
#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
#define _PyUnicode_KIND(op) \
(assert(_PyUnicode_CHECK(op)), \
((PyASCIIObject *)(op))->state.kind)
#define _PyUnicode_GET_LENGTH(op) \
(assert(_PyUnicode_CHECK(op)), \
((PyASCIIObject *)(op))->length)
#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
#undef PyUnicode_READY
#define PyUnicode_READY(op) \
(assert(_PyUnicode_CHECK(op)), \
(PyUnicode_IS_READY(op) ? \
0 : _PyUnicode_Ready((PyObject *)(op))))
/* true if the Unicode object has an allocated UTF-8 memory block
(not shared with other data) */
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
(assert(_PyUnicode_CHECK(op)), \
(!PyUnicode_IS_COMPACT_ASCII(op) \
&& _PyUnicode_UTF8(op) \
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
/* Generic helper macro to convert characters of different types. /* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end from_type and to_type have to be valid type names, begin and end
are pointers to the source characters which should be of type are pointers to the source characters which should be of type
@ -104,44 +153,6 @@ extern "C" {
} \ } \
} while (0) } while (0)
#define _PyUnicode_UTF8(op) \
(((PyCompactUnicodeObject*)(op))->utf8)
#define PyUnicode_UTF8(op) \
(assert(PyUnicode_Check(op)), \
assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)((PyASCIIObject*)(op) + 1)) : \
_PyUnicode_UTF8(op))
#define _PyUnicode_UTF8_LENGTH(op) \
(((PyCompactUnicodeObject*)(op))->utf8_length)
#define PyUnicode_UTF8_LENGTH(op) \
(assert(PyUnicode_Check(op)), \
assert(PyUnicode_IS_READY(op)), \
PyUnicode_IS_COMPACT_ASCII(op) ? \
((PyASCIIObject*)(op))->length : \
_PyUnicode_UTF8_LENGTH(op))
#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
#define _PyUnicode_KIND(op) \
(assert(PyUnicode_Check(op)), \
((PyASCIIObject *)(op))->state.kind)
#define _PyUnicode_GET_LENGTH(op) \
(assert(PyUnicode_Check(op)), \
((PyASCIIObject *)(op))->length)
#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
/* true if the Unicode object has an allocated UTF-8 memory block
(not shared with other data) */
#define _PyUnicode_HAS_UTF8_MEMORY(op) \
(assert(PyUnicode_Check(op)), \
(!PyUnicode_IS_COMPACT_ASCII(op) \
&& _PyUnicode_UTF8(op) \
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
/* The Unicode string has been modified: reset the hash */ /* The Unicode string has been modified: reset the hash */
#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0) #define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
@ -250,6 +261,57 @@ PyUnicode_GetMax(void)
#endif #endif
} }
#ifdef Py_DEBUG
static int
_PyUnicode_CheckConsistency(void *op)
{
PyASCIIObject *ascii;
unsigned int kind;
assert(PyUnicode_Check(op));
ascii = (PyASCIIObject *)op;
kind = ascii->state.kind;
if (ascii->state.ascii == 1) {
assert(kind == PyUnicode_1BYTE_KIND);
assert(ascii->state.compact == 1);
assert(ascii->state.ready == 1);
}
else if (ascii->state.compact == 1) {
assert(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
assert(ascii->state.compact == 1);
assert(ascii->state.ascii == 0);
assert(ascii->state.ready == 1);
} else {
PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
PyUnicodeObject *unicode = (PyUnicodeObject *)op;
if (kind == PyUnicode_WCHAR_KIND) {
assert(!ascii->state.compact == 1);
assert(ascii->state.ascii == 0);
assert(!ascii->state.ready == 1);
assert(ascii->wstr != NULL);
assert(unicode->data.any == NULL);
assert(compact->utf8 == NULL);
assert(ascii->state.interned == SSTATE_NOT_INTERNED);
}
else {
assert(kind == PyUnicode_1BYTE_KIND
|| kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND);
assert(!ascii->state.compact == 1);
assert(ascii->state.ready == 1);
assert(unicode->data.any != NULL);
assert(ascii->state.ascii == 0);
}
}
return 1;
}
#endif
/* --- Bloom Filters ----------------------------------------------------- */ /* --- Bloom Filters ----------------------------------------------------- */
/* stuff to implement simple "bloom filters" for Unicode characters. /* stuff to implement simple "bloom filters" for Unicode characters.
@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length)
static const char* static const char*
unicode_kind_name(PyObject *unicode) unicode_kind_name(PyObject *unicode)
{ {
assert(PyUnicode_Check(unicode)); assert(_PyUnicode_CHECK(unicode));
if (!PyUnicode_IS_COMPACT(unicode)) if (!PyUnicode_IS_COMPACT(unicode))
{ {
if (!PyUnicode_IS_READY(unicode)) if (!PyUnicode_IS_READY(unicode))
@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
const wchar_t *iter; const wchar_t *iter;
Py_UCS4 *ucs4_out; Py_UCS4 *ucs4_out;
assert(unicode && PyUnicode_Check(unicode)); assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode));
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
ucs4_out = PyUnicode_4BYTE_DATA(unicode); ucs4_out = PyUnicode_4BYTE_DATA(unicode);
@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
static int static int
_PyUnicode_Dirty(PyObject *unicode) _PyUnicode_Dirty(PyObject *unicode)
{ {
assert(PyUnicode_Check(unicode)); assert(_PyUnicode_CHECK(unicode));
if (Py_REFCNT(unicode) != 1) { if (Py_REFCNT(unicode) != 1) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"Cannot modify a string having more than 1 reference"); "Cannot modify a string having more than 1 reference");
@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj)
strings were created using _PyObject_New() and where no canonical strings were created using _PyObject_New() and where no canonical
representation (the str field) has been set yet aka strings representation (the str field) has been set yet aka strings
which are not yet ready. */ which are not yet ready. */
assert(PyUnicode_Check(obj)); assert(_PyUnicode_CHECK(unicode));
assert(!PyUnicode_IS_READY(obj)); assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
assert(!PyUnicode_IS_COMPACT(obj));
assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
assert(_PyUnicode_WSTR(unicode) != NULL); assert(_PyUnicode_WSTR(unicode) != NULL);
assert(_PyUnicode_DATA_ANY(unicode) == NULL); assert(_PyUnicode_DATA_ANY(unicode) == NULL);
assert(_PyUnicode_UTF8(unicode) == NULL); assert(_PyUnicode_UTF8(unicode) == NULL);
@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
assert(PyUnicode_Check(unicode)); assert(PyUnicode_Check(unicode));
assert(0 <= length); assert(0 <= length);
if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode)) if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
old_length = PyUnicode_WSTR_LENGTH(unicode); old_length = PyUnicode_WSTR_LENGTH(unicode);
else else
old_length = PyUnicode_GET_LENGTH(unicode); old_length = PyUnicode_GET_LENGTH(unicode);
@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
case 'U': case 'U':
{ {
PyObject *obj = va_arg(count, PyObject *); PyObject *obj = va_arg(count, PyObject *);
assert(obj && PyUnicode_Check(obj)); assert(obj && _PyUnicode_CHECK(obj));
if (PyUnicode_READY(obj) == -1) if (PyUnicode_READY(obj) == -1)
goto fail; goto fail;
argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj); argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
const char *str = va_arg(count, const char *); const char *str = va_arg(count, const char *);
PyObject *str_obj; PyObject *str_obj;
assert(obj || str); assert(obj || str);
assert(!obj || PyUnicode_Check(obj)); assert(!obj || _PyUnicode_CHECK(obj));
if (obj) { if (obj) {
if (PyUnicode_READY(obj) == -1) if (PyUnicode_READY(obj) == -1)
goto fail; goto fail;
@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
void *data; void *data;
Py_UCS4 chr; Py_UCS4 chr;
assert(PyUnicode_Check(uni)); assert(_PyUnicode_CHECK(uni));
if (PyUnicode_READY(uni) == -1) if (PyUnicode_READY(uni) == -1)
return -1; return -1;
kind = PyUnicode_KIND(uni); kind = PyUnicode_KIND(uni);
@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds); unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
if (unicode == NULL) if (unicode == NULL)
return NULL; return NULL;
assert(PyUnicode_Check(unicode)); assert(_PyUnicode_CHECK(unicode));
if (PyUnicode_READY(unicode)) if (PyUnicode_READY(unicode))
return NULL; return NULL;
@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it)
seq = it->it_seq; seq = it->it_seq;
if (seq == NULL) if (seq == NULL)
return NULL; return NULL;
assert(PyUnicode_Check(seq)); assert(_PyUnicode_CHECK(seq));
if (it->it_index < PyUnicode_GET_LENGTH(seq)) { if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
int kind = PyUnicode_KIND(seq); int kind = PyUnicode_KIND(seq);