Add _PyUnicode_CheckConsistency() macro to help debugging

* Document Unicode string states * Use _PyUnicode_CheckConsistency() to ensure that objects are always consistent.
2025-10-07 07:31:46 +00:00 · 2011-10-03 03:20:16 +02:00 · 2011-10-03 03:20:16 +02:00 · 910337b42e
commit 910337b42e
parent 4fae54cb0e
2 changed files with 144 additions and 37 deletions
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@ -206,6 +206,52 @@ extern "C" {
   immediately follow the structure. utf8_length and wstr_length can be found
   in the length field; the utf8 pointer is equal to the data pointer. */
 typedef struct {
    /* Unicode strings can be in 4 states:
       - compact ascii:
         * structure = PyASCIIObject
         * kind = PyUnicode_1BYTE_KIND
         * compact = 1
         * ascii = 1
         * ready = 1
         * utf8 = data
       - compact:
         * structure = PyCompactUnicodeObject
         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
           PyUnicode_4BYTE_KIND
         * compact = 1
         * ready = 1
         * (ascii = 0)
       - string created by the legacy API (not ready):
         * structure = PyUnicodeObject
         * kind = PyUnicode_WCHAR_KIND
         * compact = 0
         * ready = 0
         * wstr is not NULL
         * data.any is NULL
         * utf8 is NULL
         * interned = SSTATE_NOT_INTERNED
         * (ascii = 0)
       - string created by the legacy API, ready:
         * structure = PyUnicodeObject structure
         * kind = PyUnicode_1BYTE_KIND, PyUnicode_2BYTE_KIND or
           PyUnicode_4BYTE_KIND
         * compact = 0
         * ready = 1
         * data.any is not NULL
         * (ascii = 0)
       String created by the legacy API becomes ready when calling
       PyUnicode_READY().
       See also _PyUnicode_CheckConsistency(). */
    PyObject_HEAD
    Py_ssize_t length;          /* Number of code points in the string */
    Py_hash_t hash;             /* Hash value; -1 if not set */
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -89,6 +89,55 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 extern "C" {
 #endif
 #ifdef Py_DEBUG
 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
 #else
 #  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
 #endif
 #define _PyUnicode_UTF8(op)                             \
    (((PyCompactUnicodeObject*)(op))->utf8)
 #define PyUnicode_UTF8(op)                              \
    (assert(_PyUnicode_CHECK(op)),                      \
     assert(PyUnicode_IS_READY(op)),                    \
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
         ((char*)((PyASCIIObject*)(op) + 1)) :          \
         _PyUnicode_UTF8(op))
 #define _PyUnicode_UTF8_LENGTH(op)                      \
    (((PyCompactUnicodeObject*)(op))->utf8_length)
 #define PyUnicode_UTF8_LENGTH(op)                       \
    (assert(_PyUnicode_CHECK(op)),                      \
     assert(PyUnicode_IS_READY(op)),                    \
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
         ((PyASCIIObject*)(op))->length :               \
         _PyUnicode_UTF8_LENGTH(op))
 #define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
 #define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
 #define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
 #define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
 #define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
 #define _PyUnicode_KIND(op)                             \
    (assert(_PyUnicode_CHECK(op)),                      \
     ((PyASCIIObject *)(op))->state.kind)
 #define _PyUnicode_GET_LENGTH(op)                       \
    (assert(_PyUnicode_CHECK(op)),                      \
     ((PyASCIIObject *)(op))->length)
 #define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
 #undef PyUnicode_READY
 #define PyUnicode_READY(op)                             \
    (assert(_PyUnicode_CHECK(op)),                      \
     (PyUnicode_IS_READY(op) ?                          \
      0 : _PyUnicode_Ready((PyObject *)(op))))
 /* true if the Unicode object has an allocated UTF-8 memory block
   (not shared with other data) */
 #define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
    (assert(_PyUnicode_CHECK(op)),                      \
     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
      && _PyUnicode_UTF8(op)                            \
      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
 /* Generic helper macro to convert characters of different types.
   from_type and to_type have to be valid type names, begin and end
   are pointers to the source characters which should be of type
@ -104,44 +153,6 @@ extern "C" {
        }                                               \
    } while (0)
 #define _PyUnicode_UTF8(op)                             \
    (((PyCompactUnicodeObject*)(op))->utf8)
 #define PyUnicode_UTF8(op)                              \
    (assert(PyUnicode_Check(op)),                       \
     assert(PyUnicode_IS_READY(op)),                    \
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
         ((char*)((PyASCIIObject*)(op) + 1)) :          \
         _PyUnicode_UTF8(op))
 #define _PyUnicode_UTF8_LENGTH(op)                      \
    (((PyCompactUnicodeObject*)(op))->utf8_length)
 #define PyUnicode_UTF8_LENGTH(op)                       \
    (assert(PyUnicode_Check(op)),                       \
     assert(PyUnicode_IS_READY(op)),                    \
     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
         ((PyASCIIObject*)(op))->length :               \
         _PyUnicode_UTF8_LENGTH(op))
 #define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
 #define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
 #define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
 #define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
 #define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
 #define _PyUnicode_KIND(op) \
    (assert(PyUnicode_Check(op)), \
     ((PyASCIIObject *)(op))->state.kind)
 #define _PyUnicode_GET_LENGTH(op)                \
    (assert(PyUnicode_Check(op)),               \
     ((PyASCIIObject *)(op))->length)
 #define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
 /* true if the Unicode object has an allocated UTF-8 memory block
   (not shared with other data) */
 #define _PyUnicode_HAS_UTF8_MEMORY(op) \
    (assert(PyUnicode_Check(op)),                       \
     (!PyUnicode_IS_COMPACT_ASCII(op) \
      && _PyUnicode_UTF8(op) \
      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
 /* The Unicode string has been modified: reset the hash */
 #define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
@ -250,6 +261,57 @@ PyUnicode_GetMax(void)
 #endif
 }
 #ifdef Py_DEBUG
 static int
 _PyUnicode_CheckConsistency(void *op)
 {
    PyASCIIObject *ascii;
    unsigned int kind;
    assert(PyUnicode_Check(op));
    ascii = (PyASCIIObject *)op;
    kind = ascii->state.kind;
    if (ascii->state.ascii == 1) {
        assert(kind == PyUnicode_1BYTE_KIND);
        assert(ascii->state.compact == 1);
        assert(ascii->state.ready == 1);
    }
    else if (ascii->state.compact == 1) {
        assert(kind == PyUnicode_1BYTE_KIND
               || kind == PyUnicode_2BYTE_KIND
               || kind == PyUnicode_4BYTE_KIND);
        assert(ascii->state.compact == 1);
        assert(ascii->state.ascii == 0);
        assert(ascii->state.ready == 1);
    } else {
        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
        PyUnicodeObject *unicode = (PyUnicodeObject *)op;
        if (kind == PyUnicode_WCHAR_KIND) {
            assert(!ascii->state.compact == 1);
            assert(ascii->state.ascii == 0);
            assert(!ascii->state.ready == 1);
            assert(ascii->wstr != NULL);
            assert(unicode->data.any == NULL);
            assert(compact->utf8 == NULL);
            assert(ascii->state.interned == SSTATE_NOT_INTERNED);
        }
        else {
            assert(kind == PyUnicode_1BYTE_KIND
                   || kind == PyUnicode_2BYTE_KIND
                   || kind == PyUnicode_4BYTE_KIND);
            assert(!ascii->state.compact == 1);
            assert(ascii->state.ready == 1);
            assert(unicode->data.any != NULL);
            assert(ascii->state.ascii == 0);
        }
    }
    return 1;
 }
 #endif
 /* --- Bloom Filters ----------------------------------------------------- */
 /* stuff to implement simple "bloom filters" for Unicode characters.
@ -542,7 +604,7 @@ _PyUnicode_New(Py_ssize_t length)
 static const char*
 unicode_kind_name(PyObject *unicode)
 {
-    assert(PyUnicode_Check(unicode));
+    assert(_PyUnicode_CHECK(unicode));
    if (!PyUnicode_IS_COMPACT(unicode))
    {
        if (!PyUnicode_IS_READY(unicode))
@ -744,7 +806,8 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
    const wchar_t *iter;
    Py_UCS4 *ucs4_out;
-    assert(unicode && PyUnicode_Check(unicode));
+    assert(unicode != NULL);
    assert(_PyUnicode_CHECK(unicode));
    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
@ -771,7 +834,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
 static int
 _PyUnicode_Dirty(PyObject *unicode)
 {
-    assert(PyUnicode_Check(unicode));
+    assert(_PyUnicode_CHECK(unicode));
    if (Py_REFCNT(unicode) != 1) {
        PyErr_SetString(PyExc_ValueError,
                        "Cannot modify a string having more than 1 reference");
@ -966,10 +1029,8 @@ _PyUnicode_Ready(PyObject *obj)
       strings were created using _PyObject_New() and where no canonical
       representation (the str field) has been set yet aka strings
       which are not yet ready. */
-    assert(PyUnicode_Check(obj));
+    assert(_PyUnicode_CHECK(unicode));
-    assert(!PyUnicode_IS_READY(obj));
+    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
    assert(!PyUnicode_IS_COMPACT(obj));
    assert(_PyUnicode_KIND(obj) == PyUnicode_WCHAR_KIND);
    assert(_PyUnicode_WSTR(unicode) != NULL);
    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
    assert(_PyUnicode_UTF8(unicode) == NULL);
@ -1154,7 +1215,7 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
    assert(PyUnicode_Check(unicode));
    assert(0 <= length);
-    if (!PyUnicode_IS_COMPACT(unicode) && !PyUnicode_IS_READY(unicode))
+    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
        old_length = PyUnicode_WSTR_LENGTH(unicode);
    else
        old_length = PyUnicode_GET_LENGTH(unicode);
@ -1907,7 +1968,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
            case 'U':
            {
                PyObject *obj = va_arg(count, PyObject *);
-                assert(obj && PyUnicode_Check(obj));
+                assert(obj && _PyUnicode_CHECK(obj));
                if (PyUnicode_READY(obj) == -1)
                    goto fail;
                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
@ -1921,7 +1982,7 @@ PyUnicode_FromFormatV(const char *format, va_list vargs)
                const char *str = va_arg(count, const char *);
                PyObject *str_obj;
                assert(obj || str);
-                assert(!obj || PyUnicode_Check(obj));
+                assert(!obj || _PyUnicode_CHECK(obj));
                if (obj) {
                    if (PyUnicode_READY(obj) == -1)
                        goto fail;
@ -9570,7 +9631,7 @@ PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
    void *data;
    Py_UCS4 chr;
-    assert(PyUnicode_Check(uni));
+    assert(_PyUnicode_CHECK(uni));
    if (PyUnicode_READY(uni) == -1)
        return -1;
    kind = PyUnicode_KIND(uni);
@ -12698,7 +12759,7 @@ unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
    if (unicode == NULL)
        return NULL;
-    assert(PyUnicode_Check(unicode));
+    assert(_PyUnicode_CHECK(unicode));
    if (PyUnicode_READY(unicode))
        return NULL;
@ -13054,7 +13115,7 @@ unicodeiter_next(unicodeiterobject *it)
    seq = it->it_seq;
    if (seq == NULL)
        return NULL;
-    assert(PyUnicode_Check(seq));
+    assert(_PyUnicode_CHECK(seq));
    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
        int kind = PyUnicode_KIND(seq);