[3.13] gh-128013: fix data race in PyUnicode_AsUTF8AndSize on free-threading (#128021) (#128417)

This commit is contained in:
Kumar Aditya 2025-01-02 22:10:17 +05:30 committed by GitHub
parent 24cddab4a2
commit fa6c48e4b3
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 126 additions and 60 deletions

View file

@ -1,7 +1,7 @@
import unittest import unittest
import sys import sys
from test import support from test import support
from test.support import import_helper from test.support import threading_helper, import_helper
try: try:
import _testcapi import _testcapi
@ -959,6 +959,24 @@ class CAPITest(unittest.TestCase):
self.assertRaises(TypeError, unicode_asutf8, [], 0) self.assertRaises(TypeError, unicode_asutf8, [], 0)
# CRASHES unicode_asutf8(NULL, 0) # CRASHES unicode_asutf8(NULL, 0)
@unittest.skipIf(_testcapi is None, 'need _testcapi module')
@threading_helper.requires_working_threading()
def test_asutf8_race(self):
"""Test that there's no race condition in PyUnicode_AsUTF8()"""
unicode_asutf8 = _testcapi.unicode_asutf8
from threading import Thread
data = "😊"
def worker():
for _ in range(1000):
self.assertEqual(unicode_asutf8(data, 5), b'\xf0\x9f\x98\x8a\0')
threads = [Thread(target=worker) for _ in range(10)]
with threading_helper.start_threads(threads):
pass
@support.cpython_only @support.cpython_only
@unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module') @unittest.skipIf(_testlimitedcapi is None, 'need _testlimitedcapi module')
def test_asutf8andsize(self): def test_asutf8andsize(self):

View file

@ -111,20 +111,42 @@ NOTE: In the interpreter's initialization phase, some globals are currently
# define _PyUnicode_CHECK(op) PyUnicode_Check(op) # define _PyUnicode_CHECK(op) PyUnicode_Check(op)
#endif #endif
#define _PyUnicode_UTF8(op) \ static inline char* _PyUnicode_UTF8(PyObject *op)
(_PyCompactUnicodeObject_CAST(op)->utf8) {
#define PyUnicode_UTF8(op) \ return FT_ATOMIC_LOAD_PTR_ACQUIRE(_PyCompactUnicodeObject_CAST(op)->utf8);
(assert(_PyUnicode_CHECK(op)), \ }
PyUnicode_IS_COMPACT_ASCII(op) ? \
((char*)(_PyASCIIObject_CAST(op) + 1)) : \ static inline char* PyUnicode_UTF8(PyObject *op)
_PyUnicode_UTF8(op)) {
#define _PyUnicode_UTF8_LENGTH(op) \ assert(_PyUnicode_CHECK(op));
(_PyCompactUnicodeObject_CAST(op)->utf8_length) if (PyUnicode_IS_COMPACT_ASCII(op)) {
#define PyUnicode_UTF8_LENGTH(op) \ return ((char*)(_PyASCIIObject_CAST(op) + 1));
(assert(_PyUnicode_CHECK(op)), \ }
PyUnicode_IS_COMPACT_ASCII(op) ? \ else {
_PyASCIIObject_CAST(op)->length : \ return _PyUnicode_UTF8(op);
_PyUnicode_UTF8_LENGTH(op)) }
}
static inline void PyUnicode_SET_UTF8(PyObject *op, char *utf8)
{
FT_ATOMIC_STORE_PTR_RELEASE(_PyCompactUnicodeObject_CAST(op)->utf8, utf8);
}
static inline Py_ssize_t PyUnicode_UTF8_LENGTH(PyObject *op)
{
assert(_PyUnicode_CHECK(op));
if (PyUnicode_IS_COMPACT_ASCII(op)) {
return _PyASCIIObject_CAST(op)->length;
}
else {
return _PyCompactUnicodeObject_CAST(op)->utf8_length;
}
}
static inline void PyUnicode_SET_UTF8_LENGTH(PyObject *op, Py_ssize_t length)
{
_PyCompactUnicodeObject_CAST(op)->utf8_length = length;
}
#define _PyUnicode_LENGTH(op) \ #define _PyUnicode_LENGTH(op) \
(_PyASCIIObject_CAST(op)->length) (_PyASCIIObject_CAST(op)->length)
@ -132,26 +154,37 @@ NOTE: In the interpreter's initialization phase, some globals are currently
(_PyASCIIObject_CAST(op)->state) (_PyASCIIObject_CAST(op)->state)
#define _PyUnicode_HASH(op) \ #define _PyUnicode_HASH(op) \
(_PyASCIIObject_CAST(op)->hash) (_PyASCIIObject_CAST(op)->hash)
#define _PyUnicode_KIND(op) \
(assert(_PyUnicode_CHECK(op)), \ static inline Py_hash_t PyUnicode_HASH(PyObject *op)
_PyASCIIObject_CAST(op)->state.kind) {
#define _PyUnicode_GET_LENGTH(op) \ assert(_PyUnicode_CHECK(op));
(assert(_PyUnicode_CHECK(op)), \ return FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash);
_PyASCIIObject_CAST(op)->length) }
static inline void PyUnicode_SET_HASH(PyObject *op, Py_hash_t hash)
{
FT_ATOMIC_STORE_SSIZE_RELAXED(_PyASCIIObject_CAST(op)->hash, hash);
}
#define _PyUnicode_DATA_ANY(op) \ #define _PyUnicode_DATA_ANY(op) \
(_PyUnicodeObject_CAST(op)->data.any) (_PyUnicodeObject_CAST(op)->data.any)
#define _PyUnicode_SHARE_UTF8(op) \ static inline int _PyUnicode_SHARE_UTF8(PyObject *op)
(assert(_PyUnicode_CHECK(op)), \ {
assert(!PyUnicode_IS_COMPACT_ASCII(op)), \ assert(_PyUnicode_CHECK(op));
(_PyUnicode_UTF8(op) == PyUnicode_DATA(op))) assert(!PyUnicode_IS_COMPACT_ASCII(op));
return (_PyUnicode_UTF8(op) == PyUnicode_DATA(op));
}
/* true if the Unicode object has an allocated UTF-8 memory block /* true if the Unicode object has an allocated UTF-8 memory block
(not shared with other data) */ (not shared with other data) */
#define _PyUnicode_HAS_UTF8_MEMORY(op) \ static inline int _PyUnicode_HAS_UTF8_MEMORY(PyObject *op)
((!PyUnicode_IS_COMPACT_ASCII(op) \ {
&& _PyUnicode_UTF8(op) \ return (!PyUnicode_IS_COMPACT_ASCII(op)
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op))) && _PyUnicode_UTF8(op) != NULL
&& _PyUnicode_UTF8(op) != PyUnicode_DATA(op));
}
/* Generic helper macro to convert characters of different types. /* Generic helper macro to convert characters of different types.
from_type and to_type have to be valid type names, begin and end from_type and to_type have to be valid type names, begin and end
@ -650,7 +683,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
|| kind == PyUnicode_2BYTE_KIND || kind == PyUnicode_2BYTE_KIND
|| kind == PyUnicode_4BYTE_KIND); || kind == PyUnicode_4BYTE_KIND);
CHECK(ascii->state.ascii == 0); CHECK(ascii->state.ascii == 0);
CHECK(compact->utf8 != data); CHECK(_PyUnicode_UTF8(op) != data);
} }
else { else {
PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op); PyUnicodeObject *unicode = _PyUnicodeObject_CAST(op);
@ -662,16 +695,17 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
CHECK(ascii->state.compact == 0); CHECK(ascii->state.compact == 0);
CHECK(data != NULL); CHECK(data != NULL);
if (ascii->state.ascii) { if (ascii->state.ascii) {
CHECK(compact->utf8 == data); CHECK(_PyUnicode_UTF8(op) == data);
CHECK(compact->utf8_length == ascii->length); CHECK(compact->utf8_length == ascii->length);
} }
else { else {
CHECK(compact->utf8 != data); CHECK(_PyUnicode_UTF8(op) != data);
} }
} }
#ifndef Py_GIL_DISABLED
if (compact->utf8 == NULL) if (_PyUnicode_UTF8(op) == NULL)
CHECK(compact->utf8_length == 0); CHECK(compact->utf8_length == 0);
#endif
} }
/* check that the best kind is used: O(n) operation */ /* check that the best kind is used: O(n) operation */
@ -1115,8 +1149,8 @@ resize_compact(PyObject *unicode, Py_ssize_t length)
if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) { if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
PyMem_Free(_PyUnicode_UTF8(unicode)); PyMem_Free(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL; PyUnicode_SET_UTF8_LENGTH(unicode, 0);
_PyUnicode_UTF8_LENGTH(unicode) = 0; PyUnicode_SET_UTF8(unicode, NULL);
} }
#ifdef Py_TRACE_REFS #ifdef Py_TRACE_REFS
_Py_ForgetReference(unicode); _Py_ForgetReference(unicode);
@ -1169,8 +1203,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode)) if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
{ {
PyMem_Free(_PyUnicode_UTF8(unicode)); PyMem_Free(_PyUnicode_UTF8(unicode));
_PyUnicode_UTF8(unicode) = NULL; PyUnicode_SET_UTF8_LENGTH(unicode, 0);
_PyUnicode_UTF8_LENGTH(unicode) = 0; PyUnicode_SET_UTF8(unicode, NULL);
} }
data = (PyObject *)PyObject_Realloc(data, new_size); data = (PyObject *)PyObject_Realloc(data, new_size);
@ -1180,8 +1214,8 @@ resize_inplace(PyObject *unicode, Py_ssize_t length)
} }
_PyUnicode_DATA_ANY(unicode) = data; _PyUnicode_DATA_ANY(unicode) = data;
if (share_utf8) { if (share_utf8) {
_PyUnicode_UTF8(unicode) = data; PyUnicode_SET_UTF8_LENGTH(unicode, length);
_PyUnicode_UTF8_LENGTH(unicode) = length; PyUnicode_SET_UTF8(unicode, data);
} }
_PyUnicode_LENGTH(unicode) = length; _PyUnicode_LENGTH(unicode) = length;
PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0); PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
@ -1411,12 +1445,12 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
assert(unicode != NULL); assert(unicode != NULL);
assert(_PyUnicode_CHECK(unicode)); assert(_PyUnicode_CHECK(unicode));
assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND); assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
ucs4_out = PyUnicode_4BYTE_DATA(unicode); ucs4_out = PyUnicode_4BYTE_DATA(unicode);
for (iter = begin; iter < end; ) { for (iter = begin; iter < end; ) {
assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) + assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode))); PyUnicode_GET_LENGTH(unicode)));
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
&& (iter+1) < end && (iter+1) < end
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1])) && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
@ -1430,7 +1464,7 @@ unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
} }
} }
assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) + assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
_PyUnicode_GET_LENGTH(unicode))); PyUnicode_GET_LENGTH(unicode)));
} }
#endif #endif
@ -1801,7 +1835,7 @@ unicode_modifiable(PyObject *unicode)
assert(_PyUnicode_CHECK(unicode)); assert(_PyUnicode_CHECK(unicode));
if (Py_REFCNT(unicode) != 1) if (Py_REFCNT(unicode) != 1)
return 0; return 0;
if (FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(unicode)) != -1) if (PyUnicode_HASH(unicode) != -1)
return 0; return 0;
if (PyUnicode_CHECK_INTERNED(unicode)) if (PyUnicode_CHECK_INTERNED(unicode))
return 0; return 0;
@ -4052,6 +4086,21 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
static int unicode_fill_utf8(PyObject *unicode); static int unicode_fill_utf8(PyObject *unicode);
static int
unicode_ensure_utf8(PyObject *unicode)
{
int err = 0;
if (PyUnicode_UTF8(unicode) == NULL) {
Py_BEGIN_CRITICAL_SECTION(unicode);
if (PyUnicode_UTF8(unicode) == NULL) {
err = unicode_fill_utf8(unicode);
}
Py_END_CRITICAL_SECTION();
}
return err;
}
const char * const char *
PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize) PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
{ {
@ -4063,13 +4112,11 @@ PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
return NULL; return NULL;
} }
if (PyUnicode_UTF8(unicode) == NULL) { if (unicode_ensure_utf8(unicode) == -1) {
if (unicode_fill_utf8(unicode) == -1) { if (psize) {
if (psize) { *psize = -1;
*psize = -1;
}
return NULL;
} }
return NULL;
} }
if (psize) { if (psize) {
@ -5401,6 +5448,7 @@ unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
static int static int
unicode_fill_utf8(PyObject *unicode) unicode_fill_utf8(PyObject *unicode)
{ {
_Py_CRITICAL_SECTION_ASSERT_OBJECT_LOCKED(unicode);
/* the string cannot be ASCII, or PyUnicode_UTF8() would be set */ /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
assert(!PyUnicode_IS_ASCII(unicode)); assert(!PyUnicode_IS_ASCII(unicode));
@ -5442,10 +5490,10 @@ unicode_fill_utf8(PyObject *unicode)
PyErr_NoMemory(); PyErr_NoMemory();
return -1; return -1;
} }
_PyUnicode_UTF8(unicode) = cache;
_PyUnicode_UTF8_LENGTH(unicode) = len;
memcpy(cache, start, len); memcpy(cache, start, len);
cache[len] = '\0'; cache[len] = '\0';
PyUnicode_SET_UTF8_LENGTH(unicode, len);
PyUnicode_SET_UTF8(unicode, cache);
_PyBytesWriter_Dealloc(&writer); _PyBytesWriter_Dealloc(&writer);
return 0; return 0;
} }
@ -10996,9 +11044,9 @@ _PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
return 0; return 0;
} }
Py_hash_t right_hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(right_uni)); Py_hash_t right_hash = PyUnicode_HASH(right_uni);
assert(right_hash != -1); assert(right_hash != -1);
Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(left)); Py_hash_t hash = PyUnicode_HASH(left);
if (hash != -1 && hash != right_hash) { if (hash != -1 && hash != right_hash) {
return 0; return 0;
} }
@ -11484,14 +11532,14 @@ unicode_hash(PyObject *self)
#ifdef Py_DEBUG #ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized); assert(_Py_HashSecret_Initialized);
#endif #endif
Py_hash_t hash = FT_ATOMIC_LOAD_SSIZE_RELAXED(_PyUnicode_HASH(self)); Py_hash_t hash = PyUnicode_HASH(self);
if (hash != -1) { if (hash != -1) {
return hash; return hash;
} }
x = _Py_HashBytes(PyUnicode_DATA(self), x = _Py_HashBytes(PyUnicode_DATA(self),
PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self)); PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
FT_ATOMIC_STORE_SSIZE_RELAXED(_PyUnicode_HASH(self), x); PyUnicode_SET_HASH(self, x);
return x; return x;
} }
@ -14888,8 +14936,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_STATE(self).compact = 0; _PyUnicode_STATE(self).compact = 0;
_PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii; _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
_PyUnicode_STATE(self).statically_allocated = 0; _PyUnicode_STATE(self).statically_allocated = 0;
_PyUnicode_UTF8_LENGTH(self) = 0; PyUnicode_SET_UTF8_LENGTH(self, 0);
_PyUnicode_UTF8(self) = NULL; PyUnicode_SET_UTF8(self, NULL);
_PyUnicode_DATA_ANY(self) = NULL; _PyUnicode_DATA_ANY(self) = NULL;
share_utf8 = 0; share_utf8 = 0;
@ -14919,8 +14967,8 @@ unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
_PyUnicode_DATA_ANY(self) = data; _PyUnicode_DATA_ANY(self) = data;
if (share_utf8) { if (share_utf8) {
_PyUnicode_UTF8_LENGTH(self) = length; PyUnicode_SET_UTF8_LENGTH(self, length);
_PyUnicode_UTF8(self) = data; PyUnicode_SET_UTF8(self, data);
} }
memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1)); memcpy(data, PyUnicode_DATA(unicode), kind * (length + 1));