bpo-39087: Add _PyUnicode_GetUTF8Buffer() (GH-17659)

Co-authored-by: Victor Stinner <vstinner@python.org>
2025-10-09 16:34:44 +00:00 · 2020-03-14 12:43:18 +09:00 · 2020-03-14 12:43:18 +09:00 · c7ad974d34
commit c7ad974d34
parent 8fb02b6e19
5 changed files with 284 additions and 6 deletions
--- a/Include/cpython/unicodeobject.h
+++ b/Include/cpython/unicodeobject.h
@ -734,6 +734,19 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
 /* --- Manage the default encoding ---------------------------------------- */
 /* Get a buffer to the UTF-8 encoding of the Unicode object unicode.
   Returns -1 on error.
   Successful calls must be paired to
   calls to PyBuffer_Release.
 */
 PyAPI_FUNC(int) _PyUnicode_GetUTF8Buffer(
    PyObject *unicode,      /* Unicode object */
    const char *errors,     /* error handling */
    Py_buffer *view         /* (out) buffer to the UTF-8 encoding */
    );
 /* Returns a pointer to the default encoding (UTF-8) of the
   Unicode object unicode and the size of the encoded representation
   in bytes stored in *size.
@ -746,12 +759,6 @@ PyAPI_FUNC(void*) _PyUnicode_AsKind(PyObject *s, unsigned int kind);
   _PyUnicode_AsStringAndSize is a #define for PyUnicode_AsUTF8AndSize to
   support the previous internal function with the same behaviour.
   *** This API is for interpreter INTERNAL USE ONLY and will likely
   *** be removed or changed in the future.
   *** If you need to access the Unicode object as UTF-8 bytes string,
   *** please use PyUnicode_AsUTF8String() instead.
 */
 PyAPI_FUNC(const char *) PyUnicode_AsUTF8AndSize(
--- a/Lib/test/test_unicode.py
+++ b/Lib/test/test_unicode.py
@ -2830,6 +2830,28 @@ class CAPITest(unittest.TestCase):
            self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
            self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
    # Test _PyUnicode_GetUTF8Buffer()
    @support.cpython_only
    def test_getutf8buffer(self):
        from _testcapi import unicode_getutf8buffer, unicode_test_getutf8buffer
        # Run tests wrtten in C.  Raise an error when test failed.
        unicode_test_getutf8buffer()
        ascii_ = "foo"
        bmp = '\u0100'
        bmp2 = '\uffff'
        nonbmp = chr(0x10ffff)
        surrogates = 'a\ud800b\udfffc'
        self.assertEqual(unicode_getutf8buffer(ascii_), b'foo')
        self.assertEqual(unicode_getutf8buffer(bmp), b'\xc4\x80')
        self.assertEqual(unicode_getutf8buffer(bmp2), b'\xef\xbf\xbf')
        self.assertEqual(unicode_getutf8buffer(nonbmp), b'\xf4\x8f\xbf\xbf')
        self.assertRaises(UnicodeEncodeError, unicode_getutf8buffer, surrogates)
        self.assertEqual(unicode_getutf8buffer(surrogates, "surrogatepass"),
                         b'a\xed\xa0\x80b\xed\xbf\xbfc')
    # Test PyUnicode_AsUTF8()
    @support.cpython_only
    def test_asutf8(self):
--- a/API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst
+++ b/API/2019-12-19-21-19-53.bpo-39087.l4A11-.rst
@ -0,0 +1,2 @@
 Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the
 unicode object without cache or extra allocation.
--- a/Modules/_testcapimodule.c
+++ b/Modules/_testcapimodule.c
@ -1967,6 +1967,216 @@ unicode_asutf8andsize(PyObject *self, PyObject *args)
    return Py_BuildValue("(Nn)", result, utf8_len);
 }
 static PyObject *
 unicode_getutf8buffer(PyObject *self, PyObject *args)
 {
    PyObject *unicode;
    const char *errors = NULL;
    if(!PyArg_ParseTuple(args, "O|s", &unicode, &errors)) {
        return NULL;
    }
    Py_buffer buffer;
    if (_PyUnicode_GetUTF8Buffer(unicode, errors, &buffer) < 0) {
        return NULL;
    }
    assert(buffer.obj != NULL);
    assert(buffer.obj == unicode || PyBytes_CheckExact(buffer.obj));
    PyObject *result = PyBytes_FromStringAndSize(buffer.buf, buffer.len);
    PyBuffer_Release(&buffer);
    return result;
 }
 static PyObject *
 unicode_test_getutf8buffer(PyObject *self, PyObject *Py_UNUSED(ignored))
 {
    Py_buffer buf;
    // Test 1: ASCII string
    PyObject *str = PyUnicode_FromString("hello");
    if (str == NULL) {
        return NULL;
    }
    Py_ssize_t refcnt = Py_REFCNT(str);
    // _PyUnicode_GetUTF8Buffer() must not fail for ASCII string.
    int ret = _PyUnicode_GetUTF8Buffer(str, NULL,  &buf);
    assert(ret == 0);
    if (buf.obj != str) {
        PyErr_Format(TestError,
                     "buf.obj must be equal to str. (%s:%d)",
                     __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    if (buf.len != PyUnicode_GET_LENGTH(str)) {
        PyErr_Format(TestError,
                     "buf.len must be equal to len(str). (%s:%d)",
                     __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    assert(((const char*)buf.buf)[5] == '\0');
    if ((Py_UCS1*)buf.buf != PyUnicode_1BYTE_DATA(str)) {
        PyErr_Format(TestError,
                     "buf.buf must be equal to PyUnicode_1BYTE_DATA(str). (%s:%d)",
                     __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    if (refcnt + 1 != Py_REFCNT(str)) {
        PyErr_Format(TestError,
                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
                     refcnt + 1, Py_REFCNT(str),
                     __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    PyBuffer_Release(&buf);
    if (refcnt != Py_REFCNT(str)) {
        PyErr_Format(TestError,
                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
                     refcnt, Py_REFCNT(str),
                     __FILE__, __LINE__);
        Py_DECREF(str);
        return NULL;
    }
    Py_DECREF(str);
    // Test 2: non-ASCII string
    // "hello" in Japanese.  len(str)==5, len(str.encode()) == 15.
    str = PyUnicode_FromString("\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf");
    if (str == NULL) {
        return NULL;
    }
    refcnt = Py_REFCNT(str);
    assert(PyUnicode_GET_LENGTH(str) == 5);
    if (_PyUnicode_GetUTF8Buffer(str, NULL,  &buf) < 0) {
        Py_DECREF(str);
        if (!PyErr_Occurred()) {
            PyErr_Format(TestError,
                         "_PyUnicode_GetUTF8Buffer() returned nonzero "
                         "without exception set. (%s:%d)",
                         __FILE__, __LINE__);
        }
        return NULL;
    }
    if (!PyBytes_CheckExact(buf.obj)) {
        PyErr_Format(TestError,
                     "buf.obj must be a bytes object, got %R (%s:%d)",
                     buf.obj, __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    if (buf.len != 15) {
        PyErr_Format(TestError,
                     "Expected buf.len == 15, actual %zd (%s:%d)",
                     buf.len, __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    assert(((const char*)buf.buf)[15] == '\0');
    if (refcnt != Py_REFCNT(str)) {
        PyErr_Format(TestError,
                     "Py_REFCNT(str) must not be changed. (%s:%d)",
                     __FILE__, __LINE__);
        // Do not DECREF here because refcnt is broken.
        return NULL;
    }
    PyBuffer_Release(&buf);
    // Test 3: There is a UTF-8 cache
    // Reuse str of the previoss test.
    const char *cache = PyUnicode_AsUTF8(str);
    if (cache == NULL) {
        return NULL;
    }
    if (_PyUnicode_GetUTF8Buffer(str, NULL,  &buf) < 0) {
        Py_DECREF(str);
        if (!PyErr_Occurred()) {
            PyErr_Format(TestError,
                         "_PyUnicode_GetUTF8Buffer() returned nonzero "
                         "without exception set. (%s:%d)",
                         __FILE__, __LINE__);
        }
        return NULL;
    }
    if (buf.obj != str) {
        PyErr_Format(TestError,
                     "buf.obj must be equal to str. (%s:%d)",
                     __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    if (buf.buf != cache) {
        PyErr_Format(TestError,
                     "buf.buf must be equal to the UTF-8 cache (%s:%d)",
                     __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    if (buf.len != 15) {
        PyErr_Format(TestError,
                     "Expected buf.len == 15, actual %zd (%s:%d)",
                     buf.len, __FILE__, __LINE__);
        PyBuffer_Release(&buf);
        Py_DECREF(str);
        return NULL;
    }
    assert(((const char*)buf.buf)[15] == '\0');
    if (refcnt + 1 != Py_REFCNT(str)) {
        PyErr_Format(TestError,
                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
                     refcnt + 1, Py_REFCNT(str),
                     __FILE__, __LINE__);
        // Do not DECREF here because refcnt is broken.
        return NULL;
    }
    PyBuffer_Release(&buf);
    if (refcnt != Py_REFCNT(str)) {
        PyErr_Format(TestError,
                     "Py_REFCNT(str); expected %zd, got %zd. (%s:%d)",
                     refcnt, Py_REFCNT(str),
                     __FILE__, __LINE__);
        // Do not DECREF here because refcnt is broken.
        return NULL;
    }
    Py_DECREF(str);
    Py_RETURN_NONE;
 }
 static PyObject *
 unicode_findchar(PyObject *self, PyObject *args)
 {
@ -5392,6 +5602,8 @@ static PyMethodDef TestMethods[] = {
    {"unicode_asucs4",          unicode_asucs4,                  METH_VARARGS},
    {"unicode_asutf8",          unicode_asutf8,                  METH_VARARGS},
    {"unicode_asutf8andsize",   unicode_asutf8andsize,           METH_VARARGS},
    {"unicode_getutf8buffer",   unicode_getutf8buffer,           METH_VARARGS},
    {"unicode_test_getutf8buffer", unicode_test_getutf8buffer,   METH_NOARGS},
    {"unicode_findchar",        unicode_findchar,                METH_VARARGS},
    {"unicode_copycharacters",  unicode_copycharacters,          METH_VARARGS},
    {"unicode_encodedecimal",   unicode_encodedecimal,           METH_VARARGS},
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -3991,6 +3991,41 @@ PyUnicode_FSDecoder(PyObject* arg, void* addr)
 }
 int
 _PyUnicode_GetUTF8Buffer(PyObject *unicode, const char *errors,
                         Py_buffer *view)
 {
    if (!PyUnicode_Check(unicode)) {
        PyErr_BadArgument();
        return -1;
    }
    if (PyUnicode_READY(unicode) == -1) {
        return -1;
    }
    if (PyUnicode_UTF8(unicode) != NULL
            && Py_TYPE(unicode)->tp_as_buffer == NULL) {
        return PyBuffer_FillInfo(view, unicode,
                PyUnicode_UTF8(unicode),
                PyUnicode_UTF8_LENGTH(unicode),
                /* readonly */ 1, PyBUF_SIMPLE);
    }
    // Unlike PyUnicode_AsUTF8AndSize(), this function doesn't
    // create a UTF-8 cache for speed and efficiency.
    PyObject *bytes = _PyUnicode_AsUTF8String(unicode, errors);
    if (bytes == NULL) {
        return -1;
    }
    assert(PyBytes_CheckExact(bytes));
    if (PyObject_GetBuffer(bytes, view, PyBUF_SIMPLE) < 0) {
        Py_DECREF(bytes);
        return -1;
    }
    return 0;
 }
 static int unicode_fill_utf8(PyObject *unicode);
 const char *
		`@ -0,0 +1,2 @@`
							Add new ``_PyUnicode_GetUTF8Buffer`` private API to get UTF-8 encode of the
							`unicode object without cache or extra allocation.`