Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and

ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode object into one codepoint on encoding and create surrogate pairs for codepoints outside the BMP on decoding. Lone surrogates are passed through unchanged in all cases. Backport to the trunk will follow.
2025-10-15 19:29:46 +00:00 · 2007-08-16 21:55:45 +00:00 · 2007-08-16 21:55:45 +00:00 · 41980caf64
commit 41980caf64
parent 066100909a
12 changed files with 1001 additions and 2 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1992,6 +1992,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 				NULL);
 }

+/* --- UTF-32 Codec ------------------------------------------------------- */
+
+PyObject *
+PyUnicode_DecodeUTF32(const char *s,
+		      Py_ssize_t size,
+		      const char *errors,
+		      int *byteorder)
+{
+    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
+}
+
+PyObject *
+PyUnicode_DecodeUTF32Stateful(const char *s,
+			      Py_ssize_t size,
+			      const char *errors,
+			      int *byteorder,
+			      Py_ssize_t *consumed)
+{
+    const char *starts = s;
+    Py_ssize_t startinpos;
+    Py_ssize_t endinpos;
+    Py_ssize_t outpos;
+    PyUnicodeObject *unicode;
+    Py_UNICODE *p;
+#ifndef Py_UNICODE_WIDE
+    int i, pairs;
+#else
+    const int pairs = 0;
+#endif
+    const unsigned char *q, *e;
+    int bo = 0;       /* assume native ordering by default */
+    const char *errmsg = "";
+    /* On narrow builds we split characters outside the BMP into two
+       codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size/4; i++)
+	if (((Py_UCS4 *)s)[i] >= 0x10000)
+	    pairs++;
+#endif
+    /* Offsets from q for retrieving bytes in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int iorder[] = {0, 1, 2, 3};
+#else
+    int iorder[] = {3, 2, 1, 0};
+#endif
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+
+    /* This might be one to much, because of a BOM */
+    unicode = _PyUnicode_New((size+3)/4+pairs);
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return (PyObject *)unicode;
+
+    /* Unpack UTF-32 encoded data */
+    p = unicode->str;
+    q = (unsigned char *)s;
+    e = q + size;
+
+    if (byteorder)
+        bo = *byteorder;
+
+    /* Check for BOM marks (U+FEFF) in the input and adjust current
+       byte order setting accordingly. In native mode, the leading BOM
+       mark is skipped, in all other modes, it is copied to the output
+       stream as-is (giving a ZWNBSP character). */
+    if (bo == 0) {
+        if (size >= 4) {
+            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+                                (q[iorder[1]] << 8) | q[iorder[0]];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+	    if (bom == 0x0000FEFF) {
+		q += 4;
+		bo = -1;
+	    }
+	    else if (bom == 0xFFFE0000) {
+		q += 4;
+		bo = 1;
+	    }
+#else
+	    if (bom == 0x0000FEFF) {
+		q += 4;
+		bo = 1;
+	    }
+	    else if (bom == 0xFFFE0000) {
+		q += 4;
+		bo = -1;
+	    }
+#endif
+	}
+    }
+
+    if (bo == -1) {
+        /* force LE */
+        iorder[0] = 0;
+        iorder[1] = 1;
+        iorder[2] = 2;
+        iorder[3] = 3;
+    }
+    else if (bo == 1) {
+        /* force BE */
+        iorder[0] = 3;
+        iorder[1] = 2;
+        iorder[2] = 1;
+        iorder[3] = 0;
+    }
+
+    while (q < e) {
+	Py_UCS4 ch;
+	/* remaining bytes at the end? (size should be divisible by 4) */
+	if (e-q<4) {
+	    if (consumed)
+		break;
+	    errmsg = "truncated data";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = ((const char *)e)-starts;
+	    goto utf32Error;
+	    /* The remaining input chars are ignored if the callback
+	       chooses to skip the input */
+	}
+	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+	     (q[iorder[1]] << 8) | q[iorder[0]];
+
+	if (ch >= 0x110000)
+	{
+	    errmsg = "codepoint not in range(0x110000)";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = startinpos+4;
+	    goto utf32Error;
+	}
+#ifndef Py_UNICODE_WIDE
+	if (ch >= 0x10000)
+	{
+	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
+	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
+	}
+	else
+#endif
+	    *p++ = ch;
+	q += 4;
+	continue;
+    utf32Error:
+	outpos = p-PyUnicode_AS_UNICODE(unicode);
+	if (unicode_decode_call_errorhandler(
+	         errors, &errorHandler,
+	         "utf32", errmsg,
+	         &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
+	         (PyObject **)&unicode, &outpos, &p))
+	    goto onError;
+    }
+
+    if (byteorder)
+        *byteorder = bo;
+
+    if (consumed)
+	*consumed = (const char *)q-starts;
+
+    /* Adjust length */
+    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+        goto onError;
+
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return (PyObject *)unicode;
+
+onError:
+    Py_DECREF(unicode);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return NULL;
+}
+
+PyObject *
+PyUnicode_EncodeUTF32(const Py_UNICODE *s,
+		      Py_ssize_t size,
+		      const char *errors,
+		      int byteorder)
+{
+    PyObject *v;
+    unsigned char *p;
+#ifndef Py_UNICODE_WIDE
+    int i, pairs;
+#else
+    const int pairs = 0;
+#endif
+    /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int iorder[] = {0, 1, 2, 3};
+#else
+    int iorder[] = {3, 2, 1, 0};
+#endif
+
+#define STORECHAR(CH)                       \
+    do {                                    \
+        p[iorder[3]] = ((CH) >> 24) & 0xff; \
+        p[iorder[2]] = ((CH) >> 16) & 0xff; \
+        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
+        p[iorder[0]] = (CH) & 0xff;         \
+        p += 4;                             \
+    } while(0)
+
+    /* In narrow builds we can output surrogate pairs as one codepoint,
+       so we need less space. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size-1; i++)
+	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
+	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
+	    pairs++;
+#endif
+    v = PyBytes_FromStringAndSize(NULL,
+		  4 * (size - pairs + (byteorder == 0)));
+    if (v == NULL)
+        return NULL;
+
+    p = (unsigned char *)PyBytes_AS_STRING(v);
+    if (byteorder == 0)
+	STORECHAR(0xFEFF);
+    if (size == 0)
+        return v;
+
+    if (byteorder == -1) {
+        /* force LE */
+        iorder[0] = 0;
+        iorder[1] = 1;
+        iorder[2] = 2;
+        iorder[3] = 3;
+    }
+    else if (byteorder == 1) {
+        /* force BE */
+        iorder[0] = 3;
+        iorder[1] = 2;
+        iorder[2] = 1;
+        iorder[3] = 0;
+    }
+
+    while (size-- > 0) {
+	Py_UCS4 ch = *s++;
+#ifndef Py_UNICODE_WIDE
+	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+	    Py_UCS4 ch2 = *s;
+	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+		s++;
+		size--;
+	    }
+	}
+#endif
+        STORECHAR(ch);
+    }
+    return v;
+#undef STORECHAR
+}
+
+PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
+				 PyUnicode_GET_SIZE(unicode),
+				 NULL,
+				 0);
+}
+
 /* --- UTF-16 Codec ------------------------------------------------------- */

 PyObject *