Backport r57105 and r57145 from the py3k branch: UTF-32 codecs.

2025-11-13 07:26:31 +00:00 · 2007-08-17 16:41:28 +00:00 · 2007-08-17 16:41:28 +00:00 · 6e39080649
commit 6e39080649
parent 437e6a3b15
12 changed files with 999 additions and 2 deletions
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@ -1504,6 +1504,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
 				NULL);
 }

+/* --- UTF-32 Codec ------------------------------------------------------- */
+
+PyObject *
+PyUnicode_DecodeUTF32(const char *s,
+		      Py_ssize_t size,
+		      const char *errors,
+		      int *byteorder)
+{
+    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
+}
+
+PyObject *
+PyUnicode_DecodeUTF32Stateful(const char *s,
+			      Py_ssize_t size,
+			      const char *errors,
+			      int *byteorder,
+			      Py_ssize_t *consumed)
+{
+    const char *starts = s;
+    Py_ssize_t startinpos;
+    Py_ssize_t endinpos;
+    Py_ssize_t outpos;
+    PyUnicodeObject *unicode;
+    Py_UNICODE *p;
+#ifndef Py_UNICODE_WIDE
+    int i, pairs;
+#else
+    const int pairs = 0;
+#endif
+    const unsigned char *q, *e;
+    int bo = 0;       /* assume native ordering by default */
+    const char *errmsg = "";
+    /* On narrow builds we split characters outside the BMP into two
+       codepoints => count how much extra space we need. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size/4; i++)
+	if (((Py_UCS4 *)s)[i] >= 0x10000)
+	    pairs++;
+#endif
+    /* Offsets from q for retrieving bytes in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int iorder[] = {0, 1, 2, 3};
+#else
+    int iorder[] = {3, 2, 1, 0};
+#endif
+    PyObject *errorHandler = NULL;
+    PyObject *exc = NULL;
+
+    /* This might be one to much, because of a BOM */
+    unicode = _PyUnicode_New((size+3)/4+pairs);
+    if (!unicode)
+        return NULL;
+    if (size == 0)
+        return (PyObject *)unicode;
+
+    /* Unpack UTF-32 encoded data */
+    p = unicode->str;
+    q = (unsigned char *)s;
+    e = q + size;
+
+    if (byteorder)
+        bo = *byteorder;
+
+    /* Check for BOM marks (U+FEFF) in the input and adjust current
+       byte order setting accordingly. In native mode, the leading BOM
+       mark is skipped, in all other modes, it is copied to the output
+       stream as-is (giving a ZWNBSP character). */
+    if (bo == 0) {
+        if (size >= 4) {
+            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+                                (q[iorder[1]] << 8) | q[iorder[0]];
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+	    if (bom == 0x0000FEFF) {
+		q += 4;
+		bo = -1;
+	    }
+	    else if (bom == 0xFFFE0000) {
+		q += 4;
+		bo = 1;
+	    }
+#else
+	    if (bom == 0x0000FEFF) {
+		q += 4;
+		bo = 1;
+	    }
+	    else if (bom == 0xFFFE0000) {
+		q += 4;
+		bo = -1;
+	    }
+#endif
+	}
+    }
+
+    if (bo == -1) {
+        /* force LE */
+        iorder[0] = 0;
+        iorder[1] = 1;
+        iorder[2] = 2;
+        iorder[3] = 3;
+    }
+    else if (bo == 1) {
+        /* force BE */
+        iorder[0] = 3;
+        iorder[1] = 2;
+        iorder[2] = 1;
+        iorder[3] = 0;
+    }
+
+    while (q < e) {
+	Py_UCS4 ch;
+	/* remaining bytes at the end? (size should be divisible by 4) */
+	if (e-q<4) {
+	    if (consumed)
+		break;
+	    errmsg = "truncated data";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = ((const char *)e)-starts;
+	    goto utf32Error;
+	    /* The remaining input chars are ignored if the callback
+	       chooses to skip the input */
+	}
+	ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
+	     (q[iorder[1]] << 8) | q[iorder[0]];
+
+	if (ch >= 0x110000)
+	{
+	    errmsg = "codepoint not in range(0x110000)";
+	    startinpos = ((const char *)q)-starts;
+	    endinpos = startinpos+4;
+	    goto utf32Error;
+	}
+#ifndef Py_UNICODE_WIDE
+	if (ch >= 0x10000)
+	{
+	    *p++ = 0xD800 | ((ch-0x10000) >> 10);
+	    *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
+	}
+	else
+#endif
+	    *p++ = ch;
+	q += 4;
+	continue;
+    utf32Error:
+	outpos = p-PyUnicode_AS_UNICODE(unicode);
+    if (unicode_decode_call_errorhandler(
+         errors, &errorHandler,
+         "utf32", errmsg,
+         starts, size, &startinpos, &endinpos, &exc, &s,
+         (PyObject **)&unicode, &outpos, &p))
+	    goto onError;
+    }
+
+    if (byteorder)
+        *byteorder = bo;
+
+    if (consumed)
+	*consumed = (const char *)q-starts;
+
+    /* Adjust length */
+    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
+        goto onError;
+
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return (PyObject *)unicode;
+
+onError:
+    Py_DECREF(unicode);
+    Py_XDECREF(errorHandler);
+    Py_XDECREF(exc);
+    return NULL;
+}
+
+PyObject *
+PyUnicode_EncodeUTF32(const Py_UNICODE *s,
+		      Py_ssize_t size,
+		      const char *errors,
+		      int byteorder)
+{
+    PyObject *v;
+    unsigned char *p;
+#ifndef Py_UNICODE_WIDE
+    int i, pairs;
+#else
+    const int pairs = 0;
+#endif
+    /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int iorder[] = {0, 1, 2, 3};
+#else
+    int iorder[] = {3, 2, 1, 0};
+#endif
+
+#define STORECHAR(CH)                       \
+    do {                                    \
+        p[iorder[3]] = ((CH) >> 24) & 0xff; \
+        p[iorder[2]] = ((CH) >> 16) & 0xff; \
+        p[iorder[1]] = ((CH) >> 8) & 0xff;  \
+        p[iorder[0]] = (CH) & 0xff;         \
+        p += 4;                             \
+    } while(0)
+
+    /* In narrow builds we can output surrogate pairs as one codepoint,
+       so we need less space. */
+#ifndef Py_UNICODE_WIDE
+    for (i = pairs = 0; i < size-1; i++)
+	if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
+	    0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
+	    pairs++;
+#endif
+    v = PyString_FromStringAndSize(NULL,
+		  4 * (size - pairs + (byteorder == 0)));
+    if (v == NULL)
+        return NULL;
+
+    p = (unsigned char *)PyString_AS_STRING(v);
+    if (byteorder == 0)
+	STORECHAR(0xFEFF);
+    if (size == 0)
+        return v;
+
+    if (byteorder == -1) {
+        /* force LE */
+        iorder[0] = 0;
+        iorder[1] = 1;
+        iorder[2] = 2;
+        iorder[3] = 3;
+    }
+    else if (byteorder == 1) {
+        /* force BE */
+        iorder[0] = 3;
+        iorder[1] = 2;
+        iorder[2] = 1;
+        iorder[3] = 0;
+    }
+
+    while (size-- > 0) {
+	Py_UCS4 ch = *s++;
+#ifndef Py_UNICODE_WIDE
+	if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
+	    Py_UCS4 ch2 = *s;
+	    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
+		ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
+		s++;
+		size--;
+	    }
+	}
+#endif
+        STORECHAR(ch);
+    }
+    return v;
+#undef STORECHAR
+}
+
+PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
+{
+    if (!PyUnicode_Check(unicode)) {
+        PyErr_BadArgument();
+        return NULL;
+    }
+    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
+				 PyUnicode_GET_SIZE(unicode),
+				 NULL,
+				 0);
+}
+
 /* --- UTF-16 Codec ------------------------------------------------------- */

 PyObject *