Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and

ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode
object into one codepoint on encoding and create surrogate pairs for
codepoints outside the BMP on decoding. Lone surrogates are passed through
unchanged in all cases.

Backport to the trunk will follow.
This commit is contained in:
Walter Dörwald 2007-08-16 21:55:45 +00:00
parent 066100909a
commit 41980caf64
12 changed files with 1001 additions and 2 deletions

View file

@ -1992,6 +1992,272 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
NULL);
}
/* --- UTF-32 Codec ------------------------------------------------------- */
PyObject *
PyUnicode_DecodeUTF32(const char *s,
Py_ssize_t size,
const char *errors,
int *byteorder)
{
return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
}
PyObject *
PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t size,
const char *errors,
int *byteorder,
Py_ssize_t *consumed)
{
const char *starts = s;
Py_ssize_t startinpos;
Py_ssize_t endinpos;
Py_ssize_t outpos;
PyUnicodeObject *unicode;
Py_UNICODE *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
#else
const int pairs = 0;
#endif
const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */
const char *errmsg = "";
/* On narrow builds we split characters outside the BMP into two
codepoints => count how much extra space we need. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size/4; i++)
if (((Py_UCS4 *)s)[i] >= 0x10000)
pairs++;
#endif
/* Offsets from q for retrieving bytes in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
PyObject *errorHandler = NULL;
PyObject *exc = NULL;
/* This might be one to much, because of a BOM */
unicode = _PyUnicode_New((size+3)/4+pairs);
if (!unicode)
return NULL;
if (size == 0)
return (PyObject *)unicode;
/* Unpack UTF-32 encoded data */
p = unicode->str;
q = (unsigned char *)s;
e = q + size;
if (byteorder)
bo = *byteorder;
/* Check for BOM marks (U+FEFF) in the input and adjust current
byte order setting accordingly. In native mode, the leading BOM
mark is skipped, in all other modes, it is copied to the output
stream as-is (giving a ZWNBSP character). */
if (bo == 0) {
if (size >= 4) {
const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
if (bom == 0x0000FEFF) {
q += 4;
bo = -1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = 1;
}
#else
if (bom == 0x0000FEFF) {
q += 4;
bo = 1;
}
else if (bom == 0xFFFE0000) {
q += 4;
bo = -1;
}
#endif
}
}
if (bo == -1) {
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (bo == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
while (q < e) {
Py_UCS4 ch;
/* remaining bytes at the end? (size should be divisible by 4) */
if (e-q<4) {
if (consumed)
break;
errmsg = "truncated data";
startinpos = ((const char *)q)-starts;
endinpos = ((const char *)e)-starts;
goto utf32Error;
/* The remaining input chars are ignored if the callback
chooses to skip the input */
}
ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
(q[iorder[1]] << 8) | q[iorder[0]];
if (ch >= 0x110000)
{
errmsg = "codepoint not in range(0x110000)";
startinpos = ((const char *)q)-starts;
endinpos = startinpos+4;
goto utf32Error;
}
#ifndef Py_UNICODE_WIDE
if (ch >= 0x10000)
{
*p++ = 0xD800 | ((ch-0x10000) >> 10);
*p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
}
else
#endif
*p++ = ch;
q += 4;
continue;
utf32Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler(
errors, &errorHandler,
"utf32", errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
(PyObject **)&unicode, &outpos, &p))
goto onError;
}
if (byteorder)
*byteorder = bo;
if (consumed)
*consumed = (const char *)q-starts;
/* Adjust length */
if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
goto onError;
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return (PyObject *)unicode;
onError:
Py_DECREF(unicode);
Py_XDECREF(errorHandler);
Py_XDECREF(exc);
return NULL;
}
PyObject *
PyUnicode_EncodeUTF32(const Py_UNICODE *s,
Py_ssize_t size,
const char *errors,
int byteorder)
{
PyObject *v;
unsigned char *p;
#ifndef Py_UNICODE_WIDE
int i, pairs;
#else
const int pairs = 0;
#endif
/* Offsets from p for storing byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
int iorder[] = {0, 1, 2, 3};
#else
int iorder[] = {3, 2, 1, 0};
#endif
#define STORECHAR(CH) \
do { \
p[iorder[3]] = ((CH) >> 24) & 0xff; \
p[iorder[2]] = ((CH) >> 16) & 0xff; \
p[iorder[1]] = ((CH) >> 8) & 0xff; \
p[iorder[0]] = (CH) & 0xff; \
p += 4; \
} while(0)
/* In narrow builds we can output surrogate pairs as one codepoint,
so we need less space. */
#ifndef Py_UNICODE_WIDE
for (i = pairs = 0; i < size-1; i++)
if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
pairs++;
#endif
v = PyBytes_FromStringAndSize(NULL,
4 * (size - pairs + (byteorder == 0)));
if (v == NULL)
return NULL;
p = (unsigned char *)PyBytes_AS_STRING(v);
if (byteorder == 0)
STORECHAR(0xFEFF);
if (size == 0)
return v;
if (byteorder == -1) {
/* force LE */
iorder[0] = 0;
iorder[1] = 1;
iorder[2] = 2;
iorder[3] = 3;
}
else if (byteorder == 1) {
/* force BE */
iorder[0] = 3;
iorder[1] = 2;
iorder[2] = 1;
iorder[3] = 0;
}
while (size-- > 0) {
Py_UCS4 ch = *s++;
#ifndef Py_UNICODE_WIDE
if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
Py_UCS4 ch2 = *s;
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
s++;
size--;
}
}
#endif
STORECHAR(ch);
}
return v;
#undef STORECHAR
}
PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
{
if (!PyUnicode_Check(unicode)) {
PyErr_BadArgument();
return NULL;
}
return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode),
NULL,
0);
}
/* --- UTF-16 Codec ------------------------------------------------------- */
PyObject *