Apply SF patch #1775604: This adds three new codecs (utf-32, utf-32-le and

ut-32-be). On narrow builds the codecs combine surrogate pairs in the unicode
object into one codepoint on encoding and create surrogate pairs for
codepoints outside the BMP on decoding. Lone surrogates are passed through
unchanged in all cases.

Backport to the trunk will follow.
This commit is contained in:
Walter Dörwald 2007-08-16 21:55:45 +00:00
parent 066100909a
commit 41980caf64
12 changed files with 1001 additions and 2 deletions

View file

@ -412,6 +412,126 @@ utf_16_ex_decode(PyObject *self,
return tuple;
}
static PyObject *
utf_32_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 0;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_le_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = -1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_le_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
&byteorder, final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
static PyObject *
utf_32_be_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 1;
int final = 0;
Py_ssize_t consumed;
PyObject *decoded = NULL;
if (!PyArg_ParseTuple(args, "t#|zi:utf_32_be_decode",
&data, &size, &errors, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
decoded = PyUnicode_DecodeUTF32Stateful(data, size, errors,
&byteorder, final ? NULL : &consumed);
if (decoded == NULL)
return NULL;
return codec_tuple(decoded, consumed);
}
/* This non-standard version also provides access to the byteorder
parameter of the builtin UTF-32 codec.
It returns a tuple (unicode, bytesread, byteorder) with byteorder
being the value in effect at the end of data.
*/
static PyObject *
utf_32_ex_decode(PyObject *self,
PyObject *args)
{
const char *data;
Py_ssize_t size;
const char *errors = NULL;
int byteorder = 0;
PyObject *unicode, *tuple;
int final = 0;
Py_ssize_t consumed;
if (!PyArg_ParseTuple(args, "t#|zii:utf_32_ex_decode",
&data, &size, &errors, &byteorder, &final))
return NULL;
if (size < 0) {
PyErr_SetString(PyExc_ValueError, "negative argument");
return 0;
}
consumed = size; /* This is overwritten unless final is true. */
unicode = PyUnicode_DecodeUTF32Stateful(data, size, errors, &byteorder,
final ? NULL : &consumed);
if (unicode == NULL)
return NULL;
tuple = Py_BuildValue("Oni", unicode, consumed, byteorder);
Py_DECREF(unicode);
return tuple;
}
static PyObject *
unicode_escape_decode(PyObject *self,
PyObject *args)
@ -700,6 +820,83 @@ utf_16_be_encode(PyObject *self,
return v;
}
/* This version provides access to the byteorder parameter of the
builtin UTF-32 codecs as optional third argument. It defaults to 0
which means: use the native byte order and prepend the data with a
BOM mark.
*/
static PyObject *
utf_32_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
int byteorder = 0;
if (!PyArg_ParseTuple(args, "O|zi:utf_32_encode",
&str, &errors, &byteorder))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
byteorder),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_le_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_le_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
-1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
utf_32_be_encode(PyObject *self,
PyObject *args)
{
PyObject *str, *v;
const char *errors = NULL;
if (!PyArg_ParseTuple(args, "O|z:utf_32_be_encode",
&str, &errors))
return NULL;
str = PyUnicode_FromObject(str);
if (str == NULL)
return NULL;
v = codec_tuple(PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(str),
PyUnicode_GET_SIZE(str),
errors,
+1),
PyUnicode_GET_SIZE(str));
Py_DECREF(str);
return v;
}
static PyObject *
unicode_escape_encode(PyObject *self,
PyObject *args)
@ -916,6 +1113,13 @@ static PyMethodDef _codecs_functions[] = {
{"utf_16_le_decode", utf_16_le_decode, METH_VARARGS},
{"utf_16_be_decode", utf_16_be_decode, METH_VARARGS},
{"utf_16_ex_decode", utf_16_ex_decode, METH_VARARGS},
{"utf_32_encode", utf_32_encode, METH_VARARGS},
{"utf_32_le_encode", utf_32_le_encode, METH_VARARGS},
{"utf_32_be_encode", utf_32_be_encode, METH_VARARGS},
{"utf_32_decode", utf_32_decode, METH_VARARGS},
{"utf_32_le_decode", utf_32_le_decode, METH_VARARGS},
{"utf_32_be_decode", utf_32_be_decode, METH_VARARGS},
{"utf_32_ex_decode", utf_32_ex_decode, METH_VARARGS},
{"unicode_escape_encode", unicode_escape_encode, METH_VARARGS},
{"unicode_escape_decode", unicode_escape_decode, METH_VARARGS},
{"unicode_internal_encode", unicode_internal_encode, METH_VARARGS},