mirror of
https://github.com/python/cpython.git
synced 2025-10-24 07:26:11 +00:00

The attached patch set includes a workaround to get Python with Unicode compile on BSDI 4.x (courtesy Thomas Wouters; the cause is a bug in the BSDI wchar.h header file) and Python interfaces for the MBCS codec donated by Mark Hammond. Also included are some minor corrections w/r to the docs of the new "es" and "es#" parser markers (use PyMem_Free() instead of free(); thanks to Mark Hammond for finding these). The unicodedata tests are now in a separate file (test_unicodedata.py) to avoid problems if the module cannot be found.
575 lines
13 KiB
C
575 lines
13 KiB
C
/* ------------------------------------------------------------------------
|
|
|
|
_codecs -- Provides access to the codec registry and the builtin
|
|
codecs.
|
|
|
|
This module should never be imported directly. The standard library
|
|
module "codecs" wraps this builtin module for use within Python.
|
|
|
|
The codec registry is accessible via:
|
|
|
|
register(search_function) -> None
|
|
|
|
lookup(encoding) -> (encoder, decoder, stream_reader, stream_writer)
|
|
|
|
The builtin Unicode codecs use the following interface:
|
|
|
|
<encoding>_encode(Unicode_object[,errors='strict']) ->
|
|
(string object, bytes consumed)
|
|
|
|
<encoding>_decode(char_buffer_obj[,errors='strict']) ->
|
|
(Unicode object, bytes consumed)
|
|
|
|
These <encoding>s are available: utf_8, unicode_escape,
|
|
raw_unicode_escape, unicode_internal, latin_1, ascii (7-bit)
|
|
|
|
Written by Marc-Andre Lemburg (mal@lemburg.com).
|
|
|
|
(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
|
|
|
|
------------------------------------------------------------------------ */
|
|
|
|
#include "Python.h"
|
|
|
|
/* --- Registry ----------------------------------------------------------- */
|
|
|
|
static
|
|
PyObject *codecregister(PyObject *self, PyObject *args)
|
|
{
|
|
PyObject *search_function;
|
|
|
|
if (!PyArg_ParseTuple(args, "O:register", &search_function))
|
|
goto onError;
|
|
|
|
if (PyCodec_Register(search_function))
|
|
goto onError;
|
|
|
|
Py_INCREF(Py_None);
|
|
return Py_None;
|
|
|
|
onError:
|
|
return NULL;
|
|
}
|
|
|
|
static
|
|
PyObject *codeclookup(PyObject *self, PyObject *args)
|
|
{
|
|
char *encoding;
|
|
|
|
if (!PyArg_ParseTuple(args, "s:lookup", &encoding))
|
|
goto onError;
|
|
|
|
return _PyCodec_Lookup(encoding);
|
|
|
|
onError:
|
|
return NULL;
|
|
}
|
|
|
|
/* --- Helpers ------------------------------------------------------------ */
|
|
|
|
static
|
|
PyObject *codec_tuple(PyObject *unicode,
|
|
int len)
|
|
{
|
|
PyObject *v,*w;
|
|
|
|
if (unicode == NULL)
|
|
return NULL;
|
|
v = PyTuple_New(2);
|
|
if (v == NULL) {
|
|
Py_DECREF(unicode);
|
|
return NULL;
|
|
}
|
|
PyTuple_SET_ITEM(v,0,unicode);
|
|
w = PyInt_FromLong(len);
|
|
if (w == NULL) {
|
|
Py_DECREF(v);
|
|
return NULL;
|
|
}
|
|
PyTuple_SET_ITEM(v,1,w);
|
|
return v;
|
|
}
|
|
|
|
/* --- Decoder ------------------------------------------------------------ */
|
|
|
|
static PyObject *
|
|
unicode_internal_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "s#|z:unicode_internal_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_FromUnicode((Py_UNICODE *)data,
|
|
size / sizeof(Py_UNICODE)),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_8_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:utf_8_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeUTF8(data, size, errors),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:utf_16_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_le_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
int byteorder = -1;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:utf_16_le_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_be_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
int byteorder = 1;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:utf_16_be_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
return codec_tuple(PyUnicode_DecodeUTF16(data, size, errors, &byteorder),
|
|
size);
|
|
}
|
|
|
|
/* This non-standard version also provides access to the byteorder
|
|
parameter of the builtin UTF-16 codec.
|
|
|
|
It returns a tuple (unicode, bytesread, byteorder) with byteorder
|
|
being the value in effect at the end of data.
|
|
|
|
*/
|
|
|
|
static PyObject *
|
|
utf_16_ex_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
PyObject *unicode, *tuple;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|zi:utf_16_ex_decode",
|
|
&data, &size, &errors, &byteorder))
|
|
return NULL;
|
|
|
|
unicode = PyUnicode_DecodeUTF16(data, size, errors, &byteorder);
|
|
if (unicode == NULL)
|
|
return NULL;
|
|
tuple = Py_BuildValue("Oii", unicode, size, byteorder);
|
|
Py_DECREF(unicode);
|
|
return tuple;
|
|
}
|
|
|
|
static PyObject *
|
|
unicode_escape_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:unicode_escape_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeUnicodeEscape(data, size, errors),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
raw_unicode_escape_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:raw_unicode_escape_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeRawUnicodeEscape(data, size, errors),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
latin_1_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:latin_1_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeLatin1(data, size, errors),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
ascii_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:ascii_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeASCII(data, size, errors),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
charmap_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
PyObject *mapping = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|zO:charmap_decode",
|
|
&data, &size, &errors, &mapping))
|
|
return NULL;
|
|
if (mapping == Py_None)
|
|
mapping = NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeCharmap(data, size, mapping, errors),
|
|
size);
|
|
}
|
|
|
|
#ifdef MS_WIN32
|
|
|
|
static PyObject *
|
|
mbcs_decode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:mbcs_decode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_DecodeMBCS(data, size, errors),
|
|
size);
|
|
}
|
|
|
|
#endif /* MS_WIN32 */
|
|
|
|
/* --- Encoder ------------------------------------------------------------ */
|
|
|
|
static PyObject *
|
|
readbuffer_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "s#|z:readbuffer_encode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyString_FromStringAndSize(data, size),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
charbuffer_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
const char *data;
|
|
int size;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "t#|z:charbuffer_encode",
|
|
&data, &size, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyString_FromStringAndSize(data, size),
|
|
size);
|
|
}
|
|
|
|
static PyObject *
|
|
utf_8_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|z:utf_8_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
/* This version provides access to the byteorder parameter of the
|
|
builtin UTF-16 codecs as optional third argument. It defaults to 0
|
|
which means: use the native byte order and prepend the data with a
|
|
BOM mark.
|
|
|
|
*/
|
|
|
|
static PyObject *
|
|
utf_16_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
int byteorder = 0;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|zi:utf_16_encode",
|
|
&str, &errors, &byteorder))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors,
|
|
byteorder),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_le_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|zi:utf_16_le_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors,
|
|
-1),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
utf_16_be_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|zi:utf_16_be_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors,
|
|
+1),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
unicode_escape_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|z:unicode_escape_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeUnicodeEscape(
|
|
PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str)),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
raw_unicode_escape_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|z:raw_unicode_escape_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeRawUnicodeEscape(
|
|
PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str)),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
latin_1_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|z:latin_1_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeLatin1(
|
|
PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
ascii_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|z:ascii_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeASCII(
|
|
PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
static PyObject *
|
|
charmap_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
PyObject *mapping = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|zO:charmap_encode",
|
|
&str, &errors, &mapping))
|
|
return NULL;
|
|
if (mapping == Py_None)
|
|
mapping = NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeCharmap(
|
|
PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
mapping,
|
|
errors),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
#ifdef MS_WIN32
|
|
|
|
static PyObject *
|
|
mbcs_encode(PyObject *self,
|
|
PyObject *args)
|
|
{
|
|
PyObject *str;
|
|
const char *errors = NULL;
|
|
|
|
if (!PyArg_ParseTuple(args, "U|z:mbcs_encode",
|
|
&str, &errors))
|
|
return NULL;
|
|
|
|
return codec_tuple(PyUnicode_EncodeMBCS(
|
|
PyUnicode_AS_UNICODE(str),
|
|
PyUnicode_GET_SIZE(str),
|
|
errors),
|
|
PyUnicode_GET_SIZE(str));
|
|
}
|
|
|
|
#endif /* MS_WIN32 */
|
|
|
|
/* --- Module API --------------------------------------------------------- */
|
|
|
|
static PyMethodDef _codecs_functions[] = {
|
|
{"register", codecregister, 1},
|
|
{"lookup", codeclookup, 1},
|
|
{"utf_8_encode", utf_8_encode, 1},
|
|
{"utf_8_decode", utf_8_decode, 1},
|
|
{"utf_16_encode", utf_16_encode, 1},
|
|
{"utf_16_le_encode", utf_16_le_encode, 1},
|
|
{"utf_16_be_encode", utf_16_be_encode, 1},
|
|
{"utf_16_decode", utf_16_decode, 1},
|
|
{"utf_16_le_decode", utf_16_le_decode, 1},
|
|
{"utf_16_be_decode", utf_16_be_decode, 1},
|
|
{"utf_16_ex_decode", utf_16_ex_decode, 1},
|
|
{"unicode_escape_encode", unicode_escape_encode, 1},
|
|
{"unicode_escape_decode", unicode_escape_decode, 1},
|
|
{"unicode_internal_encode", readbuffer_encode, 1},
|
|
{"unicode_internal_decode", unicode_internal_decode, 1},
|
|
{"raw_unicode_escape_encode", raw_unicode_escape_encode, 1},
|
|
{"raw_unicode_escape_decode", raw_unicode_escape_decode, 1},
|
|
{"latin_1_encode", latin_1_encode, 1},
|
|
{"latin_1_decode", latin_1_decode, 1},
|
|
{"ascii_encode", ascii_encode, 1},
|
|
{"ascii_decode", ascii_decode, 1},
|
|
{"charmap_encode", charmap_encode, 1},
|
|
{"charmap_decode", charmap_decode, 1},
|
|
{"readbuffer_encode", readbuffer_encode, 1},
|
|
{"charbuffer_encode", charbuffer_encode, 1},
|
|
#ifdef MS_WIN32
|
|
{"mbcs_encode", mbcs_encode, 1},
|
|
{"mbcs_decode", mbcs_decode, 1},
|
|
#endif
|
|
{NULL, NULL} /* sentinel */
|
|
};
|
|
|
|
DL_EXPORT(void)
|
|
init_codecs()
|
|
{
|
|
Py_InitModule("_codecs", _codecs_functions);
|
|
}
|