cpython/Modules/_iconv_codec.c
Walter Dörwald 2e0b18af30 Change the treatment of positions returned by PEP293
error handers in the Unicode codecs: Negative
positions are treated as being relative to the end of
the input and out of bounds positions result in an
IndexError.

Also update the PEP and include an explanation of
this in the documentation for codecs.register_error.

Fixes a small bug in iconv_codecs: if the position
from the callback is negative *add* it to the size
instead of substracting it.

From SF patch #677429.
2003-01-31 17:19:08 +00:00

707 lines
24 KiB
C

/*
* _iconv_codec.c
*
* libiconv adaptor for Python iconvcodec
*
* Author : Hye-Shik Chang <perky@FreeBSD.org>
* Created : 17 January 2003
*/
#include "Python.h"
#include <string.h>
#include <iconv.h>
static const char *__version__ = "$Revision$";
#if Py_USING_UNICODE
# if Py_UNICODE_SIZE == 2
# ifdef __GNU_LIBRARY__
# define UNICODE_ENCODING "ucs-2"
# else
# define UNICODE_ENCODING "ucs-2-internal"
# endif
# define MBENCODED_LENGTH_MAX 4
# elif Py_UNICODE_SIZE == 4
# ifdef __GNU_LIBRARY__
# define UNICODE_ENCODING "ucs-4"
# else
# define UNICODE_ENCODING "ucs-4-internal"
# endif
# define MBENCODED_LENGTH_MAX 6
# endif
#else
# error "Unicode is not available"
#endif
typedef struct {
PyObject_HEAD
iconv_t enchdl, dechdl;
char *encoding;
} iconvcodecObject;
PyDoc_STRVAR(iconvcodec_doc, "iconvcodec object");
staticforward PyTypeObject iconvcodec_Type;
/* does the choosen internal encoding require
* byteswapping to get native endianness?
* 0=no, 1=yes, -1=unknown */
static int byteswap = -1;
#define ERROR_STRICT (PyObject *)(1)
#define ERROR_IGNORE (PyObject *)(2)
#define ERROR_REPLACE (PyObject *)(3)
#define ERROR_MAX ERROR_REPLACE
#define REPLACEMENT_CHAR_DECODE 0xFFFD
#define REPLACEMENT_CHAR_ENCODE '?'
#define DEFAULT_ENCODING "utf-8"
static PyObject *
get_errorcallback(const char *errors)
{
if (errors == NULL || strcmp(errors, "strict") == 0)
return ERROR_STRICT;
else if (strcmp(errors, "ignore") == 0)
return ERROR_IGNORE;
else if (strcmp(errors, "replace") == 0)
return ERROR_REPLACE;
else
return PyCodec_LookupError(errors);
}
PyDoc_STRVAR(iconvcodec_encode__doc__,
"I.encode(unicode, [,errors]) -> (string, length consumed)\n\
\n\
Return an encoded string version of `unicode'. errors may be given to\n\
set a different error handling scheme. Default is 'strict' meaning that\n\
encoding errors raise a UnicodeEncodeError. Other possible values are\n\
'ignore', 'replace' and 'xmlcharrefreplace' as well as any other name\n\
registered with codecs.register_error that can handle UnicodeEncodeErrors.");
static PyObject *
iconvcodec_encode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
{
static char *kwlist[] = { "input", "errors", NULL };
Py_UNICODE *input;
int inputlen;
char *errors = NULL/*strict*/, *out, *out_top;
const char *inp, *inp_top;
size_t inplen, inplen_total, outlen, outlen_total, estep;
PyObject *outputobj = NULL, *errorcb = NULL,
*exceptionobj = NULL;
Py_UNICODE *swappedinput = NULL;
int swapi;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "u#|s:encode",
kwlist, &input, &inputlen, &errors))
return NULL; /* TypeError */
errorcb = get_errorcallback(errors);
if (errorcb == NULL)
return NULL; /* LookupError or something else from error handler */
inp = inp_top = (char *)input;
inplen = inplen_total = (size_t)(inputlen * Py_UNICODE_SIZE);
outlen = inputlen * MBENCODED_LENGTH_MAX;
if (outlen < 16)
outlen = 16; /* for iso-2022 codecs */
outputobj = PyString_FromStringAndSize(NULL, outlen);
if (outputobj == NULL)
return NULL;
out = out_top = PyString_AS_STRING(outputobj);
outlen_total = outlen;
estep = inputlen * Py_UNICODE_SIZE / 2;
#define RESIZE_OUTBUFFER(size) { \
size_t toadd = (size); \
outlen_total += toadd; \
outlen += toadd; \
if (_PyString_Resize(&outputobj, outlen_total) == -1) \
goto errorexit; \
out = PyString_AS_STRING(outputobj) + (out - out_top); \
out_top = PyString_AS_STRING(outputobj); \
}
if (byteswap) {
swappedinput = PyMem_Malloc(inplen);
if (swappedinput == NULL)
return NULL;
for (swapi = 0; swapi<inputlen; ++swapi)
{
Py_UNICODE c = input[swapi];
#if Py_UNICODE_SIZE == 2
c = ((char *)&c)[0]<<8 | ((char *)&c)[1];
#else
c = ((char *)&c)[0]<<24 | ((char *)&c)[1]<<16 |
((char *)&c)[2]<<8 | ((char *)&c)[3];
#endif
swappedinput[swapi] = c;
}
inp = inp_top = (char *)swappedinput;
}
while (inplen > 0) {
if (iconv(self->enchdl, (char**)&inp, &inplen, &out, &outlen) == -1) {
char reason[128];
int errpos;
if (errno == E2BIG) {
RESIZE_OUTBUFFER(estep);
continue;
}
if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
inplen -= Py_UNICODE_SIZE;
inp += Py_UNICODE_SIZE;
if (errorcb == ERROR_REPLACE) {
if (outlen < 1)
RESIZE_OUTBUFFER(errno == EINVAL ? 1 : estep);
outlen--;
*out++ = REPLACEMENT_CHAR_ENCODE;
}
if (errno == EINVAL) break;
else continue;
}
errpos = (int)(inp - inp_top) / Py_UNICODE_SIZE;
sprintf(reason, "Undefined character map from "
#if Py_UNICODE_SIZE == 2
"\\u%04x"
#elif Py_UNICODE_SIZE == 4
"\\u%08x"
#endif
, *(Py_UNICODE *)inp);
if (exceptionobj == NULL) {
if ((exceptionobj = PyUnicodeEncodeError_Create(
self->encoding, input, inputlen,
errpos, errpos + 1, reason)) == NULL)
goto errorexit;
} else {
if (PyUnicodeEncodeError_SetStart(exceptionobj, errpos) != 0)
goto errorexit;
if (PyUnicodeEncodeError_SetEnd(exceptionobj, errpos + 1) != 0)
goto errorexit;
if (PyUnicodeEncodeError_SetReason(exceptionobj, reason) != 0)
goto errorexit;
}
if (errorcb == ERROR_STRICT) {
PyCodec_StrictErrors(exceptionobj);
goto errorexit;
} else {
PyObject *argsobj, *retobj, *retuni;
long newpos;
argsobj = PyTuple_New(1);
if (argsobj == NULL)
goto errorexit;
PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
Py_INCREF(exceptionobj);
retobj = PyObject_CallObject(errorcb, argsobj);
Py_DECREF(argsobj);
if (retobj == NULL)
goto errorexit;
if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
!PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
!PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
Py_DECREF(retobj);
PyErr_SetString(PyExc_ValueError, "encoding error handler "
"must return (unicode, int) tuple");
goto errorexit;
}
if (PyUnicode_GET_SIZE(retuni) > 0) {
#define errorexit errorexit_cbpad
PyObject *retstr = NULL;
int retstrsize;
retstr = PyUnicode_AsEncodedString(
retuni, self->encoding, NULL);
if (retstr == NULL || !PyString_Check(retstr))
goto errorexit;
retstrsize = PyString_GET_SIZE(retstr);
if (outlen < retstrsize)
RESIZE_OUTBUFFER(errno == EINVAL || retstrsize > estep
? retstrsize - outlen : estep);
memcpy(out, PyString_AS_STRING(retstr), retstrsize);
out += retstrsize;
outlen -= retstrsize;
#undef errorexit
if (0) {
errorexit_cbpad: Py_XDECREF(retobj);
Py_XDECREF(retstr);
goto errorexit;
}
Py_DECREF(retstr);
}
newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
Py_DECREF(retobj);
if (newpos < 0)
newpos = inputlen + newpos;
if (newpos < 0 || newpos > inputlen) {
PyErr_Format(PyExc_IndexError, "position %ld from error handler"
" out of bounds", newpos);
goto errorexit;
}
if (newpos == inputlen)
break;
inp = inp_top + Py_UNICODE_SIZE * newpos;
inplen = inplen_total - Py_UNICODE_SIZE * newpos;
}
} else
break;
}
#undef RESIZE_OUTBUFFER
{
PyObject *rettup;
int finalsize;
finalsize = (int)(out - out_top);
if (finalsize != outlen_total) {
if (_PyString_Resize(&outputobj, finalsize) == -1)
goto errorexit;
}
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
rettup = PyTuple_New(2);
if (rettup == NULL) {
Py_DECREF(outputobj);
if (byteswap)
PyMem_Free(swappedinput);
return NULL;
}
PyTuple_SET_ITEM(rettup, 0, outputobj);
PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inputlen));
return rettup;
}
errorexit:
Py_XDECREF(outputobj);
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
if (byteswap)
PyMem_Free(swappedinput);
return NULL;
}
PyDoc_STRVAR(iconvcodec_decode__doc__,
"I.decode(string, [,errors]) -> (unicodeobject, length consumed)\n\
\n\
Decodes `string' using I, an iconvcodec instance. errors may be given\n\
to set a different error handling scheme. Default is 'strict' meaning\n\
that encoding errors raise a UnicodeDecodeError. Other possible values\n\
are 'ignore' and 'replace' as well as any other name registerd with\n\
codecs.register_error that is able to handle UnicodeDecodeErrors.");
static PyObject *
iconvcodec_decode(iconvcodecObject *self, PyObject *args, PyObject *kwargs)
{
static char *kwlist[] = { "input", "errors", NULL };
char *errors = NULL/*strict*/, *out, *out_top;
const char *inp, *inp_top;
int inplen_int;
size_t inplen, inplen_total, outlen, outlen_total, estep;
PyObject *outputobj = NULL, *errorcb = NULL,
*exceptionobj = NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s#|s:decode",
kwlist, &inp, &inplen_int, &errors))
return NULL; /* TypeError */
errorcb = get_errorcallback(errors);
if (errorcb == NULL)
return NULL; /* LookupError or something else from error handler */
inp_top = inp;
inplen_total = inplen = (size_t)inplen_int;
outputobj = PyUnicode_FromUnicode(NULL, inplen);
if (outputobj == NULL)
return NULL;
outlen_total = outlen = PyUnicode_GET_DATA_SIZE(outputobj);
out = out_top = (char *)PyUnicode_AS_UNICODE(outputobj);
estep = outlen / 2;
#define RESIZE_OUTBUFFER(size) { \
size_t toadd = (size); \
outlen_total += toadd; \
outlen += toadd; \
if (PyUnicode_Resize(&outputobj, outlen_total/Py_UNICODE_SIZE) == -1) \
goto errorexit; \
out = (char *)PyUnicode_AS_UNICODE(outputobj) + (out - out_top); \
out_top = (char *)PyUnicode_AS_UNICODE(outputobj); \
}
while (inplen > 0) {
char *oldout = out;
char res = iconv(self->dechdl, (char**)&inp, &inplen, &out, &outlen);
if (byteswap) {
while (oldout < out)
{
char c0 = oldout[0];
#if Py_UNICODE_SIZE == 2
oldout[0] = oldout[1];
oldout[1] = c0;
#else
char c1 = oldout[1];
oldout[0] = oldout[3];
oldout[1] = oldout[2];
oldout[2] = c1;
oldout[3] = c0;
#endif
oldout += sizeof(Py_UNICODE);
}
}
if (res == -1) {
char reason[128], *reasonpos = (char *)reason;
int errpos;
if (errno == E2BIG) {
RESIZE_OUTBUFFER(estep);
continue;
}
if (errorcb == ERROR_IGNORE || errorcb == ERROR_REPLACE) {
inplen--; inp++;
if (errorcb == ERROR_REPLACE) {
Py_UNICODE *replp;
if (outlen < Py_UNICODE_SIZE)
RESIZE_OUTBUFFER(
errno == EINVAL || Py_UNICODE_SIZE > estep
? Py_UNICODE_SIZE : estep);
/* some compilers hate casted lvalue */
replp = (Py_UNICODE *)out;
assert((long)replp % Py_UNICODE_SIZE == 0);/* aligned? */
*replp = REPLACEMENT_CHAR_DECODE;
out += Py_UNICODE_SIZE;
outlen -= Py_UNICODE_SIZE;
}
if (errno == EINVAL) break;
else continue;
}
errpos = (int)(inp - inp_top);
reasonpos += sprintf(reason, "Invalid multibyte sequence \\x%02x",
(unsigned char)*inp);
if (inplen > 1) {
reasonpos += sprintf(reasonpos,
"\\x%02x", (unsigned char)*(inp+1));
if (inplen > 2)
sprintf(reasonpos, "\\x%02x", (unsigned char)*(inp+2));
}
if (exceptionobj == NULL) {
exceptionobj = PyUnicodeDecodeError_Create(
self->encoding, inp_top, inplen_total,
errpos, errpos + 1, reason);
if (exceptionobj == NULL)
goto errorexit;
} else {
if (PyUnicodeDecodeError_SetStart(exceptionobj, errpos) != 0)
goto errorexit;
if (PyUnicodeDecodeError_SetEnd(exceptionobj, errpos + 1) != 0)
goto errorexit;
if (PyUnicodeDecodeError_SetReason(exceptionobj, reason) != 0)
goto errorexit;
}
if (errorcb == ERROR_STRICT) {
PyCodec_StrictErrors(exceptionobj);
goto errorexit;
} else {
PyObject *argsobj, *retobj, *retuni;
long newpos;
argsobj = PyTuple_New(1);
if (argsobj == NULL)
goto errorexit;
PyTuple_SET_ITEM(argsobj, 0, exceptionobj);
Py_INCREF(exceptionobj);
retobj = PyObject_CallObject(errorcb, argsobj);
Py_DECREF(argsobj);
if (retobj == NULL)
goto errorexit;
if (!PyTuple_Check(retobj) || PyTuple_GET_SIZE(retobj) != 2 ||
!PyUnicode_Check((retuni = PyTuple_GET_ITEM(retobj, 0))) ||
!PyInt_Check(PyTuple_GET_ITEM(retobj, 1))) {
Py_DECREF(retobj);
PyErr_SetString(PyExc_ValueError, "decoding error handler "
"must return (unicode, int) tuple");
goto errorexit;
}
if (PyUnicode_GET_SIZE(retuni) > 0) {
#define errorexit errorexit_cbpad
size_t retunisize;
retunisize = PyUnicode_GET_DATA_SIZE(retuni);
if (outlen < retunisize)
RESIZE_OUTBUFFER(errno == EINVAL || retunisize > estep
? retunisize - outlen : estep);
memcpy(out, PyUnicode_AS_DATA(retuni), retunisize);
out += retunisize;
outlen -= retunisize;
#undef errorexit
if (0) {
errorexit_cbpad: Py_DECREF(retobj);
goto errorexit;
}
}
newpos = PyInt_AS_LONG(PyTuple_GET_ITEM(retobj, 1));
Py_DECREF(retobj);
if (newpos < 0)
newpos = inplen_total + newpos;
if (newpos < 0 || newpos > inplen_total) {
PyErr_Format(PyExc_IndexError, "position %ld from error handler"
" out of bounds", newpos);
goto errorexit;
}
if (newpos == inplen_total)
break;
inp = inp_top + newpos;
inplen = inplen_total - newpos;
}
} else
break;
}
#undef RESIZE_OUTBUFFER
{
PyObject *rettup;
int finalsize;
finalsize = (int)(out - out_top);
if (finalsize != outlen_total) {
if (PyUnicode_Resize(&outputobj, finalsize / Py_UNICODE_SIZE) == -1)
goto errorexit;
}
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
rettup = PyTuple_New(2);
if (rettup == NULL) {
Py_DECREF(outputobj);
return NULL;
}
PyTuple_SET_ITEM(rettup, 0, outputobj);
PyTuple_SET_ITEM(rettup, 1, PyInt_FromLong(inplen_total));
return rettup;
}
errorexit:
Py_XDECREF(outputobj);
if (errorcb > ERROR_MAX) {
Py_DECREF(errorcb);
}
Py_XDECREF(exceptionobj);
return NULL;
}
static struct PyMethodDef iconvcodec_methods[] = {
{"encode", (PyCFunction)iconvcodec_encode,
METH_VARARGS | METH_KEYWORDS,
iconvcodec_encode__doc__},
{"decode", (PyCFunction)iconvcodec_decode,
METH_VARARGS | METH_KEYWORDS,
iconvcodec_decode__doc__},
{NULL, NULL},
};
static PyObject *
iconvcodec_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
{
PyObject *encobj = NULL;
iconvcodecObject *new = NULL;
new = (iconvcodecObject *)type->tp_alloc(type, 0);
if (new == NULL)
return NULL;
new->encoding = NULL;
new->enchdl = new->dechdl = (iconv_t)(-1);
encobj = PyObject_GetAttrString((PyObject *)new, "encoding");
if (encobj == NULL) {
PyErr_Clear();
new->encoding = PyMem_Malloc(sizeof(DEFAULT_ENCODING));
strcpy(new->encoding, DEFAULT_ENCODING);
} else if (!PyString_Check(encobj)) {
Py_DECREF(encobj);
PyErr_SetString(PyExc_TypeError,
"`encoding' attribute must be a string.");
goto errorexit;
} else {
new->encoding = PyMem_Malloc(PyString_GET_SIZE(encobj) + 1);
strcpy(new->encoding, PyString_AS_STRING(encobj));
Py_DECREF(encobj);
}
new->dechdl = iconv_open(UNICODE_ENCODING, new->encoding);
if (new->dechdl == (iconv_t)(-1)) {
PyErr_SetString(PyExc_ValueError, "unsupported decoding");
goto errorexit;
}
new->enchdl = iconv_open(new->encoding, UNICODE_ENCODING);
if (new->enchdl == (iconv_t)(-1)) {
PyErr_SetString(PyExc_ValueError, "unsupported encoding");
iconv_close(new->dechdl);
new->dechdl = (iconv_t)(-1);
goto errorexit;
}
return (PyObject *)new;
errorexit:
Py_XDECREF(new);
return NULL;
}
static void
iconvcodec_dealloc(iconvcodecObject *self)
{
if (self->enchdl != (iconv_t)-1)
iconv_close(self->enchdl);
if (self->dechdl != (iconv_t)-1)
iconv_close(self->dechdl);
if (self->encoding != NULL)
PyMem_Free(self->encoding);
self->ob_type->tp_free((PyObject *)self);
}
static PyObject *
iconvcodec_repr(PyObject *self)
{
return PyString_FromFormat("<iconvcodec encoding='%s'>",
((iconvcodecObject *)self)->encoding);
}
statichere PyTypeObject iconvcodec_Type = {
PyObject_HEAD_INIT(&PyType_Type)
0, /* Number of items for varobject */
"iconvcodec", /* Name of this type */
sizeof(iconvcodecObject), /* Basic object size */
0, /* Item size for varobject */
(destructor)iconvcodec_dealloc, /* tp_dealloc */
0, /* tp_print */
0, /* tp_getattr */
0, /* tp_setattr */
0, /* tp_compare */
iconvcodec_repr, /* tp_repr */
0, /* tp_as_number */
0, /* tp_as_sequence */
0, /* tp_as_mapping */
0, /* tp_hash */
0, /* tp_call */
0, /* tp_str */
PyObject_GenericGetAttr, /* tp_getattro */
0, /* tp_setattro */
0, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
iconvcodec_doc, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iterext */
iconvcodec_methods, /* tp_methods */
0, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
0, /* tp_init */
PyType_GenericAlloc, /* tp_alloc */
iconvcodec_new, /* tp_new */
PyObject_Del, /* tp_free */
};
static struct PyMethodDef _iconv_codec_methods[] = {
{NULL, NULL},
};
void
init_iconv_codec(void)
{
PyObject *m;
char in = 1;
char *inptr = &in;
int insize = 1;
Py_UNICODE out = 0;
char *outptr = (char *)&out;
int outsize = sizeof(out);
int res;
iconv_t hdl = iconv_open(UNICODE_ENCODING, "ASCII");
if (hdl == (iconv_t)-1)
Py_FatalError("can't initialize the _iconv_codec module: iconv_open() failed");
res = iconv(hdl, &inptr, &insize, &outptr, &outsize);
if (res == -1)
Py_FatalError("can't initialize the _iconv_codec module: iconv() failed");
/* Check whether conv() returned native endianess or not for the choosen encoding */
if (out == 0x1)
byteswap = 0;
#if Py_UNICODE_SIZE == 2
else if (out == 0x0100)
#else
else if (out == 0x01000000)
#endif
byteswap = 1;
else
Py_FatalError("can't initialize the _iconv_codec module: mixed endianess");
iconv_close(hdl);
m = Py_InitModule("_iconv_codec", _iconv_codec_methods);
PyModule_AddStringConstant(m, "__version__", (char*)__version__);
Py_INCREF(&iconvcodec_Type);
PyModule_AddObject(m, "iconvcodec", (PyObject *)(&iconvcodec_Type));
PyModule_AddStringConstant(m, "internal_encoding", UNICODE_ENCODING);
if (PyErr_Occurred())
Py_FatalError("can't initialize the _iconv_codec module");
}
/*
* ex: ts=8 sts=4 et
* $Id$
*/