Issue #3672: Reject surrogates in utf-8 codec; add surrogates error

handler.
This commit is contained in:
Martin v. Löwis 2009-05-02 18:52:14 +00:00
parent 02953d244f
commit db12d454e6
9 changed files with 202 additions and 21 deletions

View file

@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
}
}
PyObject *PyCodec_SurrogateErrors(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Py_UNICODE *p;
Py_UNICODE *startp;
char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
startp = PyUnicode_AS_UNICODE(object);
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
}
outp = PyBytes_AsString(res);
for (p = startp+start; p < startp+end; p++) {
Py_UNICODE ch = *p;
if (ch < 0xd800 || ch > 0xdfff) {
/* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
Py_DECREF(object);
return NULL;
}
*outp++ = (char)(0xe0 | (ch >> 12));
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (char)(0x80 | (ch & 0x3f));
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
unsigned char *p;
Py_UNICODE ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
Py_DECREF(object);
return NULL;
}
/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
if ((p[0] & 0xf0) == 0xe0 ||
(p[1] & 0xc0) == 0x80 ||
(p[2] & 0xc0) == 0x80) {
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
if (ch < 0xd800 || ch > 0xdfff)
/* it's not a surrogate - fail */
ch = 0;
}
Py_DECREF(object);
if (ch == 0) {
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
return Py_BuildValue("(u#n)", &ch, 1, start+3);
}
else {
wrong_exception_type(exc);
return NULL;
}
}
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
return PyCodec_StrictErrors(exc);
@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
return PyCodec_BackslashReplaceErrors(exc);
}
static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogateErrors(exc);
}
static int _PyCodecRegistry_Init(void)
{
static struct {
@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void)
backslashreplace_errors,
METH_O
}
},
{
"surrogates",
{
"surrogates",
surrogates_errors,
METH_O
}
}
};