Issue #3672: Reject surrogates in utf-8 codec; add surrogates error

handler.
This commit is contained in:
Martin v. Löwis 2009-05-02 18:52:14 +00:00
parent 02953d244f
commit db12d454e6
9 changed files with 202 additions and 21 deletions

View file

@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
}
}
PyObject *PyCodec_SurrogateErrors(PyObject *exc)
{
PyObject *restuple;
PyObject *object;
Py_ssize_t start;
Py_ssize_t end;
PyObject *res;
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
Py_UNICODE *p;
Py_UNICODE *startp;
char *outp;
if (PyUnicodeEncodeError_GetStart(exc, &start))
return NULL;
if (PyUnicodeEncodeError_GetEnd(exc, &end))
return NULL;
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
startp = PyUnicode_AS_UNICODE(object);
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
if (!res) {
Py_DECREF(object);
return NULL;
}
outp = PyBytes_AsString(res);
for (p = startp+start; p < startp+end; p++) {
Py_UNICODE ch = *p;
if (ch < 0xd800 || ch > 0xdfff) {
/* Not a surrogate, fail with original exception */
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
Py_DECREF(res);
Py_DECREF(object);
return NULL;
}
*outp++ = (char)(0xe0 | (ch >> 12));
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*outp++ = (char)(0x80 | (ch & 0x3f));
}
restuple = Py_BuildValue("(On)", res, end);
Py_DECREF(res);
Py_DECREF(object);
return restuple;
}
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
unsigned char *p;
Py_UNICODE ch = 0;
if (PyUnicodeDecodeError_GetStart(exc, &start))
return NULL;
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
return NULL;
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
Py_DECREF(object);
return NULL;
}
/* Try decoding a single surrogate character. If
there are more, let the codec call us again. */
p += start;
if ((p[0] & 0xf0) == 0xe0 ||
(p[1] & 0xc0) == 0x80 ||
(p[2] & 0xc0) == 0x80) {
/* it's a three-byte code */
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
if (ch < 0xd800 || ch > 0xdfff)
/* it's not a surrogate - fail */
ch = 0;
}
Py_DECREF(object);
if (ch == 0) {
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
return NULL;
}
return Py_BuildValue("(u#n)", &ch, 1, start+3);
}
else {
wrong_exception_type(exc);
return NULL;
}
}
static PyObject *strict_errors(PyObject *self, PyObject *exc)
{
return PyCodec_StrictErrors(exc);
@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
return PyCodec_BackslashReplaceErrors(exc);
}
static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
{
return PyCodec_SurrogateErrors(exc);
}
static int _PyCodecRegistry_Init(void)
{
static struct {
@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void)
backslashreplace_errors,
METH_O
}
},
{
"surrogates",
{
"surrogates",
surrogates_errors,
METH_O
}
}
};

View file

@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p)
}
else if (PyUnicode_CheckExact(v)) {
PyObject *utf8;
utf8 = PyUnicode_AsUTF8String(v);
utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v),
PyUnicode_GET_SIZE(v),
"surrogates");
if (utf8 == NULL) {
p->depth--;
p->error = WFERR_UNMARSHALLABLE;
@ -810,7 +812,7 @@ r_object(RFILE *p)
retval = NULL;
break;
}
v = PyUnicode_DecodeUTF8(buffer, n, NULL);
v = PyUnicode_DecodeUTF8(buffer, n, "surrogates");
PyMem_DEL(buffer);
retval = v;
break;