mirror of
https://github.com/python/cpython.git
synced 2025-07-24 11:44:31 +00:00
Issue #3672: Reject surrogates in utf-8 codec; add surrogates error
handler.
This commit is contained in:
parent
02953d244f
commit
db12d454e6
9 changed files with 202 additions and 21 deletions
|
@ -748,6 +748,85 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
|
|||
}
|
||||
}
|
||||
|
||||
PyObject *PyCodec_SurrogateErrors(PyObject *exc)
|
||||
{
|
||||
PyObject *restuple;
|
||||
PyObject *object;
|
||||
Py_ssize_t start;
|
||||
Py_ssize_t end;
|
||||
PyObject *res;
|
||||
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
|
||||
Py_UNICODE *p;
|
||||
Py_UNICODE *startp;
|
||||
char *outp;
|
||||
if (PyUnicodeEncodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (PyUnicodeEncodeError_GetEnd(exc, &end))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
startp = PyUnicode_AS_UNICODE(object);
|
||||
res = PyBytes_FromStringAndSize(NULL, 3*(end-start));
|
||||
if (!res) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
outp = PyBytes_AsString(res);
|
||||
for (p = startp+start; p < startp+end; p++) {
|
||||
Py_UNICODE ch = *p;
|
||||
if (ch < 0xd800 || ch > 0xdfff) {
|
||||
/* Not a surrogate, fail with original exception */
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
Py_DECREF(res);
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
*outp++ = (char)(0xe0 | (ch >> 12));
|
||||
*outp++ = (char)(0x80 | ((ch >> 6) & 0x3f));
|
||||
*outp++ = (char)(0x80 | (ch & 0x3f));
|
||||
}
|
||||
restuple = Py_BuildValue("(On)", res, end);
|
||||
Py_DECREF(res);
|
||||
Py_DECREF(object);
|
||||
return restuple;
|
||||
}
|
||||
else if (PyObject_IsInstance(exc, PyExc_UnicodeDecodeError)) {
|
||||
unsigned char *p;
|
||||
Py_UNICODE ch = 0;
|
||||
if (PyUnicodeDecodeError_GetStart(exc, &start))
|
||||
return NULL;
|
||||
if (!(object = PyUnicodeDecodeError_GetObject(exc)))
|
||||
return NULL;
|
||||
if (!(p = (unsigned char*)PyBytes_AsString(object))) {
|
||||
Py_DECREF(object);
|
||||
return NULL;
|
||||
}
|
||||
/* Try decoding a single surrogate character. If
|
||||
there are more, let the codec call us again. */
|
||||
p += start;
|
||||
if ((p[0] & 0xf0) == 0xe0 ||
|
||||
(p[1] & 0xc0) == 0x80 ||
|
||||
(p[2] & 0xc0) == 0x80) {
|
||||
/* it's a three-byte code */
|
||||
ch = ((p[0] & 0x0f) << 12) + ((p[1] & 0x3f) << 6) + (p[2] & 0x3f);
|
||||
if (ch < 0xd800 || ch > 0xdfff)
|
||||
/* it's not a surrogate - fail */
|
||||
ch = 0;
|
||||
}
|
||||
Py_DECREF(object);
|
||||
if (ch == 0) {
|
||||
PyErr_SetObject(PyExceptionInstance_Class(exc), exc);
|
||||
return NULL;
|
||||
}
|
||||
return Py_BuildValue("(u#n)", &ch, 1, start+3);
|
||||
}
|
||||
else {
|
||||
wrong_exception_type(exc);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
static PyObject *strict_errors(PyObject *self, PyObject *exc)
|
||||
{
|
||||
return PyCodec_StrictErrors(exc);
|
||||
|
@ -777,6 +856,11 @@ static PyObject *backslashreplace_errors(PyObject *self, PyObject *exc)
|
|||
return PyCodec_BackslashReplaceErrors(exc);
|
||||
}
|
||||
|
||||
static PyObject *surrogates_errors(PyObject *self, PyObject *exc)
|
||||
{
|
||||
return PyCodec_SurrogateErrors(exc);
|
||||
}
|
||||
|
||||
static int _PyCodecRegistry_Init(void)
|
||||
{
|
||||
static struct {
|
||||
|
@ -823,6 +907,14 @@ static int _PyCodecRegistry_Init(void)
|
|||
backslashreplace_errors,
|
||||
METH_O
|
||||
}
|
||||
},
|
||||
{
|
||||
"surrogates",
|
||||
{
|
||||
"surrogates",
|
||||
surrogates_errors,
|
||||
METH_O
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
|
|
@ -312,7 +312,9 @@ w_object(PyObject *v, WFILE *p)
|
|||
}
|
||||
else if (PyUnicode_CheckExact(v)) {
|
||||
PyObject *utf8;
|
||||
utf8 = PyUnicode_AsUTF8String(v);
|
||||
utf8 = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(v),
|
||||
PyUnicode_GET_SIZE(v),
|
||||
"surrogates");
|
||||
if (utf8 == NULL) {
|
||||
p->depth--;
|
||||
p->error = WFERR_UNMARSHALLABLE;
|
||||
|
@ -810,7 +812,7 @@ r_object(RFILE *p)
|
|||
retval = NULL;
|
||||
break;
|
||||
}
|
||||
v = PyUnicode_DecodeUTF8(buffer, n, NULL);
|
||||
v = PyUnicode_DecodeUTF8(buffer, n, "surrogates");
|
||||
PyMem_DEL(buffer);
|
||||
retval = v;
|
||||
break;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue